def addCategoryToScan(url): content = retrievePage(url) html = lhtml.fromstring(content) d = pq(content) s = d('h1#breadCrumb') # s = html.xpath("//h1[@id='breadCrumb']") breadcrumb = toAscii(s[0].text_content()) # print breadcrumb # containers = html.xpath(".//*[@id='bestRefinement']") containers = d('#bestRefinement > a') # thecontainer = getFormatContainer(containers) #currently always going here - Apr27 if containers is None: ats = Amazon_Textbook_Section_NR(title=breadcrumb, url=url) try: ats.save() except: pass return # s = thecontainer.xpath(".//div[@class='refinement']") for cat in containers: el = cat if len(el): ats = Amazon_Textbook_Section_NR(title=breadcrumb + " " + el.text_content(), url=el.get('href')) ats.save()
def addCategoryToScan(url): content = retrievePage(url) html = lhtml.fromstring(content) s = html.xpath("//h1[@id='breadCrumb']") breadcrumb = toAscii(s[0].text_content()) #print breadcrumb containers = html.xpath(".//*[@class='refinementContainer']") thecontainer = getFormatContainer(containers) #currently always going here - Apr27 if thecontainer is None: ats = Amazon_Textbook_Section_NR(title=breadcrumb, url=url) ats.save() return s = thecontainer.xpath(".//div[@class='refinement']") for cat in s: el = cat.cssselect("a") if el: ats = Amazon_Textbook_Section_NR(title=breadcrumb + " " + el[0].text_content(), url=el[0].get('href')) ats.save()