def parseCategory(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] #TODO: test if this xpath should include other types of pages description_text_holder = hxs.select( "//p[@class='subtitle grey']/text()").extract() description_title_holder = hxs.select( "//h1/text()[normalize-space()!='']").extract() if description_text_holder: item['description_text'] = description_text_holder[0] item['description_title'] = description_title_holder[0] description_tokenized = Utils.normalize_text( item['description_text']) item['description_wc'] = len(description_tokenized) (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 yield item
def _parse_category(self, response): category = response.meta['category'] parent = response.meta.get('parent', {}) category['catid'] = self._get_catid() category['url'] = response.url category['parent_text'] = parent.get('text') category['parent_url'] = parent.get('url') category['parent_catid'] = parent.get('catid') category['grandparent_text'] = parent.get('parent_text') category['grandparent_url'] = parent.get('parent_url') category['level'] = parent.get('level', 0) + 1 category['department_text'] = response.meta['department']['text'] category['department_url'] = response.meta['department']['url'] category['department_id'] = response.meta['department']['catid'] #category['description_text'] = self._description_text.first(response) description_text = first(response.xpath(self._xpath_description_text).extract()) if description_text: category['description_wc'] = len(Utils.normalize_text(description_text)) keywords = first(response.xpath(self._xpath_keywords).extract()) if description_text: category['description_text'] = description_text if description_text and keywords: (category['keyword_count'], category['keyword_density']) = Utils.phrases_freq(keywords, description_text) if category.get('nr_products') is None: nr_products = re_find('\d+', first(response.css(self._css_product_numbers_text).extract())) category['nr_products'] = int(nr_products) if nr_products is not None else None subcategory_links = LinkExtractor(restrict_xpaths=self._xpath_category_links) for link in subcategory_links.extract_links(response): text, nr_products = re.search('(.+?) \((\d+)\) *', link.text).groups() nr_products = int(nr_products) child = CategoryItem(text=text, nr_products=nr_products) meta = {'category': child, 'department': response.meta['department'], 'parent': category} yield Request(link.url, callback=self._parse_category, meta=meta) yield category
def parseCategory(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] # extract number of products if available #TODO check count_holder = hxs.select("//div[@class='recordCount']/span[@id='RecordCount_1']/text()") if count_holder: item['nr_products'] = int(count_holder.extract()[0]) #TODO # try to change URL "Category" to "SubCategory", see if you find the product count there # extract description if available description_holders = hxs.select("//div[@id='bcaShopWindowSEO']") # if the list is not empty and contains at least one non-whitespace item if description_holders: description_texts = description_holders.select(".//text()[not(ancestor::h2)]").extract() # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = " ".join([re.sub("\s+"," ", description_text.strip()) for description_text in description_texts if description_text.strip()]) tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) description_title = description_holders.select(".//h2/text()").extract() if description_title: item['description_title'] = description_title[0].strip() (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq(item['description_title'], item['description_text']) else: item['description_wc'] = 0 yield item parent = item #TODO # extract and parse subcategories subcats = hxs.select("//dl[@class='categoryList primaryNav']/dd/a") for subcat in subcats: item = CategoryItem() item['text'] = subcat.select("text()").extract()[0].strip() #TODO: check out some huge URLs item['url'] = self.clean_url(subcat.select("@href").extract()[0]) item['parent_text'] = parent['text'] item['parent_url'] = parent['url'] item['level'] = parent['level'] - 1 item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] yield Request(url = item['url'], callback = self.parseCategory, meta = {"item" : item, \ "department_text" : response.meta['department_text'], "department_url" : response.meta['department_url'], "department_id" : response.meta['department_id']})
def parseSubcategory(self, response): hxs = HtmlXPathSelector(response) subcategory = response.meta['item'] # yield this subcategory yield subcategory # if subcategory was special, we'll mark all subsubcategories as special if 'special' in subcategory: special = True else: special = False # get its subcategories subsubcategories = hxs.select( "//div[@class='product-category-expanded']//h3[@class='title']") for subsubcategory in subsubcategories: item = CategoryItem() item['text'] = subsubcategory.select("a/text()").extract()[0] item['url'] = Utils.add_domain( subsubcategory.select("a/@href").extract()[0], self.base_url) if special: item['special'] = 1 item['parent_text'] = subcategory['text'] item['parent_url'] = subcategory['url'] item['department_text'] = subcategory['department_text'] item['department_url'] = subcategory['department_url'] item['department_id'] = subcategory['department_id'] item['level'] = subcategory['level'] - 1 description_text_holder = subsubcategory.select( "following-sibling::p[@class='description'][1]/text()" ).extract() if description_text_holder: item['description_text'] = description_text_holder[0] item['description_title'] = item['text'] description_tokenized = Utils.normalize_text( item['description_text']) item['description_wc'] = len(description_tokenized) (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 # parse subcategory page to get product count, or further subsubcategory yield Request(item['url'], callback=self.parseSubcategoryPage, meta={'item': item})
def _populate_from_html(self, response): """Set html-dependant fields""" category = response.meta['category'] #description = response.xpath('//div[@class="category-description std"]/*[not(a[@class="viewAllCats"])]') description = response.xpath('//div[@class="category-description std"]/node()') description = SelectorList(filter(lambda itm: not len(itm.css('.viewAllCats')), description)) description = ' '.join(description.extract()) or None description = description.strip(' \n\r\t') desc_title = (response.css('.category-title h1::text').extract() or [None])[0] self._set_value(category, 'description_text', description) self._set_value(category, 'description_title', desc_title) tokenized = Utils.normalize_text(description) if description else [] category['description_wc'] = len(tokenized) if description and desc_title: category['keyword_count'], category['keyword_density'] = Utils.phrases_freq(desc_title, description)
def parseCategory(self, response): hxs = HtmlXPathSelector(response) # get parent item from response, extract additional info and return it item = response.meta['parent'] # add department name, url and id for item item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] # extract product count if available nr_items_holder = hxs.select( "//div[@id='showing']/strong[position()=2]/text()").extract() if nr_items_holder: item['nr_products'] = int(str(nr_items_holder[0])) # extract description if available # these are descriptions for services pages desc_title_holder = hxs.select( "//div[@id='searchstate']/a[position()=2]/text()").extract() if desc_title_holder: item['description_title'] = desc_title_holder[0].strip() desc_content_holder = hxs.select( "//div[@class='content']/h3/text()").extract() if desc_content_holder: item['description_text'] = desc_content_holder[0].strip() tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 yield item # extract its subcategories #subcats_holders = hxs.select("//div[@class='narrowcontent']/ul[@class='search']") subcats_holders = hxs.select( "//div[@class='narrowcontent']/ul[contains(@class,'search')]") if subcats_holders: subcats_holder = subcats_holders[0] # these are subcategories if they are preceded by the title "Shop ..." title = subcats_holder.select( "parent::node()/preceding-sibling::node()//text()").extract( )[0] if str(title).startswith("Shop"): subcats = subcats_holder.select(".//li/a") for subcat in subcats: item = CategoryItem() item['text'] = subcat.select("text()").extract()[0].strip() item['url'] = Utils.add_domain( subcat.select("@href").extract()[0], "http://www.bestbuy.com") parent = response.meta['parent'] item['level'] = int(response.meta['level']) - 1 # if parent was special, this category is special too if 'special' in parent: item['special'] = 1 item['parent_text'] = parent['text'] item['parent_url'] = parent['url'] request = Request(url = item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']}) yield request
def parseDept(self, response): # for "copy & print" there's an exception, we don't need zipcode # # use selenium to complete the zipcode form and get the first results page # driver = webdriver.Firefox() # driver.get(response.url) # # set a hardcoded value for zipcode # zipcode = "12345" # textbox = driver.find_element_by_name("zipCode") # textbox.send_keys(zipcode) # button = driver.find_element_by_id("submitLink") # button.click() # cookie = {"zipcode": zipcode} # driver.add_cookie(cookie) # time.sleep(5) # # convert html to "nice format" # text_html = driver.page_source.encode('utf-8') # #print "TEXT_HTML", text_html # html_str = str(text_html) # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None) # hxs = HtmlXPathSelector(resp_for_scrapy) #TODO: doesn't extract Televisions for ex hxs = HtmlXPathSelector(response) categories = hxs.select("//h2/a") root_url = "http://www.staples.com" # from parent's page: item = response.meta['parent'] # add department name, url and id to item item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] # extract number of items, if any nritems_holder = hxs.select( "//div[@class='perpage']/span[@class='note']/text()").extract() if nritems_holder: m = re.findall("[0-9]+\s*items", nritems_holder[0]) if m: item['nr_products'] = int("".join(re.findall("[0-9]+", m[0]))) # else: # print "NOT MATCH ", nritems_holder[0] # extract description, if any description_texts = hxs.select( "//h2[@class='seo short']//text() | //h2[@class='seo short long']//text()" ).extract() if description_texts and reduce( lambda x, y: x or y, [line.strip() for line in description_texts]): # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) if item['description_text']: item['description_title'] = item['text'] tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: # if no description is found #print 'desc_holder but no desc_text ', response.URL item['description_wc'] = 0 else: item['description_wc'] = 0 # yield item the request came from (parent) yield item # extract subcategories for category in categories: # there are pages that don't have categories item = CategoryItem() text = category.select("text()").extract() if text: item['text'] = text[0] url = category.select("@href").extract() if url: item['url'] = root_url + url[0] item['level'] = int(response.meta['level'] - 1) if 'text' in response.meta['parent']: item['parent_text'] = response.meta['parent']['text'] else: print 'no text in parent ', response.meta['parent'] item['parent_url'] = response.url # yield the item after passing it through request and collecting additonal info #yield item # extract subcategories if any zipcode = "12345" request = Request(item['url'], callback = self.parseDept, cookies = {"zipcode" : zipcode}, \ headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True, \ "parent": item, "level": item['level'], \ "department_text" : response.meta["department_text"], "department_url" : response.meta["department_url"], "department_id" : response.meta["department_id"]}) yield request
def parseCategory(self, response): # if we are getting blocked by captcha, solve and redirect back here # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it if self.has_captcha( response.body) and ('retry_count' not in response.meta or response.meta['retry_count'] > 0): yield self.solve_captcha_and_redirect( response, self.parseCategory ) # meta of response will contain number of retries left if set return hxs = HtmlXPathSelector(response) # extract additional info for received parent and return it item = response.meta['item'] # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url) if 'nr_products' not in item: prod_count_holder = hxs.select( "//h2[@class='resultCount']/span/text()").extract() if prod_count_holder: prod_count = prod_count_holder[0] # extract number # for paged results: Showing ... out of ... Results m = re.match(".*\s*of\s+([0-9,]+)\s+Results\s*", prod_count) # for one page results: Showing ... Result(s) if not m: m = re.match(".*\s+([0-9,]+)\s+Results?\s*", prod_count) if m: item['nr_products'] = int(re.sub(",", "", m.group(1))) # extract description if available # only extracts descriptions that contain a h2. is that good? desc_holders = hxs.select( "//div[@class='unified_widget rcmBody'][descendant::h2][last()]") # select the one among these with the most text #TODO: another idea: check if the holder has a h2 item if desc_holders: maxsize = 0 max_desc_holder = desc_holders[0] for desc_holder in desc_holders: size = len(" ".join(desc_holder.select(".//text()").extract())) if size > maxsize: maxsize = size max_desc_holder = desc_holder desc_holder = max_desc_holder desc_title = desc_holder.select("h2/text()").extract() if desc_title: item['description_title'] = desc_title[0].strip() description_texts = desc_holder.select( ".//text()[not(ancestor::h2)]").extract() # if the list is not empty and contains at least one non-whitespace item # if there is a description title or the description body is large enough size_threshold = 50 if (description_texts and reduce(lambda x, y: x or y, [line.strip() for line in description_texts])): # and \ #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)): # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) if desc_title: (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 else: item['description_wc'] = 0 # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, and no product count was found, add info from that url extra_category = self.find_matching_key( item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS) # crawl lower level categories if item['level'] > self.LEVEL_BARRIER: if extra_category: # collect number of products from this alternate URL # this will also extract subcategories and their count yield Request( self.EXTRA_TOPLEVEL_CATEGORIES_URLS[extra_category], callback=self.extractSubcategories, meta={'item': item}) else: # extract subcategories and their count for category even if not in extra_... yield Request(item['url'], callback=self.extractSubcategories, meta={'item': item}) else: yield item
class WalmartCaSpider(BaseSpider): name = "walmartca" allowed_domains = ["walmart.ca"] start_urls = [ "http://www.walmart.ca/en", ] def __init__(self, outfile=None): self.root_url = "http://www.walmart.ca" self.outfile = outfile # set flag that indicates that for this spider, nr of products for each catgory should be computed self.compute_nrproducts = True # level that is considered to contain departments self.DEPARTMENT_LEVEL = 1 # keep crawled items represented by (url, parent_url, department_url) pairs # to eliminate duplicates # (adding department_url makes sure that if one entire department is found as a subcategory of another for ex, both (and their complete category trees) will be crawled) self.crawled = [] # last used category id, used for autoincrementing ids idenrifying categories self.id_count = 0 # hardcoded values for special category's item count. Currently used for 'Value of the day' that typically has fixed number of products, and nowhere to extract it from page self.special_itemcount = {'value of the day': 2} def parse(self, response): hxs = HtmlXPathSelector(response) #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']") #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']") #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']") parent_links = hxs.select( "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]" ) # #TODO: check this # item['nr_products'] = -1 # yield item #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item}) department_id = 0 for link in parent_links: item = CategoryItem() #TO remove: # # link to artificial parent category # item['parent_catid'] = 0 item['text'] = link.select('text()').extract()[0] item['url'] = link.select('@href').extract()[0] # add domain if relative URL item['url'] = Utils.add_domain(item['url'], self.root_url) item['level'] = 1 department_id += 1 # send category page to parseCategory function to extract description and number of products and add them to the item yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \ 'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id}) # parse category page and extract description and number of products def parseCategory(self, response): # URLs like health.walmart.com don't have body_as_unicode and generate an exception try: hxs = HtmlXPathSelector(response) except AttributeError, e: self.log("Could not get response from " + response.url + "; original exception: " + str(e) + "\n", level=log.WARNING) return item = response.meta['item'] # Add department text, url and id to item item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] # assign unique id item['catid'] = self.id_count self.id_count += 1 # Extract subcategories breakdown if any ("classification" field) classification_criteria = hxs.select( "//form[@id='refine']//h6[@class='AdvSearchSubhead']") classification_dictionary = {} for criterion in classification_criteria: criterion_name = criterion.select( ".//text()[normalize-space()!='']").extract()[0].strip() # extract subcategories by this criterion: # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more") subcategories = criterion.select( "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]" ) # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them) subcategories = filter( lambda x: x.select("text()").re(".*[A-Za-z]+.*"), subcategories) # if we found these, create the classification dictionary if criterion_name and subcategories: subcategories_list = [] for subcategory in subcategories: subcategory_name = subcategory.select( "@title").extract()[0] # replace   with space, trim subcategory_name = subcategory_name.replace(" ", " ").strip() # extract product count subcategory_prodcount = subcategory.select( "span[@class='count']/text()").extract() # if there is no count field, extract prodcount from subcategory name if subcategory_prodcount: m = re.match("\(([0-9]+)\)", subcategory_prodcount[0].strip()) # eliminate parantheses surrounding number and convert to int if m: subcategory_prodcount = m.group(1) else: subcategory_prodcount = subcategory_prodcount[ 0].strip() else: # if there is no product count in separate element, try to extract it from subcategory name subcategory_name = subcategory.select( ".//text()[normalize-space()!='']").extract( )[0].replace(" ", " ").replace(u"\xa0", " ").strip() m = re.match("(.*)\(([0-9]+)\)", subcategory_name) if m: subcategory_prodcount = m.group(2) subcategory_name = m.group(1).strip() if subcategory_name and subcategory_prodcount: subcategory_item = { "name": subcategory_name, "nr_products": int(subcategory_prodcount) } subcategories_list.append(subcategory_item) classification_dictionary[criterion_name] = subcategories_list if classification_dictionary: item['classification'] = classification_dictionary ########################################################################################## # # Extract description title, text, wordcount, and keyword density (if any) ########################################### #TODO: # first search for the description id they usually use, # second one is used more rarely and also with some false positives so needs to be checked for text length as well # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found description_holder = hxs.select( "//div[@id='detailedPageDescriptionCopyBlock']") # flag to tell if we found it with basic rule found = True if not description_holder: found = False description_holder = hxs.select( "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") # if none was found, try to find an element with much text (> DESC_LEN (200) characters) # this is gonna pe a paragraph in the description, look for its parent (containing the entire description) if not description_holder: #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content? # I think it means entire text content. We're ok description_holder = hxs.select("//p/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") # select element among these with most text if description_holder: desc_winner = description_holder[0] max_text = 0 for desc_candidate in description_holder: # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph description_texts = desc_candidate.select( ".//p//text()[string-length()>" + str(DESC_PAR_LEN) + "]").extract() text_len = len(" ".join(description_texts)) if text_len > max_text: max_text = text_len desc_winner = desc_candidate # if text length is the same, assume one of them is parent of the other # and select the one with greater depth (fewer children) elif text_len == max_text and text_len != 0: children_old = float( desc_winner.select("count(*)").extract()[0]) children_new = float( desc_candidate.select("count(*)").extract()[0]) if children_new < children_old: desc_winner = desc_candidate description_holder = desc_winner # try to find description title in <b> tag in the holder; # if it's not found, try to find it in the first <p> if the description # if found there, exclude it from the description body if description_holder: #TODO: # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() " # to fix Money Center problem. but maybe it's not always inside p? description_title = description_holder.select( ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() " ).extract() if description_title: # this will implicitly get thle first occurence of either a <b> element or an <h1> element, # which is likely to be the title (the title usually comes first) item['description_title'] = description_title[0].strip() description_texts = description_holder.select( "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \ | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]" ).extract() # if the list is not empty and contains at least one non-whitespace item if description_texts and reduce( lambda x, y: x or y, [line.strip() for line in description_texts]): description_text = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it if len(description_text) < 4096 or found: # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = description_text # replace line breaks with space item['description_text'] = re.sub("\n+", " ", item['description_text']) if 'description_text' in item: tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) # sometimes here there is no description title because of malformed html # if we can find description text but not description title, title is probably malformed - get first text in div instead if 'description_title' not in item: desc_texts = description_holder.select( "./text()").extract() desc_texts = [text for text in desc_texts if text.strip()] if desc_texts: item['description_title'] = desc_texts[0].strip() if 'description_title' in item: (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 else: item['description_wc'] = 0 # ################################################################################## # Extract product count # find if there is a wc field on the page wc_field = hxs.select( "//div[@class='mrl mod-toggleItemCount']/span/text() |\ //div[@class='SPRecordCount']/text()").extract() if wc_field: m1 = re.match("([0-9]+) Results", wc_field[0]) if m1: item['nr_products'] = int(m1.group(1)) m2 = m2 = re.match( "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0]) if m2: item['nr_products'] = int(m2.group(1)) # set item count for special items (hardcoded in special_itemcount) if item['text'].lower() in self.special_itemcount: item['nr_products'] = self.special_itemcount[item['text'].lower()] # Extract subcategories if no product count found if 'nr_products' in item: yield item else: # look for links to subcategory pages in menu subcategories_links = hxs.select( "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']" ) if not subcategories_links: # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header # subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a") # if we haven't found them, try to find subcategories in menu on the left - get almost anything subcategories_links = hxs.select( "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \ and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\ and not(contains(text(), 'Featured Brands'))]\ /following-sibling::node()//a") # if we found them, create new category for each and parse it from the beginning #TODO ######################################## # Exceptions - doesn't find anything for: # http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter # # ######################################## if subcategories_links: # new categories are subcategories of current one - calculate and store their level parent_item = item level = parent_item['level'] - 1 #print "URL ", response.url, " CALLING PARSEPAGE" for subcategory in subcategories_links: # to avoid rescraping categories reached from links in menu and reaching levels of -9, # if level < -3 assume we've been there and skip if level < -3: continue item = CategoryItem() item['url'] = Utils.add_domain( subcategory.select("@href").extract()[0], self.root_url) text = subcategory.select("text()").extract() if text: item['text'] = text[0].strip() else: # usually means it's something else than what we need #TODO: check continue #print "no text for subcategory ", item, response.url # # take care of unicode # item['text'] = item['text'].encode("utf-8", errors=ignore) item['level'] = level item['parent_text'] = parent_item['text'] item['parent_url'] = parent_item['url'] item['parent_catid'] = parent_item['catid'] if 'parent_text' in parent_item: item['grandparent_text'] = parent_item['parent_text'] if 'parent_url' in parent_item: item['grandparent_url'] = parent_item['parent_url'] # if parent's parents are missing, level must be at least 0 if 'parent_text' not in parent_item or 'parent_url' not in parent_item: assert level >= 0 # send subcategory items to be parsed again # if not already crawled if (item['url'], item['parent_url'], response.meta['department_url'] ) not in self.crawled: yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']}) self.crawled.append((item['url'], item['parent_url'], response.meta['department_url'])) # return current item # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent yield parent_item #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item}) # if we can't find either products on the page or subcategory links else: #print "URL", response.url, " NO SUBCATs" #item['nr_products'] = 0 yield item
def parseCategory(self, response): hxs = HtmlXPathSelector(response) # output received parent element after extracting additional info item = response.meta['parent'] # add department name, url and id to item item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] # extract number of items if available prod_count_holder = hxs.select( "//span[@id='productCount']/text()").extract() if prod_count_holder: item['nr_products'] = int(prod_count_holder[0].strip()) # exract description if available desc_holder = hxs.select("//div[@id='catalogCopyBlock']") if desc_holder: item['description_title'] = desc_holder.select( "h2/text()").extract()[0] description_texts = desc_holder.select("p/text()").extract() # if the list is not empty and contains at least one non-whitespace item if description_texts and reduce( lambda x, y: x or y, [line.strip() for line in description_texts]): # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 else: item['description_wc'] = 0 yield item chapters = hxs.select("//li[@class='nav_cat_item_bold']") for chapter in chapters: #TODO: still includes some special categories (like "Coming Soon" in men) # exclude "Brands" chapter chapter_name = chapter.select("span/text()").extract() if not chapter_name or "brands" in chapter_name[0]: continue subcats = chapter.select("ul/li/a") for subcat in subcats: item = CategoryItem() text = subcat.select('text()').extract()[0] # if it starts with "Shop all", ignore it if re.match("Shop [aA]ll.*", text): continue else: item['text'] = text # remove unnecessary suffix from URL url = subcat.select('@href').extract()[0] m = re.match("(.*\?id=[0-9]+)&?.*", url) if m: item['url'] = m.group(1) else: item['url'] = url item['level'] = int(response.meta['level']) - 1 item['parent_text'] = response.meta['parent']['text'] item['parent_url'] = response.url #yield item yield Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
def parseCategory(self, response): hxs = HtmlXPathSelector(response) # extract additional info for received parent and return it item = response.meta['item'] # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url) if 'nr_products' not in item: prod_count_holder = hxs.select("//h2[@class='resultCount']/span/text()").extract() if prod_count_holder: prod_count = prod_count_holder[0] # extract number m = re.match(".*\s*of\s*([0-9,]+)\s*Results\s*", prod_count) if m: item['nr_products'] = int(re.sub(",","",m.group(1))) # extract description if available # only extracts descriptions that contain a h2. is that good? desc_holders = hxs.select("//div[@class='unified_widget rcmBody'][descendant::h2][last()]") # select the one among these with the most text #TODO: another idea: check if the holder has a h2 item if desc_holders: maxsize = 0 max_desc_holder = desc_holders[0] for desc_holder in desc_holders: size = len(" ".join(desc_holder.select(".//text()").extract())) if size > maxsize: maxsize = size max_desc_holder = desc_holder desc_holder = max_desc_holder desc_title = desc_holder.select("h2/text()").extract() if desc_title: item['description_title'] = desc_title[0].strip() description_texts = desc_holder.select(".//text()[not(ancestor::h2)]").extract() # if the list is not empty and contains at least one non-whitespace item # if there is a description title or the description body is large enough size_threshold = 50 if (description_texts and reduce(lambda x,y: x or y, [line.strip() for line in description_texts])):# and \ #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)): # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = " ".join([re.sub("\s+"," ", description_text.strip()) for description_text in description_texts if description_text.strip()]) tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) if desc_title: (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq(item['description_title'], item['description_text']) else: item['description_wc'] = 0 else: item['description_wc'] = 0 # if item is found among extra_toplevel_categories_urls, and no product count was found, add info from that url extra_category = self.find_matching_key(item['text'], self.extra_toplevel_categories_urls) #yield item # crawl level 0 categories (only for their product count and subcategories - no descriptions...) if 'nr_products' not in item or item['level'] > self.LEVEL_BARRIER: if extra_category: # collect number of products from this alternate URL # this will also extract subcategories and their count yield Request(self.extra_toplevel_categories_urls[extra_category], callback = self.extract_nrprods_and_subcats, meta = {'item' : item}) else: # extract subcategories and their count for category even if not in extra_... yield Request(item['url'], callback = self.extract_nrprods_and_subcats, meta = {'item' : item}) else: yield item
def parseCategory(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] # extract number of products if available nrproducts_holder = hxs.select( "//div[@class='resultsfilterBottom']/div[@class='itemsShowresult']/strong[2]/text()" ).extract() if nrproducts_holder: item['nr_products'] = int(nrproducts_holder[0]) # extract description if available description_holders = hxs.select("//div[@class='textBlock']") # if the list is not empty and contains at least one non-whitespace item if description_holders: description_texts = description_holders.select( ".//text()[not(ancestor::h2)]").extract() # replace all whitespace with one space, strip, and remove empty texts; then join them desc_text = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) if desc_text: item['description_text'] = desc_text tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) else: item['description_wc'] = 0 description_title = description_holders.select( ".//h2/text()").extract() if description_title: item['description_title'] = description_title[0].strip() if desc_text: (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 self.parsed_urls.append(item['url']) yield item # extract subcategories product_links = hxs.select( "//div[@class='resultsWrap listView']//h3[@class='itemName']/a/@href" ).extract() # only extract subcategories if product links not found on page if not product_links: parent = item # search for a link to "See All Products" seeall = hxs.select( "//span[text()='See All Products']/parent::node()/@href" ).extract() if seeall: # pass the page with subcategories menu to a method to parse it #print 'parsing seeall: from ', response.url, ' to ', Utils.add_domain(seeall[0], "http://www.tigerdirect.com") yield Request(url = Utils.add_domain(seeall[0], "http://www.tigerdirect.com"), callback = self.parseSubcats, \ meta = {'parent' : parent,\ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\ 'department_id' : response.meta['department_id']}) else: # pass the current page (with subcategories menu on it) to a method to parse it #print 'parsing for subcategories ', response.url yield Request(url = response.url, callback = self.parseSubcats, meta = {'parent' : parent,\ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\ 'department_id' : response.meta['department_id']})
def parseCategory(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] # Add department text, url and id to item item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] # assign unique id item['catid'] = self.id_count self.id_count += 1 # Extract subcategories breakdown if any ("classification" field) classification_criteria = hxs.select( "//form[@id='refine']//h6[@class='AdvSearchSubhead']") classification_dictionary = {} for criterion in classification_criteria: criterion_name = criterion.select( ".//text()[normalize-space()!='']").extract()[0].strip() # extract subcategories by this criterion: # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more") subcategories = criterion.select( "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]" ) # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them) subcategories = filter( lambda x: x.select("text()").re(".*[A-Za-z]+.*"), subcategories) # if we found these, create the classification dictionary if criterion_name and subcategories: subcategories_list = [] for subcategory in subcategories: subcategory_name = subcategory.select( "@title").extract()[0] # replace   with space, trim subcategory_name = subcategory_name.replace(" ", " ").strip() # extract product count subcategory_prodcount = subcategory.select( "span[@class='count']/text()").extract() # if there is no count field, extract prodcount from subcategory name if subcategory_prodcount: m = re.match("\(([0-9]+)\)", subcategory_prodcount[0].strip()) # eliminate parantheses surrounding number and convert to int if m: subcategory_prodcount = m.group(1) else: subcategory_prodcount = subcategory_prodcount[ 0].strip() else: # if there is no product count in separate element, try to extract it from subcategory name subcategory_name = subcategory.select( ".//text()[normalize-space()!='']").extract( )[0].replace(" ", " ").replace(u"\xa0", " ").strip() m = re.match("(.*)\(([0-9]+)\)", subcategory_name) if m: subcategory_prodcount = m.group(2) subcategory_name = m.group(1).strip() if subcategory_name and subcategory_prodcount: subcategory_item = { "name": subcategory_name, "nr_products": int(subcategory_prodcount) } subcategories_list.append(subcategory_item) classification_dictionary[criterion_name] = subcategories_list if classification_dictionary: item['classification'] = classification_dictionary ########################################################################################## # # Extract description title, text, wordcount, and keyword density (if any) ########################################### #TODO: # Exceptions: # http://www.walmart.com/cp/5431?povid=cat1078944-env506746-moduleA030213-lLinkLHNRelatedCategories2Pharmacy - finds wrong title (also wrong description holder - too high level) # http://www.walmart.com/cp/1102793?povid=cat1094926-env999999-moduleA030713-lLinkLHNLearnmoreAbouttheprogram - finds description, actually no description, CustomPOV... with large text inside, hard to fix # http://brands.walmart.com/fishing/essential-rods-and-reels/ - finds description, actually no description. Just an element with much text # http://brands.walmart.com/fishing/get-salty-with-your-bass-skills/ - finds description, actually no description. Just an element with much text # http://instoresnow.walmart.com/article.aspx?Center=Pets&id=104225 - finds description, actually no description. Just an element with much text # http://brands.walmart.com/fishing/turn-a-kid-on-to-flyfishing/ - finds description, actually no description. Just an element with much text # http://www.walmart.com/cp/1094926?povid=cat121828-env999999-moduleA030713-lLinkGNAV1_Campaign_EmpoweringWomenTogether - finds description, actually no description. Just an element with much text # http://www.walmart.com/ip/Straight-Talk-Samsung-Galaxy-S-III/23573710?povid=cat1105910-env542259-moduleA092613-lLinkLHNWhatsNewSamsungSIIIStraightTalk - finds description, actually no description. Just an element with much text # http://www.walmart.com/cp/Bakery/120764 - finds description, actually no description. Just an element with much text, also title problem # http://www.walmart.com/cp/1078665 - not a description, also imperfect title extraction # http://www.walmart.com/cp/1101244?povid=cat1100706-env999999-module122012-LHN_HealthyLivingTips - wrong title extraction, extracts too much as a description holder # http://www.walmart.com/cp/flexible-spending-account/555326 - finds description though no description, just large text (also bad title extraction) # Idea for excluding elements with much text that are false positives: check if element is composed of many sibling paragraphs or so ########################################### # first search for the description id they usually use, # second one is used more rarely and also with some false positives so needs to be checked for text length as well # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found description_holder = hxs.select( "//div[@id='detailedPageDescriptionCopyBlock']") # flag to tell if we found it with basic rule found = True if not description_holder: found = False description_holder = hxs.select( "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") # if none was found, try to find an element with much text (> DESC_LEN (200) characters) # this is gonna pe a paragraph in the description, look for its parent (containing the entire description) if not description_holder: #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content? # I think it means entire text content. We're ok description_holder = hxs.select("//p/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") # select element among these with most text if description_holder: desc_winner = description_holder[0] max_text = 0 for desc_candidate in description_holder: # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph description_texts = desc_candidate.select( ".//p//text()[string-length()>" + str(DESC_PAR_LEN) + "]").extract() text_len = len(" ".join(description_texts)) if text_len > max_text: max_text = text_len desc_winner = desc_candidate # if text length is the same, assume one of them is parent of the other # and select the one with greater depth (fewer children) elif text_len == max_text and text_len != 0: children_old = float( desc_winner.select("count(*)").extract()[0]) children_new = float( desc_candidate.select("count(*)").extract()[0]) if children_new < children_old: desc_winner = desc_candidate description_holder = desc_winner # try to find description title in <b> tag in the holder; # if it's not found, try to find it in the first <p> if the description # if found there, exclude it from the description body if description_holder: #TODO: # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() " # to fix Money Center problem. but maybe it's not always inside p? description_title = description_holder.select( ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() " ).extract() if description_title: # this will implicitly get thle first occurence of either a <b> element or an <h1> element, # which is likely to be the title (the title usually comes first) item['description_title'] = description_title[0].strip() description_texts = description_holder.select( "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \ | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]" ).extract() # if the list is not empty and contains at least one non-whitespace item if description_texts and reduce( lambda x, y: x or y, [line.strip() for line in description_texts]): description_text = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it if len(description_text) < 4096 or found: # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = description_text # replace line breaks with space item['description_text'] = re.sub("\n+", " ", item['description_text']) if 'description_text' in item: tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) # sometimes here there is no description title because of malformed html # if we can find description text but not description title, title is probably malformed - get first text in div instead if 'description_title' not in item: desc_texts = description_holder.select( "./text()").extract() desc_texts = [text for text in desc_texts if text.strip()] if desc_texts: item['description_title'] = desc_texts[0].strip() if 'description_title' in item: (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 else: item['description_wc'] = 0 # ################################################################################## # Extract product count # find if there is a wc field on the page wc_field = hxs.select( "//div[@class='mrl mod-toggleItemCount']/span/text() |\ //div[@class='SPRecordCount']/text()").extract() if wc_field: m1 = re.match("([0-9]+) Results", wc_field[0]) if m1: item['nr_products'] = int(m1.group(1)) m2 = m2 = re.match( "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0]) if m2: item['nr_products'] = int(m2.group(1)) yield item else: # look for links to subcategory pages in menu subcategories_links = hxs.select( "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']" ) if not subcategories_links: # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header # subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a") # if we haven't found them, try to find subcategories in menu on the left - get almost anything subcategories_links = hxs.select( "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \ and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\ and not(contains(text(), 'Featured Brands'))]\ /following-sibling::node()//a") # if we found them, create new category for each and parse it from the beginning #TODO ######################################## # Exceptions - doesn't find anything for: # http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter # # ######################################## if subcategories_links: # new categories are subcategories of current one - calculate and store their level parent_item = item level = parent_item['level'] - 1 #print "URL ", response.url, " CALLING PARSEPAGE" for subcategory in subcategories_links: # to avoid rescraping categories reached from links in menu and reaching levels of -9, # if level < -3 assume we've been there and skip if level < -3: continue item = CategoryItem() item['url'] = Utils.add_domain( subcategory.select("@href").extract()[0], self.root_url) text = subcategory.select("text()").extract() if text: item['text'] = text[0].strip() else: # usually means it's something else than what we need #TODO: check continue #print "no text for subcategory ", item, response.url # # take care of unicode # item['text'] = item['text'].encode("utf-8", errors=ignore) item['level'] = level item['parent_text'] = parent_item['text'] item['parent_url'] = parent_item['url'] item['parent_catid'] = parent_item['catid'] if 'parent_text' in parent_item: item['grandparent_text'] = parent_item['parent_text'] if 'parent_url' in parent_item: item['grandparent_url'] = parent_item['parent_url'] # if parent's parents are missing, level must be at least 0 if 'parent_text' not in parent_item or 'parent_url' not in parent_item: assert level >= 0 # send subcategory items to be parsed again # if not already crawled if (item['url'], item['parent_url'], response.meta['department_url'] ) not in self.crawled: yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']}) self.crawled.append((item['url'], item['parent_url'], response.meta['department_url'])) # return current item # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent yield parent_item #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item}) # if we can't find either products on the page or subcategory links else: #print "URL", response.url, " NO SUBCATs" #item['nr_products'] = 0 yield item
def parseCategory(self, response): #TODO: add extraction of additional category info sel = Selector(response) #TODO: a lot of redirects. maybe for item, set 'url' to the one to which it was redirected? (response.url) item = response.meta['item'] # Description extraction needs to be done first because it can be found in regular /c/ pages that are first passed to this method. # For other info (item count, subcategories), the spider will redirect to different page if necessary (where description won't be available) # extract description description_texts = sel.xpath( "//div[@class='subpart']/p//text()").extract() # second try at finding descriptions if not description_texts: description_texts = sel.xpath( "//div[@id='SEO_TEXT']//text()").extract() # replace all whitespace with one space, strip, and remove empty texts; then join them if description_texts: item['description_text'] = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) else: item['description_wc'] = 0 # try to extract item count - if alternative extraction needs to be done. # this item's parsing will be redirected through different method and returned here # extract item count nr_products_node = sel.xpath("//ul[@class='results']//strong/text()") if nr_products_node: # nr of products is in the second of these nodessel.xpath("//ul[@class='results']//strong/text()") nr_products = nr_products_node.extract()[1].strip() item['nr_products'] = int(nr_products) # alternative item count: try on same page, but with /sb/ instead of /c/ in url if not nr_products_node: m = re.match("http://www\.target\.com/c/(.*)", response.url) if m: new_url = "http://www.target.com/sb/" + m.group(1) # retry to this same method but with new url #TODO: will miss descriptions. leave it to the end of the method then. but I want subcats from that one too? #OR extract it in secondary method and send it back to original url yield Request(new_url, callback=self.parseCategory, meta={'item': item}) else: if "/sb/" not in new_url: print "DOES NOT MATCH", response.url # alternative item count extraction 2 (dynamically generated content) if not nr_products_node: # extract dynamycally loaded data by making an additional request (made by the page to load the data) # extract url and parameters from form data form = sel.xpath("//form[@name='dynamicAjaxFrm1']") if form: form_action = form.xpath("@action").extract()[0] form_inputs = form.xpath("input") # build string of parameters from input names and values param_dict = { form_input.xpath("@name").extract()[0]: form_input.xpath("@value").extract()[0] for form_input in form_inputs } param_string = urllib.urlencode(param_dict) # build url to make request to new_url = "http://www.target.com" + form_action + "&" + param_string # if this url was found, redirect request to new method to extract item count as well, that method will yield the item # only redirect to this method if we weren't already redirected from it - to avoid redirect loop if 'redirected' not in response.meta or not response.meta[ 'redirected']: yield Request(new_url, callback=self.parseCategoryDyncontent, meta={'item': item}) return #TODO: add description title as category name if no title available? # then also add the keyword/density count yield item if 'parent_url' in item: self.crawled_urls.append((item['url'], item['parent_url'])) # extract subcategories (if we haven't reached level barrier) if item['level'] <= self.LEVEL_BARRIER: return parent_item = item # "shop categories" menu #subcategories = sel.xpath("//h3[text() = 'shop categories']/following-sibling::ul/li/a") #TODO: replace the not startswith with != ? subcategories_menu = sel.xpath( "//h3[starts-with(text(), 'shop ') and not(starts-with(text(), 'shop by')) \ and not(starts-with(text(), 'shop for')) and not(starts-with(text(), 'shop favorite')) and not(contains(text(), ' size'))]" ) subcategories = subcategories_menu.xpath("following-sibling::ul/li/a") for subcategory in subcategories: subcategory_item = CategoryItem() subcategory_item['text'] = subcategory.xpath( "text()").extract()[0].strip() subcategory_item['url'] = self.build_url( subcategory.xpath("@href").extract()[0]) # filter duplicates if (subcategory_item['url'], parent_item['url']) in self.crawled_urls: # print subcategory_item['url'] # print parent_item['url'] continue # assign next available category id self.catid += 1 subcategory_item['catid'] = self.catid subcategory_item['level'] = parent_item['level'] - 1 subcategory_item['parent_url'] = parent_item['url'] subcategory_item['parent_text'] = parent_item['text'] subcategory_item['parent_catid'] = parent_item['catid'] subcategory_item['department_text'] = parent_item[ 'department_text'] subcategory_item['department_url'] = parent_item['department_url'] subcategory_item['department_id'] = parent_item['department_id'] # send this subcategory to be further parsed yield Request(subcategory_item['url'], callback=self.parseCategory, meta={'item': subcategory_item})