def extractSubcategoriesFromMenu(self, hxs): # extract subcategories for regular page structure subcategories = hxs.select( "//h2[text()='Department']/following-sibling::ul[1]/li/a") # only try "Shop by Department" if there is no "Department", otherwise might cause problems when both are present. e.g (http://www.amazon.com/Watches-Mens-Womens-Kids-Accessories/b/ref=sd_allcat_watches/187-9021585-5419616?ie=UTF8&node=377110011) if not subcategories: subcategories = hxs.select( "(//h2 | //h3)[text()='Shop by Department']/following-sibling::ul[1]/li/a" ) for subcategory in subcategories: # if we have a subcategory URL and product count with the expected format extract it, otherwise move on # there is an exception to this refinement link rule - then extract info directly from subcategory node, but only if len(text)>1 (otherwise we catch all the little arrows for parent cats) if not subcategory.select("span[@class='refinementLink']"): if len(subcategory.select(".//text()").extract() [0].strip()) > 1: # so it's not that little arrow thing subcategory_text_holder = subcategory.select( "text()[normalize-space()!='']").extract() if subcategory_text_holder: subcategory_text = subcategory_text_holder[0].strip() else: continue subcategory_url_holder = subcategory.select( "@href").extract() if subcategory_url_holder: subcategory_url = Utils.add_domain( subcategory_url_holder[0], "http://www.amazon.com") else: continue subcategory_prodcount_holder = None else: continue else: subcategory_url = Utils.add_domain( subcategory.select("@href").extract()[0], "http://www.amazon.com") subcategory_text = subcategory.select( "span[@class='refinementLink']//text()").extract( )[0].strip() # extract product count, clean it of commas and parantheses subcategory_prodcount_holder = subcategory.select( "span[@class='narrowValue']/text()").extract() # if there's also product count available in the menu, extract it if subcategory_prodcount_holder: subcategory_prodcount = subcategory_prodcount_holder[ 0].replace(";nbsp&", " ").strip() m = re.match("\(([0-9,]+)\)", subcategory_prodcount) if m: subcategory_prodcount = m.group(1).replace(",", "") else: subcategory_prodcount = None yield (subcategory_text, subcategory_url, subcategory_prodcount)
def parse(self, response): hxs = HtmlXPathSelector(response) subcats_links = hxs.select( "//h2[contains(text(),'categories')]/following-sibling::ul[1]/li/a" ) for subcat_link in subcats_links: # extract name subcat_name = subcat_link.select( "span/text()").extract()[0].strip() # extract url, add domain subcat_url = Utils.add_domain( subcat_link.select("@href").extract()[0], self.base_url) # send subcategories to be further parsed # if brand filter is set, senf to parseSubcategory for brands to be extracted etc if self.brands: yield Request(url=subcat_url, callback=self.parseSubcategory, meta={'category': subcat_name}) # if brand filter is not set, send directly to extract products else: yield Request(url=subcat_url, callback=self.parseBrandPage, meta={'category': subcat_name})
def parse_resultsPage(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] result = hxs.select( "//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink'][position()<2]/@href" ).extract() if result: item['walmart_full_url'] = Utils.add_domain( result[0], "http://www.walmart.com") # id should be somewhere in the full URL as well if self.valid_result(item['walmart_full_url'], item['walmart_id']): return item else: # search again, but select result that contains id #OBS: non optimal, should do selecting here return Request(response.url, callback=self.parse_resultsPage2, meta={"item": item}) else: # try to find result by using the product name instead # get product name from product page, then search by it return Request(item['walmart_short_url'], callback=self.getProductName, meta={"item": item})
def parseSubcats(self, response): hxs = HtmlXPathSelector(response) parent = response.meta['parent'] # extract subcategories subcats_links = hxs.select( "//div[@class='sideNav']/div[@class='innerWrap'][1]//ul/li/a") for subcat_link in subcats_links: item = CategoryItem() item['url'] = Utils.add_domain( subcat_link.select("@href").extract()[0], "http://www.tigerdirect.com") item['text'] = subcat_link.select("text()").extract()[0] item['parent_text'] = parent['text'] item['parent_url'] = parent['url'] item['level'] = parent['level'] - 1 item['department_text'] = response.meta['department_text'] item['department_id'] = response.meta['department_id'] item['department_text'] = response.meta['department_text'] #print 'passing to parse category ', item # there are some loops in their categories tree, so we need to check this to avoid infinite loops in crawling if item['url'] not in self.parsed_urls: yield Request(url = item['url'], callback = self.parseCategory,\ meta = {'item' : item,\ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\ 'department_id' : response.meta['department_id']})
def parseResults(self, response): hxs = HtmlXPathSelector(response) origin_product_id = response.meta['origin_product_id'] current_query = response.meta['query'] # all product urls from all queries items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \ self.results[origin_product_id]['search_requests']), []) # all product urls from all queries product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \ self.results[origin_product_id]['search_requests']), []) product_urls = set(product_urls) # get search results for received results page and add them to product_urls to be parsed # Note: xpath below ignores Sponsored links (which is good) results = hxs.select("//div[@class='a-row a-spacing-small']/a") for result in results: product_url = result.select("@href").extract()[0] # remove the part after "/ref" containing details about the search query m = re.match("(.*)/ref=(.*)", product_url) if m: product_url = m.group(1) product_url = Utils.add_domain(product_url, self.domain) self.results[origin_product_id]['search_requests'][current_query][ 'search_results'].append(product_url) # extract product info from product pages (send request to parse first URL in list) # add as meta all that was received as meta, will pass it on to reduceResults function in the end # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed # send the request further to parse product pages only if we gathered all the product URLs from all the queries # (there are no more pending requests) # otherwise send them back to parseResults and wait for the next query, save all product URLs in search_results # this way we avoid duplicates if product_urls and ('pending_requests' not in response.meta or not response.meta['pending_requests']): next_product_url = product_urls.pop() request = Request(next_product_url, callback=self.parse_product_amazon, meta=response.meta) # remove the urls you've just consumed self.remove_result_from_queue(origin_product_id, next_product_url) return request # if there were no results, the request will never get back to reduceResults # so send it from here so it can parse the next queries # add to the response the URLs of the products to crawl we have so far, items (handles case when it was not created yet) # and field 'parsed' to indicate that the call was received from this method (was not the initial one) else: response.meta['parsed'] = True # only send the response we have as an argument, no need to make a new request return self.reduceResults(response)
def parseSubcategory(self, response): hxs = HtmlXPathSelector(response) subcategory = response.meta['item'] # yield this subcategory yield subcategory # if subcategory was special, we'll mark all subsubcategories as special if 'special' in subcategory: special = True else: special = False # get its subcategories subsubcategories = hxs.select( "//div[@class='product-category-expanded']//h3[@class='title']") for subsubcategory in subsubcategories: item = CategoryItem() item['text'] = subsubcategory.select("a/text()").extract()[0] item['url'] = Utils.add_domain( subsubcategory.select("a/@href").extract()[0], self.base_url) if special: item['special'] = 1 item['parent_text'] = subcategory['text'] item['parent_url'] = subcategory['url'] item['department_text'] = subcategory['department_text'] item['department_url'] = subcategory['department_url'] item['department_id'] = subcategory['department_id'] item['level'] = subcategory['level'] - 1 description_text_holder = subsubcategory.select( "following-sibling::p[@class='description'][1]/text()" ).extract() if description_text_holder: item['description_text'] = description_text_holder[0] item['description_title'] = item['text'] description_tokenized = Utils.normalize_text( item['description_text']) item['description_wc'] = len(description_tokenized) (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 # parse subcategory page to get product count, or further subsubcategory yield Request(item['url'], callback=self.parseSubcategoryPage, meta={'item': item})
def extractSubcategoriesSports(self, hxs): subcategories = hxs.select( "//h3[text()='Shop by Sport']/following-sibling::ul[1]/li/a") for subcategory in subcategories: subcategory_name = subcategory.select("text()").extract()[0] subcategory_url = Utils.add_domain( subcategory.select("@href").extract()[0], "http://www.amazon.com") yield (subcategory_name, subcategory_url, None)
def parsePage_tigerdirect(self, response): hxs = HtmlXPathSelector(response) #print "IN PARSEPAGE ", response.url # without the "resultsWrap" div, these are found on pages we don't want as well product_links = hxs.select( "//div[@class='resultsWrap listView']//h3[@class='itemName']/a/@href" ).extract() for product_link in product_links: item = ProductItem() item['product_url'] = Utils.add_domain( product_link, "http://www.tigerdirect.com") # remove CatId from URL (generates duplicates) m = re.match("(.*)&CatId=[0-9]+", item['product_url']) if m: item['product_url'] = m.group(1) yield item # parse next pages (if results spread on more than 1 page) #TODO: not sure if all of them are extracted next_page = hxs.select("//a[@title='Next page']") if next_page: #print "next page : ", response.url, " + ", next_page page_nr = response.meta['page'] + 1 # base_url = response.meta['base_url'] # # remove trailing "&" character at the end of the URL # m = re.match("(.*)&", base_url) # if m: # base_url = m.group(1) # yield Request(url = base_url + "&page=%d"%page_nr, callback = self.parsePage_tigerdirect,\ # meta = {'page' : page_nr, 'base_url' : response.meta['base_url']}) next_page_url = Utils.add_domain( next_page.select("@href").extract()[0], "http://www.tigerdirect.com") yield Request(url = next_page_url, callback = self.parsePage_tigerdirect,\ meta = {'page' : page_nr}) # if you can't find product links, you should search for links to the subcategories pages and parse them for product links if not product_links: yield Request(url=response.url, callback=self.parseSubcats_tigerdirect)
def extractSubcategoriesAccessories(self, hxs): subcategories = hxs.select("//a[contains(text(),'Shop All')]") for subcategory in subcategories: # extract words after "Shop All" - that is the subcategory name subcategory_text_full = subcategory.select("text()").extract()[0] m = re.match("Shop All (.*)", subcategory_text_full) subcategory_name = m.group(1).strip() subcategory_url = Utils.add_domain( subcategory.select("@href").extract()[0], "http://www.amazon.com") yield (subcategory_name, subcategory_url, None)
def parseSubcats_tigerdirect(self, response): hxs = HtmlXPathSelector(response) # search for a link to "See All Products" seeall = hxs.select( "//span[text()='See All Products']/parent::node()/@href").extract( ) if seeall: # pass the new page to this same method to be handled by the next branch of the if statement yield Request(url=Utils.add_domain(seeall[0], "http://www.tigerdirect.com"), callback=self.parseSubcats_tigerdirect) else: # extract subcategories subcats_links = hxs.select( "//div[@class='sideNav']/div[@class='innerWrap'][1]//ul/li/a") for subcat_link in subcats_links: subcat_url = Utils.add_domain( subcat_link.select("@href").extract()[0], "http://www.tigerdirect.com") yield Request(url = subcat_url, callback = self.parsePage_tigerdirect,\ meta = {'page' : 1, 'base_url' : subcat_url})
def extract_result_products(self, response): hxs = HtmlXPathSelector(response) items = [] results = hxs.select( "//div[@class='list-item-info']/div[@class='sku-title']/h4/a") for result in results: item = SearchItem() #item['origin_site'] = site product_name_holder = result.select("text()").extract() if product_name_holder: item['product_name'] = product_name_holder[0].strip() else: self.log("Error: No product name: " + str(response.url) + " from product: " + origin_url, level=log.ERROR) item['product_url'] = Utils.clean_url( Utils.add_domain( result.select("@href").extract()[0], "http://www.bestbuy.com")) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_name' in response.meta: item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] model_holder = result.select( "../../../div[@class='sku-model']/ul/li[@class='model-number']/span[@id='model-value']/text()" ).extract() if model_holder: item['product_model'] = model_holder[0] price_holder = result.select( "../../../../div[@class='list-item-price']//div[@class='price-block']//div[@class='medium-item-price']/text()[normalize-space()]" ).extract() if price_holder: price = price_holder[0].strip() price = re.sub(",", "", price) price = float(price) item['product_target_price'] = price items.append(item) return items
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']" ) for result in results: item = SearchItem() #item['origin_site'] = site #TODO: usually the manufacturer is in bold, so maybe use that product_name = " ".join(result.select(".//text()").extract()) # append text that is in <span> if any span_text = result.select("./span/text()") #TODO: use span text differently, as it is more important/relevant (bold) ? for text in span_text: product_name += " " + text.extract() item['product_name'] = product_name rel_url = result.select("@href").extract()[0] root_url = "http://www.walmart.com" item['product_url'] = Utils.add_domain(rel_url, root_url) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] assert self.by_id else: assert not self.by_id items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parseBrand(self, response): hxs = HtmlXPathSelector(response) # category of items on current page category = response.meta['category'] # set parameters in meta specifying current product count and total product count for this brand # to be used for deciding on stop criteria on pagination if 'total_product_count' in response.meta: product_count = response.meta['total_product_count'] cur_product_count = response.meta['current_product_count'] else: # extract number of products for this brand product_count = int( hxs.select("//h2[@id='productCount']//text()").re("[0-9]+")[0]) cur_product_count = 0 # extract products from this page product_links = hxs.select( "//h3[@class='productTitle']/a/@href").extract() # add domain product_urls = map(lambda x: Utils.add_domain(x, self.base_url), product_links) for product_url in product_urls: item = ProductItem() # remove parameters in url item['product_url'] = Utils.clean_url(product_url) item['category'] = category yield item # add nr of extracted products to current product count cur_product_count += len(product_urls) # get next page if any next_page = self.build_next_page_url(response.url, product_count, cur_product_count, first=('total_product_count' not in response.meta)) if next_page: yield Request(url=next_page, callback=self.parseBrand, meta={ 'total_product_count': product_count, 'current_product_count': cur_product_count, 'category': category })
def parseSubcategory(self, response): hxs = HtmlXPathSelector(response) #print "SUBCATEGORY:", response.url # extract link to page containing brands (look for link to 'more') brands_menu_page = hxs.select( "//h4[contains(text(),'Brand')]/following-sibling::ul[1]/li[@class='more']/a/@data-overlay-url" ).extract() if brands_menu_page: # send request for brands pages to be extracted yield Request(url=Utils.add_domain(brands_menu_page[0], self.base_url), callback=self.parseBrandsMenu, meta={'category': response.meta['category']}) else: # if no 'more' link, extract brand pages directly from this page (it means they are all here) brands_pages = hxs.select( "//h4[contains(text(),'Brand')]/following-sibling::ul[1]/li/a") for brand_page in brands_pages: brand_name = brand_page.select( "span[@class='facet-str-name']/text()").extract()[0] brand_url = Utils.add_domain( brand_page.select("@href").extract()[0], self.base_url) # filter brands if it applies if self.brands and not self.name_matches_brands(brand_name): self.log("Omitting brand " + brand_name, level=log.INFO) continue # send request for brands page to be parsed and its products extracted yield Request(url=brand_url, callback=self.parseBrandPage, meta={'category': response.meta['category']})
def parse_resultsPage2(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] results = hxs.select("//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']/@href").extract() for result in results: # if the result URL contains the id, this is the correct result if self.valid_result(item['walmart_id'], result): product_url = Utils.add_domain(result, "http://www.walmart.com") item['walmart_full_url'] = product_url return item # no results matching the condition were found self.log("No results for short_url (didn't find any URLs containing id) " + item['walmart_short_url'] + "\n", level=log.ERROR)
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//div[@class='hproduct']/div[@class='info-main']/h3/a") for result in results: item = SearchItem() #item['origin_site'] = site item['product_name'] = result.select("text()").extract()[0].strip() item['product_url'] = Utils.clean_url( Utils.add_domain( result.select("@href").extract()[0], "http://www.bestbuy.com")) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id model_holder = result.select( "parent::node()/parent::node()//strong[@itemprop='model']/text()" ).extract() if model_holder: item['product_model'] = model_holder[0] items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def extract_results(self, response): hxs = HtmlXPathSelector(response) # TODO: check this xpath and extractions results = hxs.select("//h4[@class='tile-heading']/a") product_urls = set() # try xpath for old page version if not results: results = hxs.select("//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']") for result in results: product_url = result.select("@href").extract()[0] product_url = Utils.add_domain(product_url, "http://www.walmart.com") product_urls.add(product_url) return list(product_urls)
def parseBrandPage(self, response): hxs = HtmlXPathSelector(response) # category of items on this page category = response.meta['category'] # extract item count if 'item_count' in response.meta: total_item_count = reponse.meta['item_count'] else: total_item_count = int( hxs.select("//p[@id='filtered-products-count']").re("[0-9]+") [0]) # extract product holder. not extracting <a> element directly because each product holder has many a elements (all just as good, but we only want one) product_holders = hxs.select("//div[@class='product ']") for product_holder in product_holders: # extract first link in product holder product_link = product_holder.select(".//a/@href").extract()[0] product_url = Utils.add_domain(product_link, self.base_url) item = ProductItem() item['product_url'] = product_url item['category'] = category yield item # crawl next pages if any left if 'offset' not in response.meta: offset = 0 else: offset = response.meta['offset'] next_page = self.build_next_page_url(response.url, total_item_count, offset) # if there are more products to crawl, send new request if next_page: yield Request(url=next_page, callback=self.parseBrandPage, meta={ 'offset': offset + 1, 'total_item_count': total_item_count, 'category': category })
def parseCategory(self, response): hxs = HtmlXPathSelector(response) brands_links = hxs.select("//li[contains(@class,'brandsSel')]/a") for brand_link in brands_links: brand_name = brand_link.select( "text()[normalize-space()]").extract()[0].strip() brand_url = Utils.add_domain( brand_link.select("@href").extract()[0], self.base_url) # filter brand if brand filter set if self.brands and not self.name_matches_brands(brand_name): self.log("Omitting brand " + brand_name, level=log.INFO) continue # crawl brand page if it passed filter yield Request(url=brand_url, callback=self.parseBrand, meta={'category': response.meta['category']})
def parse(self, response): hxs = HtmlXPathSelector(response) categories_links = hxs.select("//div[@class='nav baseLevel']/ul/li/a") for category_link in categories_links: category_name = category_link.select("text()").extract()[0] category_url = Utils.add_domain( category_link.select("@href").extract()[0], self.base_url) # if brand filter is set, send to parseCategory to extract brands pages from menu if self.brands: yield Request(url=category_url, callback=self.parseCategory, meta={'category': category_name}) # if we're extracting all brands, send it directly to extract products from it else: yield Request(url=category_url, callback=self.parseBrand, meta={'category': category_name})
def parseBrandsMenu(self, response): hxs = HtmlXPathSelector(response) # extract links to brands pages brands_links = hxs.select("//ul/li/a") for brand_link in brands_links: brand_name = brand_link.select( "@data-facet-option-value").extract()[0] # filter brands if it applies if self.brands and not self.name_matches_brands(brand_name): self.log("Omitting brand " + brand_name, level=log.INFO) continue # build brand url try: # extract brand id brand_id = brand_link.select( "@data-facet-option-id").extract()[0] # extract base url for brand page brand_base_url = Utils.add_domain( hxs.select("//form/@action").extract()[0], self.base_url) # extract relative url parameters for brand page brand_relative_url_params = hxs.select( "//input/@value").extract()[0] # extract catId parameter cat_id_param = re.findall("catId=[0-9]+(?=&|$)", brand_relative_url_params)[0] # build brand page brand_page_url = brand_base_url + "?" + cat_id_param + "+" + str( brand_id) #print brand_page_url yield Request(url=brand_page_url, callback=self.parseBrandPage, meta={'category': response.meta['category']}) except Exception, e: self.log("Couldn't extract brand page from menu: " + e, level=log.ERROR)
def parse(self, response): hxs = HtmlXPathSelector(response) #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']") #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']") #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']") parent_links = hxs.select( "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]" ) # #TODO: check this # item['nr_products'] = -1 # yield item #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item}) department_id = 0 for link in parent_links: item = CategoryItem() #TO remove: # # link to artificial parent category # item['parent_catid'] = 0 item['text'] = link.select('text()').extract()[0] item['url'] = link.select('@href').extract()[0] # add domain if relative URL item['url'] = Utils.add_domain(item['url'], self.root_url) item['level'] = 1 department_id += 1 # send category page to parseCategory function to extract description and number of products and add them to the item yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \ 'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})
def parse(self, response): if self.product_name: # can inly use this option if self.target_site has been initialized (usually true for spiders for retailers sites, not true for manufacturer's sites) if not self.target_site: self.log( "You can't use the product_name option without setting the target site to search on\n", level=log.ERROR) raise CloseSpider( "\nYou can't use the product_name option without setting the target site to search on\n" ) search_query = self.build_search_query(self.product_name) search_pages = self.build_search_pages(search_query) request = Request(search_pages[self.target_site], callback=self.parseResults) # set amazon cookies if (self.target_site == 'amazon' and self.cookies_file): request.cookies = self.amazon_cookies request.headers['Cookies'] = self.amazon_cookie_header #request.meta['dont_merge_cookies'] = True ## print "SET AMAZON COOKIES" request.meta['origin_name'] = self.product_name request.meta['query'] = search_query # just use empty product model and url, for compatibility, also pending_requests request.meta['origin_model'] = '' request.meta['origin_url'] = '' request.meta['pending_requests'] = [] yield request # if we have product URLs, pass them to parseURL to extract product names (which will pass them to parseResults) product_urls = [] # if we have a single product URL, create a list of URLs containing it if self.product_url: product_urls.append(self.product_url) # if we have a file with a list of URLs, create a list with URLs found there if self.product_urls_file: f = open(self.product_urls_file, "r") for line in f: product_urls.append(line.strip()) f.close() for product_url in product_urls: # extract site domain # m = re.match("http://www1?\.([^\.]+)\.com.*", product_url) # origin_site = "" # if m: # origin_site = m.group(1) # else: # sys.stderr.write('Can\'t extract domain from URL.\n') origin_site = Utils.extract_domain(product_url) request = Request(product_url, callback=self.parseURL) request.meta['origin_site'] = origin_site if origin_site == 'staples': zipcode = "12345" request.cookies = {"zipcode": zipcode} request.meta['dont_redirect'] = True yield request # if we have a file with Walmart ids, create a list of the ids there if self.walmart_ids_file: walmart_ids = [] f = open(self.walmart_ids_file, "r") for line in f: if "," in line: id_string = line.strip().split(",")[0] else: id_string = line.strip() if re.match("[0-9]+", id_string): walmart_ids.append(id_string) f.close() self.by_id = True for walmart_id in walmart_ids: # create Walmart URLs based on these IDs walmart_url = Utils.add_domain(walmart_id, "http://www.walmart.com/ip/") request = Request(walmart_url, callback=self.parseURL) #request.meta['origin_site'] = 'walmart' yield request
def parseResults(self, response): hxs = HtmlXPathSelector(response) if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//ul[@class='products']//div[@class='product ']//h3//a") for result in results: item = SearchItem() product_url = result.select("@href").extract()[0] if result.select( "@href") else None product_name = result.select( "@title").extract()[0] if result.select("@title") else None # assert name is not abbreviated # empirically, this only seems to produce false positives, so removed # assert '...' not in product_name # quit if there is no product name if product_name and product_url: # clean url item['product_url'] = Utils.add_domain(product_url, self.base_url) item['product_name'] = product_name else: self.log("No product name: " + str(response.url) + " from product: " + response.meta['origin_url'], level=log.ERROR) continue # add url, name and model of product to be matched (from origin site) item['origin_url'] = response.meta['origin_url'] item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # extract product model from name product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted #TODO: extract: price, brand? # add result to items items.add(item) # extract product info from product pages (send request to parse first URL in list) # add as meta all that was received as meta, will pass it on to reduceResults function in the end # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not # if there are, reduceResults will send the next one back here, if not it will return the final result response.meta['items'] = items # and field 'parsed' to indicate that the call was received from this method (was not the initial one) #TODO: do we still need this? response.meta['parsed'] = True # only send the response we have as an argument, no need to make a new request return self.reduceResults(response)
def parseCategory(self, response): hxs = HtmlXPathSelector(response) # get parent item from response, extract additional info and return it item = response.meta['parent'] # add department name, url and id for item item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] # extract product count if available nr_items_holder = hxs.select( "//div[@id='showing']/strong[position()=2]/text()").extract() if nr_items_holder: item['nr_products'] = int(str(nr_items_holder[0])) # extract description if available # these are descriptions for services pages desc_title_holder = hxs.select( "//div[@id='searchstate']/a[position()=2]/text()").extract() if desc_title_holder: item['description_title'] = desc_title_holder[0].strip() desc_content_holder = hxs.select( "//div[@class='content']/h3/text()").extract() if desc_content_holder: item['description_text'] = desc_content_holder[0].strip() tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 yield item # extract its subcategories #subcats_holders = hxs.select("//div[@class='narrowcontent']/ul[@class='search']") subcats_holders = hxs.select( "//div[@class='narrowcontent']/ul[contains(@class,'search')]") if subcats_holders: subcats_holder = subcats_holders[0] # these are subcategories if they are preceded by the title "Shop ..." title = subcats_holder.select( "parent::node()/preceding-sibling::node()//text()").extract( )[0] if str(title).startswith("Shop"): subcats = subcats_holder.select(".//li/a") for subcat in subcats: item = CategoryItem() item['text'] = subcat.select("text()").extract()[0].strip() item['url'] = Utils.add_domain( subcat.select("@href").extract()[0], "http://www.bestbuy.com") parent = response.meta['parent'] item['level'] = int(response.meta['level']) - 1 # if parent was special, this category is special too if 'special' in parent: item['special'] = 1 item['parent_text'] = parent['text'] item['parent_url'] = parent['url'] request = Request(url = item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']}) yield request
def parsePage(self, response): #print "IN PARSEPAGE" hxs = HtmlXPathSelector(response) item = response.meta['item'] if 'parent_item' in response.meta: parent_item = response.meta['parent_item'] item['parent_text'] = parent_item['text'] item['parent_url'] = parent_item['url'] if 'parent_text' in parent_item: item['grandparent_text'] = parent_item['parent_text'] item['grandparent_url'] = parent_item['parent_url'] if 'nr_products' not in parent_item: parent_nr_products = 0 else: parent_nr_products = parent_item['nr_products'] # initialize product URL list if 'products' not in response.meta: products = [] else: products = response.meta['products'] # # if this is the first page, initialize number of products # if 'nr_products' not in item: # old_nr_products = 0 # else: # old_nr_products = item['nr_products'] # find number of products on this page product_links = hxs.select( "//a[@class='prodLink ListItemLink']/@href").extract() # gather all products in this (sub)category products += product_links #this_nr_products = len(product_links) #item['nr_products'] = old_nr_products + this_nr_products # if 'parent_item' in response.meta: # parent_item['nr_products'] = parent_nr_products + item['nr_products'] # find URL to next page, parse it as well next_page = hxs.select( "//a[@class='link-pageNum' and text()=' Next ']/@href").extract() if next_page: page_url = Utils.add_domain(next_page[0], self.root_url) request = Request(url=page_url, callback=self.parsePage, meta={ 'item': item, 'products': products }) if 'parent_item' in response.meta: request.meta['parent_item'] = parent_item yield request # if no next page, return current results; and return parent category page else: item['nr_products'] = len(set(products)) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) # extract departments departments = hxs.select("//h2") department_id = 0 for department in departments: department_item = CategoryItem() department_text = department.select("text()").extract()[0] department_item['department_text'] = department_text # #TODO: add department_url, from sherwin-williams.com ...? get department list from there and match with departments from here by seeing if names match department_item['department_id'] = department_id department_item['text'] = department_text department_item['level'] = 1 # get categories in department categories = department.select("following-sibling::ul[1]/li") # extract department url from one of its categories urls (it's not available directly) category_ex = categories[0] category_ex_url = Utils.add_domain( category_ex.select("a/@href").extract()[0], self.base_url) # extract first part of url m = re.match("(http://www.sherwin\-williams\.com/[^/]+)/.*", category_ex_url) department_url = m.group(1) department_item['department_url'] = department_url department_item['url'] = department_url for category in categories: item = CategoryItem() #TODO: special if 'Services'? or Specifications, or Ads... category_text = category.select("a/text()").extract()[0] category_url = Utils.add_domain( category.select("a/@href").extract()[0], self.base_url) item['text'] = category_text item['url'] = category_url # if it's not a 'products' category, mark it and all its subcategories as special if category_text != 'Products': item['special'] = 1 special = True else: special = False item['department_id'] = department_id item['department_text'] = department_text item['department_url'] = department_url item['parent_text'] = department_text item['parent_url'] = department_url item['level'] = 0 #TODO: do we need description_wc here as well? yield Request(item['url'], callback=self.parseCategory, meta={'item': item}) # get subcategories in category subcategories = category.select("ul/li") for subcategory in subcategories: item = CategoryItem() item['text'] = subcategory.select("a/text()").extract()[0] item['url'] = Utils.add_domain( subcategory.select("a/@href").extract()[0], self.base_url) item['department_id'] = department_id item['department_text'] = department_text item['department_url'] = department_url item['parent_text'] = category_text item['parent_url'] = category_url item['level'] = -1 # if parent is special, category is special if special: item['special'] = 1 yield Request(item['url'], callback=self.parseSubcategory, meta={'item': item}) department_id += 1 # return department yield department_item
class WalmartCaSpider(BaseSpider): name = "walmartca" allowed_domains = ["walmart.ca"] start_urls = [ "http://www.walmart.ca/en", ] def __init__(self, outfile=None): self.root_url = "http://www.walmart.ca" self.outfile = outfile # set flag that indicates that for this spider, nr of products for each catgory should be computed self.compute_nrproducts = True # level that is considered to contain departments self.DEPARTMENT_LEVEL = 1 # keep crawled items represented by (url, parent_url, department_url) pairs # to eliminate duplicates # (adding department_url makes sure that if one entire department is found as a subcategory of another for ex, both (and their complete category trees) will be crawled) self.crawled = [] # last used category id, used for autoincrementing ids idenrifying categories self.id_count = 0 # hardcoded values for special category's item count. Currently used for 'Value of the day' that typically has fixed number of products, and nowhere to extract it from page self.special_itemcount = {'value of the day': 2} def parse(self, response): hxs = HtmlXPathSelector(response) #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']") #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']") #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']") parent_links = hxs.select( "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]" ) # #TODO: check this # item['nr_products'] = -1 # yield item #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item}) department_id = 0 for link in parent_links: item = CategoryItem() #TO remove: # # link to artificial parent category # item['parent_catid'] = 0 item['text'] = link.select('text()').extract()[0] item['url'] = link.select('@href').extract()[0] # add domain if relative URL item['url'] = Utils.add_domain(item['url'], self.root_url) item['level'] = 1 department_id += 1 # send category page to parseCategory function to extract description and number of products and add them to the item yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \ 'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id}) # parse category page and extract description and number of products def parseCategory(self, response): # URLs like health.walmart.com don't have body_as_unicode and generate an exception try: hxs = HtmlXPathSelector(response) except AttributeError, e: self.log("Could not get response from " + response.url + "; original exception: " + str(e) + "\n", level=log.WARNING) return item = response.meta['item'] # Add department text, url and id to item item['department_text'] = response.meta['department_text'] item['department_url'] = response.meta['department_url'] item['department_id'] = response.meta['department_id'] # assign unique id item['catid'] = self.id_count self.id_count += 1 # Extract subcategories breakdown if any ("classification" field) classification_criteria = hxs.select( "//form[@id='refine']//h6[@class='AdvSearchSubhead']") classification_dictionary = {} for criterion in classification_criteria: criterion_name = criterion.select( ".//text()[normalize-space()!='']").extract()[0].strip() # extract subcategories by this criterion: # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more") subcategories = criterion.select( "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]" ) # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them) subcategories = filter( lambda x: x.select("text()").re(".*[A-Za-z]+.*"), subcategories) # if we found these, create the classification dictionary if criterion_name and subcategories: subcategories_list = [] for subcategory in subcategories: subcategory_name = subcategory.select( "@title").extract()[0] # replace   with space, trim subcategory_name = subcategory_name.replace(" ", " ").strip() # extract product count subcategory_prodcount = subcategory.select( "span[@class='count']/text()").extract() # if there is no count field, extract prodcount from subcategory name if subcategory_prodcount: m = re.match("\(([0-9]+)\)", subcategory_prodcount[0].strip()) # eliminate parantheses surrounding number and convert to int if m: subcategory_prodcount = m.group(1) else: subcategory_prodcount = subcategory_prodcount[ 0].strip() else: # if there is no product count in separate element, try to extract it from subcategory name subcategory_name = subcategory.select( ".//text()[normalize-space()!='']").extract( )[0].replace(" ", " ").replace(u"\xa0", " ").strip() m = re.match("(.*)\(([0-9]+)\)", subcategory_name) if m: subcategory_prodcount = m.group(2) subcategory_name = m.group(1).strip() if subcategory_name and subcategory_prodcount: subcategory_item = { "name": subcategory_name, "nr_products": int(subcategory_prodcount) } subcategories_list.append(subcategory_item) classification_dictionary[criterion_name] = subcategories_list if classification_dictionary: item['classification'] = classification_dictionary ########################################################################################## # # Extract description title, text, wordcount, and keyword density (if any) ########################################### #TODO: # first search for the description id they usually use, # second one is used more rarely and also with some false positives so needs to be checked for text length as well # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found description_holder = hxs.select( "//div[@id='detailedPageDescriptionCopyBlock']") # flag to tell if we found it with basic rule found = True if not description_holder: found = False description_holder = hxs.select( "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") # if none was found, try to find an element with much text (> DESC_LEN (200) characters) # this is gonna pe a paragraph in the description, look for its parent (containing the entire description) if not description_holder: #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content? # I think it means entire text content. We're ok description_holder = hxs.select("//p/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*") # select element among these with most text if description_holder: desc_winner = description_holder[0] max_text = 0 for desc_candidate in description_holder: # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph description_texts = desc_candidate.select( ".//p//text()[string-length()>" + str(DESC_PAR_LEN) + "]").extract() text_len = len(" ".join(description_texts)) if text_len > max_text: max_text = text_len desc_winner = desc_candidate # if text length is the same, assume one of them is parent of the other # and select the one with greater depth (fewer children) elif text_len == max_text and text_len != 0: children_old = float( desc_winner.select("count(*)").extract()[0]) children_new = float( desc_candidate.select("count(*)").extract()[0]) if children_new < children_old: desc_winner = desc_candidate description_holder = desc_winner # try to find description title in <b> tag in the holder; # if it's not found, try to find it in the first <p> if the description # if found there, exclude it from the description body if description_holder: #TODO: # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() " # to fix Money Center problem. but maybe it's not always inside p? description_title = description_holder.select( ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() " ).extract() if description_title: # this will implicitly get thle first occurence of either a <b> element or an <h1> element, # which is likely to be the title (the title usually comes first) item['description_title'] = description_title[0].strip() description_texts = description_holder.select( "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \ | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]" ).extract() # if the list is not empty and contains at least one non-whitespace item if description_texts and reduce( lambda x, y: x or y, [line.strip() for line in description_texts]): description_text = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it if len(description_text) < 4096 or found: # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = description_text # replace line breaks with space item['description_text'] = re.sub("\n+", " ", item['description_text']) if 'description_text' in item: tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) # sometimes here there is no description title because of malformed html # if we can find description text but not description title, title is probably malformed - get first text in div instead if 'description_title' not in item: desc_texts = description_holder.select( "./text()").extract() desc_texts = [text for text in desc_texts if text.strip()] if desc_texts: item['description_title'] = desc_texts[0].strip() if 'description_title' in item: (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 else: item['description_wc'] = 0 # ################################################################################## # Extract product count # find if there is a wc field on the page wc_field = hxs.select( "//div[@class='mrl mod-toggleItemCount']/span/text() |\ //div[@class='SPRecordCount']/text()").extract() if wc_field: m1 = re.match("([0-9]+) Results", wc_field[0]) if m1: item['nr_products'] = int(m1.group(1)) m2 = m2 = re.match( "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0]) if m2: item['nr_products'] = int(m2.group(1)) # set item count for special items (hardcoded in special_itemcount) if item['text'].lower() in self.special_itemcount: item['nr_products'] = self.special_itemcount[item['text'].lower()] # Extract subcategories if no product count found if 'nr_products' in item: yield item else: # look for links to subcategory pages in menu subcategories_links = hxs.select( "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']" ) if not subcategories_links: # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header # subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a") # if we haven't found them, try to find subcategories in menu on the left - get almost anything subcategories_links = hxs.select( "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \ and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\ and not(contains(text(), 'Featured Brands'))]\ /following-sibling::node()//a") # if we found them, create new category for each and parse it from the beginning #TODO ######################################## # Exceptions - doesn't find anything for: # http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter # # ######################################## if subcategories_links: # new categories are subcategories of current one - calculate and store their level parent_item = item level = parent_item['level'] - 1 #print "URL ", response.url, " CALLING PARSEPAGE" for subcategory in subcategories_links: # to avoid rescraping categories reached from links in menu and reaching levels of -9, # if level < -3 assume we've been there and skip if level < -3: continue item = CategoryItem() item['url'] = Utils.add_domain( subcategory.select("@href").extract()[0], self.root_url) text = subcategory.select("text()").extract() if text: item['text'] = text[0].strip() else: # usually means it's something else than what we need #TODO: check continue #print "no text for subcategory ", item, response.url # # take care of unicode # item['text'] = item['text'].encode("utf-8", errors=ignore) item['level'] = level item['parent_text'] = parent_item['text'] item['parent_url'] = parent_item['url'] item['parent_catid'] = parent_item['catid'] if 'parent_text' in parent_item: item['grandparent_text'] = parent_item['parent_text'] if 'parent_url' in parent_item: item['grandparent_url'] = parent_item['parent_url'] # if parent's parents are missing, level must be at least 0 if 'parent_text' not in parent_item or 'parent_url' not in parent_item: assert level >= 0 # send subcategory items to be parsed again # if not already crawled if (item['url'], item['parent_url'], response.meta['department_url'] ) not in self.crawled: yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \ 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']}) self.crawled.append((item['url'], item['parent_url'], response.meta['department_url'])) # return current item # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent yield parent_item #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item}) # if we can't find either products on the page or subcategory links else: #print "URL", response.url, " NO SUBCATs" #item['nr_products'] = 0 yield item
def parseResults_samsung(self, response): hxs = HtmlXPathSelector(response) if 'items' in response.meta: items = response.meta['items'] else: items = set() # add product URLs to be parsed to this list if 'search_results' not in response.meta: product_urls = set() else: product_urls = response.meta['search_results'] #TODO: implement support for multiple results pages? # if we find any results to this it means we are already on a product page results = hxs.select("//ul[@class='product-info']") if results: product_urls.add(response.url) # it also means it's an exact match, so stop search here response.meta['pending_requests'] = [] response.meta['threshold'] = 0.2 # # also temporarily lower threshold # self.threshold = 0.2 else: # try to see if this is a results page then # Content seems to be generated with javascript - open page with selenium, extract its content then return it back here # try to see if the page contains what we need, or we need to try it with selenium results = hxs.select( "//input[contains(@id,'detailpageurl')]/@value") if not results: print 'NO RESULTS: ', response.url #results = [] # COMMENTED FOR TESTING # use selenium request = self.get_samsung_results(response.url) # get body of request request_body = request.body resp_for_scrapy = TextResponse('none', 200, {}, request_body, [], None) hxs = HtmlXPathSelector(resp_for_scrapy) #print "PAGE_SOURCE: ", page_source results = hxs.select( "//input[contains(@id,'detailpageurl')]/@value") else: print 'WE ALREADY HAD RESULTS! ' print 'RESULTS: ', results for result in results: product_url = Utils.add_domain(result.extract().strip(), "http://www.samsung.com") product_urls.add(product_url) if product_urls and ('pending_requests' not in response.meta or not response.meta['pending_requests']): request = Request(product_urls.pop(), callback=self.parse_product_samsung, meta=response.meta) request.meta['items'] = items # this will be the new product_urls list with the first item popped request.meta['search_results'] = product_urls return request # if there were no results, the request will never get back to reduceResults else: # # we are finished and should close the driver # if self.driver: # self.driver.close() response.meta['items'] = items response.meta['parsed'] = True response.meta['search_results'] = product_urls # only send the response we have as an argument, no need to make a new request return self.reduceResults(response)
#!/usr/bin/python # Get ids from a CSV file containing one on each line, and generate Walmart product URLs based on them import sys import re from spiders_utils import Utils base_url = "http://www.walmart.com/ip/" with open(sys.argv[1]) as idsfile: for line in idsfile: # if there are other fields ignore them (get the first one) if "," in line: id_string = line.strip().split(",")[0] else: id_string = line.strip() # if it's not a number ignore it (could be a header line) if re.match("[0-9]+", id_string): # generate URL and output it url = Utils.add_domain(id_string, base_url) print url