def parse(self, response): for url in self.prod_urls: domain = Utils.extract_domain(url) #TODO: pass some cookie with country value for sites where price for example is displayed in local currency if domain != 'staples': yield Request(url, callback=self.parseProdpage, meta={"site": domain}) # for staples we need extra cookies else: yield Request(url, callback = self.parseProdpage, cookies = {"zipcode" : "1234"}, \ headers = {"Cookie" : "zipcode=" + "1234"}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True, "site" : domain})
def parse(self, response): if self.product_name: # can inly use this option if self.target_site has been initialized (usually true for spiders for retailers sites, not true for manufacturer's sites) if not self.target_site: self.log( "You can't use the product_name option without setting the target site to search on\n", level=log.ERROR) raise CloseSpider( "\nYou can't use the product_name option without setting the target site to search on\n" ) search_query = self.build_search_query(self.product_name) search_pages = self.build_search_pages(search_query) request = Request(search_pages[self.target_site], callback=self.parseResults) # set amazon cookies if (self.target_site == 'amazon' and self.cookies_file): request.cookies = self.amazon_cookies request.headers['Cookies'] = self.amazon_cookie_header #request.meta['dont_merge_cookies'] = True ## print "SET AMAZON COOKIES" request.meta['origin_name'] = self.product_name request.meta['query'] = search_query # just use empty product model and url, for compatibility, also pending_requests request.meta['origin_model'] = '' request.meta['origin_url'] = '' request.meta['pending_requests'] = [] yield request # if we have product URLs, pass them to parseURL to extract product names (which will pass them to parseResults) product_urls = [] # if we have a single product URL, create a list of URLs containing it if self.product_url: product_urls.append(self.product_url) # if we have a file with a list of URLs, create a list with URLs found there if self.product_urls_file: f = open(self.product_urls_file, "r") for line in f: product_urls.append(line.strip()) f.close() for product_url in product_urls: # extract site domain # m = re.match("http://www1?\.([^\.]+)\.com.*", product_url) # origin_site = "" # if m: # origin_site = m.group(1) # else: # sys.stderr.write('Can\'t extract domain from URL.\n') origin_site = Utils.extract_domain(product_url) request = Request(product_url, callback=self.parseURL) request.meta['origin_site'] = origin_site if origin_site == 'staples': zipcode = "12345" request.cookies = {"zipcode": zipcode} request.meta['dont_redirect'] = True yield request # if we have a file with Walmart ids, create a list of the ids there if self.walmart_ids_file: walmart_ids = [] f = open(self.walmart_ids_file, "r") for line in f: if "," in line: id_string = line.strip().split(",")[0] else: id_string = line.strip() if re.match("[0-9]+", id_string): walmart_ids.append(id_string) f.close() self.by_id = True for walmart_id in walmart_ids: # create Walmart URLs based on these IDs walmart_url = Utils.add_domain(walmart_id, "http://www.walmart.com/ip/") request = Request(walmart_url, callback=self.parseURL) #request.meta['origin_site'] = 'walmart' yield request
def parse(self, response): items = [] # extract site domain site = Utils.extract_domain(response.url) if not site: return items # handle staples televisions if site == 'staples': ############################################ # # # Use selenium - not necessary anymore # # zipcode = "12345" # # hxs = HtmlXPathSelector(response) # # return Request(self.cat_page, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, meta = {"dont_redirect" : False}) # # use selenium to complete the zipcode form and get the first results page # driver = webdriver.Firefox() # driver.get(response.url) # # set a hardcoded value for zipcode # zipcode = "12345" # textbox = driver.find_element_by_name("zipCode") # if textbox.is_displayed(): # textbox.send_keys(zipcode) # button = driver.find_element_by_id("submitLink") # button.click() # cookie = {"zipcode": zipcode} # driver.add_cookie(cookie) # time.sleep(5) # # convert html to "nice format" # text_html = driver.page_source.encode('utf-8') # #print "TEXT_HTML", text_html # html_str = str(text_html) # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None) # #resp_for_scrapy = TextResponse(html_str) # # pass first page to parsePage function to extract products # items += self.parsePage_staples(resp_for_scrapy) # # use selenium to get next page, while there is a next page # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a") # while (next_page): # next_page.click() # time.sleep(5) # # convert html to "nice format" # text_html = driver.page_source.encode('utf-8') # #print "TEXT_HTML", text_html # html_str = str(text_html) # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None) # #resp_for_scrapy = TextResponse(html_str) # # pass first page to parsePage function to extract products # items += self.parsePage_staples(resp_for_scrapy) # hxs = HtmlXPathSelector(resp_for_scrapy) # next = hxs.select("//li[@class='pageNext']/a") # next_page = None # if next: # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a") # #TODO: this doesn't work # # try: # # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a") # # break # # except NoSuchElementException: # # # if there are no more pages exit the loop # # driver.close() # # return items # driver.close() # return items # ############################################## zipcode = "12345" request = Request(response.url, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, \ headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True}) return request # handle bloomingdales sneakers if site == 'bloomingdales': driver = webdriver.Firefox() driver.get(response.url) # use selenium to select USD currency link = driver.find_element_by_xpath( "//li[@id='bl_nav_account_flag']//a") link.click() time.sleep(5) button = driver.find_element_by_id("iShip_shipToUS") button.click() time.sleep(10) # convert html to "nice format" text_html = driver.page_source.encode('utf-8') html_str = str(text_html) # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) resp_for_scrapy = TextResponse('none', 200, {}, html_str, [], None) # parse first page with parsePage_bloomingdales function items += self.parsePage_bloomingdales(resp_for_scrapy) hxs = HtmlXPathSelector(resp_for_scrapy) # while there is a next page get it and pass it to parsePage_bloomingdales next_page_url = hxs.select("//li[@class='nextArrow']//a") while next_page_url: # use selenium to click on next page arrow and retrieve the resulted page if any next = driver.find_element_by_xpath( "//li[@class='nextArrow']//a") next.click() time.sleep(5) # convert html to "nice format" text_html = driver.page_source.encode('utf-8') html_str = str(text_html) # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) resp_for_scrapy = TextResponse('none', 200, {}, html_str, [], None) # pass the page to parsePage function to extract products items += self.parsePage_bloomingdales(resp_for_scrapy) hxs = HtmlXPathSelector(resp_for_scrapy) next_page_url = hxs.select("//li[@class='nextArrow']//a") driver.close() return items # works for both product list pages and higher level pages with links in the left side menu to the product links page if site == 'walmart': hxs = HtmlXPathSelector(response) # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL #! this has a space after the div class, maybe in other pages it doesn't seeall = hxs.select( "//div[@class='CustomSecondaryNav ']//li[last()]/a/@href" ).extract() if seeall: root_url = "http://www.walmart.com" page_url = root_url + seeall[0] # send the page to parsePage and extract product URLs request = Request(page_url, callback=self.parsePage_walmart) return request # if you can't find the link to the product list page, try to parse this as the product list page else: return Request(response.url, callback=self.parsePage_walmart) # works for both product list pages and higher level pages with links in the left side menu to the product links page if site == 'amazon': hxs = HtmlXPathSelector(response) # select first see more list ("All Televisions") seeall = hxs.select("//p[@class='seeMore'][1]/a/@href").extract() root_url = "http://www.amazon.com" # if we can find see all link, follow it and pass it to parsePage to extract product URLs if seeall: page_url = root_url + seeall[0] return Request(page_url, callback=self.parsePage_amazon) # otherwise, try to parse current page as product list page else: return Request(response.url, callback=self.parsePage_amazon) # works for both product list pages and higher level pages with links in the left side menu to the product links page if site == 'bestbuy': hxs = HtmlXPathSelector(response) # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL seeall_list = hxs.select("//ul[@class='search']") if seeall_list: seeall = seeall_list[0].select("li[1]/a/@href").extract() if seeall: root_url = "http://www.bestbuy.com" page_url = root_url + seeall[0] # send the page to parsePage and extract product URLs return Request(page_url, callback=self.parsePage_bestbuy) else: return Request(response.url, callback=self.parsePage_bestbuy) # if you can't find the link to the product list page, try to parse this as the product list page else: return Request(response.url, callback=self.parsePage_bestbuy) if site == 'nordstrom': hxs = HtmlXPathSelector(response) return Request(response.url, callback=self.parsePage_nordstrom) if site == 'macys': hxs = HtmlXPathSelector(response) m = re.match("http://www1.macys.com/shop(.*)\?id=([0-9]+).*", self.cat_page) cat_id = 0 if m: cat_id = int(m.group(2)) productids_request = "http://www1.macys.com/catalog/category/facetedmeta?edge=hybrid&categoryId=%d&pageIndex=1&sortBy=ORIGINAL&productsPerPage=40&" % cat_id return Request(productids_request, callback=self.parse_macys, headers={"Cookie": "shippingCountry=US"}, meta={ 'dont_merge_cookies': True, "cat_id": cat_id, "page_nr": 1 }) if site == 'williams-sonoma': return Request(url=self.cat_page, callback=self.parsePage_sonoma) #TODO: is the list of product numbers ok for all pages? got if from laptops category request, seems to work for others as well even though it's not the same if site == 'overstock': # # get category, and if it's laptops treat it specially using the hardcoded url # m = re.match("http://www.overstock.com/[^/]+/([^/]+)/.*", self.cat_page) # if m and m.group(1) == "Laptops": return Request(url = self.cat_page + "&index=1&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true", callback = self.parsePage_overstock, \ headers = {"Referer": self.cat_page + "&page=2", "X-Requested-With": "XMLHttpRequest"}, \ meta = {"index" : 1}) # else: # return Request(url = self.cat_page, callback = self.parsePage_overstock) if site == 'newegg': return Request(url=self.cat_page, callback=self.parsePage_newegg, meta={'page': 1}) if site == 'tigerdirect': return Request(url = self.cat_page, callback = self.parsePage_tigerdirect,\ # add as meta the page number and the base URL to which to append page number if necessary meta = {'page' : 1, 'base_url' : self.cat_page})