Ejemplo n.º 1
0
 def parse(self, response):
     for url in self.prod_urls:
         domain = Utils.extract_domain(url)
         #TODO: pass some cookie with country value for sites where price for example is displayed in local currency
         if domain != 'staples':
             yield Request(url,
                           callback=self.parseProdpage,
                           meta={"site": domain})
         # for staples we need extra cookies
         else:
             yield Request(url, callback = self.parseProdpage, cookies = {"zipcode" : "1234"}, \
         headers = {"Cookie" : "zipcode=" + "1234"}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True, "site" : domain})
Ejemplo n.º 2
0
    def parse(self, response):

        if self.product_name:

            # can inly use this option if self.target_site has been initialized (usually true for spiders for retailers sites, not true for manufacturer's sites)
            if not self.target_site:
                self.log(
                    "You can't use the product_name option without setting the target site to search on\n",
                    level=log.ERROR)
                raise CloseSpider(
                    "\nYou can't use the product_name option without setting the target site to search on\n"
                )

            search_query = self.build_search_query(self.product_name)
            search_pages = self.build_search_pages(search_query)

            request = Request(search_pages[self.target_site],
                              callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
                ## print "SET AMAZON COOKIES"

            request.meta['origin_name'] = self.product_name
            request.meta['query'] = search_query

            # just use empty product model and url, for compatibility, also pending_requests
            request.meta['origin_model'] = ''
            request.meta['origin_url'] = ''
            request.meta['pending_requests'] = []

            yield request

        # if we have product URLs, pass them to parseURL to extract product names (which will pass them to parseResults)
        product_urls = []
        # if we have a single product URL, create a list of URLs containing it
        if self.product_url:
            product_urls.append(self.product_url)

        # if we have a file with a list of URLs, create a list with URLs found there
        if self.product_urls_file:
            f = open(self.product_urls_file, "r")
            for line in f:
                product_urls.append(line.strip())
            f.close()

        for product_url in product_urls:
            # extract site domain

            # m = re.match("http://www1?\.([^\.]+)\.com.*", product_url)
            # origin_site = ""
            # if m:
            # 	origin_site = m.group(1)
            # else:
            # 	sys.stderr.write('Can\'t extract domain from URL.\n')
            origin_site = Utils.extract_domain(product_url)

            request = Request(product_url, callback=self.parseURL)
            request.meta['origin_site'] = origin_site
            if origin_site == 'staples':
                zipcode = "12345"
                request.cookies = {"zipcode": zipcode}
                request.meta['dont_redirect'] = True
            yield request

        # if we have a file with Walmart ids, create a list of the ids there
        if self.walmart_ids_file:
            walmart_ids = []
            f = open(self.walmart_ids_file, "r")
            for line in f:
                if "," in line:
                    id_string = line.strip().split(",")[0]
                else:
                    id_string = line.strip()
                if re.match("[0-9]+", id_string):
                    walmart_ids.append(id_string)
            f.close()

            self.by_id = True

            for walmart_id in walmart_ids:
                # create Walmart URLs based on these IDs
                walmart_url = Utils.add_domain(walmart_id,
                                               "http://www.walmart.com/ip/")
                request = Request(walmart_url, callback=self.parseURL)
                #request.meta['origin_site'] = 'walmart'
                yield request
Ejemplo n.º 3
0
    def parse(self, response):

        items = []

        # extract site domain
        site = Utils.extract_domain(response.url)
        if not site:
            return items

        # handle staples televisions
        if site == 'staples':

            ############################################
            #
            # # Use selenium - not necessary anymore

            # # zipcode = "12345"

            # # hxs = HtmlXPathSelector(response)
            # # return Request(self.cat_page, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, meta = {"dont_redirect" : False})
            # # use selenium to complete the zipcode form and get the first results page
            # driver = webdriver.Firefox()
            # driver.get(response.url)

            # # set a hardcoded value for zipcode
            # zipcode = "12345"
            # textbox = driver.find_element_by_name("zipCode")

            # if textbox.is_displayed():
            # 	textbox.send_keys(zipcode)

            # 	button = driver.find_element_by_id("submitLink")
            # 	button.click()

            # 	cookie = {"zipcode": zipcode}
            # 	driver.add_cookie(cookie)

            # 	time.sleep(5)

            # # convert html to "nice format"
            # text_html = driver.page_source.encode('utf-8')
            # #print "TEXT_HTML", text_html
            # html_str = str(text_html)

            # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
            # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None)
            # #resp_for_scrapy = TextResponse(html_str)

            # # pass first page to parsePage function to extract products
            # items += self.parsePage_staples(resp_for_scrapy)

            # # use selenium to get next page, while there is a next page
            # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a")
            # while (next_page):
            # 	next_page.click()
            # 	time.sleep(5)

            # 	# convert html to "nice format"
            # 	text_html = driver.page_source.encode('utf-8')
            # 	#print "TEXT_HTML", text_html
            # 	html_str = str(text_html)

            # 	# this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
            # 	resp_for_scrapy = TextResponse('none',200,{},html_str,[],None)
            # 	#resp_for_scrapy = TextResponse(html_str)

            # 	# pass first page to parsePage function to extract products
            # 	items += self.parsePage_staples(resp_for_scrapy)

            # 	hxs = HtmlXPathSelector(resp_for_scrapy)
            # 	next = hxs.select("//li[@class='pageNext']/a")
            # 	next_page = None
            # 	if next:
            # 		next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a")

            # 	#TODO: this doesn't work
            # 	# try:
            # 	# 	next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a")
            # 	# 	break
            # 	# except NoSuchElementException:
            # 	# 	# if there are no more pages exit the loop
            # 	# 	driver.close()
            # 	# 	return items

            # driver.close()

            # return items
            #
            ##############################################

            zipcode = "12345"
            request = Request(response.url, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, \
             headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True})
            return request

        # handle bloomingdales sneakers
        if site == 'bloomingdales':
            driver = webdriver.Firefox()
            driver.get(response.url)

            # use selenium to select USD currency
            link = driver.find_element_by_xpath(
                "//li[@id='bl_nav_account_flag']//a")
            link.click()
            time.sleep(5)
            button = driver.find_element_by_id("iShip_shipToUS")
            button.click()
            time.sleep(10)

            # convert html to "nice format"
            text_html = driver.page_source.encode('utf-8')
            html_str = str(text_html)

            # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
            resp_for_scrapy = TextResponse('none', 200, {}, html_str, [], None)

            # parse first page with parsePage_bloomingdales function
            items += self.parsePage_bloomingdales(resp_for_scrapy)
            hxs = HtmlXPathSelector(resp_for_scrapy)

            # while there is a next page get it and pass it to parsePage_bloomingdales
            next_page_url = hxs.select("//li[@class='nextArrow']//a")

            while next_page_url:

                # use selenium to click on next page arrow and retrieve the resulted page if any
                next = driver.find_element_by_xpath(
                    "//li[@class='nextArrow']//a")
                next.click()

                time.sleep(5)

                # convert html to "nice format"
                text_html = driver.page_source.encode('utf-8')
                html_str = str(text_html)

                # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
                resp_for_scrapy = TextResponse('none', 200, {}, html_str, [],
                                               None)

                # pass the page to parsePage function to extract products
                items += self.parsePage_bloomingdales(resp_for_scrapy)

                hxs = HtmlXPathSelector(resp_for_scrapy)
                next_page_url = hxs.select("//li[@class='nextArrow']//a")

            driver.close()

            return items

        # works for both product list pages and higher level pages with links in the left side menu to the product links page
        if site == 'walmart':
            hxs = HtmlXPathSelector(response)

            # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL
            #! this has a space after the div class, maybe in other pages it doesn't
            seeall = hxs.select(
                "//div[@class='CustomSecondaryNav ']//li[last()]/a/@href"
            ).extract()
            if seeall:
                root_url = "http://www.walmart.com"
                page_url = root_url + seeall[0]
                # send the page to parsePage and extract product URLs
                request = Request(page_url, callback=self.parsePage_walmart)
                return request
            # if you can't find the link to the product list page, try to parse this as the product list page
            else:
                return Request(response.url, callback=self.parsePage_walmart)

        # works for both product list pages and higher level pages with links in the left side menu to the product links page
        if site == 'amazon':
            hxs = HtmlXPathSelector(response)
            # select first see more list ("All Televisions")
            seeall = hxs.select("//p[@class='seeMore'][1]/a/@href").extract()
            root_url = "http://www.amazon.com"

            # if we can find see all link, follow it and pass it to parsePage to extract product URLs
            if seeall:
                page_url = root_url + seeall[0]
                return Request(page_url, callback=self.parsePage_amazon)

            # otherwise, try to parse current page as product list page
            else:
                return Request(response.url, callback=self.parsePage_amazon)

        # works for both product list pages and higher level pages with links in the left side menu to the product links page
        if site == 'bestbuy':
            hxs = HtmlXPathSelector(response)

            # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL
            seeall_list = hxs.select("//ul[@class='search']")
            if seeall_list:
                seeall = seeall_list[0].select("li[1]/a/@href").extract()
                if seeall:
                    root_url = "http://www.bestbuy.com"
                    page_url = root_url + seeall[0]

                    # send the page to parsePage and extract product URLs
                    return Request(page_url, callback=self.parsePage_bestbuy)

                else:
                    return Request(response.url,
                                   callback=self.parsePage_bestbuy)

            # if you can't find the link to the product list page, try to parse this as the product list page
            else:
                return Request(response.url, callback=self.parsePage_bestbuy)

        if site == 'nordstrom':
            hxs = HtmlXPathSelector(response)

            return Request(response.url, callback=self.parsePage_nordstrom)

        if site == 'macys':

            hxs = HtmlXPathSelector(response)

            m = re.match("http://www1.macys.com/shop(.*)\?id=([0-9]+).*",
                         self.cat_page)
            cat_id = 0
            if m:
                cat_id = int(m.group(2))
            productids_request = "http://www1.macys.com/catalog/category/facetedmeta?edge=hybrid&categoryId=%d&pageIndex=1&sortBy=ORIGINAL&productsPerPage=40&" % cat_id
            return Request(productids_request,
                           callback=self.parse_macys,
                           headers={"Cookie": "shippingCountry=US"},
                           meta={
                               'dont_merge_cookies': True,
                               "cat_id": cat_id,
                               "page_nr": 1
                           })

        if site == 'williams-sonoma':

            return Request(url=self.cat_page, callback=self.parsePage_sonoma)

        #TODO: is the list of product numbers ok for all pages? got if from laptops category request, seems to work for others as well even though it's not the same
        if site == 'overstock':
            # # get category, and if it's laptops treat it specially using the hardcoded url
            # m = re.match("http://www.overstock.com/[^/]+/([^/]+)/.*", self.cat_page)
            # if m and m.group(1) == "Laptops":
            return Request(url = self.cat_page + "&index=1&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true", callback = self.parsePage_overstock, \
             headers = {"Referer": self.cat_page + "&page=2", "X-Requested-With": "XMLHttpRequest"}, \
             meta = {"index" : 1})
            # else:
            # 	return Request(url = self.cat_page, callback = self.parsePage_overstock)

        if site == 'newegg':
            return Request(url=self.cat_page,
                           callback=self.parsePage_newegg,
                           meta={'page': 1})

        if site == 'tigerdirect':
            return Request(url = self.cat_page, callback = self.parsePage_tigerdirect,\
				# add as meta the page number and the base URL to which to append page number if necessary

             meta = {'page' : 1, 'base_url' : self.cat_page})