Esempio n. 1
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                if 'request' in item:
                    req = request_from_dict(pickle.loads(item['request']),
                                            self.spider)
                else:
                    req = Request(item['url'], meta=make_splash_meta({}))
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'],
                              meta=make_splash_meta({}))

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                if key != 'request':
                    req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                if 'request' in item:
                    req = request_from_dict(pickle.loads(item['request']), self.spider)
                else:
                    req = Request(item['url'], meta=make_splash_meta({}))
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'], meta=make_splash_meta({}))

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                if key != 'request' :
                    req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
    def next_request(self):
        """
        Logic to handle getting a new url request, from a bunch of
        different queues
        """
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}".format(url=item["url"]))
            try:
                req = Request(item["url"])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request("http://" + item["url"])

            if "meta" in item:
                item = item["meta"]

            # defaults not in schema
            if "curdepth" not in item:
                item["curdepth"] = 0
            if "retry_times" not in item:
                item["retry_times"] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if "useragent" in item and item["useragent"] is not None:
                req.headers["User-Agent"] = item["useragent"]
            if "cookie" in item and item["cookie"] is not None:
                if isinstance(item["cookie"], dict):
                    req.cookies = item["cookie"]
                elif isinstance(item["cookie"], basestring):
                    req.cookies = self.parse_cookie(item["cookie"])

            return req

        return None
Esempio n. 4
0
 def process_request(self, request: Request, spider):
     if (spider.name == 'cookiespider'):
         cookies = CookieUtils.getCookies()
         logging.info(
             "===================================被cookie中间件处理, cookie是%s" %
             cookies)
         request.cookies = cookies
     return None
    def make_requests_from_url(self, url, id=None, attr=None):
        #request = Request(url,headers={"Accept-Encoding": "gzip,deflate,sdch","Accept-Language": "en-US,en;q=0.8" , "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" , "Accept": "*/*" ,"Referer": "https://www.amazon.de" , "Connection": "keep-alive" }, dont_filter=True)
        request = Request(url,headers={'Origin': 'https://www.amazon.de', 'Referer':'https://www.amazon.de', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.8', 'Upgrade-Insecure-Requests': '1' , 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36, LuminadBot/1.0 ([email protected])', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,'Cache-Control': 'max-age=0' }, dont_filter=True)
        request.cookies ={'s_pers':'%20s_fid%3D300B8810F7CDBDE1-10092DE00A8359D7%7C1558680220920%3B%20s_dl%3D1%7C1495610020921%3B%20gpv_page%3DDE%253AAZ%253ASOA-Landing%7C1495610020924%3B%20s_ev15%3D%255B%255B%2527AZDEGNOSellC%2527%252C%25271495608209183%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608216403%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220916%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220925%2527%255D%255D%7C1653374620925%3B%20s_eVar26%3DAmazon%2520Services%2520DE%7C1498200220927%3B', 'amznacsleftnav-656eac4a-695b-3a6a-946f-db61e4deb392':'1', 'amznacsleftnav-fdfd699f-c863-3b78-85b2-8a649c6b58f6':'1', 'x-amz-captcha-1':'1508482986892769', 'x-amz-captcha-2':'hw3RhTh0tvhX81cdMFkFgQ==', 'lc-acbde':'de_DE', 'session-id':'261-5677163-4561642', 'ubid-acbde':'259-8821950-7904223', 'a-ogbcbff':'1', 'x-acbde':'"V0z3CSC5jraR2B7OY6OiPR3wrDO7GbRjA9fTg2AJTorXXbAPToPEDvMAo8KTh@7M"', 'at-acbde':'Atza|IwEBIHwqc3CD45BqlJs_5aa-V8dGYqRemzUHaOJhdARXf-o6rlAp0DANlQO8ZPGB23Uek573IjBb2qkX4mlZWKna1Xn3pOzTpiUd0SQO7gh-uTZnxF5r2p22mMsR4_clEZvBBlZBMJYXD6HPxW7_sEYtklqCkY-Br197rDnz9KPza3y5u7XzgezJIBdXCaeq4vAqo9Wrl0uG0RGKSr41-4rKK9hpnGK1nN4UbO_qWxnLSwzA6LwgXczqe0C5EyH1HIp12IlKFB7OgxIEsH0QZAiT0eh0D7sFwlVG6eHfqPNWfix03SZ7apAC7C7jQ-vw1lmICAeJciD9QmumuCNEDDCT-GGWCkrAh-gxMRhKpm7Q5_gOtJijbqoLi3VfPO9QrCA7hYW8Atc-kFRIW3Y6vtRc8OZzZipCneewy-Rj_xYUMFVWMCmHs_ljfe2W6vxWgiRfmyw', 'sess-at-acbde':'"NbwPRqfG4oPuznYLUmFM5Y5JSvyizaA9ZJz6vTkNQL4="', 'sst-acbde':'Sst1|PQEs5smXCO43G8WIotdsANHyCEBZ9TkcZ_OdLYTgnk2mCfAy4Z5W77Y7zX74BQuxS7UKtfnUM6KkKhmcu01A2Fq7xshyjesDvnQDYp9QYcrFDvlceaVvpWqQfpEt2Q9XIM0VQFdd2EMpXc4C9QlehgHT0URfOlUmC47BkfeJr5dpb4Pv_dbnFASQli0k7Cln9sN_Vf4Wqz4km-6UTpsNlVJxJE48_RK6Zsk7bklH_cpJE8tfltiPzdhyhY2oDh7SieUx6CNKphxtIezjzr-0SbD8cg', 'x-wl-uid':'11PAl+O2T6FeY67SmgtWeMBtyZ538YMsy2Zcpov67B4kL2DVIv3Nx7rEprTLBkI4W3ZZ954YAADFuG1oAMSt9uIgNhk3yQfBCY6pDMJUcXUzK6rFTPF4tPnrWr3utKPzHqJATwvQOHKE=', 'session-token':'"tzfdQwuhV4SLJ9/PfV3QSfg2b3LxOcRlqovsFb3AsrqZSnkxHCjhgMsO3d7NbIS7rOee9CPoh7Lxo8LF7EdVopNDFYLMzzOtDGVhnY4czMEVNS5VHAxjtdaDvRNDJC0OloD0EvRMDfHeXG70D93/wWVNfqU0c6nKEv0yTLU7pFpIbTicUYQQFeDZYf9tPQEepQxbZ1pBOU+0FjTwWUj3SnNdDf/SVmmk+feDLRuqn+WcP6w6CPQ1G03W/TACUuIHBz9mSMRFPU0il4m+s0KyzA=="', 'csm-hit':'s-F8Q4HD9WHE8M6GMQKQT4|1519186540551', 'session-id-time':'2082754801l' }
        #request.cookies ={ 'x-wl-uid':'1yOwLjX2WnY9mLM7WsqYh6e6V1fXMd1ZMNtSL2K4PXEdSmASj6jCPPBezf56CZBu8dNd+B0dbGk6FSb6sv3/5Z2bObc/d7RBUn4jelvgzhpzxeiQQPCByKtKt+rFfaF6lordo7OBLv6I=', 's_vn':'1538041742354%26vn%3D1','s_fid':'7FA70D7094115718-2F7725F9CDA62241', 'regStatus':'pre-register', 's_nr':'1506673939908-Repeat', 's_vnum':'1938673939908%26vn%3D1', 's_dslv':'1506673939908', 'JSESSIONID':'7D8C49FEC5F5D74FBFB8C44B4582E920', 'skin':'noskin','session-token':'fMF7GsLbD9OFUtBEffIAbQYQ+k+oGY4qtqc4L+jpdCrQuiLu4c9Hm8YSsbtiO5c9mfQ3IRuuQojX/N/SOZ1vcQVF58RRX0RpMeXLEPvV50aTQq+f/s/rV8yGoETGydD/29yEVxxEqc4cWCblz5+V28+sOHeSSoUiYwysN7+jUIC+ICgHh8EJAM1aQiONRz31', 'ubid-main':'131-1502033-8002851', 'session-id-time':'2082787201l', 'session-id':'143-4281452-3926723', 'csm-hit':'%7B%22tb%22%3A%223FYTGMTG10SZNP3AYFTN%2Bs-TWA04Y4WMDA93A0N8PZQ%7C1507802966608%22%7D' }

        if id:request.meta['id']=id
        if attr:request.meta['attr']=attr
        # set the meta['item'] to use the item in the next call back
        return request
    def request_from_feed(self, item):
        try:
            req = Request(item['url'])
        except ValueError:
            # need absolute url
            # need better url validation here
            req = Request('http://' + item['url'])

        # defaults not in schema
        if 'curdepth' not in item:
            item['curdepth'] = 0
        if "retry_times" not in item:
            item['retry_times'] = 0

        for key in list(item.keys()):
            req.meta[key] = item[key]

        # extra check to add items to request
        if 'cookie' in item and item['cookie'] is not None:
            if isinstance(item['cookie'], dict):
                req.cookies = item['cookie']
            elif isinstance(item['cookie'], string_types):
                req.cookies = self.parse_cookie(item['cookie'])
        return req
Esempio n. 7
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                self.logger.warn("Unable to find callback method")

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                self.logger.warn("Unable to find errback method")

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in list(item.keys()):
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
Esempio n. 8
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''

        t = time.time()
        # update the redis queues every so often

        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        item = self.find_item()

        if item:
            self.logger.info(
                'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s'
                % (item["meta"]["url"] if 'meta' in item else item["url"]))
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = get_method(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = get_method(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])
            return req

        return None
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                self.logger.warn("Unable to find callback method")

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                self.logger.warn("Unable to find errback method")

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in list(item.keys()):
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
Esempio n. 10
0
    def parseURL(self, response):

        site = response.meta['origin_site']
        hxs = HtmlXPathSelector(response)

        product_model = ""

        product_brand = ""
        product_price = ""

        #############################################################3
        # Extract product attributes (differently depending on site)

        if site == 'staples':

            product_name = hxs.select("//h1/text()").extract()[0]

            model_nodes = hxs.select(
                "//p[@class='itemModel']/text()").extract()
            if model_nodes:
                model_node = model_nodes[0]

                model_node = re.sub("\W", " ", model_node, re.UNICODE)
                m = re.match("(.*)Model:(.*)", model_node.encode("utf-8"),
                             re.UNICODE)

                if m:
                    product_model = m.group(2).strip()

        elif site == 'walmart':
            product_name_holder = hxs.select(
                "//h1[@class='productTitle']/text()").extract()
            if product_name_holder:
                product_name = product_name_holder[0].strip()

                # get integer part of product price
                product_price_big = hxs.select(
                    "//span[@class='bigPriceText1']/text()").extract()

                if not product_price_big:
                    self.log("Didn't find product price: " + response.url +
                             "\n",
                             level=log.DEBUG)
                # if there is a range of prices take their average
                if len(product_price_big) > 1:

                    # remove $ and .
                    product_price_min = re.sub("[\$\.,]", "",
                                               product_price_big[0])
                    product_price_max = re.sub("[\$\.,]", "",
                                               product_price_big[-1])

                    #TODO: check if they're ints?
                    product_price_big = (int(product_price_min) +
                                         int(product_price_max)) / 2.0

                elif product_price_big:
                    product_price_big = int(
                        re.sub("[\$\.,]", "", product_price_big[0]))

                # get fractional part of price
                #TODO - not that important

                if product_price_big:
                    product_price = product_price_big

            else:
                sys.stderr.write(
                    "Broken product page link (can't find item title): " +
                    response.url + "\n")
                # return the item as a non-matched item
                item = SearchItem()
                #item['origin_site'] = site
                item['origin_url'] = response.url
                # remove unnecessary parameters
                m = re.match("(.*)\?enlargedSearch.*", item['origin_url'])
                if m:
                    item['origin_url'] = m.group(1)
                #item['origin_id'] = self.extract_walmart_id(item['origin_url'])
                if self.name != 'manufacturer':
                    # don't return empty matches in manufacturer spider
                    yield item
                return

            #TODO: if it contains 2 words, first could be brand - also add it in similar_names function
            product_model_holder = hxs.select(
                "//td[contains(text(),'Model')]/following-sibling::*/text()"
            ).extract()
            if product_model_holder:
                product_model = product_model_holder[0]

        #TODO: for the sites below, complete with missing logic, for not returning empty elements in manufacturer spider
        elif site == 'newegg':
            product_name_holder = hxs.select(
                "//span[@itemprop='name']/text()").extract()
            if product_name_holder:
                product_name = product_name_holder[0].strip()
            else:
                sys.stderr.write(
                    "Broken product page link (can't find item title): " +
                    response.url + "\n")
                item = SearchItem()
                #item['origin_site'] = site
                item['origin_url'] = response.url
                yield item
                return
            product_model_holder = hxs.select(
                "//dt[text()='Model']/following-sibling::*/text()").extract()
            if product_model_holder:
                product_model = product_model_holder[0]

        else:
            raise CloseSpider("Unsupported site: " + site)

        if site == 'staples':
            zipcode = "12345"
            cookies = {"zipcode": zipcode}
        else:
            cookies = {}

        #######################################################################
        # Create search queries to the second site, based on product attributes

        request = None

        #TODO: search by alternative model numbers?

        #TODO: search by model number extracted from product name? Don't I do that implicitly? no, but in combinations

        # if there is no product model, try to extract it
        if not product_model:
            product_model = ProcessText.extract_model_from_name(product_name)

            # for logging purposes, set this back to the empty string if it wasn't found (so was None)
            if not product_model:
                product_model = ""

            # product_model_index = ProcessText.extract_model_nr_index(product_name)
            # if product_model_index >= 0:
            # 	product_model = product_name[product_model_index]

            ## print "MODEL EXTRACTED: ", product_model, " FROM NAME ", product_name

        # if there is no product brand, get first word in name, assume it's the brand
        product_brand_extracted = ""
        #product_name_tokenized = ProcessText.normalize(product_name)
        product_name_tokenized = [
            word.lower() for word in product_name.split(" ")
        ]
        #TODO: maybe extract brand as word after 'by', if 'by' is somewhere in the product name
        if len(product_name_tokenized) > 0 and re.match(
                "[a-z]*", product_name_tokenized[0]):
            product_brand_extracted = product_name_tokenized[0].lower()

        # if we are in manufacturer spider, set target_site to manufacturer site

        # for manufacturer spider set target_site of request to brand extracted from name for this particular product
        if self.name == 'manufacturer':

            #TODO: restore commented code; if brand not found, try to search for it on every manufacturer site (build queries fo every supported site)
            # hardcode target site to sony
            #self.target_site = 'sony'
            #self.target_site = product_brand_extracted

            #target_site = product_brand_extracted

            # can only go on if site is supported
            # (use dummy query)
            #if target_site not in self.build_search_pages("").keys():
            if product_brand_extracted not in self.build_search_pages(
                    "").keys():

                product_brands_extracted = set(
                    self.build_search_pages("").keys()).intersection(
                        set(product_name_tokenized))

                if product_brands_extracted:
                    product_brand_extracted = product_brands_extracted.pop()
                    #target_site = product_brand_extracted
                else:
                    # give up and return item without match
                    self.log(
                        "Manufacturer site not supported (" +
                        product_brand_extracted +
                        ") or not able to extract brand from product name (" +
                        product_name + ")\n",
                        level=log.ERROR)

                    ## comment lines below to: don't return anything if you can't search on manufacturer site
                    # item = SearchItem()
                    # item['origin_url'] = response.url
                    # item['origin_name'] = product_name
                    # if product_model:
                    # 	item['origin_model'] = product_model
                    # yield item
                    return

            # if specific site is not set, search on manufacturer site as extracted from name
            if not self.manufacturer_site:
                target_site = product_brand_extracted
            else:
                # if it's set, continue only if it matches extracted brand
                if self.manufacturer_site != product_brand_extracted:
                    self.log(
                        "Will abort matching for product, extracted brand does not match specified manufacturer option ("
                        + product_brand_extracted + ")\n",
                        level=log.INFO)

                    ## comment lines below to: don't return anything if you can't search on manufacturer site
                    # item = SearchItem()
                    # item['origin_url'] = response.url
                    # item['origin_name'] = product_name
                    # if product_model:
                    # 	item['origin_model'] = product_model
                    # yield item
                    return

                else:
                    target_site = product_brand_extracted

                    # # try to match it without specific site (manufacturer spider will try to search on all manufacturer sites)
                    # target_site = None

        # for other (site specific) spiders, set target_site of request to class variable self.target_site set in class "constructor" (init_sub)
        else:
            target_site = self.target_site

        # 1) Search by model number
        if product_model:

            #TODO: model was extracted with ProcessText.extract_model_from_name(), without lowercasing, should I lowercase before adding it to query?
            query1 = self.build_search_query(product_model)
            search_pages1 = self.build_search_pages(query1)
            #page1 = search_pages1[self.target_site]
            page1 = search_pages1[target_site]

            request1 = Request(page1, callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request1.cookies = self.amazon_cookies
                request1.headers['Cookies'] = self.amazon_cookie_header
                #request1.meta['dont_merge_cookies'] = True
                ## print "SET AMAZON COOKIES"

            request1.meta['query'] = query1
            request1.meta['target_site'] = target_site

            request = request1

        # 2) Search by product full name
        query2 = self.build_search_query(product_name)
        search_pages2 = self.build_search_pages(query2)
        #page2 = search_pages2[self.target_site]
        page2 = search_pages2[target_site]
        request2 = Request(page2, callback=self.parseResults)

        # set cookies for amazon
        if (self.target_site == 'amazon' and self.cookies_file):
            request2.cookies = self.amazon_cookies
            request2.headers['Cookies'] = self.amazon_cookie_header
            #request2.meta['dont_merge_cookies'] = True

        request2.meta['query'] = query2
        request2.meta['target_site'] = target_site

        pending_requests = []

        if not request:
            request = request2
        else:
            pending_requests.append(request2)

        # 3) Search by combinations of words in product's name
        # create queries

        for words in ProcessText.words_combinations(product_name,
                                                    fast=self.fast):
            query3 = self.build_search_query(" ".join(words))
            search_pages3 = self.build_search_pages(query3)
            #page3 = search_pages3[self.target_site]
            page3 = search_pages3[target_site]
            request3 = Request(page3, callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request3.cookies = self.amazon_cookies
                request3.headers['Cookies'] = self.amazon_cookie_header
                #request3.meta['dont_merge_cookies'] = True

            request3.meta['query'] = query3
            request3.meta['target_site'] = target_site

            pending_requests.append(request3)

        request.meta['pending_requests'] = pending_requests
        #request.meta['origin_site'] =
        # product page from source site
        #TODO: clean this URL? for walmart it added something with ?enlargedsearch=True
        request.meta['origin_url'] = response.url

        request.meta['origin_name'] = product_name
        request.meta['origin_model'] = product_model
        if product_price:
            request.meta['origin_price'] = product_price

        # origin product brand as extracted from name (basically the first word in the name)
        request.meta['origin_brand_extracted'] = product_brand_extracted

        # if self.by_id:
        # 	request.meta['origin_id'] = self.extract_walmart_id(response.url)

        #self.target_site = product_brand_extracted
        #TODO: should this be here??
        target_site = product_brand_extracted

        # print "SENDING REQUEST FOR ", product_name, response.url

        yield request
Esempio n. 11
0
    def parse(self, response):

        if self.product_name:

            # can inly use this option if self.target_site has been initialized (usually true for spiders for retailers sites, not true for manufacturer's sites)
            if not self.target_site:
                self.log(
                    "You can't use the product_name option without setting the target site to search on\n",
                    level=log.ERROR)
                raise CloseSpider(
                    "\nYou can't use the product_name option without setting the target site to search on\n"
                )

            search_query = self.build_search_query(self.product_name)
            search_pages = self.build_search_pages(search_query)

            request = Request(search_pages[self.target_site],
                              callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
                ## print "SET AMAZON COOKIES"

            request.meta['origin_name'] = self.product_name
            request.meta['query'] = search_query

            # just use empty product model and url, for compatibility, also pending_requests
            request.meta['origin_model'] = ''
            request.meta['origin_url'] = ''
            request.meta['pending_requests'] = []

            yield request

        # if we have product URLs, pass them to parseURL to extract product names (which will pass them to parseResults)
        product_urls = []
        # if we have a single product URL, create a list of URLs containing it
        if self.product_url:
            product_urls.append(self.product_url)

        # if we have a file with a list of URLs, create a list with URLs found there
        if self.product_urls_file:
            f = open(self.product_urls_file, "r")
            for line in f:
                product_urls.append(line.strip())
            f.close()

        for product_url in product_urls:
            # extract site domain

            # m = re.match("http://www1?\.([^\.]+)\.com.*", product_url)
            # origin_site = ""
            # if m:
            # 	origin_site = m.group(1)
            # else:
            # 	sys.stderr.write('Can\'t extract domain from URL.\n')
            origin_site = Utils.extract_domain(product_url)

            request = Request(product_url, callback=self.parseURL)
            request.meta['origin_site'] = origin_site
            if origin_site == 'staples':
                zipcode = "12345"
                request.cookies = {"zipcode": zipcode}
                request.meta['dont_redirect'] = True
            yield request

        # if we have a file with Walmart ids, create a list of the ids there
        if self.walmart_ids_file:
            walmart_ids = []
            f = open(self.walmart_ids_file, "r")
            for line in f:
                if "," in line:
                    id_string = line.strip().split(",")[0]
                else:
                    id_string = line.strip()
                if re.match("[0-9]+", id_string):
                    walmart_ids.append(id_string)
            f.close()

            self.by_id = True

            for walmart_id in walmart_ids:
                # create Walmart URLs based on these IDs
                walmart_url = Utils.add_domain(walmart_id,
                                               "http://www.walmart.com/ip/")
                request = Request(walmart_url, callback=self.parseURL)
                #request.meta['origin_site'] = 'walmart'
                yield request
Esempio n. 12
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''

        t = time.time()
        # update the redis queues every so often

        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        item = self.find_item()

        if item:
            self.logger.info(
                'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % (
                item["meta"]["url"] if 'meta' in item else item["url"]))
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = get_method(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = get_method(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])
            return req

        return None
Esempio n. 13
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        # print "PARSE AMAZON FOR", response.meta['origin_url'], "RESULTS PAGE", response.url

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        # add product URLs to be parsed to this list
        if 'search_results' not in response.meta:
            product_urls = set()
        else:
            product_urls = response.meta['search_results']

        # get search results for received results page and add them to product_urls to be parsed
        results = hxs.select("//h3[@class='newaps']/a")
        for result in results:
            product_url = result.select("@href").extract()[0]

            # remove the part after "/ref" containing details about the search query
            m = re.match("(.*)/ref=(.*)", product_url)
            if m:
                product_url = m.group(1)

            product_url = Utils.add_domain(product_url,
                                           "http://www.amazon.com")

            product_urls.add(product_url)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request further to parse product pages only if we gathered all the product URLs from all the queries
        # (there are no more pending requests)
        # otherwise send them back to parseResults and wait for the next query, save all product URLs in search_results
        # this way we avoid duplicates
        if product_urls and ('pending_requests' not in response.meta
                             or not response.meta['pending_requests']):
            request = Request(product_urls.pop(),
                              callback=self.parse_product_amazon,
                              meta=response.meta)
            if self.cookies_file:
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
            request.meta['items'] = items

            # this will be the new product_urls list with the first item popped
            request.meta['search_results'] = product_urls

            return request

        # if there were no results, the request will never get back to reduceResults
        # so send it from here so it can parse the next queries
        # add to the response the URLs of the products to crawl we have so far, items (handles case when it was not created yet)
        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        else:
            response.meta['items'] = items
            response.meta['parsed'] = True
            response.meta['search_results'] = product_urls
            # only send the response we have as an argument, no need to make a new request

            # print "RETURNING TO REDUCE RESULTS", response.meta['origin_url']
            return self.reduceResults(response)
Esempio n. 14
0
    def parse_product_amazon(self, response):

        # print "PARSE AMAZON PRODUCT FOR", response.meta['origin_url'], response.url

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']

        # if 'origin_id' in response.meta:
        # 	item['origin_id'] = response.meta['origin_id']
        # 	assert self.by_id
        # else:
        # 	assert not self.by_id

        # extract product name
        #TODO: id='title' doesn't work for all, should I use a 'contains' or something?
        # extract titles that are not empty (ignoring whitespace)
        # eliminate "Amazon Prime Free Trial"

        #TODO: to test this
        #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract())
        product_name = filter(
            lambda x: not x.startswith("Amazon Prime"),
            hxs.select("//h1//text()[normalize-space()!='']").extract())
        if not product_name:
            # print "NO PRODUCT NAME FOR", response.url
            self.log("Error: No product name: " + str(response.url) +
                     " for walmart product " + origin_url,
                     level=log.ERROR)

            # assume there is a captcha to crack
            # check if there is a form on the page - that means it's probably the captcha form
            forms = hxs.select("//form")
            if forms:

                # solve captcha
                captcha_text = None
                image = hxs.select(".//img/@src").extract()
                if image:
                    captcha_text = self.CB.solve_captcha(image[0])

                # value to use if there was an exception
                if not captcha_text:
                    captcha_text = ''

                # create a FormRequest to this same URL, with everything needed in meta
                # items, cookies and search_urls not changed from previous response so no need to set them again

                # redo the entire request (no items will be lost)
                return [
                    FormRequest.from_response(
                        response,
                        callback=self.parse_product_amazon,
                        formdata={'field-keywords': captcha_text},
                        meta=response.meta)
                ]

        else:
            item['product_name'] = product_name[0].strip()

            # extract product model number
            model_number_holder = hxs.select(
                "//tr[@class='item-model-number']/td[@class='value']/text() | //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text()"
            ).extract()
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            brand_holder = hxs.select(
                "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
            else:
                pass
                #sys.stderr.write("Didn't find product brand: " + response.url + "\n")

            # extract price
            #! extracting list price and not discount price when discounts available?
            price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \
             "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract()

            # if we can't find it like above try other things:
            if not price_holder:
                # prefer new prices to used ones
                price_holder = hxs.select(
                    "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]"
                ).extract()
            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

            # add result to items
            items.add(item)

        # print "STILL IN parse_product FOR", response.url

        product_urls = response.meta['search_results']

        # try to send request to parse next product, try until url for next product url is valid (response not 404)
        # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost

        # find first valid next product url
        next_product_url = None
        if product_urls:
            next_product_url = product_urls.pop()
        while (product_urls and not self.is_valid_url(next_product_url)):
            # print "404 FROM", next_product_url
            next_product_url = product_urls.pop()

        # handle corner case of bad next product url
        if not product_urls and next_product_url and not self.is_valid_url(
                next_product_url):
            next_product_url = None

        # if a next product url was found, send new request back to parse_product_url
        if next_product_url:
            request = Request(next_product_url,
                              callback=self.parse_product_amazon,
                              meta=response.meta)
            if self.cookies_file:
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            # print "RETURNING FROM PARSE AMAZON PRODUCT TO parse_product FOR", response.meta['origin_url'], response.url, "NEXT IS", next_product_url
            respcode = urllib.urlopen(next_product_url)

            return request

        # if no next valid product url was found
        else:
            # we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            # print "RETURNING FROM PARSE AMAZON PRODUCT TO reduce_results FOR", response.meta['origin_url'], response.url

            return self.reduceResults(response)