Beispiel #1
0
 def build_search_query(self, product_name):
     # put + instead of spaces, lowercase all words
     search_query = "+".join(
         ProcessText.normalize(product_name,
                               stem=False,
                               exclude_stopwords=False))
     return search_query
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        origin_product_id = response.meta['origin_product_id']
        current_query = response.meta['query']

        # all product urls from all queries
        items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \
            self.results[origin_product_id]['search_requests']), [])

        result_items = self.extract_result_products(response)
        for item in result_items:

            for field in self.results[origin_product_id][
                    'origin_product'].keys():
                item[field] = self.results[origin_product_id][
                    'origin_product'][field]

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            # add result to items
            self.results[origin_product_id]['search_requests'][current_query][
                'product_items'].append(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
Beispiel #3
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//ul[@class='products']//div[@class='product ']//h3//a")
        for result in results:
            item = SearchItem()

            product_url = result.select("@href").extract()[0] if result.select(
                "@href") else None
            product_name = result.select(
                "@title").extract()[0] if result.select("@title") else None

            # assert name is not abbreviated
            # empirically, this only seems to produce false positives, so removed
            # assert '...' not in product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(product_url,
                                                       self.base_url)

                item['product_name'] = product_name
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
Beispiel #4
0
    def parse_product_amazon(self, response):

        hxs = HtmlXPathSelector(response)

        origin_product_id = response.meta['origin_product_id']
        current_query = response.meta['query']
        origin_url = self.results[origin_product_id]['origin_product'][
            'origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        for field in self.results[origin_product_id]['origin_product'].keys():
            item[field] = self.results[origin_product_id]['origin_product'][
                field]

        # all product urls from all queries
        items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \
            self.results[origin_product_id]['search_requests']), [])
        # all product urls from all queries
        product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \
            self.results[origin_product_id]['search_requests']), [])
        product_urls = set(product_urls)

        #TODO: to test this
        #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract())
        product_name_node = hxs.select(
            '//h1[@id="title"]/span[@id="productTitle"]/text()').extract()
        product_name = None
        if not product_name_node:
            product_name_node = hxs.select(
                '//h1[@id="aiv-content-title"]//text()').extract()
        if not product_name_node:
            product_name_node = hxs.select(
                '//div[@id="title_feature_div"]/h1//text()').extract()

        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            # needs special treatment
            product_name_node = hxs.select(
                '//h1[@class="parseasinTitle " or @class="parseasinTitle"]/span[@id="btAsinTitle"]//text()'
            ).extract()
            if product_name_node:
                product_name = " ".join(product_name_node).strip()

        if not product_name:

            # log this error:
            # if number of retries were not exhausted, it might just be a captcha page, not an insurmonutable error
            if 'captcha_retries' in response.meta and response.meta[
                    'captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.WARNING)
            else:
                # if it comes from a solved captcha page, then it's an error if it's still not found
                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.ERROR)

                # try this: don't remove captcha_retries from meta, may cause infinite loops, works
                # if response.meta['captcha_retries'] > self.MAX_CAPTCHA_RETRIES:
                # del response.meta['captcha_retries']
            # if we have reached maximum number of retries, do nothing (item just won't be added to the "items" list)

            # if we haven't reached maximum retries, try again
            if 'captcha_retries' not in response.meta \
                or 'captcha_retries' in response.meta and response.meta['captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                # assume there is a captcha to crack
                # check if there is a form on the page - that means it's probably the captcha form
                forms = hxs.select("//form")
                if forms:

                    # solve captcha
                    captcha_text = None
                    image = hxs.select(".//img/@src").extract()
                    if image:
                        captcha_text = self.CB.solve_captcha(image[0])

                    # value to use if there was an exception
                    if not captcha_text:
                        captcha_text = ''

                    # create a FormRequest to this same URL, with everything needed in meta
                    # items, cookies and search_urls not changed from previous response so no need to set them again

                    # redo the entire request (no items will be lost)
                    meta = response.meta
                    # flag indicating how many times we already retried to solve captcha
                    if 'captcha_retries' in meta:
                        meta['captcha_retries'] += 1
                    else:
                        meta['captcha_retries'] = 1
                    return [
                        FormRequest.from_response(
                            response,
                            callback=self.parse_product_amazon,
                            formdata={'field-keywords': captcha_text},
                            meta=meta)
                    ]

        else:
            item['product_name'] = product_name

            # extract product model number
            model_number_holder = hxs.select(
                """//tr[@class='item-model-number']/td[@class='value']/text() |
             //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text() |
             //span/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/span[2]/text()"""
            ).extract()
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            upc_node = hxs.select(
                "//li/b/text()[normalize-space()='UPC:']/parent::node()/parent::node()/text()"
            ).extract()
            if upc_node:
                upc = upc_node[0].strip().split()
                item['product_upc'] = upc

            manufacturer_code_node = hxs.select(
                "//li/b/text()[normalize-space()='Manufacturer reference:']/parent::node()/parent::node()/text()"
            ).extract()
            if manufacturer_code_node:
                manufacturer_code = manufacturer_code_node[0].strip()
                item['manufacturer_code'] = manufacturer_code

            try:
                # for lowest level category:
                # TODO: test the xpath for the second type of page (see second type of xpath for top-level category)
                # bestsellers_rank = hxs.select("//tr[@id='SalesRank']/td[@class='value']/ul/li/span/text()" + \
                # "| //li[@id='SalesRank']/ul/li/span/text()").re("#[0-9,]+")[0]

                # for top-level category:
                bestsellers_rank = hxs.select(
                    "//tr[@id='SalesRank']/td[@class='value']/text()" +
                    " | //li[@id='SalesRank']/text()").re("#[0-9,]+")[0]
                item['bestsellers_rank'] = int(
                    re.sub(",", "", "".join(bestsellers_rank[1:])))
            except Exception, e:
                if self.output == 6 or self.bestsellers_link:
                    self.log("Didn't find product rank: " + str(e) + " " +
                             response.url + "\n",
                             level=log.INFO)

            asin_node = hxs.select(
                "//li/b/text()[normalize-space()='ASIN:']/parent::node()/parent::node()/text()"
            ).extract()
            if asin_node:
                item['product_asin'] = asin_node[0].strip()

            brand_holder = hxs.select(
                "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
            else:
                pass
                #sys.stderr.write("Didn't find product brand: " + response.url + "\n")

            # extract price
            #! extracting list price and not discount price when discounts available?
            price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \
                "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract()

            # if we can't find it like above try other things:
            if not price_holder:
                # prefer new prices to used ones
                # TODO: doesn't work for amazon.co.uk (pounds), but isn't needed bery often
                price_holder = hxs.select(
                    "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]"
                ).extract()
            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("(\$|\xa3)([0-9]+\.?[0-9]*)",
                             product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(2))
                    currency = m.group(1)
                    if currency != "$":
                        item[
                            'product_target_price'] = Utils.convert_to_dollars(
                                item['product_target_price'], currency)
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

            try:
                item['product_category_tree'] = \
                    filter(None, map(lambda c: c.strip(), hxs.select("//ul[li[@class='a-breadcrumb-divider']]/li/span[@class='a-list-item']/a/text()").extract()))
            except:
                pass

            try:
                item['product_keywords'] = hxs.select(
                    "//meta[@name='keywords']/@content").extract()[0]
            except:
                pass

            try:
                product_image = hxs.select(
                    "//img[@id='landingImage']/@src").extract()[0]
                item['product_image_url'] = product_image
                item['product_image_encoded'] = ProcessText.encode_image(
                    product_image)
            except:
                pass

            # add result to items
            self.results[origin_product_id]['search_requests'][current_query][
                'product_items'].append(item)
Beispiel #5
0
    def reduceResults(self, response):

        # print "IN REDUCE RESULTS"

        items = response.meta['items']
        #site = response.meta['origin_site']

        #TODO: do we still need this?
        if 'parsed' not in response.meta:

            # pass to specific prase results function (in derived class)
            return self.parseResults(response)

        else:
            del response.meta['parsed']

        ## print stuff
        self.log("PRODUCT: " + response.meta['origin_name'].encode("utf-8") +
                 " MODEL: " + response.meta['origin_model'].encode("utf-8"),
                 level=log.DEBUG)
        self.log("QUERY: " + response.meta['query'], level=log.DEBUG)
        self.log("MATCHES: ", level=log.DEBUG)
        for item in items:
            self.log(item['product_name'].encode("utf-8"), level=log.DEBUG)
        self.log('\n', level=log.DEBUG)

        # if there is a pending request (current request used product model, and pending request is to use product name),
        # continue with that one and send current results to it as metadata
        if 'pending_requests' in response.meta:
            # yield first request in queue and send the other ones as metadata
            pending_requests = response.meta['pending_requests']

            if pending_requests:
                # print "PENDING REQUESTS FOR", response.meta['origin_url'], response.meta['origin_name']
                request = pending_requests[0]

                # update pending requests
                request.meta['pending_requests'] = pending_requests[1:]

                request.meta['items'] = items

                #request.meta['origin_site'] = response.meta['origin_site']
                # product page from source site
                request.meta['origin_url'] = response.meta['origin_url']
                request.meta['origin_name'] = response.meta['origin_name']
                request.meta['origin_model'] = response.meta['origin_model']
                if 'origin_price' in response.meta:
                    request.meta['origin_price'] = response.meta[
                        'origin_price']
                request.meta['origin_brand_extracted'] = response.meta[
                    'origin_brand_extracted']
                if 'threshold' in response.meta:
                    request.meta['threshold'] = response.meta['threshold']

                # if 'origin_id' in response.meta:
                # 	request.meta['origin_id'] = response.meta['origin_id']
                # 	assert self.by_id
                # else:
                # 	assert not self.by_id

                # used for result product URLs
                if 'search_results' in response.meta:
                    request.meta['search_results'] = response.meta[
                        'search_results']

                return request

            # if there are no more pending requests, use cumulated items to find best match and send it as a result
            else:

                # print "DONE FOR ", response.meta['origin_url'], response.meta['origin_name']

                best_match = None

                if items:

                    # from all results, select the product whose name is most similar with the original product's name
                    # if there was a specific threshold set in request, use that, otherwise, use the class variable
                    if 'threshold' in response.meta:
                        threshold = response.meta['threshold']
                    else:
                        threshold = self.threshold

                    if 'origin_price' in response.meta:
                        product_price = response.meta['origin_price']
                        ## print "PRICE:", product_price
                    else:
                        product_price = None
                        ## print "NO PRICE"
                    best_match = ProcessText.similar(
                        response.meta['origin_name'],
                        response.meta['origin_model'], product_price, items,
                        threshold)

                    # #self.log( "ALL MATCHES: ", level=log.WARNING)
                    # for item in items:
                    # 	## print item['product_name'].encode("utf-8")
                    # ## print '\n'

                self.log("FINAL: " + str(best_match), level=log.WARNING)
                self.log("\n----------------------------------------------\n",
                         level=log.WARNING)

                if not best_match:
                    # if there are no results but the option was to include original product URL, create an item with just that
                    # output item if match not found for either output type
                    #if self.output == 2:
                    item = SearchItem()
                    #item['origin_site'] = site

                    item['origin_url'] = response.meta['origin_url']
                    item['origin_name'] = response.meta['origin_name']
                    if 'origin_model' in response.meta:
                        item['origin_model'] = response.meta['origin_model']

                    # if 'origin_id' in response.meta:
                    # 	item['origin_id'] = response.meta['origin_id']
                    # 	assert self.by_id
                    # else:
                    # 	assert not self.by_id
                    return [item]

                return best_match

        else:
            # output item if match not found
            item = SearchItem()
            #item['origin_site'] = site

            # print "DONE FOR ", response.meta['origin_name']

            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            # if 'origin_id' in response.meta:
            # 	item['origin_id'] = response.meta['origin_id']
            # 	assert self.by_id
            # else:
            # 	assert not self.by_id

            #TODO: uncomment below - it should not have been in if/else branch!

            self.log("FINAL: " + str(item), level=log.WARNING)
            self.log("\n----------------------------------------------\n",
                     level=log.WARNING)

            return [item]
Beispiel #6
0
    def parseURL(self, response):

        site = response.meta['origin_site']
        hxs = HtmlXPathSelector(response)

        product_model = ""

        product_brand = ""
        product_price = ""

        #############################################################3
        # Extract product attributes (differently depending on site)

        if site == 'staples':

            product_name = hxs.select("//h1/text()").extract()[0]

            model_nodes = hxs.select(
                "//p[@class='itemModel']/text()").extract()
            if model_nodes:
                model_node = model_nodes[0]

                model_node = re.sub("\W", " ", model_node, re.UNICODE)
                m = re.match("(.*)Model:(.*)", model_node.encode("utf-8"),
                             re.UNICODE)

                if m:
                    product_model = m.group(2).strip()

        elif site == 'walmart':
            product_name_holder = hxs.select(
                "//h1[@class='productTitle']/text()").extract()
            if product_name_holder:
                product_name = product_name_holder[0].strip()

                # get integer part of product price
                product_price_big = hxs.select(
                    "//span[@class='bigPriceText1']/text()").extract()

                if not product_price_big:
                    self.log("Didn't find product price: " + response.url +
                             "\n",
                             level=log.DEBUG)
                # if there is a range of prices take their average
                if len(product_price_big) > 1:

                    # remove $ and .
                    product_price_min = re.sub("[\$\.,]", "",
                                               product_price_big[0])
                    product_price_max = re.sub("[\$\.,]", "",
                                               product_price_big[-1])

                    #TODO: check if they're ints?
                    product_price_big = (int(product_price_min) +
                                         int(product_price_max)) / 2.0

                elif product_price_big:
                    product_price_big = int(
                        re.sub("[\$\.,]", "", product_price_big[0]))

                # get fractional part of price
                #TODO - not that important

                if product_price_big:
                    product_price = product_price_big

            else:
                sys.stderr.write(
                    "Broken product page link (can't find item title): " +
                    response.url + "\n")
                # return the item as a non-matched item
                item = SearchItem()
                #item['origin_site'] = site
                item['origin_url'] = response.url
                # remove unnecessary parameters
                m = re.match("(.*)\?enlargedSearch.*", item['origin_url'])
                if m:
                    item['origin_url'] = m.group(1)
                #item['origin_id'] = self.extract_walmart_id(item['origin_url'])
                if self.name != 'manufacturer':
                    # don't return empty matches in manufacturer spider
                    yield item
                return

            #TODO: if it contains 2 words, first could be brand - also add it in similar_names function
            product_model_holder = hxs.select(
                "//td[contains(text(),'Model')]/following-sibling::*/text()"
            ).extract()
            if product_model_holder:
                product_model = product_model_holder[0]

        #TODO: for the sites below, complete with missing logic, for not returning empty elements in manufacturer spider
        elif site == 'newegg':
            product_name_holder = hxs.select(
                "//span[@itemprop='name']/text()").extract()
            if product_name_holder:
                product_name = product_name_holder[0].strip()
            else:
                sys.stderr.write(
                    "Broken product page link (can't find item title): " +
                    response.url + "\n")
                item = SearchItem()
                #item['origin_site'] = site
                item['origin_url'] = response.url
                yield item
                return
            product_model_holder = hxs.select(
                "//dt[text()='Model']/following-sibling::*/text()").extract()
            if product_model_holder:
                product_model = product_model_holder[0]

        else:
            raise CloseSpider("Unsupported site: " + site)

        if site == 'staples':
            zipcode = "12345"
            cookies = {"zipcode": zipcode}
        else:
            cookies = {}

        #######################################################################
        # Create search queries to the second site, based on product attributes

        request = None

        #TODO: search by alternative model numbers?

        #TODO: search by model number extracted from product name? Don't I do that implicitly? no, but in combinations

        # if there is no product model, try to extract it
        if not product_model:
            product_model = ProcessText.extract_model_from_name(product_name)

            # for logging purposes, set this back to the empty string if it wasn't found (so was None)
            if not product_model:
                product_model = ""

            # product_model_index = ProcessText.extract_model_nr_index(product_name)
            # if product_model_index >= 0:
            # 	product_model = product_name[product_model_index]

            ## print "MODEL EXTRACTED: ", product_model, " FROM NAME ", product_name

        # if there is no product brand, get first word in name, assume it's the brand
        product_brand_extracted = ""
        #product_name_tokenized = ProcessText.normalize(product_name)
        product_name_tokenized = [
            word.lower() for word in product_name.split(" ")
        ]
        #TODO: maybe extract brand as word after 'by', if 'by' is somewhere in the product name
        if len(product_name_tokenized) > 0 and re.match(
                "[a-z]*", product_name_tokenized[0]):
            product_brand_extracted = product_name_tokenized[0].lower()

        # if we are in manufacturer spider, set target_site to manufacturer site

        # for manufacturer spider set target_site of request to brand extracted from name for this particular product
        if self.name == 'manufacturer':

            #TODO: restore commented code; if brand not found, try to search for it on every manufacturer site (build queries fo every supported site)
            # hardcode target site to sony
            #self.target_site = 'sony'
            #self.target_site = product_brand_extracted

            #target_site = product_brand_extracted

            # can only go on if site is supported
            # (use dummy query)
            #if target_site not in self.build_search_pages("").keys():
            if product_brand_extracted not in self.build_search_pages(
                    "").keys():

                product_brands_extracted = set(
                    self.build_search_pages("").keys()).intersection(
                        set(product_name_tokenized))

                if product_brands_extracted:
                    product_brand_extracted = product_brands_extracted.pop()
                    #target_site = product_brand_extracted
                else:
                    # give up and return item without match
                    self.log(
                        "Manufacturer site not supported (" +
                        product_brand_extracted +
                        ") or not able to extract brand from product name (" +
                        product_name + ")\n",
                        level=log.ERROR)

                    ## comment lines below to: don't return anything if you can't search on manufacturer site
                    # item = SearchItem()
                    # item['origin_url'] = response.url
                    # item['origin_name'] = product_name
                    # if product_model:
                    # 	item['origin_model'] = product_model
                    # yield item
                    return

            # if specific site is not set, search on manufacturer site as extracted from name
            if not self.manufacturer_site:
                target_site = product_brand_extracted
            else:
                # if it's set, continue only if it matches extracted brand
                if self.manufacturer_site != product_brand_extracted:
                    self.log(
                        "Will abort matching for product, extracted brand does not match specified manufacturer option ("
                        + product_brand_extracted + ")\n",
                        level=log.INFO)

                    ## comment lines below to: don't return anything if you can't search on manufacturer site
                    # item = SearchItem()
                    # item['origin_url'] = response.url
                    # item['origin_name'] = product_name
                    # if product_model:
                    # 	item['origin_model'] = product_model
                    # yield item
                    return

                else:
                    target_site = product_brand_extracted

                    # # try to match it without specific site (manufacturer spider will try to search on all manufacturer sites)
                    # target_site = None

        # for other (site specific) spiders, set target_site of request to class variable self.target_site set in class "constructor" (init_sub)
        else:
            target_site = self.target_site

        # 1) Search by model number
        if product_model:

            #TODO: model was extracted with ProcessText.extract_model_from_name(), without lowercasing, should I lowercase before adding it to query?
            query1 = self.build_search_query(product_model)
            search_pages1 = self.build_search_pages(query1)
            #page1 = search_pages1[self.target_site]
            page1 = search_pages1[target_site]

            request1 = Request(page1, callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request1.cookies = self.amazon_cookies
                request1.headers['Cookies'] = self.amazon_cookie_header
                #request1.meta['dont_merge_cookies'] = True
                ## print "SET AMAZON COOKIES"

            request1.meta['query'] = query1
            request1.meta['target_site'] = target_site

            request = request1

        # 2) Search by product full name
        query2 = self.build_search_query(product_name)
        search_pages2 = self.build_search_pages(query2)
        #page2 = search_pages2[self.target_site]
        page2 = search_pages2[target_site]
        request2 = Request(page2, callback=self.parseResults)

        # set cookies for amazon
        if (self.target_site == 'amazon' and self.cookies_file):
            request2.cookies = self.amazon_cookies
            request2.headers['Cookies'] = self.amazon_cookie_header
            #request2.meta['dont_merge_cookies'] = True

        request2.meta['query'] = query2
        request2.meta['target_site'] = target_site

        pending_requests = []

        if not request:
            request = request2
        else:
            pending_requests.append(request2)

        # 3) Search by combinations of words in product's name
        # create queries

        for words in ProcessText.words_combinations(product_name,
                                                    fast=self.fast):
            query3 = self.build_search_query(" ".join(words))
            search_pages3 = self.build_search_pages(query3)
            #page3 = search_pages3[self.target_site]
            page3 = search_pages3[target_site]
            request3 = Request(page3, callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request3.cookies = self.amazon_cookies
                request3.headers['Cookies'] = self.amazon_cookie_header
                #request3.meta['dont_merge_cookies'] = True

            request3.meta['query'] = query3
            request3.meta['target_site'] = target_site

            pending_requests.append(request3)

        request.meta['pending_requests'] = pending_requests
        #request.meta['origin_site'] =
        # product page from source site
        #TODO: clean this URL? for walmart it added something with ?enlargedsearch=True
        request.meta['origin_url'] = response.url

        request.meta['origin_name'] = product_name
        request.meta['origin_model'] = product_model
        if product_price:
            request.meta['origin_price'] = product_price

        # origin product brand as extracted from name (basically the first word in the name)
        request.meta['origin_brand_extracted'] = product_brand_extracted

        # if self.by_id:
        # 	request.meta['origin_id'] = self.extract_walmart_id(response.url)

        #self.target_site = product_brand_extracted
        #TODO: should this be here??
        target_site = product_brand_extracted

        # print "SENDING REQUEST FOR ", product_name, response.url

        yield request
Beispiel #7
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        #results = hxs.select("//ul[@class='productsListView']/li")
        results = hxs.select("//li[contains(@class,'tile standard')]")
        for result in results:
            item = SearchItem()
            product_title_holder = result.select(
                ".//div[@class='tileInfo']/a[contains(@class,'productTitle')]")
            product_url = product_title_holder.select("@href").extract()
            product_name = product_title_holder.select("@title").extract()

            #print "ITEM", product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                m = re.match("(.*)#prodSlot*", url.extract(), product_url[0])
                if m:
                    item['product_url'] = m.group(1)
                else:
                    item['product_url'] = product_url[0]
                item['product_name'] = product_name[0]
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            # extract price
            #! extracting regular price and not discount price when discounts available?
            price_holder = result.select(
                ".//p[@class='regularprice-label']//text()[contains(.,'$')]"
            ).extract()

            # second attempt at finding price
            if not price_holder:
                price_holder = result.select(
                    ".//*[contains(@class, 'price price-label')]/text()[contains(.,'$')]"
                ).extract()

            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                # if more than one match, it will get the first one
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.DEBUG)

            # extract product brand
            brand_holder = product_title_holder.select(
                "parent::node()//a[contains(@class,'productBrand')]/a/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
                self.log("Extracted brand: " + item['product_brand'] +
                         " from results page: " + str(response.url),
                         level=log.DEBUG)

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
Beispiel #8
0
    def parse_product_target(self, response):

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']

        # extract product name

        #TODO: is this general enough?
        product_name = hxs.select(
            "//h2[@class='product-name item']/span[@itemprop='name']/text()"
        ).extract()

        # if you can't find product name in product page, use the one extracted from results page
        if not product_name:
            item['product_name'] = response.meta['product_name']
            self.log(
                "Error: product name not found on product page, extracted from results page: "
                + item['product_name'] + " " + origin_url,
                level=log.INFO)
        else:
            item['product_name'] = product_name[0].strip()

        if not item['product_name']:
            self.log("Error: No product name: " + str(response.url) +
                     " from product: " + origin_url,
                     level=log.INFO)

        else:
            #TODO: no model number field?
            model_number_holder = None
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                #print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            #TODO: no brand field?

            # extract price
            #! extracting list price and not discount price when discounts available?
            #TODO: complete this with other types of pages
            price_holder = hxs.select(
                "//span[@class='offerPrice']/text()").extract()

            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    sys.stderr.write("Didn't match product price: " +
                                     product_target_price + " " +
                                     response.url + "\n")

            else:
                sys.stderr.write("Didn't find product price: " + response.url +
                                 "\n")

            # add result to items
            items.add(item)

        # if there are any more results to be parsed, send a request back to this method with the next product to be parsed
        product_urls_and_names = response.meta['search_results']

        if product_urls_and_names:
            product_url_and_name = product_urls_and_names.pop()
            request = Request(product_url_and_name[0],
                              callback=self.parse_product_target,
                              meta=response.meta)
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)

            # send product name with request as well
            request.meta['product_name'] = product_url_and_name[1]
            request.meta['search_results'] = product_urls_and_names

            return request
        else:
            # otherwise, we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            return self.reduceResults(response)
    def parse_product_amazon(self, response):

        # print "PARSE AMAZON PRODUCT FOR", response.meta['origin_url'], response.url

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']

        # if 'origin_id' in response.meta:
        # 	item['origin_id'] = response.meta['origin_id']
        # 	assert self.by_id
        # else:
        # 	assert not self.by_id

        # extract product name
        #TODO: id='title' doesn't work for all, should I use a 'contains' or something?
        # extract titles that are not empty (ignoring whitespace)
        # eliminate "Amazon Prime Free Trial"

        #TODO: to test this
        #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract())
        product_name = filter(
            lambda x: not x.startswith("Amazon Prime"),
            hxs.select("//h1//text()[normalize-space()!='']").extract())
        if not product_name:
            # print "NO PRODUCT NAME FOR", response.url
            self.log("Error: No product name: " + str(response.url) +
                     " for walmart product " + origin_url,
                     level=log.ERROR)

            # assume there is a captcha to crack
            # check if there is a form on the page - that means it's probably the captcha form
            forms = hxs.select("//form")
            if forms:

                # solve captcha
                captcha_text = None
                image = hxs.select(".//img/@src").extract()
                if image:
                    captcha_text = self.CB.solve_captcha(image[0])

                # value to use if there was an exception
                if not captcha_text:
                    captcha_text = ''

                # create a FormRequest to this same URL, with everything needed in meta
                # items, cookies and search_urls not changed from previous response so no need to set them again

                # redo the entire request (no items will be lost)
                return [
                    FormRequest.from_response(
                        response,
                        callback=self.parse_product_amazon,
                        formdata={'field-keywords': captcha_text},
                        meta=response.meta)
                ]

        else:
            item['product_name'] = product_name[0].strip()

            # extract product model number
            model_number_holder = hxs.select(
                "//tr[@class='item-model-number']/td[@class='value']/text() | //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text()"
            ).extract()
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            brand_holder = hxs.select(
                "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
            else:
                pass
                #sys.stderr.write("Didn't find product brand: " + response.url + "\n")

            # extract price
            #! extracting list price and not discount price when discounts available?
            price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \
             "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract()

            # if we can't find it like above try other things:
            if not price_holder:
                # prefer new prices to used ones
                price_holder = hxs.select(
                    "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]"
                ).extract()
            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

            # add result to items
            items.add(item)

        # print "STILL IN parse_product FOR", response.url

        product_urls = response.meta['search_results']

        # try to send request to parse next product, try until url for next product url is valid (response not 404)
        # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost

        # find first valid next product url
        next_product_url = None
        if product_urls:
            next_product_url = product_urls.pop()
        while (product_urls and not self.is_valid_url(next_product_url)):
            # print "404 FROM", next_product_url
            next_product_url = product_urls.pop()

        # handle corner case of bad next product url
        if not product_urls and next_product_url and not self.is_valid_url(
                next_product_url):
            next_product_url = None

        # if a next product url was found, send new request back to parse_product_url
        if next_product_url:
            request = Request(next_product_url,
                              callback=self.parse_product_amazon,
                              meta=response.meta)
            if self.cookies_file:
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            # print "RETURNING FROM PARSE AMAZON PRODUCT TO parse_product FOR", response.meta['origin_url'], response.url, "NEXT IS", next_product_url
            respcode = urllib.urlopen(next_product_url)

            return request

        # if no next valid product url was found
        else:
            # we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            # print "RETURNING FROM PARSE AMAZON PRODUCT TO reduce_results FOR", response.meta['origin_url'], response.url

            return self.reduceResults(response)
Beispiel #10
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select("//h3[@class='productTitle']/a")
        for result in results:
            item = SearchItem()
            product_url = result.select("@href").extract()[0]
            # extract all text in <a> (contains product name inside <strong>, and size(ml) directly in text())

            # node containing full product name if the displayed one is abbreviated. use this one if exists, and displayed one if it doesn't
            product_name_node = result.select("strong/abbr/@title")
            product_name = product_name_node.extract(
            )[0] if product_name_node else result.select(
                "strong/text()").extract()[0]
            # assert name is not abbreviated
            assert '...' not in product_name
            # add product quantity
            product_quantity_node = result.select(
                "text()[normalize-space()!='']")
            product_quantity = product_quantity_node.extract()[0].strip(
            ) if product_quantity_node else ""
            product_name_full = product_name + " " + product_quantity

            #print "ITEM", product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(
                    Utils.clean_url(product_url), self.base_url)

                item['product_name'] = product_name_full
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        #TODO: is this general enough?
        product_name = hxs.select(
            "//h2[@class='product-name item']/span[@itemprop='name']/text()"
        ).extract()

        # if you can't find product name in product page, use the one extracted from results page
        if not product_name:
            # item['product_name'] = response.meta['product_name']
            self.log("Error: product name not found on product page: " +
                     str(response.url),
                     level=log.INFO)
        else:
            item['product_name'] = product_name[0].strip()

        if 'product_name' not in item or not item['product_name']:
            self.log("Error: No product name: " + str(response.url),
                     level=log.INFO)

        else:
            # consider DPCI as model number
            # TODO: not sure if the best approach, maybe in the future add separate field "DPCI"
            # TODO: may make things worse where there is also an actual model number in the name?

            DPCI_holder = hxs.select(
                "//li[contains(strong/text(), 'DPCI')]/text()").re("[0-9\-]+")
            # try hidden tag
            if not DPCI_holder:
                DPCI_holder = hxs.select(
                    "//input[@id='dpciHidden']/@value").extract()

            if DPCI_holder:
                item['product_upc'] = [DPCI_holder[0].strip()]
            # if no product model explicitly on the page, try to extract it from name

            # no model to extract directly from page for target
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted
            #print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            #TODO: no brand field?

            # extract price
            #! extracting list price and not discount price when discounts available?
            #TODO: complete this with other types of pages
            price_holder = hxs.select(
                "//span[@class='offerPrice']/text()").extract()

            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    sys.stderr.write("Didn't match product price: " +
                                     product_target_price + " " +
                                     response.url + "\n")

            else:
                sys.stderr.write("Didn't find product price: " + response.url +
                                 "\n")

            return item