Ejemplo n.º 1
0
    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        try:
            item['product_name'] = hxs.select(
                "//h2[@class='product_name product_title']/span[@itemprop='name']/text()"
            ).extract()[0]
        except:
            self.log("Error: No product name: " + str(response.url) +
                     " from product: " + item['origin_url'],
                     level=log.INFO)
            # ignore products with no name
            return

        price = hxs.select(
            "//div[@class='price']/text() | //div[@class='pricestring']/text()"
        ).extract()[0].strip()

        if price.startswith("from "):
            price = price[5:].strip()

        m = re.match("(\xa3|\$)([0-9]+\.?[0-9]*)", price)
        if not m:
            self.log("Didn't match product price: " + price + " " +
                     response.url + "\n",
                     level=log.WARNING)
        else:
            price_amount = m.group(2)
            price_currency = m.group(1)
            price_value = Utils.convert_to_dollars(float(price_amount),
                                                   price_currency)
            item['product_target_price'] = price_value

        return item
Ejemplo n.º 2
0
    def extract_result_products(self, response):

        hxs = HtmlXPathSelector(response)

        results = hxs.select("//div[@class='innerWrapper']")
        items = []

        for result in results:

            item = SearchItem()
            product_name = result.select(
                ".//div[@class='shortDescription']/a/text()").extract()
            product_url = result.select(
                ".//div[@class='shortDescription']/a/@href").extract()

            # quit if there is no product name
            if product_name and product_url:
                item['product_url'] = "http://www1.macys.com" + product_url[0]
                item['product_name'] = product_name[0].strip()
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # extract price
            #! extracting regular price and not discount price when discounts available?
            price_holder = result.select(
                "div[@class='prices']/span/text()").extract()

            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                # if more than one match, it will get the first one
                m = re.match("([a-zA-Z\.\s]+)?(\xa3|\$)([0-9]+\.?[0-9]*)",
                             product_target_price)
                if m:
                    price = float(m.group(3))
                    currency = m.group(2)
                    item['product_target_price'] = Utils.convert_to_dollars(
                        price, currency)
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.DEBUG)

            # extract product brand
            #

            items.append(item)

        return items
Ejemplo n.º 3
0
    def parse_product_maplin(self, response):

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']
        if 'origin_upc' in response.meta:
            item['origin_upc'] = response.meta['origin_upc']
        if 'origin_brand' in response.meta:
            item['origin_brand'] = response.meta['origin_brand']


        product_name_node = hxs.select("//h1[@itemprop='name']/text()").extract()
        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR)
            # TODO:is this ok? I think so
            return

        item['product_name'] = product_name

        # extract product model number
        # TODO: no model?
        # TODO: no upc?
        # TODO: no brand?
        # TODO: add code extraction
        
        # extract price
        price_holder = hxs.select("//meta[@itemprop='price']/@content").extract()
        # if we can't find it like above try other things:
        if price_holder:
            product_target_price = price_holder[0].strip()
            # remove commas separating orders of magnitude (ex 2,000)
            product_target_price = re.sub(",","",product_target_price)
            try:
                product_target_price = float(product_target_price)

                # convert to dollars (assume pounds)
                product_target_price = Utils.convert_to_dollars(product_target_price, u'\xa3')
                item['product_target_price'] = product_target_price
            except Exception, ex:
                self.log("Couldn't convert product price: " + response.url + "\n", level=log.WARNING)
Ejemplo n.º 4
0
    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        try:
            item['product_name'] = hxs.xpath(
                "//h1[@class='product-title']/text()").extract()[0]
        except:
            self.log("Error: No product name: " + str(response.url) +
                     " from product: " + item['origin_url'],
                     level=log.INFO)
            # ignore products with no name
            return

        price_node = hxs.select("//p[@class='price']")

        if price_node:

            try:
                price_amount = price_node.select(
                    "span[@itemprop='price']/text()").extract()[0]
                price_currency = price_node.select(
                    "span[@class='smaller']/text()").extract()[0]

                price_amount = re.sub(",", "", price_amount)

                m1 = re.match("[0-9]+\.?[0-9]*", price_amount)
                m2 = re.match("(\xa3)|(\$)", price_currency)
                if not m1 or not m2:
                    self.log("Didn't match product price: " + price_amount +
                             price_currency + " " + response.url + "\n",
                             level=log.WARNING)
                else:
                    price = Utils.convert_to_dollars(float(price_amount),
                                                     price_currency)
                    item['product_target_price'] = price
            except Exception:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

        try:
            item['product_model'] = hxs.select(
                "//strong[@itemprop='mpn']/text()").extract()[0]
        except Exception:
            pass

        try:
            item['product_brand'] = hxs.select(
                "//div[@itemprop='manufacturer']/meta/@content").extract()[0]
        except Exception:
            pass

        return item
Ejemplo n.º 5
0
    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        try:
            item['product_name'] = hxs.xpath(
                "//h1[@itemprop='name']/text()").extract()[0]
        except:
            self.log("Error: No product name: " + str(response.url) +
                     " from product: " + item['origin_url'],
                     level=log.INFO)
            # ignore products with no name
            return

        try:
            price = hxs.select(
                "//p[@class='price']/span[@itemprop='price']/text()").extract(
                )[0]
            price = re.sub(",", "", price)

            m = re.match("(\xa3|\$)([0-9]+\.?[0-9]*)", price)
            if not m:
                self.log("Didn't match product price: " + price_amount +
                         price_currency + " " + response.url + "\n",
                         level=log.WARNING)
            else:
                price_amount = m.group(2)
                price_currency = m.group(1)
                price_value = Utils.convert_to_dollars(float(price_amount),
                                                       price_currency)
                item['product_target_price'] = price_value
        except Exception:
            self.log("Didn't find product price: " + response.url + "\n",
                     level=log.INFO)

        try:
            item['product_model'] = hxs.select("//div[@id='product_additional_details_container']" + \
                "//tr[starts-with(.//text()[normalize-space()], 'Model')]/td/text()")\
            .extract()[0].strip()
        except Exception:
            pass

        try:
            item['product_brand'] = hxs.select("//div[@id='product_additional_details_container']" + \
                "//tr[starts-with(.//text()[normalize-space()], 'Brand')]/td/text()")\
            .extract()[0].strip()
        except Exception:
            pass

        return item
Ejemplo n.º 6
0
    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        # extract product name
        product_name = hxs.select("//h1[@id='itemTitle']/text()").extract()
        if not product_name:
            self.log("Error: No product name: " + str(response.url),
                     level=log.INFO)

        else:

            item['product_name'] = product_name[0]

            # extract product brand
            product_brand_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Brand')]" + \
                "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract()
            if product_brand_holder:
                item['product_brand'] = product_brand_holder[0]

            # extract product model
            product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Model')]" + \
                "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract()
            if not product_model_holder:
                product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'MPN')]" + \
                "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract()

            if product_model_holder:
                item['product_model'] = product_model_holder[0]

            # TODO: upc?

            price_holder = hxs.select(
                "//span[@itemprop='price']/text() | //span[@id='mm-saleDscPrc']/text()"
            )
            try:
                (currency, price) = price_holder.re("(\$|\xa3)([0-9\.]+)")
                if currency != "$":
                    price = Utils.convert_to_dollars(float(price), currency)
                item['product_target_price'] = float(price)
            except:
                self.log("No price: " + str(response.url), level=log.WARNING)

            return item
Ejemplo n.º 7
0
    def parse_product_currys(self, response):

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']
        if 'origin_upc' in response.meta:
            item['origin_upc'] = response.meta['origin_upc']
        if 'origin_brand' in response.meta:
            item['origin_brand'] = response.meta['origin_brand']


        product_name_node = hxs.select("//span[@itemprop='name']/text()").extract()
        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR)
            # TODO:is this ok? I think so
            return

        item['product_name'] = product_name

        # extract product model number
        # TODO: no model?
        # TODO: no upc?
        
        brand_holder = hxs.select("//span[@itemprop='name']/text()").extract()
        if brand_holder:
            item['product_brand'] = brand_holder[0]

        # extract price
        price_holder = hxs.select("//span[@class='currentPrice']/ins/text()").extract()
        # if we can't find it like above try other things:
        if price_holder:
            product_target_price = price_holder[0].strip()
            # remove commas separating orders of magnitude (ex 2,000)
            product_target_price = re.sub(",","",product_target_price)
            m = re.match("(\xa3)([0-9]+\.?[0-9]*)", product_target_price)
            if m:
                item['product_target_price'] = float(m.group(2))
                currency = m.group(1)
                item['product_target_price'] = Utils.convert_to_dollars(item['product_target_price'], currency)
            else:
                self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING)

        else:
            self.log("Didn't find product price: " + response.url + "\n", level=log.INFO)


        # add result to items
        items.add(item)


        product_urls = response.meta['search_results']

        # try to send request to parse next product, try until url for next product url is valid (response not 404)
        # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost

        # find first valid next product url
        next_product_url = None
        if product_urls:
            next_product_url = product_urls.pop()

        # if a next product url was found, send new request back to parse_product_url
        if next_product_url:
            request = Request(next_product_url, callback = self.parse_product_currys, meta = response.meta)
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            return request

        # if no next valid product url was found
        else:
            # we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            return self.reduceResults(response)
Ejemplo n.º 8
0
    def parse_product_amazon(self, response):

        hxs = HtmlXPathSelector(response)

        origin_product_id = response.meta['origin_product_id']
        current_query = response.meta['query']
        origin_url = self.results[origin_product_id]['origin_product'][
            'origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        for field in self.results[origin_product_id]['origin_product'].keys():
            item[field] = self.results[origin_product_id]['origin_product'][
                field]

        # all product urls from all queries
        items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \
            self.results[origin_product_id]['search_requests']), [])
        # all product urls from all queries
        product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \
            self.results[origin_product_id]['search_requests']), [])
        product_urls = set(product_urls)

        #TODO: to test this
        #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract())
        product_name_node = hxs.select(
            '//h1[@id="title"]/span[@id="productTitle"]/text()').extract()
        product_name = None
        if not product_name_node:
            product_name_node = hxs.select(
                '//h1[@id="aiv-content-title"]//text()').extract()
        if not product_name_node:
            product_name_node = hxs.select(
                '//div[@id="title_feature_div"]/h1//text()').extract()

        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            # needs special treatment
            product_name_node = hxs.select(
                '//h1[@class="parseasinTitle " or @class="parseasinTitle"]/span[@id="btAsinTitle"]//text()'
            ).extract()
            if product_name_node:
                product_name = " ".join(product_name_node).strip()

        if not product_name:

            # log this error:
            # if number of retries were not exhausted, it might just be a captcha page, not an insurmonutable error
            if 'captcha_retries' in response.meta and response.meta[
                    'captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.WARNING)
            else:
                # if it comes from a solved captcha page, then it's an error if it's still not found
                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.ERROR)

                # try this: don't remove captcha_retries from meta, may cause infinite loops, works
                # if response.meta['captcha_retries'] > self.MAX_CAPTCHA_RETRIES:
                # del response.meta['captcha_retries']
            # if we have reached maximum number of retries, do nothing (item just won't be added to the "items" list)

            # if we haven't reached maximum retries, try again
            if 'captcha_retries' not in response.meta \
                or 'captcha_retries' in response.meta and response.meta['captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                # assume there is a captcha to crack
                # check if there is a form on the page - that means it's probably the captcha form
                forms = hxs.select("//form")
                if forms:

                    # solve captcha
                    captcha_text = None
                    image = hxs.select(".//img/@src").extract()
                    if image:
                        captcha_text = self.CB.solve_captcha(image[0])

                    # value to use if there was an exception
                    if not captcha_text:
                        captcha_text = ''

                    # create a FormRequest to this same URL, with everything needed in meta
                    # items, cookies and search_urls not changed from previous response so no need to set them again

                    # redo the entire request (no items will be lost)
                    meta = response.meta
                    # flag indicating how many times we already retried to solve captcha
                    if 'captcha_retries' in meta:
                        meta['captcha_retries'] += 1
                    else:
                        meta['captcha_retries'] = 1
                    return [
                        FormRequest.from_response(
                            response,
                            callback=self.parse_product_amazon,
                            formdata={'field-keywords': captcha_text},
                            meta=meta)
                    ]

        else:
            item['product_name'] = product_name

            # extract product model number
            model_number_holder = hxs.select(
                """//tr[@class='item-model-number']/td[@class='value']/text() |
             //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text() |
             //span/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/span[2]/text()"""
            ).extract()
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            upc_node = hxs.select(
                "//li/b/text()[normalize-space()='UPC:']/parent::node()/parent::node()/text()"
            ).extract()
            if upc_node:
                upc = upc_node[0].strip().split()
                item['product_upc'] = upc

            manufacturer_code_node = hxs.select(
                "//li/b/text()[normalize-space()='Manufacturer reference:']/parent::node()/parent::node()/text()"
            ).extract()
            if manufacturer_code_node:
                manufacturer_code = manufacturer_code_node[0].strip()
                item['manufacturer_code'] = manufacturer_code

            try:
                # for lowest level category:
                # TODO: test the xpath for the second type of page (see second type of xpath for top-level category)
                # bestsellers_rank = hxs.select("//tr[@id='SalesRank']/td[@class='value']/ul/li/span/text()" + \
                # "| //li[@id='SalesRank']/ul/li/span/text()").re("#[0-9,]+")[0]

                # for top-level category:
                bestsellers_rank = hxs.select(
                    "//tr[@id='SalesRank']/td[@class='value']/text()" +
                    " | //li[@id='SalesRank']/text()").re("#[0-9,]+")[0]
                item['bestsellers_rank'] = int(
                    re.sub(",", "", "".join(bestsellers_rank[1:])))
            except Exception, e:
                if self.output == 6 or self.bestsellers_link:
                    self.log("Didn't find product rank: " + str(e) + " " +
                             response.url + "\n",
                             level=log.INFO)

            asin_node = hxs.select(
                "//li/b/text()[normalize-space()='ASIN:']/parent::node()/parent::node()/text()"
            ).extract()
            if asin_node:
                item['product_asin'] = asin_node[0].strip()

            brand_holder = hxs.select(
                "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
            else:
                pass
                #sys.stderr.write("Didn't find product brand: " + response.url + "\n")

            # extract price
            #! extracting list price and not discount price when discounts available?
            price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \
                "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract()

            # if we can't find it like above try other things:
            if not price_holder:
                # prefer new prices to used ones
                # TODO: doesn't work for amazon.co.uk (pounds), but isn't needed bery often
                price_holder = hxs.select(
                    "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]"
                ).extract()
            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("(\$|\xa3)([0-9]+\.?[0-9]*)",
                             product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(2))
                    currency = m.group(1)
                    if currency != "$":
                        item[
                            'product_target_price'] = Utils.convert_to_dollars(
                                item['product_target_price'], currency)
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

            try:
                item['product_category_tree'] = \
                    filter(None, map(lambda c: c.strip(), hxs.select("//ul[li[@class='a-breadcrumb-divider']]/li/span[@class='a-list-item']/a/text()").extract()))
            except:
                pass

            try:
                item['product_keywords'] = hxs.select(
                    "//meta[@name='keywords']/@content").extract()[0]
            except:
                pass

            try:
                product_image = hxs.select(
                    "//img[@id='landingImage']/@src").extract()[0]
                item['product_image_url'] = product_image
                item['product_image_encoded'] = ProcessText.encode_image(
                    product_image)
            except:
                pass

            # add result to items
            self.results[origin_product_id]['search_requests'][current_query][
                'product_items'].append(item)
Ejemplo n.º 9
0
    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        try:
            item['product_name'] = hxs.xpath(
                "//h1[starts-with(@class,'title')]//text()").extract(
                )[0].strip()
        except:
            try:
                item['product_name'] = hxs.xpath(
                    "//div[@class='pdp_title']//text()[normalize-space()!='']"
                ).extract()[0].strip()
            except:
                try:
                    item['product_name'] = hxs.xpath(
                        "//h1//text()").extract()[0].strip()
                except:
                    # out of stock products return 404s with this text, not the actual product page
                    out_of_stock = hxs.xpath(
                        "//strong[contains(text(),'out of stock')]").extract()
                    if not out_of_stock:
                        self.log("Error: No product name: " +
                                 str(response.url) + " from product: " +
                                 item['origin_url'],
                                 level=log.ERROR)
                    # ignore products with no name
                    return None

        price_node = hxs.select("//meta[@itemprop='price']/@content").extract()

        if price_node:

            try:
                price_currency = price_node[0][0]
                price_amount = "".join(price_node[0][1:])

                price_amount = re.sub(",", "", price_amount)

                m1 = re.match("[0-9]+\.?[0-9]*", price_amount)
                m2 = re.match("(\xa3)|(\$)", price_currency)
                if not m1 or not m2:
                    self.log("Didn't match product price: " + price_amount +
                             price_currency + " " + response.url + "\n",
                             level=log.WARNING)
                else:
                    price = Utils.convert_to_dollars(float(price_amount),
                                                     price_currency)
                    item['product_target_price'] = price
            except Exception:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

        try:
            product_model_node = hxs.select(
                "//div[@class='prod_description1']//li[contains(text(), 'Style')]/text()"
            ).re("[sS]tyle +[nN]o\.? +[a-zA-Z0-9]+")
            item['product_model'] = re.match(
                "[sS]tyle +[nN]o\.? +([a-zA-Z0-9]+)",
                product_model_node[0]).group(1)
        except Exception:
            pass

        try:
            item['product_brand'] = hxs.select(
                "//meta[@itemprop='brand']/@content").extract()[0]
        except Exception:
            pass

        try:
            js_body = hxs.select(
                "//script[contains(text(),'Upc')]/text()").extract()[0]
            item['product_upc'] = re.match('.*"skuUpcCode":"([0-9a-zA-Z]+)".*',
                                           js_body,
                                           re.DOTALL | re.MULTILINE).group(1)
        except Exception:
            pass

        return item