Ejemplo n.º 1
0
    def parse_items(self, response):

        product = ProductItem()

        product['TestUrl'] = response.url
        product_name = self.extract(
            response.xpath('//meta[@property="og:title"]/@content'))
        product['ProductName'] = product_name.replace(" | EP:", "")
        product['PicURL'] = self.extract(
            response.xpath('//meta[@property="og:image"]/@content'))
        product['ProductManufacturer'] = self.extract(
            response.xpath("//div[@class='product-details-left']/a//@title"))
        product['source_internal_id'] = str(response.url).split("/")[5]
        yield product

        price_xpath = "//div/div[@class='product-details-price']//div/text()"
        price = self.extract(response.xpath(price_xpath))
        if price:
            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "price"
            product_id['ID_value'] = price.replace(".", "").rstrip(",-")
            yield product_id

        EAN_id_xpath = "//div[@class='product-flixdata']/@data-ean"
        EAN_id = self.extract(response.xpath(EAN_id_xpath))
        if EAN_id:
            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "EAN"
            product_id['ID_value'] = EAN_id
            yield product_id
Ejemplo n.º 2
0
    def parse_product(self, response):
        item = response.meta['item']

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = item['ocn']
        product['ProductName'] = item['name']
        product['PicURL'] = get_full_url(
            response.url,
            self.extract(response.xpath('//img[@itemprop="image"]/@src')))
        product["ProductManufacturer"] = self.extract(
            response.xpath('//span[@itemprop="brand"]/text()'))
        yield product

        mpn_id_xpath = '//div[text()="Partnumber"]/parent::div/div[contains(@class,"value")]/text()'
        ean_id_xpath = '//div[text()="EAN"]/parent::div/div[contains(@class,"value")]/text()'
        mpn_id = self.extract(response.xpath(mpn_id_xpath))
        ean_id = self.extract(response.xpath(ean_id_xpath))

        if mpn_id.strip() > '-':
            mpn = ProductIdItem()
            mpn['ProductName'] = item['name']
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_id
            yield mpn

        if ean_id.strip() > '-':
            ean = ProductIdItem()
            ean['ProductName'] = item['name']
            ean['ID_kind'] = "EAN"
            ean['ID_value'] = ean_id
            yield ean
Ejemplo n.º 3
0
    def parse_product(self, response):
        product_xpaths = {
            "PicURL": "//meta[@property='og:image']/@content",
            "ProductName": "//h1[@class='productHeading']//text()",
            "ProductManufacturer": "//h1[@class='productHeading']/text()"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)

        match = re.search(self.source_internal_id_re, response.url)
        if match:
            product['source_internal_id'] = match.group(1)

        product['TestUrl'] = response.url
        product["OriginalCategoryName"] = response.meta["category"][
            "category_path"]
        yield product

        mpn_value = self.extract(
            response.xpath("//span[@id='productMPN']/text()"))
        if mpn_value:
            mpn = ProductIdItem()
            mpn['source_internal_id'] = product["source_internal_id"]
            mpn['ProductName'] = product["ProductName"]
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_value
            yield mpn

        ean_value = self.extract(
            response.xpath("//span[@id='productEAN']/text()"))
        if ean_value:
            ean = ProductIdItem()
            ean['source_internal_id'] = product["source_internal_id"]
            ean['ProductName'] = product["ProductName"]
            ean['ID_kind'] = "EAN"
            ean['ID_value'] = ean_value
            yield ean

        bv_params = self.bv_base_params.copy()
        bv_params['bv_id'] = product['source_internal_id']
        bv_params['offset'] = 0
        review_url = self.get_review_url(**bv_params)
        request = Request(url=review_url, callback=self.parse_reviews)

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])
        request.meta['last_user_review'] = last_user_review

        request.meta['bv_id'] = product['source_internal_id']
        request.meta['product'] = product
        request.meta['filter_other_sources'] = False

        yield request
Ejemplo n.º 4
0
    def parse_product(self, response):
        product = ProductItem()
        product_name_xpath = "//*[@itemprop='name']/a/text()"
        pic_url_xpath = "//div[@class='imageCarousel']//img/@src"
        manufacturer_xpath = "//td[@class='spec-index-column'][text()='Merk']/following-sibling::td//text()"
        sii_xpath = "//td[@class='spec-index-column'][text()='Tweakers ID']/following-sibling::td//text()"
        product['TestUrl'] = response.url
        product['ProductName'] = self.extract(
            response.xpath(product_name_xpath))
        if not product['ProductName']:  #blocked
            request = self._retry(response.request)
            yield request
            return

        category_path_xpath = "//li[@id='tweakbaseBreadcrumbCategory']/a/text()"
        category_path = self.extract(response.xpath(category_path_xpath))
        if category_path:
            category = CategoryItem()
            category['category_path'] = category_path
            product['OriginalCategoryName'] = category_path

            if self.should_skip_category(category):
                return

            yield category

        product['PicURL'] = self.extract(response.xpath(pic_url_xpath))
        product['ProductManufacturer'] = self.extract(
            response.xpath(manufacturer_xpath))
        product['source_internal_id'] = self.extract(response.xpath(sii_xpath))
        yield product

        tweakers_kind = ProductIdItem()
        tweakers_kind['source_internal_id'] = product['source_internal_id']
        tweakers_kind['ProductName'] = product["ProductName"]
        tweakers_kind['ID_kind'] = "tweakers_id"
        tweakers_kind['ID_value'] = product["source_internal_id"]
        yield tweakers_kind

        eans_xpath = "//td[@class='spec-index-column'][text()='EAN']/following-sibling::td/span/text()"
        eans = self.extract_list(response.xpath(eans_xpath))
        for ean in eans:
            tweakers_kind = ProductIdItem()
            tweakers_kind['source_internal_id'] = product["source_internal_id"]
            tweakers_kind['ProductName'] = product["ProductName"]
            tweakers_kind['ID_kind'] = "EAN"
            try:
                tweakers_kind['ID_value'] = int(ean)
                yield tweakers_kind
            except ValueError, e:
                continue
Ejemplo n.º 5
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        product['ProductManufacturer'] = self.brand_name

        product_name_xpath = "//meta[@name='PS_DTN']/@content"
        pic_url_xpath = "//meta[@name='ISS_IMAGE']/@content"
        sii_xpath = "//meta[@name='PHILIPS.METRICS.PRODUCTID']/@content"

        product_name_orig = self.extract(response.xpath(product_name_xpath))
        if not product_name_orig:
            return

        product['ProductName'] = self.brand_name + ' ' + product_name_orig

        category_path_xpath = "//meta[@name='ISS_GROUP_KEY_NEW']/@content"
        category_path = self.extract(response.xpath(category_path_xpath))
        if category_path:
            category = CategoryItem()
            category['category_path'] = category_path
            product['OriginalCategoryName'] = category_path

            if self.should_skip_category(category):
                return
            yield category

        product['PicURL'] = self.extract(response.xpath(pic_url_xpath))
        product['source_internal_id'] = self.extract(response.xpath(sii_xpath))
        yield product

        # We were using product MPNs as philips_id,
        # do the same thing in alaScrapy spider
        philips_id = ProductIdItem.from_product(product,
                                                kind='philips_id',
                                                value=product_name_orig)
        yield philips_id

        eans_xpath = "//meta[@name='PS_GTIN']/@content"
        ean = self.extract(response.xpath(eans_xpath))
        if ean:
            ean_item = ProductIdItem.from_product(product,
                                                  kind='EAN',
                                                  value=ean)
            yield ean_item

        request = self.start_reviews(response,
                                     product,
                                     filter_other_sources=False)
        request.meta['product'] = product
        yield request
Ejemplo n.º 6
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf": "//*[@id='moreFrom-catLink']/a/text()",
            "category_path": "//*[@id='moreFrom-catLink']/a/text()"
        }

        product_xpaths = {
            "PicURL": "(//li[@class='productImageItem'])[1]//img/@src",
            "ProductName": "//h1[@class='productHeading']//text()",
            "ProductManufacturer": "//h1[@class='productHeading']/text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        product = self.init_item_by_xpaths(response, "product", product_xpaths)

        match = re.search(self.source_internal_id_re, response.url)
        if match:
            product['source_internal_id'] = match.group(1)
        product["OriginalCategoryName"] = category["category_path"]
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        mpn_value = self.extract(
            response.xpath("//span[@id='productMPN']/text()"))
        if mpn_value:
            mpn = ProductIdItem()
            mpn['source_internal_id'] = product["source_internal_id"]
            mpn['ProductName'] = product["ProductName"]
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_value
            yield mpn

        ean_value = self.extract(
            response.xpath("//span[@id='productEAN']/text()"))
        if ean_value:
            ean = ProductIdItem()
            ean['source_internal_id'] = product["source_internal_id"]
            ean['ProductName'] = product["ProductName"]
            ean['ID_kind'] = "EAN"
            ean['ID_value'] = ean_value
            yield ean

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)
            for review in self._parse_reviews(selector, browser, product):
                yield review
Ejemplo n.º 7
0
    def parse_product(self, response):

        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = self.extract(response.xpath("(//div[@class='Breadcrumbs-sc-11q7umm-0 dsQddb']//text())[last()]"))
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
        product['ProductManufacturer'] = self.extract(response.xpath("//div[@class='RelatedPage-sc-1i89wok-8 ZlMGA']/a/text()"))
        product['source_internal_id'] = str(self.extract(response.xpath("(//link[@data-route-id='initial']/@href)[1]"))).split("--p")[1]
        yield product

        product_id = ProductIdItem()
        product_id['source_internal_id'] = product['source_internal_id']
        product_id['ProductName'] = product["ProductName"]
        product_id['ID_kind'] = "prisjakt_id"
        product_id['ID_value'] = product["source_internal_id"]
        yield product_id

        hdd_xpath = "//tr[@class='TableRow-sc-41ik9-2 dBYNIg'][4]/td/text()"
        size_internal_hdd = self.extract(response.xpath(hdd_xpath))
        if size_internal_hdd:
            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "size_internal_hdd"
            product_id['ID_value'] = size_internal_hdd
            yield product_id

        date_xpath = "//tr[@class='TableRow-sc-41ik9-2 dBYNIg'][5]/td/text()" 
        date = self.extract(response.xpath(date_xpath))
        if date.isdigit():
            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "first_publish_date"
            product_id['ID_value'] = date
            yield product_id
        else:
            date_xpath = "//tr[@class='TableRow-sc-41ik9-2 dBYNIg'][6]/td/text()" 
            date = self.extract(response.xpath(date_xpath))
            if date.isdigit():
                product_id = ProductIdItem()
                product_id['source_internal_id'] = product["source_internal_id"]
                product_id['ProductName'] = product["ProductName"]
                product_id['ID_kind'] = "first_publish_date"
                product_id['ID_value'] = date
                yield product_id
Ejemplo n.º 8
0
    def init_item_by_xpaths(self, response, item_type, fields, selector=None):
        if not selector:
            selector = Selector(response=response)

        if item_type not in ('review', 'product', 'product_id', 'category'):
            raise Exception("Invalid item type: %s" % item_type)

        if item_type == "review":
            item = ReviewItem()
        elif item_type == "product":
            item = ProductItem()
        elif item_type == "product_id":
            item = ProductIdItem()
        elif item_type == "category":
            item = CategoryItem()

        if item_type in ('review', 'product'):
            item["TestUrl"] = response.url

        for field in fields:
            # TODO: maybe check field.
            if item_type == "review" and field in ("TestPros, TestCons"):
                item[field] = self.extract_all(selector.xpath(fields[field]),
                                               " ; ")
            else:
                item[field] = self.extract_all(selector.xpath(fields[field]))
        return item
Ejemplo n.º 9
0
    def parse_product(self, response):
        category_path_xpath = "//span[contains(@class, 'breadcrumb')]/a/text()"

        category = CategoryItem()
        category['category_path'] = self.extract_all(response.xpath(category_path_xpath),
                                                     separator=' | ')
        yield category
        if self.should_skip_category(category):
            return

        product_xpaths = {"ProductName": "//h1/text()",
                          "PicURL": "//meta[@property='og:image']/@content",
                          "ProductManufacturer": "//meta[@property='brand']/@content",
                          "source_internal_id": "//meta[@property='product_number']/@content"
                          }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)

        if not product['source_internal_id']:
            return

        product['OriginalCategoryName'] = category['category_path']
        yield product

        product_id = ProductIdItem()
        product_id['source_internal_id'] = product["source_internal_id"]
        product_id['ProductName'] = product["ProductName"]
        product_id['ID_kind'] = "debenhams_id"
        product_id['ID_value'] = product["source_internal_id"]
        yield product_id

        request = self.start_reviews(response, product, filter_other_sources=False)
        request.meta['product'] = product
        yield request
Ejemplo n.º 10
0
    def parse_product(self, response):
        manufacturer_xpath = "//strong[contains(@class,'property-name') and contains(text(),'Hersteller')]/following-sibling::span/a[1]/text()"
        review_url_xpath = "//div[@id='product-head-reviews']//a[@class='headbutton']/@href"
        product = ProductItem()
        
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1//text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="data"]/div/img/@src'))
        product['ProductManufacturer'] = self.extract(response.xpath(manufacturer_xpath))
        yield product

        id_values = self.extract(response.xpath('//strong[contains(text(),"EAN")]/parent::div/span/text()'))
        if id_values:
            id_values = id_values.split(',')
            for id_value in id_values:
                productid = ProductIdItem()
                productid['ProductName'] = product["ProductName"]
                productid['ID_kind'] = "EAN"
                productid['ID_value'] = id_value.strip(' ')
                yield productid          

        review_url = self.extract(response.xpath(review_url_xpath))
        if review_url:
            review_url = get_full_url(response, review_url)
            request = Request(url=review_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Ejemplo n.º 11
0
    def parse_items(self, response):
        product_id = ProductIdItem()
        price = response.xpath(
                    '//*[@id="priceCol"]/div[2]/text()').extract()
        product_id['ProductName'] = self.extract(
                    response.xpath('//*[@id="cart_quantity"]/div/div[2]/h1/text()'))
        product_id['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()'))
        if price:
            product_id['ID_kind'] = 'price'
            product_id['ID_value'] = str(price).split()[4].replace(
                "u'\\xa0", "").replace("*", "")
        EAN_id_xpath = '//span[@class="product-ean"]/text()'
        EAN_id = self.extract(response.xpath(EAN_id_xpath))
        if EAN_id:
            product_id['ID_kind'] = "EAN"
            product_id['ID_value'] = EAN_id
        yield product_id

        product = ProductItem()
        product['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()'))
        product['ProductName'] = self.extract(response.xpath(
            '//*[@id="cart_quantity"]/div/div[2]/h1/text()'))
        picture = response.xpath(
            '//*[@id="bImageCarousel"]/div/div[1]/a/img').extract()
        if picture:
            product['PicURL'] = str(picture).split('=')[1].replace("alt", "").replace("\'", "").replace(" \"", "").replace("\"", "")
            product['OriginalCategoryName'] = self.extract(response.xpath(
                '//*[@id="bBreadcrumb"]/ol/li/a/span/text()'))
            product['TestUrl'] = response.url
            yield product
Ejemplo n.º 12
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['ocn']
        name = self.extract(
            response.xpath('//h1[@id="productNameHeader"]/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//img[@id="_imgLarge"]/@src'))
        product['source_internal_id'] = self.extract(
            response.xpath('//span[@class="jsSwatchSku"]/text()'))

        mpn = self.extract(
            response.xpath('//p[contains(text(),"Item Number")]/span/text()'))
        if mpn:
            product_id = ProductIdItem()
            product["ProductName"] = name + ' ' + mpn
            product_id['ProductName'] = product["ProductName"]
            product_id['source_internal_id'] = product['source_internal_id']
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn
            yield product
            yield product_id
        else:
            product["ProductName"] = name
            yield product

        test_url = 'http://api.bazaarvoice.com/data/reviews.json?apiversion=%s&passkey=%s&Filter=ProductId:s%s' \
                   '&Sort=SubmissionTime:desc&Limit=100' % (self.bv_version, self.bv_key, product['source_internal_id'])

        request = Request(url=test_url, callback=self.parse_reviews)
        request.meta['product'] = product
        yield request
Ejemplo n.º 13
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = self.extract(
            response.xpath('//a[contains(@class,"breadcrumb")]/text()'))
        model = self.extract(
            response.xpath('//span[@itemprop="model"]/text()'))
        pic_url = self.extract(
            response.xpath(
                '//meta[@name="analytics-product-image_url"]/@content'))
        if pic_url:
            product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = 'Sony'
        product['ProductName'] = product['ProductManufacturer'] + ' ' + model
        yield product

        id_values = self.extract(response.xpath('//@data-model_ids'))
        if id_values:
            id_values = id_values.strip('[').strip(']').split(',')
            for id_value in id_values:
                product_id = ProductIdItem()
                product_id['ProductName'] = product["ProductName"]
                product_id['ID_kind'] = "MPN"
                product_id['ID_value'] = id_value
                yield product_id

        review_url = response.url + '/reviews-ratings'
        request = Request(url=review_url, callback=self.parse_reviews)
        request.meta['product'] = product
        yield request
Ejemplo n.º 14
0
    def parse_price(self, product, response):
        price_xpath = "//b[contains(text(),'Price')]/following-sibling::text()"
        price_str = self.extract(response.xpath(price_xpath))

        if price_str:
            return ProductIdItem.from_product(product,
                                              kind='price',
                                              value=price_str)
Ejemplo n.º 15
0
    def parse_price(self, product, response):
        price_xpath = '//p[@class="cost-text"]/text()'
        price = self.extract(response.xpath(price_xpath))

        if price:
            return ProductIdItem.from_product(product,
                                              kind='price',
                                              value=price)
Ejemplo n.º 16
0
    def parse_price(self, product, response):
        price_xpath = "(//div[@class='price-msrp'])[1]/a/text()"
        price_str = self.extract(response.xpath(price_xpath))

        if price_str:
            return ProductIdItem.from_product(product,
                                              kind='price',
                                              value=price_str)
Ejemplo n.º 17
0
 def product_id(self, product, kind='', value=''):
     product_id = ProductIdItem()
     if "source_internal_id" in product:
         product_id['source_internal_id'] = product["source_internal_id"]
     product_id['ProductName'] = product["ProductName"]
     product_id['ID_kind'] = kind
     product_id['ID_value'] = value
     return product_id
Ejemplo n.º 18
0
    def parse_items(self, response):
        product_xpaths = {
            "PicURL": "//meta[@property='og:image']/@content",
            "ProductManufacturer":
            "//tr[@class='marke-hersteller']/td/a/text()"
        }

        review_xpaths = {
            "TestSummary": "//div[@id='review_body']/div[1]/p/text()",
            "TestVerdict": "(//div[@id='review_body']/div/p/text())[last()]",
            "TestTitle": "(//title/text())[1]",
            "Author": "//span/meta[@itemprop='author']/@content",
            "TestPros": "//div[@class='list-advantages']/ul/li/div/text()",
            "TestCons": "//div[@class='list-disadvantages']/ul/li/div/text()",
            "SourceTestRating": "//span/meta[@itemprop='ratingValue']/@content"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        productname = self.extract(
            response.xpath("//tr[@class='modell']/td/span/text()"))
        productmanu = product['ProductManufacturer']
        review['ProductName'] = productmanu + " " + productname
        product['ProductName'] = review['ProductName']

        source_internal_id = self.extract(
            response.xpath("//div/meta[@itemprop='productID']/@content"))
        review['source_internal_id'] = source_internal_id
        product['source_internal_id'] = source_internal_id

        if not product['PicURL']:
            product['PicURL'] = self.extract(
                response.xpath("(//div/a/img/@data-src)[1]"))

        if review['SourceTestRating']:
            review['SourceTestScale'] = "5"

        review["DBaseCategoryName"] = "PRO"

        review_date = self.extract(
            response.xpath("//div[@class='offers']/small/text()"))
        date = str(review_date).split(" ")[2]
        review['TestDateText'] = date_format(date, '%d.%m.%Y')

        price = self.extract(
            response.xpath("//div[@class='price']/text()")).encode('utf-8')
        if price:
            product_id = ProductIdItem()
            product_id['ID_kind'] = 'price'
            product_id['ID_value'] = str(price).split(' ')[0]
            product_id['ProductName'] = product['ProductName']
            product_id['source_internal_id'] = product['source_internal_id']
            review_date = datetime.strptime(review['TestDateText'], "%Y-%m-%d")
            if review_date > self.stored_last_date:
                yield review
                yield product_id
                yield product
Ejemplo n.º 19
0
    def parse_price(self, product, response):
        price_xpath = '//div[@class="bbcode"]/p/br[1]'\
            '/preceding-sibling::text()'
        price_str = self.extract(response.xpath(price_xpath))

        if price_str:
            return ProductIdItem.from_product(product,
                                              kind='price',
                                              value=price_str)
Ejemplo n.º 20
0
    def parse_price(self, product, response):
        price_xpath = "(//h2[contains(text(),'Price')]"\
            "/following-sibling::p)/text()"
        price_str = (self.extract(response.xpath(price_xpath))).encode('utf-8')

        if price_str:
            return ProductIdItem.from_product(product,
                                              kind='price',
                                              value=(price_str).split("$")[1])
Ejemplo n.º 21
0
    def parse_price(self, product, response):
        price_xpath = "//div[@class='price']/a/text()|"\
            "//a[@class='price']/text()|"\
            "//span[@class='msrp']/text()"
        price_str = (self.extract(response.xpath(price_xpath))).encode('utf-8')

        if price_str:
            return ProductIdItem.from_product(product,
                                              kind='price',
                                              value=price_str.lstrip('$'))
Ejemplo n.º 22
0
    def parse_product(self, response):
        review_urls = self.extract_list(
            response.xpath('//a[@class="readFullReviewLink"]/@href'))
        if review_urls:
            product = ProductItem()

            product['TestUrl'] = response.url
            product['OriginalCategoryName'] = response.meta['category'][
                'category_path']
            product['ProductName'] = self.extract(
                response.xpath('//h1/text()'))
            product['PicURL'] = self.extract(
                response.xpath('//div[@class="enlargeText"]/a/@href'))
            yield product

            upc = self.extract(
                response.xpath(
                    '//td[contains(text(),"UPC")]/parent::tr/td[@class=""]/text()'
                ))
            if upc:
                product_id = ProductIdItem()
                product_id['ProductName'] = product["ProductName"]
                product_id['ID_kind'] = "UPC"
                product_id['ID_value'] = upc
                yield product_id

            mpn = self.extract(
                response.xpath(
                    '//td[contains(text(),"MPN")]/parent::tr/td[@class=""]/text()'
                ))
            if mpn:
                product_id = ProductIdItem()
                product_id['ProductName'] = product["ProductName"]
                product_id['ID_kind'] = "MPN"
                product_id['ID_value'] = mpn
                yield product_id

            for review_url in review_urls:
                review_url = get_full_url(response,
                                          review_url.strip('#tabAnchor'))
                request = Request(url=review_url, callback=self.parse_review)
                request.meta['product'] = product
                yield request
Ejemplo n.º 23
0
    def parse_product(self, response):
        review_url = 'http://reviews.officedepot.com/2563/%s/reviews.htm'

        category_xpath = '//div[@id="siteBreadcrumb"]//a'
        product_name_xpath = '//*[@itemprop="name"]/text()'
        officedepot_id_xpath = '//*[@id="basicInfoCustomerSku"]/text()'
        brand_xpath = '//*[@id="attributebrand_namekey"]/text()'
        pic_url_xpath = '//*[@id="mainSkuProductImage"]/@src'

        mpn_xpath = '//*[@id="basicInfoManufacturerSku"]/text()'

        category = self.handle_category(response, category_xpath)

        product = ProductItem()
        product['TestUrl'] = response.url
        product['ProductName'] = self.extract(response.xpath(product_name_xpath))
        product['source_internal_id'] = self.extract(response.xpath(officedepot_id_xpath))
        product['ProductManufacturer'] = self.extract(response.xpath(brand_xpath))
        product['PicURL'] = self.extract(response.xpath(pic_url_xpath))
        product['OriginalCategoryName'] = category['category_path']

        officedepot_id = ProductIdItem()
        officedepot_id['source_internal_id'] = product['source_internal_id']
        officedepot_id['ProductName'] =  product['ProductName']
        officedepot_id['ID_kind'] = 'officedepot_id'
        officedepot_id['ID_value'] = product['source_internal_id']

        mpn = ProductIdItem()
        mpn['source_internal_id'] = product['source_internal_id']
        mpn['ProductName'] =  product['ProductName']
        mpn['ID_kind'] = 'MPN'
        mpn['ID_value'] = self.extract(response.xpath(mpn_xpath))

        request = self.selenium_request(url=review_url % product['source_internal_id'],
                                        callback=self.parse_reviews)
        request.meta['product'] = product
        request.meta['product_id'] = officedepot_id
        yield request

        yield category
        yield officedepot_id
        yield mpn
        yield product
Ejemplo n.º 24
0
    def parse_product(self, response):
        reviews = response.xpath('//div[@class="review"]')
        if reviews:
            product = None

            if "product" in response.meta:
                product = response.meta['product']

            if not product:
                product = ProductItem()

                product['TestUrl'] = response.url
                product['OriginalCategoryName'] = response.meta['category']['category_path']
                product['ProductName'] = self.extract(response.xpath('//span[@itemprop="name"]/text()'))
                product['PicURL'] = self.extract(response.xpath('//div[@class="main-image"]/a/img/@src'))
                product['ProductManufacturer'] = self.extract(response.xpath('//div[@itemprop="brand"]//a/text()'))
                if not product['ProductManufacturer']:
                    product['ProductManufacturer'] = self.extract_all(response.xpath(
                        '//div[@class="label"][contains(text(),"Developer")]'
                        '/following-sibling::div[@class="value"]//text()'))
                yield product

                mpn = self.extract(response.xpath(
                    '//div[@class="label"][contains(text(),"Manufacturer")]'
                    '/following-sibling::div[@class="value"]/text()'))
                if mpn:
                    product_id = ProductIdItem()
                    product_id['ProductName'] = product["ProductName"]
                    product_id['ID_kind'] = "MPN"
                    product_id['ID_value'] = mpn
                    yield product_id

                review_url = self.extract(response.xpath('//a[@class="more"]/@href'))
                if review_url:
                    review_url = get_full_url(response, review_url)
                    request = Request(url=review_url, callback=self.parse_product)
                    request.meta['product'] = product
                    yield request
                    return

            for review in reviews:
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = product['ProductName']
                user_review['TestUrl'] = product['TestUrl']
                date = self.extract(review.xpath('.//div[@class="author"]/text()[last()]'))
                user_review['TestDateText'] = date_format(date, '')
                rating = self.extract(review.xpath('.//span[@class="ratingImage"]/img/@alt'))
                user_review['SourceTestRating'] = rating.split(' ')[0]
                user_review['Author'] = self.extract(review.xpath('.//div[@class="author"]/b/text()'))
                user_review['TestTitle'] = self.extract(review.xpath('.//div[@class="title"]/text()[last()]'))
                user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="body"]//text()'))
                yield user_review
Ejemplo n.º 25
0
    def parse_product_json(self, response):
        product_json_ld = extruct_helper.extract_json_ld(
            response.body, 'Product')
        if product_json_ld:
            ocns = product_json_ld.get('category', '')
            if ocns:
                seperator = '/'
                ocns = ocns.split(seperator)
                ocn = ' | '.join(ocn for ocn in ocns)
                category = CategoryItem()
                category['category_path'] = ocn
                yield category

                if not self.should_skip_category(category):
                    product = extruct_helper.product_item_from_product_json_ld(
                        product_json_ld)
                    product['source_id'] = self.spider_conf['source_id']
                    product['TestUrl'] = response.url
                    product['source_internal_id'] = product_json_ld.get(
                        'productID', '')
                    product['OriginalCategoryName'] = ocn
                    yield product

                    # Product Price Item
                    # ----------------------------------------
                    price_str = product_json_ld.get('offers',
                                                    {}).get('price', '')
                    currency_str = product_json_ld.get('offers', {}).get(
                        'priceCurrency', '')
                    price_str = price_str + ' ' + currency_str
                    yield ProductIdItem.from_product(product,
                                                     kind='price',
                                                     value=price_str)

                    # Product SKU Item
                    # ----------------------------------------
                    sku_str = product_json_ld.get('sku', '')
                    yield ProductIdItem.from_product(product,
                                                     kind='SKU',
                                                     value=sku_str)
Ejemplo n.º 26
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category'][
            'category_path']
        product['ProductName'] = self.extract(
            response.xpath('//h1/span/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//img[@itemprop="image"]/@src'))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//span[@class="brand-logo"]/img/@alt'))
        product['source_internal_id'] = self.extract(
            response.xpath('//span[@itemprop="productid"]/text()'))
        yield product

        id_value = self.extract(
            response.xpath('//span[@itemprop="model"]/text()'))
        if id_value:
            product_id = ProductIdItem()
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = id_value
            product_id['source_internal_id'] = product['source_internal_id']
            yield product_id

        reviews = response.xpath(
            '//div[contains(@class,"customer-review-item")]')

        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            date = self.extract(review.xpath('.//li[@class="date"]/text()'))
            date_match = re.findall(r'(\d) day', date)
            if date_match:
                review_date = datetime.date.today() - datetime.timedelta(
                    days=int(date_match[0]))
                user_review['TestDateText'] = review_date.strftime('%Y-%m-%d')
            else:
                user_review['TestDateText'] = date_format(date, '')
            user_review['SourceTestRating'] = self.extract(
                review.xpath('.//div[@class="rating-score"]/text()'))
            user_review['Author'] = self.extract(
                review.xpath('.//li[@class="name"]/text()'))
            user_review['TestTitle'] = self.extract(
                review.xpath('.//h3/text()'))
            user_review['TestSummary'] = self.extract_all(
                review.xpath('.//p/text()|.//span[@class="hidden"]/text()'))
            yield user_review
Ejemplo n.º 27
0
    def parse_product(self, response):
        category = response.meta['category']
        soup = BeautifulSoup(response.body, "lxml")
        item_id = response.url.split('/')[-1].strip()
        product = ProductItem()
        product['source_internal_id'] = item_id
        product['ProductName'] = soup.find('h1', {
            'itemprop': 'name'
        }).text.strip()
        product['ProductManufacturer'] = soup.find('a', {
            'id': 'WMItemBrandLnk'
        }).text.strip() if soup.find('a', {'id': 'WMItemBrandLnk'}) else ''

        product['OriginalCategoryName'] = category['category_path']
        product['PicURL'] = soup.find(
            'img', {'class': 'product-image'})['src'].strip()
        product['TestUrl'] = response.url
        yield product

        price = soup.find('div', {'itemprop': 'price'})
        product_id = ProductIdItem()
        product_id['source_id'] = product['source_id']
        product_id['ProductName'] = product['ProductName']
        product_id['source_internal_id'] = product['source_internal_id']
        if price:
            try:
                product_id['ID_kind'] = 'price'
                product_id['ID_value'] = format(
                    round(float(''.join(price.text.replace('$', ''))), 2),
                    ".2f").replace('.', ',')
            except:
                pass
        yield product_id

        latest_review_date = get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'], item_id)

        review_page = 1
        reviews_link = reviews_link_pattern % (item_id, str(review_page))
        request = Request(reviews_link, callback=self.parse_review)
        request.meta['ProductName'] = product['ProductName']
        request.meta['item_id'] = item_id
        request.meta['review_page'] = review_page
        request.meta['latest_review_date'] = latest_review_date
        anchors = soup.find_all('a', {'class': 'js-product-anchor'})
        for anchor in anchors:
            if 'reviews' in anchor.text:
                request.meta['max_idx'] = int(
                    anchor.text.replace('reviews', '').strip())
                break
        yield request
Ejemplo n.º 28
0
    def parse_product(self, response):
        product_xpaths = {"PicURL": "(//*[@property='og:image'])[1]/@content",
                          "ProductName": "//h1//text()",
                          "OriginalCategoryName": "//li[contains(@class, 'item category')][last()]/a/text()",
                          "ProductManufacturer":  "//th[@class='col label' and text()='Brand']/"
                                                  "following-sibling::*/text()"
                          }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        bv_config_data = self.extract(response.xpath("//script[@type='text/javascript']"
                                                     "[contains(text(),'productId')]/text()"))

        if product.get('OriginalCategoryName', ''):
            category = CategoryItem()
            category_url = self.extract(response.xpath("//li[contains(@class, 'item category')][last()]/a/@href"))
            category['category_url'] = get_full_url(response, category_url)
            category['category_leaf'] = product['OriginalCategoryName']
            category['category_path'] = category['category_leaf']
            yield category

        match = re.search(self.source_internal_id_re, bv_config_data)
        if match:
            product["source_internal_id"] = match.group(1).upper()

            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "richersounds_id"
            product_id['ID_value'] = product["source_internal_id"]
            yield product_id
            yield product

            bv_params = self.bv_base_params.copy()
            bv_params['bv_id'] = product['source_internal_id']
            bv_params['offset'] = 0
            review_url = self.get_review_url(**bv_params)

            request = Request(url=review_url, callback=self.parse_reviews)

            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product["source_internal_id"]
            )
            request.meta['last_user_review'] = last_user_review

            request.meta['bv_id'] = product['source_internal_id']
            request.meta['product'] = product
            request.meta['filter_other_sources'] = False

            yield request
Ejemplo n.º 29
0
    def parse_items(self, response):

        product_xpaths = {
            "PicURL": "//meta[@property='og:image']/@content",
            "ProductName": "//div[@class='news-single-item']/dl/dd[1]/text()"
        }

        review_xpaths = {
            "TestSummary": "//div[@class='news-single-teaser']/h3/text()",
            "Author": "//meta[@name='author']/@content",
            "TestTitle": "//div[@class='news-single-item']/h2/text()",
            "ProductName": "//div[@class='news-single-item']/dl/dd[1]/text()",
            "TestVerdict":
            "(//div[@class='news-single-text']/p/text())[last()]"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        source_internal_id = str(response.url).split("/")[6]
        review['source_internal_id'] = source_internal_id
        product['source_internal_id'] = source_internal_id

        if not review['ProductName']:
            review['ProductName'] = review['TestTitle']

        if not product['ProductName']:
            product['ProductName'] = review['TestTitle']

        review["DBaseCategoryName"] = "PRO"

        date = self.extract(
            response.xpath("//div[@class='news-single-timedata']/text()"))
        review['TestDateText'] = date_format(date, '%d.%m.%Y')

        yield product
        yield review

        price = self.extract(
            response.xpath("//div[@class='news-single-item']/dl/dd[2]/text()"))
        if price:
            pricevalue = str(price.encode('utf-8')).split(' ')[0]
            if pricevalue.isdigit():
                product_id = ProductIdItem()
                product_id['ID_kind'] = 'price'
                product_id['ID_value'] = pricevalue
                product_id['ProductName'] = product['ProductName']
                product_id['source_internal_id'] = product[
                    'source_internal_id']
                yield product_id
Ejemplo n.º 30
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf":
            "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a[last()-1]/text()"
        }
        category_path_xpath = "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a/text()"

        product_xpaths = {
            "PicURL": "(//*[@property='og:image'])[1]/@content",
            "source_internal_id": "//form[@id='productSheet']/@data-product",
            "ProductName": "//div[@itemprop='name']/h1/text()",
            "ProductManufacturer": "//*[@class='nameBrand']/text()"
        }
        category_path_selector = response.xpath(category_path_xpath)
        category_path_selector = category_path_selector[:-1]

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(category_path_selector,
                                                     separator=' | ')
        print category

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]

        product_id = ProductIdItem()
        product_id['source_internal_id'] = product["source_internal_id"]
        product_id['ProductName'] = product["ProductName"]
        product_id['ID_kind'] = "conforama_fr_id"
        product_id['ID_value'] = product["source_internal_id"]
        yield product_id

        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[@id='rating']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.click(reviews_xpath)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['product_id'] = product_id
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response, selector=selector):
                yield review