Ejemplo n.º 1
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1[contains(@class, "title")]/text()'))
        product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
        product['ProductManufacturer'] = self.extract(
            response.xpath(self.brand_xpath))
        product['source_internal_id'] = self.extract(
            response.xpath('//span[@class="details" and text()="SKU"]/following::span/text()'))
        yield product

        if product['source_internal_id']:
            sku_id = self.product_id(product, kind='sku', value=product['source_internal_id'])
            yield sku_id

        id_value = self.extract(response.xpath('//span[@itemprop="productID"]/text()'))
        if id_value:
            product_id = self.product_id(product, kind='MPN', value=id_value)
            yield product_id

        splitted = response.url.split('/')
        if splitted:
            review_url = self.review_url_prefix + splitted[-1].rstrip('.aspx')
            request = Request(url=get_full_url(response, review_url), callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Ejemplo n.º 2
0
    def parse_review(self, response):

        review_xpaths = {
            "TestTitle": "//meta[@property='og:title']/@content",
            "Author": "//div[@class='meta']/a/text()",
            "TestSummary": "//meta[@name='description']/@content"
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        product = ProductItem()
        if not review['TestSummary']:
            review['TestSummary'] = self.extract(
                response.xpath("//meta[@property='og:description']/@content"))

        test_url = response.url
        internal_source_id = str(test_url).split('/')[4].rstrip('/')
        review['source_internal_id'] = internal_source_id
        product['source_internal_id'] = internal_source_id
        # product name
        title = (review['TestTitle']).encode('utf-8')
        if 'review' in title:
            product_name = title.replace(" review", "")
        elif 'Review' in title:
            product_name = title.replace(" Review", "")
        elif 'Video' in title:
            product_name = title.replace(" Video", "").split(":")[0]
        elif ':' in title:
            product_name = str(title).split(":")[0]
        else:
            product_name = title

        product_name = product_name.replace(
            " - Carryology - Exploring better ways to carry",
            "").replace(" Video",
                        "").replace("Drive By", "").replace(":", "").replace(
                            " |", "").replace(" Carryology", "")

        review['ProductName'] = product_name
        product['ProductName'] = product_name

        source_test_rating = self.extract(
            response.xpath("//div[@class='bar']/span[@class='score']/text()"))
        if source_test_rating:
            review['SourceTestRating'] = source_test_rating
            review['SourceTestScale'] = '10'
        review['TestUrl'] = test_url

        date_str = self.extract(
            response.xpath("//div[@class='meta']/text()[2]"))
        date = str(date_str).lstrip(", ")
        date_time = date_format(date, "%B %d, %Y")
        review['TestDateText'] = date_time
        review['DBaseCategoryName'] = 'PRO'

        product['TestUrl'] = test_url
        product['OriginalCategoryName'] = self.extract(
            response.xpath("//div[@class='breadcrumbs']//span/text()"))
        product['PicURL'] = self.extract(
            response.xpath('//meta[@property="og:image"]/@content'))
        yield review
        yield product
Ejemplo n.º 3
0
    def parse_items(self, response):

        product = ProductItem()

        product['TestUrl'] = response.url
        product_name = self.extract(
            response.xpath('//meta[@property="og:title"]/@content'))
        product['ProductName'] = product_name.replace(" | EP:", "")
        product['PicURL'] = self.extract(
            response.xpath('//meta[@property="og:image"]/@content'))
        product['ProductManufacturer'] = self.extract(
            response.xpath("//div[@class='product-details-left']/a//@title"))
        product['source_internal_id'] = str(response.url).split("/")[5]
        yield product

        price_xpath = "//div/div[@class='product-details-price']//div/text()"
        price = self.extract(response.xpath(price_xpath))
        if price:
            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "price"
            product_id['ID_value'] = price.replace(".", "").rstrip(",-")
            yield product_id

        EAN_id_xpath = "//div[@class='product-flixdata']/@data-ean"
        EAN_id = self.extract(response.xpath(EAN_id_xpath))
        if EAN_id:
            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "EAN"
            product_id['ID_value'] = EAN_id
            yield product_id
Ejemplo n.º 4
0
    def parse(self, response):
        all_review_button_xpath = "//a[contains(@class,'seeAllReviews')]"
        soup = BeautifulSoup(response.body, "lxml")
        #inspect_response(response, self)
        item_id = response.url.split('/')[-2].strip()
        product = ProductItem()
        product['source_internal_id'] = item_id
        product['ProductName'] = soup.find('span', {
            'itemprop': 'name'
        }).text.strip()
        product['ProductManufacturer'] = soup.find('span', {
            'itemprop': 'manufacturer'
        }).text.strip()
        ocn = []
        ocn_paths = soup.find('ul', {
            'class': 'Breadcrumb-list'
        }).find_all('span', {'itemprop': 'title'})
        for item in ocn_paths:
            ocn.append(item.text.strip())
        product['OriginalCategoryName'] = ' > '.join(ocn)
        product['PicURL'] = soup.find(
            'img', {'class': 'js-ProductVisuals-imagePreview'})['src'].strip()
        product['TestUrl'] = response.url
        yield product
        yield self.get_rm_kidval(product, response)

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)
            all_review_button = response.xpath(all_review_button_xpath)
            if all_review_button:
                selector = browser.click(
                    "//a[contains(@class,'seeAllReviews')]")
            for review in self._parse_reviews(selector, product, browser):
                yield review
Ejemplo n.º 5
0
    def parse_product(self, response):
        category = response.meta['category']
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']

        product['ProductName'] = ''
        product['PicURL'] = ''
        product_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Product')
        if product_json_ld:
            product['ProductName'] = product_json_ld.get('name', '')
            product['PicURL'] = product_json_ld.get('image', '')
        else:
            # TODO: add fallback plan?
            return

        parsed_url = urlparse(response.url)
        splited = parsed_url.path.split('/')
        if splited:
            product["source_internal_id"] = splited[-1]
        yield product

        internal_id = self.product_id(product,
                                      kind='reevoo_internal_id',
                                      value=product['source_internal_id'])
        yield internal_id

        # TODO: test if the url is valid or not?
        review_url = self.review_url_format.format(
            product["source_internal_id"])
        request = Request(review_url, callback=self.parse_review)
        request.meta['product'] = product
        yield request
Ejemplo n.º 6
0
    def parse_product(self, response):
        reviews = response.xpath('//section[article[contains(@class,"review")]]')
        if reviews:
            product = ProductItem()

            product['TestUrl'] = response.url
            product['OriginalCategoryName'] = 'Cell Phones'
            product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content'))
            pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
            product['PicURL'] = get_full_url(response, pic_url)
            product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content'))
            yield product

            user_reviews = reviews.xpath('./article[@itemprop="review"]')

            for review in user_reviews:
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = product['ProductName']
                user_review['TestUrl'] = product['TestUrl']
                date = self.extract(review.xpath('.//span[@class="time"]/text()'))
                user_review['TestDateText'] = date_format(date, '')
                user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content'))
                user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
                user_review['TestPros'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"positives")]/text()'), '; ')
                user_review['TestCons'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"negatives")]/text()'), '; ')
                yield user_review

            pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href'))
            if pro_review_url:
                request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review)
                request.meta['product'] = product
                yield request
Ejemplo n.º 7
0
    def parse_product(self, response):
        product = ProductItem()
        print response.url
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = self.extract_all(
            response.xpath('//ol[@id="breadcrumb-list"]/li/a/text()'), "->")
        product['ProductName'] = self.extract(
            response.xpath('//div[@class="type-subhead-alt-regular"]//text()'))
        product['PicURL'] = self.extract(
            response.xpath(
                '//div[@data-slide-number="0"]/div[@class="zoomable hammer-wrapper"]/img/@data-img-path'
            ))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//meta[@id="schemaorg-brand-name"]/@content'))
        product['source_internal_id'] = self.extract(
            response.xpath(
                '//span[@id="sku-value" and @itemprop="productID"]/text()'))
        yield product

        request = Request(url="http://bestbuy.ugc.bazaarvoice.com/3545w/" +
                          product['source_internal_id'] +
                          "/reviews.djs?format=embeddedhtml",
                          callback=self.parse_review)
        request.meta['product'] = product
        yield request
Ejemplo n.º 8
0
    def parse_product(self, response):
        manufacturer_xpath = "//strong[contains(@class,'property-name') and contains(text(),'Hersteller')]/following-sibling::span/a[1]/text()"
        review_url_xpath = "//div[@id='product-head-reviews']//a[@class='headbutton']/@href"
        product = ProductItem()
        
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1//text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="data"]/div/img/@src'))
        product['ProductManufacturer'] = self.extract(response.xpath(manufacturer_xpath))
        yield product

        id_values = self.extract(response.xpath('//strong[contains(text(),"EAN")]/parent::div/span/text()'))
        if id_values:
            id_values = id_values.split(',')
            for id_value in id_values:
                productid = ProductIdItem()
                productid['ProductName'] = product["ProductName"]
                productid['ID_kind'] = "EAN"
                productid['ID_value'] = id_value.strip(' ')
                yield productid          

        review_url = self.extract(response.xpath(review_url_xpath))
        if review_url:
            review_url = get_full_url(response, review_url)
            request = Request(url=review_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Ejemplo n.º 9
0
    def parse_review(self, response):

        product = ProductItem()

        product_name_xpath = "//hearder[@class='gutter-top']/h1[@itemprop='name']/text()"
        ocn_xpath = "//div[@class='gutter-vertical']//span[@class='tags']/atext()"
        pic_url_xpath = "//meta[@property='og:image']/text()"

        product['ProductName'] = self.extract(response.xpath(product_name_xpath))
        product['OriginalCategoryName'] = response.meta['category']
        product['PicURL'] = self.extract(response.xpath(pic_url_xpath))

        yield product

        testTitle_xpath = "//meta[@property='og:title']/text()"
        testSummary_xpath = "//div[@class='segment-article gutter-bottom-lg']div[class='row']/div/p/text()"
        author_xpath = ".//span[@class='review-created-by']/text()"
        testDateText_xpath = ".//span[@class='review-created-by']/text()"
        sourceTestRating_xpath = ".//span[@class='review-rating']/img/@src"

        review = ReviewItem()
        review["TestUrl"] = response.url
        review["DBaseCategoryName"] = "USER"
        review["SourceTestScale"] = "5";
        review["ProductName"] = product["ProductName"]
        review["TestTitle"] = self.extract_all(response.xpath(testTitle_xpath))
        review["TestSummary"] = self.extract_all(response.xpath(testSummary_xpath), " ")
        review["Author"] = self.extract(response.xpath(author_xpath))
        review["TestDateText"] = self.extract(response.xpath(testDateText_xpath))
Ejemplo n.º 10
0
    def parse_reviews(self, response):
        review = ReviewItem()
        product = ProductItem()
        contents = response.xpath('//article[@class="post-content"]')
        for content in contents:
            title = self.extract(
                content.xpath('.//div//h1[@class="post-title"]//text()'))
            test_url = self.extract(
                content.xpath('.//div//h1[@class="post-title"]//a/@href'))
            author = self.extract(
                content.xpath('.//span[@itemprop="name"]/text()'))
            date_str = self.extract_all(
                content.xpath('.//meta[@itemprop="datePublished"]/@content'))
            date = date_format(date_str, '%Y-%m-%d')
            pic = self.extract(content.xpath('.//img/@src'))
            sumamry = self.extract_all(
                content.xpath('.//div[@itemprop="articleBody"]//text()'))
            sid = test_url.split('/')[-2]
            # product items
            product['ProductName'] = title
            product['PicURL'] = pic
            product['source_internal_id'] = sid
            product['TestUrl'] = test_url
            # review
            review['ProductName'] = title
            review['TestTitle'] = title
            review['TestSummary'] = sumamry
            review['TestUrl'] = test_url
            review['DBaseCategoryName'] = 'pro'
            review['source_internal_id'] = sid
            review['TestDateText'] = date
            review['Author'] = author

            yield review
            yield product
Ejemplo n.º 11
0
    def parse_product(self, response):
        item = response.meta['item']

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = item['ocn']
        product['ProductName'] = item['name']
        product['PicURL'] = get_full_url(
            response.url,
            self.extract(response.xpath('//img[@itemprop="image"]/@src')))
        product["ProductManufacturer"] = self.extract(
            response.xpath('//span[@itemprop="brand"]/text()'))
        yield product

        mpn_id_xpath = '//div[text()="Partnumber"]/parent::div/div[contains(@class,"value")]/text()'
        ean_id_xpath = '//div[text()="EAN"]/parent::div/div[contains(@class,"value")]/text()'
        mpn_id = self.extract(response.xpath(mpn_id_xpath))
        ean_id = self.extract(response.xpath(ean_id_xpath))

        if mpn_id.strip() > '-':
            mpn = ProductIdItem()
            mpn['ProductName'] = item['name']
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_id
            yield mpn

        if ean_id.strip() > '-':
            ean = ProductIdItem()
            ean['ProductName'] = item['name']
            ean['ID_kind'] = "EAN"
            ean['ID_value'] = ean_id
            yield ean
Ejemplo n.º 12
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['ocn']
        name = self.extract(
            response.xpath('//h1[@id="productNameHeader"]/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//img[@id="_imgLarge"]/@src'))
        product['source_internal_id'] = self.extract(
            response.xpath('//span[@class="jsSwatchSku"]/text()'))

        mpn = self.extract(
            response.xpath('//p[contains(text(),"Item Number")]/span/text()'))
        if mpn:
            product_id = ProductIdItem()
            product["ProductName"] = name + ' ' + mpn
            product_id['ProductName'] = product["ProductName"]
            product_id['source_internal_id'] = product['source_internal_id']
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn
            yield product
            yield product_id
        else:
            product["ProductName"] = name
            yield product

        test_url = 'http://api.bazaarvoice.com/data/reviews.json?apiversion=%s&passkey=%s&Filter=ProductId:s%s' \
                   '&Sort=SubmissionTime:desc&Limit=100' % (self.bv_version, self.bv_key, product['source_internal_id'])

        request = Request(url=test_url, callback=self.parse_reviews)
        request.meta['product'] = product
        yield request
Ejemplo n.º 13
0
    def parse_reviews(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category'][
            'category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/a/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//meta[@property="og:image"]/@content'))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//meta[@itemprop="brand"]/@content'))
        product['source_internal_id'] = self.extract(
            response.xpath('//@data-product-id'))
        yield product

        reviews = response.xpath('//li[@class="opinion-row"]')
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            date = self.extract(
                review.xpath('.//meta[@itemprop="datePublished"]/@content'))
            user_review['TestDateText'] = date_format(date, "%Y %m %d")
            user_review['SourceTestRating'] = self.extract(
                review.xpath('.//meta[@itemprop="ratingValue"]/@content'))
            user_review['Author'] = self.extract(review.xpath('.//h4/text()'))
            user_review['TestTitle'] = self.extract(
                review.xpath('.//div[contains(@class,"grade-text")]/text()'))
            user_review['TestSummary'] = self.extract_all(
                review.xpath('.//div[@itemprop="description"]/text()'))
            yield user_review
Ejemplo n.º 14
0
    def parse_product_review(self, response):
        # print "     ...PARSE_PRODUCT_REVIEW: " + response.url

        title_xpath = '//meta[@property="og:title"]/@content'
        title = response.xpath(title_xpath).get()

        dont_scrape_words = ['Headphone Battle', 'Comparison', 'Comparisons']

        scrape = True
        for w in dont_scrape_words:
            if w in title:
                scrape = False
                break

        if scrape:
            # REVIEW ITEM --------------------------------------------------
            review_xpaths = {
                'TestTitle': '//meta[@property="og:title"]/@content',
                'TestSummary': '//meta[@property="og:description"]/@content',
            }

            # Create the review
            review = self.init_item_by_xpaths(response, "review",
                                              review_xpaths)

            # 'ProductName'
            r_title = review['TestTitle']
            review['ProductName'] = \
                self.get_product_name_based_on_title(r_title)

            # 'Author'
            review['Author'] = response.meta.get('author')

            # 'TestDateText'
            review['TestDateText'] = response.meta.get('date')

            # 'DBaseCategoryName'
            review['DBaseCategoryName'] = 'PRO'

            # 'source_internal_id'
            '''sid_xpath = '//link[@rel="shortlink"]/@href'
            sid = response.xpath(sid_xpath).get()
            sid = sid.split('?p=')[-1]
            review['source_internal_id'] = sid'''
            review['source_internal_id'] = response.meta.get('sid')

            # PRODUCT ITEM -------------------------------------------------
            product = ProductItem()
            product['source_internal_id'] = review['source_internal_id']
            product['OriginalCategoryName'] = response.meta.get('cat')
            product['ProductName'] = review['ProductName']

            pic_url_xpath = '//meta[@property="og:image"]/@content'
            pic_url = response.xpath(pic_url_xpath).get()
            product['PicURL'] = pic_url

            product['TestUrl'] = response.url

            yield review
            yield product
Ejemplo n.º 15
0
    def init_item_by_xpaths(self, response, item_type, fields, selector=None):
        if not selector:
            selector = Selector(response=response)

        if item_type not in ('review', 'product', 'product_id', 'category'):
            raise Exception("Invalid item type: %s" % item_type)

        if item_type == "review":
            item = ReviewItem()
        elif item_type == "product":
            item = ProductItem()
        elif item_type == "product_id":
            item = ProductIdItem()
        elif item_type == "category":
            item = CategoryItem()

        if item_type in ('review', 'product'):
            item["TestUrl"] = response.url

        for field in fields:
            # TODO: maybe check field.
            if item_type == "review" and field in ("TestPros, TestCons"):
                item[field] = self.extract_all(selector.xpath(fields[field]),
                                               " ; ")
            else:
                item[field] = self.extract_all(selector.xpath(fields[field]))
        return item
Ejemplo n.º 16
0
    def parse_product(self, response):
        product = ProductItem()
        mpn = self.extract(response.xpath('//span[@id="lblMfgPartNo"]/text()'))

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['ocn']
        product['PicURL'] = self.extract(
            response.xpath('//meta[@itemprop="image"]/@content'))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//meta[@itemprop="brand"]/@content'))
        product['ProductName'] = product['ProductManufacturer'] + ' ' + mpn
        product['source_internal_id'] = self.extract(
            response.xpath('//span[@id="lblCatalog"]/text()'))
        yield product

        product_id = self.product_id(product)
        product_id['ID_kind'] = "MPN"
        product_id['ID_value'] = mpn
        product_id['source_internal_id'] = product['source_internal_id']
        yield product_id

        review_id = self.extract(
            response.xpath('//a[@name="aReviews"]/@onclick'))
        id_match = re.findall(r"','([\d]+)'", review_id)
        review_url = 'http://www.buydig.com/shop/productreviews.aspx?sku=&pageid=%s&srt=DateNew&lmt=50' % id_match[
            0]
        request = Request(url=review_url, callback=self.parse_reviews)
        request.meta['product'] = product
        yield request
Ejemplo n.º 17
0
 def parse_reviews(self, response):
     category = response.meta['category']
     product = ProductItem()
     product['TestUrl'] = response.url
     product['OriginalCategoryName'] = category['category_path']
     product['ProductName'] = self.extract(response.xpath('//span[@class="fn"]/text()'))
     product_id = response.meta['product_id']
     product['PicURL'] = 'http://geizhals.at/p/'+product_id+'.jpg'
     product['source_internal_id'] = product_id
     yield product
     
     reviews = response.xpath('//li[contains(@class,"gh_box")]')
     for review in reviews:
         user_review = ReviewItem()
         user_review['DBaseCategoryName'] = "USER"
         user_review['ProductName'] = product['ProductName']
         user_review['TestUrl'] = product['TestUrl']
         date = self.extract(review.xpath('.//div[@class="userbox"]/text()')).strip('am ')
         user_review['TestDateText'] = date_format(date, "%d.%m.%Y %H:%M")
         user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="rating"]/text()'))
         user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()'))
         user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()'))
         user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@itemprop="description"]//text()'))
         user_review['source_internal_id'] = product['source_internal_id']
         yield user_review
Ejemplo n.º 18
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['ocn']
        product['ProductName'] = self.extract_all(response.xpath(
                '//div[@class="hilo-navegacion"]/descendant::span[last()]/text()'))
        product['PicURL'] = self.extract(response.xpath('//a[@id="imagen-principal-1"]/@href'))
        product['ProductManufacturer'] = self.extract(response.xpath('//span[@itemprop="brand"]/text()'))
        product['source_internal_id'] = self.extract(response.xpath('//li[@id="id_articulo"]/@data-id'))
        yield product

        mpn = self.extract(response.xpath('//span[@itemprop="productID"]/@content'))
        if mpn:
            product_id = self.product_id(product)
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn.strip('mpn:')
            product_id['source_internal_id'] = product['source_internal_id']
            yield product_id

        review_url = 'http://www.pccomponentes.com/comentarios/inc_pagina_comentarios.php?id_articulo=%s' \
                     '&orden=recientes' % product['source_internal_id']
        request = Request(url=review_url, callback=self.parse_reviews)
        request.meta['product'] = product
        yield request
Ejemplo n.º 19
0
    def test_product(self):
        product = ProductItem()
        product['source_id'] = 7654321
        product['source_internal_id'] = "Squibobble12387"
        product['ProductName'] = "Awesome fake product #1"
        product['OriginalCategoryName'] = "Fake products"
        product[
            'PicURL'] = "http://totes.fake.website.com/fake_products/pics/fake_product_of_awesome.jpg"
        product['ProductManufacturer'] = "ACME"
        product[
            'TestUrl'] = "http://totes.fake.website.com/fake_products/fake_product_of_awesome.html"

        assert product._name == "product", "ProductItem _name field incorrect"
        assert product[
            'source_id'] == 7654321, "ProductItem source_id incorrectly set"
        assert product[
            'source_internal_id'] == "Squibobble12387", "ProductItem source_internal_id incorrectly set"
        assert product[
            'ProductName'] == "Awesome fake product #1", "ProductItem ProductName incorrectly set"
        assert product[
            'OriginalCategoryName'] == "Fake products", "ProductItem OriginalCategoryName incorrectly set"
        assert product['PicURL'] == "http://totes.fake.website.com/fake_products/pics/fake_product_of_awesome.jpg", \
            "ProductItem PicURL incorrectly set"
        assert product[
            'ProductManufacturer'] == "ACME", "ProductItem ProductManufacturer incorrectly set"
        assert product['TestUrl'] == "http://totes.fake.website.com/fake_products/fake_product_of_awesome.html", \
            "ProductItem TestUrl incorrectly set"
Ejemplo n.º 20
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        ocn = self.extract(response.xpath(
            '//script[@type="text/javascript"][contains(text(),"sectionValue")]/text()'))
        ocn_match = re.findall(r'sectionValue = "([^"]+)"', ocn)
        product['OriginalCategoryName'] = ocn_match[0]
        product['ProductName'] = self.extract(response.xpath('//h1/span[@itemprop="name"]/text()'))
        pic_url = self.extract(response.xpath('//ul/li[1]/img[@itemprop="image"]/@src'))
        if pic_url:
            pic_url = get_full_url(response, pic_url)
            product['PicURL'] = pic_url
        product['ProductManufacturer'] = 'HP'
        yield product

        mpn = self.extract_list(response.xpath('//span[@class="prodNum"]/text()'))
        if mpn:
            product_id = self.product_id(product)
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn[0]
            yield product_id

        reviews = response.xpath('//div[@itemprop="review"]')

        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['TestDateText'] = self.extract(review.xpath('./meta[@itemprop="datePublished"]/@content'))
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//span[@itemprop="name"]/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//span[@itemprop="description"]//text()'))
            yield user_review
Ejemplo n.º 21
0
    def parse_product_review(self, response):
        # print "     ...PARSE_PRODUCT_REVIEW: " + response.url

        # REVIEW ITEM ------------------------------------------------------
        review_xpaths = {
            'TestTitle': '//meta[@property="og:title"]/@content',
            'TestSummary': '//meta[@property="og:description"]/@content',
        }

        # Create the review
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # 'ProductName'
        title = review['TestTitle']
        review['ProductName'] = self.get_product_name_based_on_title(title)

        # 'Author'
        authors_xpath = '//span[@class="entry-info-author"]//text()'
        authors = response.xpath(authors_xpath).get()

        # 'TestDateText'
        review['TestDateText'] = response.meta.get('date')

        # 'DBaseCategoryName'
        review['DBaseCategoryName'] = 'PRO'

        # 'TestPros'  'TestCons'
        pros_xpath = '//div[@class="color-green plus-wrapper col-sm-6 '\
                     'col-xs-12"]/ul/li//text()'
        pros = response.xpath(pros_xpath).getall()
        if pros:
            pros = ";".join(pros)
            review['TestPros'] = pros

        cons_xpath = '//div[@class="color-red minus-wrapper '\
                     'col-sm-6 col-xs-12"]/ul/li//text()'
        cons = response.xpath(cons_xpath).getall()
        if cons:
            cons = ";".join(cons)
            review['TestCons'] = cons

        # 'source_internal_id'
        review['source_internal_id'] = response.meta.get('sid')
        # ------------------------------------------------------------------

        # PRODUCT ITEM -----------------------------------------------------
        product = ProductItem()
        product['source_internal_id'] = review['source_internal_id']
        product['OriginalCategoryName'] = response.meta.get('cat')
        product['ProductName'] = review['ProductName']

        pic_url_xpath = '//meta[@property="og:image"]/@content'
        pic_url = response.xpath(pic_url_xpath).get()
        product['PicURL'] = pic_url

        product['TestUrl'] = response.url
        # ------------------------------------------------------------------

        yield review
        yield product
Ejemplo n.º 22
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = self.extract(response.xpath('//meta[@name="ProductName"]/@content'))
        product['ProductName'] = self.extract(response.xpath(
                '//h1[@class="bar_3-heading"]/text() |'
                '//h1[@itemprop="name"]/text()'))
        pic_url = self.extract_list(response.xpath(
                '(//img[contains(@class,"heroImg")]/@src) |'
                '(//div[@class="productImg"]/img/@src) |'
                '//div[@id="galleria-stage"]//@src'))
        if pic_url:
            pic_url = get_full_url(response, pic_url[0])
            product['PicURL'] = pic_url
        product['ProductManufacturer'] = 'Lenovo'
        yield product

        mpn = self.extract(response.xpath('//meta[@name="PartNumber"]/@content'))
        if mpn:
            product_id = self.product_id(product)
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn
            yield product_id

        category_id = self.extract(response.xpath('//meta[@name="metacategoryidentifier"]/@content'))

        test_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=%s&apiversion=%s' \
                   '&displaycode=%s&resource.q0=reviews&filter.q0=isratingsonly:eq:false' \
                   '&filter.q0=productid:eq:%s_%s' \
                   '&filter.q0=contentlocale:eq:en_US&sort.q0=submissiontime:desc&limit.q0=100&offset.q0=0' % \
                   (self.bv_key, self.bv_version, self.bv_code, category_id, self.bv_id)

        request = Request(url=test_url, callback=self.parse_reviews)
        request.meta['product'] = product
        yield request
Ejemplo n.º 23
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = self.extract(
            response.xpath('//a[contains(@class,"breadcrumb")]/text()'))
        model = self.extract(
            response.xpath('//span[@itemprop="model"]/text()'))
        pic_url = self.extract(
            response.xpath(
                '//meta[@name="analytics-product-image_url"]/@content'))
        if pic_url:
            product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = 'Sony'
        product['ProductName'] = product['ProductManufacturer'] + ' ' + model
        yield product

        id_values = self.extract(response.xpath('//@data-model_ids'))
        if id_values:
            id_values = id_values.strip('[').strip(']').split(',')
            for id_value in id_values:
                product_id = ProductIdItem()
                product_id['ProductName'] = product["ProductName"]
                product_id['ID_kind'] = "MPN"
                product_id['ID_value'] = id_value
                yield product_id

        review_url = response.url + '/reviews-ratings'
        request = Request(url=review_url, callback=self.parse_reviews)
        request.meta['product'] = product
        yield request
Ejemplo n.º 24
0
    def parse_product(self, response):
        sii = self.extract(
            response.xpath('//input[@class="addedItemInput"]/@value'))
        if sii:
            product = ProductItem()
            product['TestUrl'] = response.url
            product['OriginalCategoryName'] = response.meta['category'][
                'category_path']
            product['ProductName'] = self.extract(
                response.xpath('//h1[@itemprop="name"]/text()'))
            product['PicURL'] = self.extract(
                response.xpath('//img[@itemprop="image"]/@src'))
            product['ProductManufacturer'] = self.extract(
                response.xpath(
                    '//span[contains(text(),"Brand")]/following-sibling::text()'
                ))
            product['source_internal_id'] = sii
            yield product

            test_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=%s&apiversion=%s' \
                       '&displaycode=%s&resource.q0=reviews&filter.q0=isratingsonly:eq:false' \
                       '&filter.q0=productid:eq:%s' \
                       '&filter.q0=contentlocale:eq:en_US&sort.q0=submissiontime:desc&limit.q0=100&offset.q0=0' % \
                       (self.bv_key, self.bv_version, self.bv_code, product['source_internal_id'])

            request = Request(url=test_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Ejemplo n.º 25
0
    def parse_items(self, response):
        product_id = ProductIdItem()
        price = response.xpath(
                    '//*[@id="priceCol"]/div[2]/text()').extract()
        product_id['ProductName'] = self.extract(
                    response.xpath('//*[@id="cart_quantity"]/div/div[2]/h1/text()'))
        product_id['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()'))
        if price:
            product_id['ID_kind'] = 'price'
            product_id['ID_value'] = str(price).split()[4].replace(
                "u'\\xa0", "").replace("*", "")
        EAN_id_xpath = '//span[@class="product-ean"]/text()'
        EAN_id = self.extract(response.xpath(EAN_id_xpath))
        if EAN_id:
            product_id['ID_kind'] = "EAN"
            product_id['ID_value'] = EAN_id
        yield product_id

        product = ProductItem()
        product['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()'))
        product['ProductName'] = self.extract(response.xpath(
            '//*[@id="cart_quantity"]/div/div[2]/h1/text()'))
        picture = response.xpath(
            '//*[@id="bImageCarousel"]/div/div[1]/a/img').extract()
        if picture:
            product['PicURL'] = str(picture).split('=')[1].replace("alt", "").replace("\'", "").replace(" \"", "").replace("\"", "")
            product['OriginalCategoryName'] = self.extract(response.xpath(
                '//*[@id="bBreadcrumb"]/ol/li/a/span/text()'))
            product['TestUrl'] = response.url
            yield product
Ejemplo n.º 26
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1[@itemprop="itemreviewed"]/text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="productPhotoGallery"]/div/img/@src'))
        product['ProductManufacturer'] = self.extract(response.xpath(
                '//div[@class="manufacturer"]//span[not(text()="brak")]/text()'))
        yield product

        reviews = response.xpath(
                '//div[@class="opinion"][not(descendant::a[contains(text(),"Opinia z serwisu Ceneo.pl")])]')
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//span[@class="date"]/text()'))
            user_review['TestDateText'] = date_format(date, "%Y-%m-%d")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@class="points"]/text()'))
            user_review['Author'] = self.extract_all(review.xpath('.//*[@class="profileName"]//text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="text"]//text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pluses"]//span/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="minuses"]//span/text()'), '; ')
            yield user_review
Ejemplo n.º 27
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="images"]/a/img/@src'))
        product['ProductManufacturer'] = self.extract(
                response.xpath('//span[text()="Marca"]/parent::li/span[@class="value"]/text()'))
        product['source_internal_id'] = self.extract(response.xpath('//input[@id="prodId"]/@value'))
        yield product

        reviews = response.xpath('//article[@itemscope]')
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['source_internal_id'] = product['source_internal_id']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//div[@class="date"]/text()'))
            date_match = re.findall(r'[\d/]{10}', date)
            if date_match:
                user_review['TestDateText'] = date_format(date_match[0], "%d/%m/%Y")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//h2/a/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//h3/a/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//p[@itemprop="reviewBody"]/text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//div[@class="pro"]//li/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//div[@class="con"]//li/text()'), '; ')
            yield user_review
Ejemplo n.º 28
0
    def parse_product(self, response):
        category = response.meta['category']

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']
        product["ProductManufacturer"] = self.extract(response.xpath('//a[@class="brand"]/text()'))
        product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
        product['source_internal_id'] = self.extract(response.xpath('//div[@id="pdpFRdivMain"]/@data-productid'))
        mpn = self.extract(response.xpath(
            '//dt[@data-cerberus="txt_pdp_sizetitle"]/parent::dl/dd[not(contains(text(),"Taille"))]/text()'))
        if mpn:
            product['ProductName'] = product["ProductManufacturer"] + ' ' + mpn
            product_id = self.product_id(product)
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn
            yield product_id
        else:
            name = self.extract(response.xpath('//h2[@itemprop="name"]/text()'))
            product['ProductName'] = product["ProductManufacturer"] + ' ' + name
        yield product

        review_url = self.extract(response.xpath('//a[@class="read-reviews"]/@href'))
        review_url = get_full_url(response, review_url)
        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(review_url, timeout=10)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response, selector, incremental=True):
                yield review
Ejemplo n.º 29
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        pic_url = self.extract(response.xpath('//div[@class="product-carousel"]//img[@itemprop="image"][1]/@src'))
        product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = self.extract(response.xpath(
            '//td[text()="Constructeur"]/following-sibling::td/text()'))
        yield product

        reviews = response.xpath('//li[@itemprop="review"]')
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//span[@itemprop="datePublished"]/text()'))
            user_review['TestDateText'] = date_format(date, '%d/%m/%Y')
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//div[@itemprop="name"]/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//blockquote/text()'))
            yield user_review
Ejemplo n.º 30
0
    def parse_product_review(self, response):
        # print "     ...PARSE_PRODUCT_REVIEW: " + response.url

        # REVIEW ----------------------------------------------------------
        review_xpaths = {
            'TestTitle': '//meta[@property="og:title"]/@content',
            'Author': '(//span[@class="author vcard"])[1]/text()',
            'TestSummary': '//meta[@property="og:description"]/@content'
        }

        # Create the review
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # 'ProductName'
        header = self.extract(
            response.xpath("//h1[@class='title-primary']/text()"))
        review['ProductName'] = header.replace(" Review",
                                               "").replace(" review", "")

        # 'TestDateText'
        review['TestDateText'] = response.meta.get('date')

        # 'DBaseCategoryName'
        review['DBaseCategoryName'] = 'PRO'

        # 'TestPros'    'TestCons'
        pros_xpath = '//div[@class="review-points__pros"]//li/'\
                     'span[@class="point"]/text()'
        pros = response.xpath(pros_xpath).getall()
        pros = ";".join(pros)

        cons_xpath = '//div[@class="review-points__cons"]//li/'\
                     'span[@class="point"]/text()'
        cons = response.xpath(cons_xpath).getall()
        cons = ";".join(cons)

        if pros and cons:
            review['TestPros'] = pros
            review['TestCons'] = cons

        # 'source_internal_id'
        review['source_internal_id'] = response.url.split('/')[-1]
        # ---------------------------------------------------------------------

        # PRODUCT -------------------------------------------------------------
        product = ProductItem()
        product['source_internal_id'] = review['source_internal_id']
        product['OriginalCategoryName'] = response.meta.get('cat')
        product['ProductName'] = review['ProductName']

        pic_url_xpath = '(//meta[@property="og:image"])[1]/@content'
        pic_url = response.xpath(pic_url_xpath).get()
        product['PicURL'] = pic_url

        product['TestUrl'] = response.url
        # ---------------------------------------------------------------------

        yield review
        yield product