def parse_items(self, response): product = ProductItem() product['TestUrl'] = response.url product_name = self.extract( response.xpath('//meta[@property="og:title"]/@content')) product['ProductName'] = product_name.replace(" | EP:", "") product['PicURL'] = self.extract( response.xpath('//meta[@property="og:image"]/@content')) product['ProductManufacturer'] = self.extract( response.xpath("//div[@class='product-details-left']/a//@title")) product['source_internal_id'] = str(response.url).split("/")[5] yield product price_xpath = "//div/div[@class='product-details-price']//div/text()" price = self.extract(response.xpath(price_xpath)) if price: product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "price" product_id['ID_value'] = price.replace(".", "").rstrip(",-") yield product_id EAN_id_xpath = "//div[@class='product-flixdata']/@data-ean" EAN_id = self.extract(response.xpath(EAN_id_xpath)) if EAN_id: product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "EAN" product_id['ID_value'] = EAN_id yield product_id
def parse_product(self, response): item = response.meta['item'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = item['ocn'] product['ProductName'] = item['name'] product['PicURL'] = get_full_url( response.url, self.extract(response.xpath('//img[@itemprop="image"]/@src'))) product["ProductManufacturer"] = self.extract( response.xpath('//span[@itemprop="brand"]/text()')) yield product mpn_id_xpath = '//div[text()="Partnumber"]/parent::div/div[contains(@class,"value")]/text()' ean_id_xpath = '//div[text()="EAN"]/parent::div/div[contains(@class,"value")]/text()' mpn_id = self.extract(response.xpath(mpn_id_xpath)) ean_id = self.extract(response.xpath(ean_id_xpath)) if mpn_id.strip() > '-': mpn = ProductIdItem() mpn['ProductName'] = item['name'] mpn['ID_kind'] = "MPN" mpn['ID_value'] = mpn_id yield mpn if ean_id.strip() > '-': ean = ProductIdItem() ean['ProductName'] = item['name'] ean['ID_kind'] = "EAN" ean['ID_value'] = ean_id yield ean
def parse_product(self, response): product_xpaths = { "PicURL": "//meta[@property='og:image']/@content", "ProductName": "//h1[@class='productHeading']//text()", "ProductManufacturer": "//h1[@class='productHeading']/text()" } product = self.init_item_by_xpaths(response, "product", product_xpaths) match = re.search(self.source_internal_id_re, response.url) if match: product['source_internal_id'] = match.group(1) product['TestUrl'] = response.url product["OriginalCategoryName"] = response.meta["category"][ "category_path"] yield product mpn_value = self.extract( response.xpath("//span[@id='productMPN']/text()")) if mpn_value: mpn = ProductIdItem() mpn['source_internal_id'] = product["source_internal_id"] mpn['ProductName'] = product["ProductName"] mpn['ID_kind'] = "MPN" mpn['ID_value'] = mpn_value yield mpn ean_value = self.extract( response.xpath("//span[@id='productEAN']/text()")) if ean_value: ean = ProductIdItem() ean['source_internal_id'] = product["source_internal_id"] ean['ProductName'] = product["ProductName"] ean['ID_kind'] = "EAN" ean['ID_value'] = ean_value yield ean bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['filter_other_sources'] = False yield request
def parse_product(self, response): product = ProductItem() product_name_xpath = "//*[@itemprop='name']/a/text()" pic_url_xpath = "//div[@class='imageCarousel']//img/@src" manufacturer_xpath = "//td[@class='spec-index-column'][text()='Merk']/following-sibling::td//text()" sii_xpath = "//td[@class='spec-index-column'][text()='Tweakers ID']/following-sibling::td//text()" product['TestUrl'] = response.url product['ProductName'] = self.extract( response.xpath(product_name_xpath)) if not product['ProductName']: #blocked request = self._retry(response.request) yield request return category_path_xpath = "//li[@id='tweakbaseBreadcrumbCategory']/a/text()" category_path = self.extract(response.xpath(category_path_xpath)) if category_path: category = CategoryItem() category['category_path'] = category_path product['OriginalCategoryName'] = category_path if self.should_skip_category(category): return yield category product['PicURL'] = self.extract(response.xpath(pic_url_xpath)) product['ProductManufacturer'] = self.extract( response.xpath(manufacturer_xpath)) product['source_internal_id'] = self.extract(response.xpath(sii_xpath)) yield product tweakers_kind = ProductIdItem() tweakers_kind['source_internal_id'] = product['source_internal_id'] tweakers_kind['ProductName'] = product["ProductName"] tweakers_kind['ID_kind'] = "tweakers_id" tweakers_kind['ID_value'] = product["source_internal_id"] yield tweakers_kind eans_xpath = "//td[@class='spec-index-column'][text()='EAN']/following-sibling::td/span/text()" eans = self.extract_list(response.xpath(eans_xpath)) for ean in eans: tweakers_kind = ProductIdItem() tweakers_kind['source_internal_id'] = product["source_internal_id"] tweakers_kind['ProductName'] = product["ProductName"] tweakers_kind['ID_kind'] = "EAN" try: tweakers_kind['ID_value'] = int(ean) yield tweakers_kind except ValueError, e: continue
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['ProductManufacturer'] = self.brand_name product_name_xpath = "//meta[@name='PS_DTN']/@content" pic_url_xpath = "//meta[@name='ISS_IMAGE']/@content" sii_xpath = "//meta[@name='PHILIPS.METRICS.PRODUCTID']/@content" product_name_orig = self.extract(response.xpath(product_name_xpath)) if not product_name_orig: return product['ProductName'] = self.brand_name + ' ' + product_name_orig category_path_xpath = "//meta[@name='ISS_GROUP_KEY_NEW']/@content" category_path = self.extract(response.xpath(category_path_xpath)) if category_path: category = CategoryItem() category['category_path'] = category_path product['OriginalCategoryName'] = category_path if self.should_skip_category(category): return yield category product['PicURL'] = self.extract(response.xpath(pic_url_xpath)) product['source_internal_id'] = self.extract(response.xpath(sii_xpath)) yield product # We were using product MPNs as philips_id, # do the same thing in alaScrapy spider philips_id = ProductIdItem.from_product(product, kind='philips_id', value=product_name_orig) yield philips_id eans_xpath = "//meta[@name='PS_GTIN']/@content" ean = self.extract(response.xpath(eans_xpath)) if ean: ean_item = ProductIdItem.from_product(product, kind='EAN', value=ean) yield ean_item request = self.start_reviews(response, product, filter_other_sources=False) request.meta['product'] = product yield request
def parse(self, response): #Must use only product_page category_xpaths = { "category_leaf": "//*[@id='moreFrom-catLink']/a/text()", "category_path": "//*[@id='moreFrom-catLink']/a/text()" } product_xpaths = { "PicURL": "(//li[@class='productImageItem'])[1]//img/@src", "ProductName": "//h1[@class='productHeading']//text()", "ProductManufacturer": "//h1[@class='productHeading']/text()" } category = self.init_item_by_xpaths(response, "category", category_xpaths) product = self.init_item_by_xpaths(response, "product", product_xpaths) match = re.search(self.source_internal_id_re, response.url) if match: product['source_internal_id'] = match.group(1) product["OriginalCategoryName"] = category["category_path"] yield category yield product yield self.get_rm_kidval(product, response) mpn_value = self.extract( response.xpath("//span[@id='productMPN']/text()")) if mpn_value: mpn = ProductIdItem() mpn['source_internal_id'] = product["source_internal_id"] mpn['ProductName'] = product["ProductName"] mpn['ID_kind'] = "MPN" mpn['ID_value'] = mpn_value yield mpn ean_value = self.extract( response.xpath("//span[@id='productEAN']/text()")) if ean_value: ean = ProductIdItem() ean['source_internal_id'] = product["source_internal_id"] ean['ProductName'] = product["ProductName"] ean['ID_kind'] = "EAN" ean['ID_value'] = ean_value yield ean with SeleniumBrowser(self, response) as browser: selector = browser.get(response.url) for review in self._parse_reviews(selector, browser, product): yield review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = self.extract(response.xpath("(//div[@class='Breadcrumbs-sc-11q7umm-0 dsQddb']//text())[last()]")) product['ProductName'] = self.extract(response.xpath('//h1/text()')) product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['ProductManufacturer'] = self.extract(response.xpath("//div[@class='RelatedPage-sc-1i89wok-8 ZlMGA']/a/text()")) product['source_internal_id'] = str(self.extract(response.xpath("(//link[@data-route-id='initial']/@href)[1]"))).split("--p")[1] yield product product_id = ProductIdItem() product_id['source_internal_id'] = product['source_internal_id'] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "prisjakt_id" product_id['ID_value'] = product["source_internal_id"] yield product_id hdd_xpath = "//tr[@class='TableRow-sc-41ik9-2 dBYNIg'][4]/td/text()" size_internal_hdd = self.extract(response.xpath(hdd_xpath)) if size_internal_hdd: product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "size_internal_hdd" product_id['ID_value'] = size_internal_hdd yield product_id date_xpath = "//tr[@class='TableRow-sc-41ik9-2 dBYNIg'][5]/td/text()" date = self.extract(response.xpath(date_xpath)) if date.isdigit(): product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "first_publish_date" product_id['ID_value'] = date yield product_id else: date_xpath = "//tr[@class='TableRow-sc-41ik9-2 dBYNIg'][6]/td/text()" date = self.extract(response.xpath(date_xpath)) if date.isdigit(): product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "first_publish_date" product_id['ID_value'] = date yield product_id
def init_item_by_xpaths(self, response, item_type, fields, selector=None): if not selector: selector = Selector(response=response) if item_type not in ('review', 'product', 'product_id', 'category'): raise Exception("Invalid item type: %s" % item_type) if item_type == "review": item = ReviewItem() elif item_type == "product": item = ProductItem() elif item_type == "product_id": item = ProductIdItem() elif item_type == "category": item = CategoryItem() if item_type in ('review', 'product'): item["TestUrl"] = response.url for field in fields: # TODO: maybe check field. if item_type == "review" and field in ("TestPros, TestCons"): item[field] = self.extract_all(selector.xpath(fields[field]), " ; ") else: item[field] = self.extract_all(selector.xpath(fields[field])) return item
def parse_product(self, response): category_path_xpath = "//span[contains(@class, 'breadcrumb')]/a/text()" category = CategoryItem() category['category_path'] = self.extract_all(response.xpath(category_path_xpath), separator=' | ') yield category if self.should_skip_category(category): return product_xpaths = {"ProductName": "//h1/text()", "PicURL": "//meta[@property='og:image']/@content", "ProductManufacturer": "//meta[@property='brand']/@content", "source_internal_id": "//meta[@property='product_number']/@content" } product = self.init_item_by_xpaths(response, "product", product_xpaths) if not product['source_internal_id']: return product['OriginalCategoryName'] = category['category_path'] yield product product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "debenhams_id" product_id['ID_value'] = product["source_internal_id"] yield product_id request = self.start_reviews(response, product, filter_other_sources=False) request.meta['product'] = product yield request
def parse_product(self, response): manufacturer_xpath = "//strong[contains(@class,'property-name') and contains(text(),'Hersteller')]/following-sibling::span/a[1]/text()" review_url_xpath = "//div[@id='product-head-reviews']//a[@class='headbutton']/@href" product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1//text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="data"]/div/img/@src')) product['ProductManufacturer'] = self.extract(response.xpath(manufacturer_xpath)) yield product id_values = self.extract(response.xpath('//strong[contains(text(),"EAN")]/parent::div/span/text()')) if id_values: id_values = id_values.split(',') for id_value in id_values: productid = ProductIdItem() productid['ProductName'] = product["ProductName"] productid['ID_kind'] = "EAN" productid['ID_value'] = id_value.strip(' ') yield productid review_url = self.extract(response.xpath(review_url_xpath)) if review_url: review_url = get_full_url(response, review_url) request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_items(self, response): product_id = ProductIdItem() price = response.xpath( '//*[@id="priceCol"]/div[2]/text()').extract() product_id['ProductName'] = self.extract( response.xpath('//*[@id="cart_quantity"]/div/div[2]/h1/text()')) product_id['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()')) if price: product_id['ID_kind'] = 'price' product_id['ID_value'] = str(price).split()[4].replace( "u'\\xa0", "").replace("*", "") EAN_id_xpath = '//span[@class="product-ean"]/text()' EAN_id = self.extract(response.xpath(EAN_id_xpath)) if EAN_id: product_id['ID_kind'] = "EAN" product_id['ID_value'] = EAN_id yield product_id product = ProductItem() product['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()')) product['ProductName'] = self.extract(response.xpath( '//*[@id="cart_quantity"]/div/div[2]/h1/text()')) picture = response.xpath( '//*[@id="bImageCarousel"]/div/div[1]/a/img').extract() if picture: product['PicURL'] = str(picture).split('=')[1].replace("alt", "").replace("\'", "").replace(" \"", "").replace("\"", "") product['OriginalCategoryName'] = self.extract(response.xpath( '//*[@id="bBreadcrumb"]/ol/li/a/span/text()')) product['TestUrl'] = response.url yield product
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['ocn'] name = self.extract( response.xpath('//h1[@id="productNameHeader"]/text()')) product['PicURL'] = self.extract( response.xpath('//img[@id="_imgLarge"]/@src')) product['source_internal_id'] = self.extract( response.xpath('//span[@class="jsSwatchSku"]/text()')) mpn = self.extract( response.xpath('//p[contains(text(),"Item Number")]/span/text()')) if mpn: product_id = ProductIdItem() product["ProductName"] = name + ' ' + mpn product_id['ProductName'] = product["ProductName"] product_id['source_internal_id'] = product['source_internal_id'] product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn yield product yield product_id else: product["ProductName"] = name yield product test_url = 'http://api.bazaarvoice.com/data/reviews.json?apiversion=%s&passkey=%s&Filter=ProductId:s%s' \ '&Sort=SubmissionTime:desc&Limit=100' % (self.bv_version, self.bv_key, product['source_internal_id']) request = Request(url=test_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = self.extract( response.xpath('//a[contains(@class,"breadcrumb")]/text()')) model = self.extract( response.xpath('//span[@itemprop="model"]/text()')) pic_url = self.extract( response.xpath( '//meta[@name="analytics-product-image_url"]/@content')) if pic_url: product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = 'Sony' product['ProductName'] = product['ProductManufacturer'] + ' ' + model yield product id_values = self.extract(response.xpath('//@data-model_ids')) if id_values: id_values = id_values.strip('[').strip(']').split(',') for id_value in id_values: product_id = ProductIdItem() product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "MPN" product_id['ID_value'] = id_value yield product_id review_url = response.url + '/reviews-ratings' request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_price(self, product, response): price_xpath = "//b[contains(text(),'Price')]/following-sibling::text()" price_str = self.extract(response.xpath(price_xpath)) if price_str: return ProductIdItem.from_product(product, kind='price', value=price_str)
def parse_price(self, product, response): price_xpath = '//p[@class="cost-text"]/text()' price = self.extract(response.xpath(price_xpath)) if price: return ProductIdItem.from_product(product, kind='price', value=price)
def parse_price(self, product, response): price_xpath = "(//div[@class='price-msrp'])[1]/a/text()" price_str = self.extract(response.xpath(price_xpath)) if price_str: return ProductIdItem.from_product(product, kind='price', value=price_str)
def product_id(self, product, kind='', value=''): product_id = ProductIdItem() if "source_internal_id" in product: product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = kind product_id['ID_value'] = value return product_id
def parse_items(self, response): product_xpaths = { "PicURL": "//meta[@property='og:image']/@content", "ProductManufacturer": "//tr[@class='marke-hersteller']/td/a/text()" } review_xpaths = { "TestSummary": "//div[@id='review_body']/div[1]/p/text()", "TestVerdict": "(//div[@id='review_body']/div/p/text())[last()]", "TestTitle": "(//title/text())[1]", "Author": "//span/meta[@itemprop='author']/@content", "TestPros": "//div[@class='list-advantages']/ul/li/div/text()", "TestCons": "//div[@class='list-disadvantages']/ul/li/div/text()", "SourceTestRating": "//span/meta[@itemprop='ratingValue']/@content" } product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) productname = self.extract( response.xpath("//tr[@class='modell']/td/span/text()")) productmanu = product['ProductManufacturer'] review['ProductName'] = productmanu + " " + productname product['ProductName'] = review['ProductName'] source_internal_id = self.extract( response.xpath("//div/meta[@itemprop='productID']/@content")) review['source_internal_id'] = source_internal_id product['source_internal_id'] = source_internal_id if not product['PicURL']: product['PicURL'] = self.extract( response.xpath("(//div/a/img/@data-src)[1]")) if review['SourceTestRating']: review['SourceTestScale'] = "5" review["DBaseCategoryName"] = "PRO" review_date = self.extract( response.xpath("//div[@class='offers']/small/text()")) date = str(review_date).split(" ")[2] review['TestDateText'] = date_format(date, '%d.%m.%Y') price = self.extract( response.xpath("//div[@class='price']/text()")).encode('utf-8') if price: product_id = ProductIdItem() product_id['ID_kind'] = 'price' product_id['ID_value'] = str(price).split(' ')[0] product_id['ProductName'] = product['ProductName'] product_id['source_internal_id'] = product['source_internal_id'] review_date = datetime.strptime(review['TestDateText'], "%Y-%m-%d") if review_date > self.stored_last_date: yield review yield product_id yield product
def parse_price(self, product, response): price_xpath = '//div[@class="bbcode"]/p/br[1]'\ '/preceding-sibling::text()' price_str = self.extract(response.xpath(price_xpath)) if price_str: return ProductIdItem.from_product(product, kind='price', value=price_str)
def parse_price(self, product, response): price_xpath = "(//h2[contains(text(),'Price')]"\ "/following-sibling::p)/text()" price_str = (self.extract(response.xpath(price_xpath))).encode('utf-8') if price_str: return ProductIdItem.from_product(product, kind='price', value=(price_str).split("$")[1])
def parse_price(self, product, response): price_xpath = "//div[@class='price']/a/text()|"\ "//a[@class='price']/text()|"\ "//span[@class='msrp']/text()" price_str = (self.extract(response.xpath(price_xpath))).encode('utf-8') if price_str: return ProductIdItem.from_product(product, kind='price', value=price_str.lstrip('$'))
def parse_product(self, response): review_urls = self.extract_list( response.xpath('//a[@class="readFullReviewLink"]/@href')) if review_urls: product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract( response.xpath('//h1/text()')) product['PicURL'] = self.extract( response.xpath('//div[@class="enlargeText"]/a/@href')) yield product upc = self.extract( response.xpath( '//td[contains(text(),"UPC")]/parent::tr/td[@class=""]/text()' )) if upc: product_id = ProductIdItem() product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "UPC" product_id['ID_value'] = upc yield product_id mpn = self.extract( response.xpath( '//td[contains(text(),"MPN")]/parent::tr/td[@class=""]/text()' )) if mpn: product_id = ProductIdItem() product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn yield product_id for review_url in review_urls: review_url = get_full_url(response, review_url.strip('#tabAnchor')) request = Request(url=review_url, callback=self.parse_review) request.meta['product'] = product yield request
def parse_product(self, response): review_url = 'http://reviews.officedepot.com/2563/%s/reviews.htm' category_xpath = '//div[@id="siteBreadcrumb"]//a' product_name_xpath = '//*[@itemprop="name"]/text()' officedepot_id_xpath = '//*[@id="basicInfoCustomerSku"]/text()' brand_xpath = '//*[@id="attributebrand_namekey"]/text()' pic_url_xpath = '//*[@id="mainSkuProductImage"]/@src' mpn_xpath = '//*[@id="basicInfoManufacturerSku"]/text()' category = self.handle_category(response, category_xpath) product = ProductItem() product['TestUrl'] = response.url product['ProductName'] = self.extract(response.xpath(product_name_xpath)) product['source_internal_id'] = self.extract(response.xpath(officedepot_id_xpath)) product['ProductManufacturer'] = self.extract(response.xpath(brand_xpath)) product['PicURL'] = self.extract(response.xpath(pic_url_xpath)) product['OriginalCategoryName'] = category['category_path'] officedepot_id = ProductIdItem() officedepot_id['source_internal_id'] = product['source_internal_id'] officedepot_id['ProductName'] = product['ProductName'] officedepot_id['ID_kind'] = 'officedepot_id' officedepot_id['ID_value'] = product['source_internal_id'] mpn = ProductIdItem() mpn['source_internal_id'] = product['source_internal_id'] mpn['ProductName'] = product['ProductName'] mpn['ID_kind'] = 'MPN' mpn['ID_value'] = self.extract(response.xpath(mpn_xpath)) request = self.selenium_request(url=review_url % product['source_internal_id'], callback=self.parse_reviews) request.meta['product'] = product request.meta['product_id'] = officedepot_id yield request yield category yield officedepot_id yield mpn yield product
def parse_product(self, response): reviews = response.xpath('//div[@class="review"]') if reviews: product = None if "product" in response.meta: product = response.meta['product'] if not product: product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//span[@itemprop="name"]/text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="main-image"]/a/img/@src')) product['ProductManufacturer'] = self.extract(response.xpath('//div[@itemprop="brand"]//a/text()')) if not product['ProductManufacturer']: product['ProductManufacturer'] = self.extract_all(response.xpath( '//div[@class="label"][contains(text(),"Developer")]' '/following-sibling::div[@class="value"]//text()')) yield product mpn = self.extract(response.xpath( '//div[@class="label"][contains(text(),"Manufacturer")]' '/following-sibling::div[@class="value"]/text()')) if mpn: product_id = ProductIdItem() product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn yield product_id review_url = self.extract(response.xpath('//a[@class="more"]/@href')) if review_url: review_url = get_full_url(response, review_url) request = Request(url=review_url, callback=self.parse_product) request.meta['product'] = product yield request return for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//div[@class="author"]/text()[last()]')) user_review['TestDateText'] = date_format(date, '') rating = self.extract(review.xpath('.//span[@class="ratingImage"]/img/@alt')) user_review['SourceTestRating'] = rating.split(' ')[0] user_review['Author'] = self.extract(review.xpath('.//div[@class="author"]/b/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//div[@class="title"]/text()[last()]')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="body"]//text()')) yield user_review
def parse_product_json(self, response): product_json_ld = extruct_helper.extract_json_ld( response.body, 'Product') if product_json_ld: ocns = product_json_ld.get('category', '') if ocns: seperator = '/' ocns = ocns.split(seperator) ocn = ' | '.join(ocn for ocn in ocns) category = CategoryItem() category['category_path'] = ocn yield category if not self.should_skip_category(category): product = extruct_helper.product_item_from_product_json_ld( product_json_ld) product['source_id'] = self.spider_conf['source_id'] product['TestUrl'] = response.url product['source_internal_id'] = product_json_ld.get( 'productID', '') product['OriginalCategoryName'] = ocn yield product # Product Price Item # ---------------------------------------- price_str = product_json_ld.get('offers', {}).get('price', '') currency_str = product_json_ld.get('offers', {}).get( 'priceCurrency', '') price_str = price_str + ' ' + currency_str yield ProductIdItem.from_product(product, kind='price', value=price_str) # Product SKU Item # ---------------------------------------- sku_str = product_json_ld.get('sku', '') yield ProductIdItem.from_product(product, kind='SKU', value=sku_str)
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract( response.xpath('//h1/span/text()')) product['PicURL'] = self.extract( response.xpath('//img[@itemprop="image"]/@src')) product['ProductManufacturer'] = self.extract( response.xpath('//span[@class="brand-logo"]/img/@alt')) product['source_internal_id'] = self.extract( response.xpath('//span[@itemprop="productid"]/text()')) yield product id_value = self.extract( response.xpath('//span[@itemprop="model"]/text()')) if id_value: product_id = ProductIdItem() product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "MPN" product_id['ID_value'] = id_value product_id['source_internal_id'] = product['source_internal_id'] yield product_id reviews = response.xpath( '//div[contains(@class,"customer-review-item")]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] date = self.extract(review.xpath('.//li[@class="date"]/text()')) date_match = re.findall(r'(\d) day', date) if date_match: review_date = datetime.date.today() - datetime.timedelta( days=int(date_match[0])) user_review['TestDateText'] = review_date.strftime('%Y-%m-%d') else: user_review['TestDateText'] = date_format(date, '') user_review['SourceTestRating'] = self.extract( review.xpath('.//div[@class="rating-score"]/text()')) user_review['Author'] = self.extract( review.xpath('.//li[@class="name"]/text()')) user_review['TestTitle'] = self.extract( review.xpath('.//h3/text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//p/text()|.//span[@class="hidden"]/text()')) yield user_review
def parse_product(self, response): category = response.meta['category'] soup = BeautifulSoup(response.body, "lxml") item_id = response.url.split('/')[-1].strip() product = ProductItem() product['source_internal_id'] = item_id product['ProductName'] = soup.find('h1', { 'itemprop': 'name' }).text.strip() product['ProductManufacturer'] = soup.find('a', { 'id': 'WMItemBrandLnk' }).text.strip() if soup.find('a', {'id': 'WMItemBrandLnk'}) else '' product['OriginalCategoryName'] = category['category_path'] product['PicURL'] = soup.find( 'img', {'class': 'product-image'})['src'].strip() product['TestUrl'] = response.url yield product price = soup.find('div', {'itemprop': 'price'}) product_id = ProductIdItem() product_id['source_id'] = product['source_id'] product_id['ProductName'] = product['ProductName'] product_id['source_internal_id'] = product['source_internal_id'] if price: try: product_id['ID_kind'] = 'price' product_id['ID_value'] = format( round(float(''.join(price.text.replace('$', ''))), 2), ".2f").replace('.', ',') except: pass yield product_id latest_review_date = get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], item_id) review_page = 1 reviews_link = reviews_link_pattern % (item_id, str(review_page)) request = Request(reviews_link, callback=self.parse_review) request.meta['ProductName'] = product['ProductName'] request.meta['item_id'] = item_id request.meta['review_page'] = review_page request.meta['latest_review_date'] = latest_review_date anchors = soup.find_all('a', {'class': 'js-product-anchor'}) for anchor in anchors: if 'reviews' in anchor.text: request.meta['max_idx'] = int( anchor.text.replace('reviews', '').strip()) break yield request
def parse_product(self, response): product_xpaths = {"PicURL": "(//*[@property='og:image'])[1]/@content", "ProductName": "//h1//text()", "OriginalCategoryName": "//li[contains(@class, 'item category')][last()]/a/text()", "ProductManufacturer": "//th[@class='col label' and text()='Brand']/" "following-sibling::*/text()" } product = self.init_item_by_xpaths(response, "product", product_xpaths) bv_config_data = self.extract(response.xpath("//script[@type='text/javascript']" "[contains(text(),'productId')]/text()")) if product.get('OriginalCategoryName', ''): category = CategoryItem() category_url = self.extract(response.xpath("//li[contains(@class, 'item category')][last()]/a/@href")) category['category_url'] = get_full_url(response, category_url) category['category_leaf'] = product['OriginalCategoryName'] category['category_path'] = category['category_leaf'] yield category match = re.search(self.source_internal_id_re, bv_config_data) if match: product["source_internal_id"] = match.group(1).upper() product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "richersounds_id" product_id['ID_value'] = product["source_internal_id"] yield product_id yield product bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['filter_other_sources'] = False yield request
def parse_items(self, response): product_xpaths = { "PicURL": "//meta[@property='og:image']/@content", "ProductName": "//div[@class='news-single-item']/dl/dd[1]/text()" } review_xpaths = { "TestSummary": "//div[@class='news-single-teaser']/h3/text()", "Author": "//meta[@name='author']/@content", "TestTitle": "//div[@class='news-single-item']/h2/text()", "ProductName": "//div[@class='news-single-item']/dl/dd[1]/text()", "TestVerdict": "(//div[@class='news-single-text']/p/text())[last()]" } product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) source_internal_id = str(response.url).split("/")[6] review['source_internal_id'] = source_internal_id product['source_internal_id'] = source_internal_id if not review['ProductName']: review['ProductName'] = review['TestTitle'] if not product['ProductName']: product['ProductName'] = review['TestTitle'] review["DBaseCategoryName"] = "PRO" date = self.extract( response.xpath("//div[@class='news-single-timedata']/text()")) review['TestDateText'] = date_format(date, '%d.%m.%Y') yield product yield review price = self.extract( response.xpath("//div[@class='news-single-item']/dl/dd[2]/text()")) if price: pricevalue = str(price.encode('utf-8')).split(' ')[0] if pricevalue.isdigit(): product_id = ProductIdItem() product_id['ID_kind'] = 'price' product_id['ID_value'] = pricevalue product_id['ProductName'] = product['ProductName'] product_id['source_internal_id'] = product[ 'source_internal_id'] yield product_id
def parse(self, response): #Must use only product_page category_xpaths = { "category_leaf": "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a[last()-1]/text()" } category_path_xpath = "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a/text()" product_xpaths = { "PicURL": "(//*[@property='og:image'])[1]/@content", "source_internal_id": "//form[@id='productSheet']/@data-product", "ProductName": "//div[@itemprop='name']/h1/text()", "ProductManufacturer": "//*[@class='nameBrand']/text()" } category_path_selector = response.xpath(category_path_xpath) category_path_selector = category_path_selector[:-1] category = self.init_item_by_xpaths(response, "category", category_xpaths) category["category_path"] = self.extract_all(category_path_selector, separator=' | ') print category product = self.init_item_by_xpaths(response, "product", product_xpaths) product["OriginalCategoryName"] = category["category_path"] product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "conforama_fr_id" product_id['ID_value'] = product["source_internal_id"] yield product_id yield category yield product yield self.get_rm_kidval(product, response) reviews_xpath = "//a[@id='rating']" with SeleniumBrowser(self, response) as browser: browser.get(response.url) selector = browser.click(reviews_xpath) response.meta['browser'] = browser response.meta['product'] = product response.meta['product_id'] = product_id response.meta['_been_in_decorator'] = True for review in self.parse_reviews(response, selector=selector): yield review