def _parse_related(self, response): product = response.meta['product'] key = response.meta['key'] related_products = [] aa = response.xpath('//a[@data-tracking-id="prd_img"]') for a in aa: title = a.xpath('./img/@alt').extract() link = a.xpath('./@href').extract() if title and link: related_products.append(RelatedProduct(title[0], link[0])) aa = response.css('div.recom-mini-item a.image-wrapper') for a in aa: title = a.xpath('./@title').extract() link = a.xpath('./@href').extract() if title and link: related_products.append(RelatedProduct(title[0], link[0])) if product['related_products'].get(key): product['related_products'][key] += related_products else: product['related_products'][key] = related_products return self._generate_related_request(response)
def parse_related_products(self, response): related_products = {'also_liked': [], 'recommended': []} recommended = response.xpath( '//ul[@class="products-grid"]/li/div[@class="thumbnail"]') for item in recommended: url = is_empty(item.xpath('a/@href').extract()) title = is_empty(item.xpath('a/@title').extract()) if url and title: prod = RelatedProduct(url=url, title=title) related_products['recommended'].append(prod) also_liked = response.xpath( '//div[@class="thumbnail"]/div[@class="caption"]/div[@class="product-name"]' ) for item in also_liked: url = is_empty(item.xpath('a/@href').extract()) title = is_empty(item.xpath('a/@title').extract()) if url and title: prod = RelatedProduct(url=url, title=title) related_products['also_liked'].append(prod) return related_products
def _parse_related_products(self, response): product = response.meta['product'] product_id = response.meta.get('product_id') text = is_empty(re.findall('html:\'(.*)\'}', response.body)) if text: html = Selector(text=text) also_browsed = is_empty( html.xpath('//div[@class="grid_14 saled_view flt_rgt"]' '/div/p/strong/text()').extract()) related = [] related_products = {} for sel in html.xpath('' '//div[@class="grid_14 saled_view flt_rgt"]' '/div/ul/li/a'): url = is_empty(sel.xpath('@href').extract()) if url: related.append( RelatedProduct(title=is_empty( sel.xpath('./div[@class="ftProductDesc"]/text()'). extract()), url=urllib.unquote( 'http' + url.split('http')[-1]))) if len(related) > 0: related_products[also_browsed] = related product['related_products'] = related_products also_bought = is_empty( html.xpath('//div[@class="grid_1 saled_view flt_rgt"]' '/div/p/strong/text()').extract()) related = [] related_products = {} for sel in html.xpath('' '//div[@class="grid_1 saled_view flt_rgt"]' '/div/ul/li/a'): url = is_empty(sel.xpath('@href').extract()) if url: related.append( RelatedProduct(title=is_empty( sel.xpath('./div[@class="ftProductDesc"]/text()'). extract()), url=urllib.unquote( 'http' + url.split('http')[-1]))) if len(related) > 0: if 'related_products' in product: product['related_products'][also_bought] = related else: related_products[also_bought] = related product['related_products'] = related_products return product
def _parse_related_products(self, response, style_id): product = response.meta['product'] items = response.xpath( '//section[@class="ProductDetail"]/article[not(@id="Style{id}")]'. format(id=style_id)) if items: related_prods = [] for item in items: # Get title title = item.xpath('.//div[@class="Title"]//h1/text() |' './/div[@class="Title"]//h2/text()') if title: title = is_empty(title.extract()).strip() # Get url targetitem = item.xpath('.//@data-targetitem') url = item.xpath('.//div[@class="StyleThumb"]/a/@href') if targetitem and url: targetitem = is_empty(targetitem.extract()) url = '{url}#{id}'.format(url=response.url, id=targetitem.replace('-', '')) if url and title: related_prods.append(RelatedProduct(title=title, url=url)) product['related_products'] = related_prods
def parse_related_product(self, response): product = response.meta.get("product") reqs = response.meta.get("reqs") data = re.findall("currentItemObjArray\[\d\].items.push\(([^}]*)}\);", response.body) rp = [] for item in data: title = is_empty(re.findall("name\:\"([^\"]*)", item)) url = is_empty(re.findall("url\:\"([^\"]*)", item)) if title and url: rp.append(RelatedProduct( title=title, url=url, )) if rp: product["related_products"] = rp if reqs: return self.send_next_request(reqs, response) return product
def _populate_from_html(self, response, prod): self._populate_from_js(response, prod) des = response.xpath( '//div[contains(@class,"pdp-accordion-description-description")]' '//text()').extract() des = ''.join(i.strip() for i in des) cond_set_value(prod, 'description', des) reseller_id_regex = "[Pp]roduct[Ii][dD]=([^\&]+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(prod, 'reseller_id', reseller_id) related_products = [] for li in response.xpath('//div[@class="pdp-likethat"]' '//li[contains(@class,"portrait-item")]'): url = None urls = li.xpath( './/div[@class="portrait-item-header"]/a/@href').extract() if urls: url = urlparse.urljoin(response.url, urls[0]) title = ' '.join(s.strip() for s in li.xpath( './/div[@class="portrait-item-header"]//text()').extract()) if url and title: related_products.append(RelatedProduct(title, url)) cond_set_value( prod.setdefault('related_products', {}), 'recommended', related_products, ) self._unify_price(prod)
def _populate_from_html(self, response, prod): des = response.xpath( '//div[contains(@class,"pdp-accordion-description-description")]' '//text()').extract() des = ''.join(i.strip() for i in des) cond_set_value(prod, 'description', des) related_products = [] for li in response.xpath('//div[@class="pdp-likethat"]' '//li[contains(@class,"portrait-item")]'): url = None urls = li.xpath( './/div[@class="portrait-item-header"]/a/@href').extract() if urls: url = urlparse.urljoin(response.url, urls[0]) title = ' '.join(s.strip() for s in li.xpath( './/div[@class="portrait-item-header"]//text()').extract()) if url and title: related_products.append(RelatedProduct(title, url)) cond_set_value( prod.setdefault('related_products', {}), 'recommended', related_products, )
def make_list(rlist): prodlist = [] for r in rlist: href = r.xpath("a/@href").extract()[0] text = r.xpath("a/text()").extract()[0] prodlist.append(RelatedProduct(text, href)) return prodlist
def _parse_related_product(self, response): items = response.xpath('//div[@name="carousel"][@data-carousel-type="product,"]/./' '/div[contains(@class, "carousel_item")]') related_products = [] if items: for item in items: title = is_empty( item.xpath('././/div[@data-click-track="carousel_product_name"]/./' '/strong/text()').extract() ) url = is_empty( item.xpath('./div[@class="productimage "]' '/a/@href').extract() ) if title and url: if self.allowed_domains[0] not in url: url = 'http://www.{domain}{url}'.format( domain=self.allowed_domains[0], url=url ) related_products.append( RelatedProduct(url=url, title=title) ) return related_products
def _parse_rr(self, response): product = response.meta['product'] title = response.xpath( "//div[contains(@id, 'rr_strategy')]/text()").extract() urls = re.findall("a_href = \"([^\"]*)", response.body) names = response.xpath( "//div[contains(@class, 'medium')]/text()").extract() rp = dict(zip(names, urls)) product['related_products'] = {} prodlist = [] for k, v in rp.items(): prodlist.append(RelatedProduct(k, v)) if prodlist and title: product['related_products'] = {title[0]: prodlist} bvurl = response.meta.get('bvurl') if bvurl: # TODO: add dont_filter new_meta = response.meta.copy() new_meta['product'] = product new_meta['handle_httpstatus_list'] = [404] return Request(bvurl, meta=new_meta, callback=self._parse_bv, dont_filter=True) return product
def _parse_related_products(self, response): related_products = [] data = response.xpath( '//*[@id="PDPCrossSellContent"]/li' ) if data: for item in data: title = is_empty( item.xpath( '././/span[@class="productTitle"]/./' '/a[@class="productTitleLink"]/text()' ).extract() ) url = is_empty( item.xpath( '././/span[@class="productTitle"]/./' '/a[@class="productTitleLink"]/@href' ).extract() ) if url and title: related_products.append( RelatedProduct( url=url, title=title.strip() ) ) return related_products
def populate_related(self, response): product = response.meta['product'] prev_related = product.get('related_products') related = [] if not prev_related: title = "more_items_to_consider" else: title = "related_products" box = response.xpath('//div[@class="pad homeProduct"]') if box: for item in box: name = item.xpath( './/div[@class="cert-product-title"]/text()' ).extract() link = item.xpath('.//a/@href').extract() if name and link: name = name[0].strip() related.append(RelatedProduct(name, link[0])) if related and prev_related: prev_related[title] = related product['related_products'] = prev_related else: if related: product['related_products'] = {title: related} jsessionid = response.meta.get('jsessionid') product_id = response.meta.get('product_id') if jsessionid and product_id: # here we make request for additional "related products" field scheme = 'v1_th_rr' url = self.generate_related_url(jsessionid, product_id, scheme) return Request(url, callback=self.populate_related, meta={'product': product}, dont_filter=True) return product
def parse_related_product(self, response): related_prods = [] product = response.meta['product'] sample = response.body_as_unicode() try: sample = sample.replace(u'certonaResx.showResponse(', '') sample = sample[:-2] data = json.loads(sample) html = data['Resonance']['Response'][2]['output'] except Exception as e: self.log( 'Error during parsing related products page: {}'.format(e)) return product else: s = Selector(text=html) titles = s.xpath('//h4/text()').extract() # Title urls = s.xpath('//img/@src').extract() # Img url for title, url in zip(titles, urls): if url and title: related_prods.append(RelatedProduct(title=title, url=url)) product['related_products'] = {} if related_prods: product['related_products'][ 'buyers_also_bought'] = related_prods return product
def populate_recommendations(self, response): ids_data = response.meta['ids_data'] product = response.meta['product'] related = product.get('related_products', {}) # ids = ids_data[ids_data.keys()[0]] for key in ids_data.keys(): recomm = [] ids = ids_data[key] body = json.loads(response.body) for item_id in ids: try: name = body['recommendedProducts'][item_id]['name'] url = body['recommendedProducts'][item_id]['canonicalUrl'] except KeyError: pass else: full_url = "http://www.ebuyer.com/" + url recomm.append(RelatedProduct(name, full_url)) related[key] = recomm # pls note that server return to spider and site 8 products, # but displayed by javascript only 6. product['related_products'] = related sku = response.meta['item_id'] url = self.REVIEW_URL.format(sku=sku) if self.POPULATE_REVIEWS: meta = response.meta.copy() meta['handle_httpstatus_list'] = [404] return Request(url, callback=self.populate_reviews, meta=meta) return product
def _parse_related_products(self, response): product = response.meta['product'] rp = {} for id_, stategy in self.REQ_STRATEGY.findall( response.body_as_unicode()): message = json.loads(stategy)['strategy_message'] if re.match('[Rr]ecently [Vv]iewed', message): continue if response.meta['alsoviewed']: if re.match('[Rr]ecommended *[Ff]or', message): continue elif re.match('[Pp]eople *[Aa]lso', message): continue rp[message] = [] for iid, data in self.REQ_ITEM.findall(response.body): if not iid == id_: continue # fix json syntax data = re.sub(', *}', '}', data) data = json.loads(data) title = data['ProductName'] url = data['ProductUrl'] url = unquote(re.search('&ct=([^&]+)', url).group(1)) rp[message].append(RelatedProduct(title=title, url=url)) cond_set_value(product, 'related_products', rp)
def parse_related_product(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) try: jsonresponse = json.loads(response.body_as_unicode()) related_prods = [] if jsonresponse and jsonresponse['bloomReach']['relatedProducts']: for prod in jsonresponse['bloomReach']['relatedProducts']: related_prods.append( RelatedProduct(title=prod['title'], url=prod['url'])) product['related_products'] = {} if related_prods: product['related_products'][ 'buyers_also_bought'] = related_prods except: pass if reqs: return self.send_next_request(reqs, response) else: return product
def _handle_related_product(self, response, rel_product_type): related_products = [] try: data = json.loads(response.body_as_unicode()) items = data['d']['Items'] if items: for item in items: title = item['Name'] href = '{www}{domain}{url}'.format( www='http://www.', domain=self.allowed_domains[0], url=item['Href'] ) related_products.append(RelatedProduct( title=title, url=href )) return related_products except (KeyError, ValueError): self.log("Impossible to get {0} products info in {1}".format( rel_product_type, response.url ), WARNING) return None
def parse_related_products(self): """parse response from richrelevance api""" body = re.sub('\t|\s{2,6}', '', self.response.body) # strip tabs initial_data = re.findall('json\s?=\s?(\{.+?\});', body) initial_data = [re.sub('\t|\s{2,6}', '', _) for _ in initial_data] initial_data = [hjson.loads(_) for _ in initial_data] additional_data = re.findall( '\[(\d+)\]\.json\.items\.push\((\{.+?\})\);', body) for ind, data in additional_data: initial_data[int(ind)]['items'].append(hjson.loads(data)) related_products = [] for data in initial_data: l = [] for item in data['items']: title = item.get('name', '') url = item.get('link_url', '') if not title or not url: continue url = parse_qs(urlparse(url).query).get('ct', [''])[0] if not url: continue if url.startswith('/'): url = '%s%s' % (self.base_url, url) l.append(RelatedProduct(title=title, url=url)) if l: related_products.append({data['message']: l}) cond_set_value(self.product, 'related_products', related_products)
def _parse_related_products(self, response): product = response.meta['product'] internet_no = response.meta.get('internet_no', None) if response.status in response.meta['handle_httpstatus_list']: # No further pages were found. Check the request payload. return product data=json.loads(response.body_as_unicode()) related_prods=[] for prod in data['schemas'][0]['products']: name = prod['productName'] href = prod['canonicalURL'] related_prods.append(RelatedProduct( name, urlparse.urljoin(product['url'], href)) ) if related_prods: if 'THE HOME DEPOT RECOMMENDS' in data['schemas'][0]['title']: product['related_products'] = {'recommended': related_prods} else: product['related_products'] = {'buyers_also_bought': related_prods} skus = response.meta.get('skus', None) if not skus: if internet_no: return Request( url=self.REVIEWS_URL % internet_no, callback=self.parse_buyer_reviews, meta={"product": product}, dont_filter=True, ) return product return self._gen_variants_requests(response, product, skus, internet_no)
def _parse_related_products(self, response): meta = response.meta.copy() product = meta['product'] related_products = product.get('related_products', {}) reqs = meta.get('reqs') body = response.body_as_unicode() data = re.findall( r'"(message)":\s*"(.*?)\^.*?"|"(name)":\s*"(.*?)"|"(linkurl)":\s*"(.*?)"', body ) if data: feat_prod_list = [] featured_prods = dict() url_ready = name_ready = False last_message = False forbidden_featured_names = ['also_viewed', 'top_sellers'] for item in data: # Make a dir of two tuples item_list = filter(None, list(item)) i = iter(item_list) values_dict = dict(izip(i, i)) keys = values_dict.keys() if 'message' in keys: if feat_prod_list: if last_message not in forbidden_featured_names: featured_prods[last_message] = feat_prod_list feat_prod_list = [] last_message = values_dict['message'].lower() last_message = '_'.join( last_message.split(' ')[-2:] ) elif 'name' in keys: title = values_dict['name'] name_ready = True elif 'linkurl' in keys: url = values_dict['linkurl'] url_ready = True if url_ready and name_ready: feat_prod_list.append( RelatedProduct(**{'url': url, 'title': title}) ) url_ready = name_ready = False if last_message not in forbidden_featured_names: featured_prods[last_message] = feat_prod_list related_products.update(featured_prods) product['related_products'] = related_products if reqs: return self.send_next_request(reqs, response) return product
def _parse_recomendar(self, response): product = response.meta['product'] jdata = {} try: jdata = json.loads(response.body) except ValueError: return product if "interest" in jdata: sel = Selector(text=jdata["interest"]) titles = sel.xpath("//h6/a/text()").extract() links = sel.xpath("//h6/a/@href").extract() tl = is_empty(sel.xpath( '//div[contains(@class, "row")]/div/' \ 'h3[contains(@class, "slider-header")]/text()' ).extract(), "Customers Also Viewed").strip() if not titles: return product related_prod = dict(zip(iter(titles), iter(links))) related_prod = [ RelatedProduct(title=k, url=v) for k, v in related_prod.items() ] product["related_products"] = {tl: related_prod} if "mtp_link" in response.meta: meta = {"product": product} return Request(url=response.meta["mtp_link"], callback=self.parse_marketplace, meta=meta) return product
def _populate_related_products(self, response, product): xpath = '//ul[contains(@class, "might_like")]/li/' \ 'div[contains(@class, "product_description")]/a' extractor = LinkExtractor(restrict_xpaths=xpath) products = [ RelatedProduct(url=urljoin(response.url, link.url), title=link.text.strip()) for link in extractor.extract_links(response) ] cond_set_value(product, 'related_products', {'You might also like': products})
def _carousel_getitems(self, carousel): if not carousel: return for item in carousel[0].css('.item_ph'): link = item.css('.desc2') url = link.css('::attr(href)') title = link.css('::text') if url and title: yield RelatedProduct( url=urljoin('http://quill.com', url[0].extract()), title=_strip_non_ascii(title[0].extract()))
def parse_related_product(self, response): meta = response.meta product = meta['product'] related_products = {'customers_also_bought': [], 'recommended': []} data = response.body_as_unicode().replace('\\', '') if data: sel = Selector(text=data) # Customers Also Bought customers_products = sel.xpath( '//div[contains(@class, "descSideSell")]') for item in customers_products: title = is_empty(item.xpath('a/text()').extract()) url = is_empty(item.xpath('a/@href').extract()) if title and url: prod = RelatedProduct(url=url, title=title) related_products['customers_also_bought'].append(prod) customers_products = sel.xpath( '//div[contains(@class,"imgSideSell")]/img') for item in customers_products: title = is_empty(item.xpath('@title').extract()) url = is_empty(item.xpath('@src').extract()) if title and url: prod = RelatedProduct(url=url, title=title) related_products['customers_also_bought'].append(prod) # Recommended recommended_products = sel.xpath( '//div[contains(@class, "wrap_description")]') for item in recommended_products: title = is_empty(item.xpath('a/span/text()').extract()) url = is_empty(item.xpath('a/@href').extract()) if title and url: prod = RelatedProduct(url=url, title=title) related_products['recommended'].append(prod) if related_products: product['related_products'] = related_products return product
def _populate_related_products(self, response, product): related_products = {} for panel in response.css('.relProPanel'): relation = panel.css('h2::text').extract()[0] products = [] for link in panel.css('a.productDesc'): url = urlparse.urljoin(response.url, link.css( '::attr(href)')[0].extract()) title = link.css('::text')[0].extract() products.append(RelatedProduct(title=title, url=url)) related_products[relation] = products cond_set_value(product, 'related_products', related_products)
def parse_rp_lookup(self, response): baseurl = 'http://waitrose.com' jsondata = json.loads(response.body_as_unicode()) products = [ RelatedProduct(title=p['name'], url=urlparse.urljoin(baseurl, p['url'])) for p in jsondata['products'] ] cond_set_value(response.meta['product'], 'related_products', {response.meta['relation']: products}) return response.meta['product']
def _build_related_products_array(text, product): s = Selector(text=text) related_products = [] product_url = product.get('url') for element in s.xpath('//li[contains(@class, "imagegrid")]'): url = element.xpath('.//a/@href').extract() title = element.xpath('.//p[@class="name"]/text()').extract() if url and title: url = urlparse.urljoin(product_url, url[0]) title = title[0] related_products.append(RelatedProduct(url=url, title=title)) return related_products
def _populate_from_html(self, response, prod): # title title = response.css('h2[itemprop=name]::text') cond_set(prod, 'title', title.extract()) # price price_div = response.css('[itemprop=offers] > [itemprop=price]') price_div = price_div[0] currency = price_div.css('[itemprop=priceCurrency]::attr(content)') price = price_div.css('[itemprop=price]::attr(content)') if currency and price: prod['price'] = Price(currency[0].extract(), price[0].extract()) # out of stock cond_set_value(prod, 'is_out_of_stock', response.css('.out_of_stock_box'), bool) # image img = response.css('.vip_gallery [itemprop=image] ::attr(src)') cond_set(prod, 'image_url', img.extract()) # description, merged with details desc = response.xpath('//div[@itemprop="description"]/p | ' '//ul[@class="linear_list"]') cond_set_value(prod, 'description', ''.join(desc.extract())) # brand brand = response.css('input[name=brand_name] ::attr(value)') cond_set(prod, 'brand', brand.extract()) # reseller_id regex = "-(\d+)\." reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(prod, "reseller_id", reseller_id) # related products related = [] rel_key = ' '.join( response.xpath('//div[@class="moreby_brand"]' '/a/h2//text()').extract()) rel_items = response.css('#morefrom_slider > ul > li') for rel_item in rel_items: r_hr = rel_item.css('a::attr(href)') r_t = rel_item.css('a > span::text') if not r_hr or not r_t: continue r = RelatedProduct(r_t[0].extract(), r_hr[0].extract()) related.append(r) related_products = {rel_key: related} if related_products and related_products.values()[0]: cond_set_value(prod, 'related_products', related_products)
def _parse_related_products(self, response): related_products = [] for a in response.xpath( './/*[@class="slick-track"]/div[contains(@class, "item ")]/a[@tabindex]' ): title = a.xpath('text()').extract() url = a.xpath('@href').extract() if title and url: related_products.append( RelatedProduct(title=title[0], url=urljoin(response.url, url[0]))) return related_products or None
def populate_recommendations(self, response): items = response.xpath('//li/div/h4') related = [] for item in items: name = item.xpath('.//a/text()').extract() link = item.xpath('.//a/@href').extract() if name and link: name = name[0] final_link = "http://www.maplin.co.uk/" + link[0] related.append(RelatedProduct(title=name, url=final_link)) product = response.meta['product'] product['related_products'] = related return product