def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = json.loads(response.body) for link in data: link = link['product_link'] link_list.append(link) for link in link_list: meta = response.meta meta['fire'] = True # meta['dont_redirect'] = True # meta['handle_httpstatus_list'] = ['301'] # stopping 301 redirects product_request = Request(url=link, meta=meta, dont_filter=True) yield product_request, ProductItem() else: links = response.xpath( '//div[@class="productWrapper"]' '//div[@class="productInfo2"]//a[@class="productHdr"]/@href' ).extract() links = [response.urljoin(x) for x in links] for link in links: yield link, ProductItem()
def _scrape_product_links(self, response): link_data = [] if self.retailer_id: data = json.loads(response.body) link_list = data for link in link_list: link = link['product_link'] link = urlparse.urljoin(response.url, link) link = self._add_akamai(link) link_data.append(link) for link in link_data: yield link, ProductItem() else: if response.xpath( '//div[@class="stp--new-product-tile-container desktop"]'): sku_list = response.xpath( '//div[@class="stp--new-product-tile-container desktop"]/div[@class="tile-container"]/@id' ).extract() for sku in sku_list: yield self.PRODUCT_URL.format(sku=sku), ProductItem() else: product_links = response.xpath( '//a[@class="standard-type__product_link"]/@href').extract( ) for product_link in product_links: yield self._add_akamai(product_link), ProductItem()
def start_requests(self): """Generate Requests from the SEARCH_URL and the search terms.""" for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), ), meta={ 'search_term': st, 'remaining': self.quantity }, ) if self.product_url: prod = ProductItem() prod['is_single_result'] = True prod['link'] = self.product_url prod['search_term'] = '' yield Request(self.product_url, self._parse_single_product, meta={'product': prod}) if self.products_url: urls = self.products_url.split('||||') for url in urls: prod = ProductItem() prod['link'] = url prod['search_term'] = '' yield Request(url, self._parse_single_product, meta={'product': prod})
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = json.loads(response.body) for link in data: link = link['product_link'] link_list.append(link) for link in link_list: url = urlparse.urljoin(response.url, link) yield url, ProductItem() else: links = response.xpath('//div[@class="product-name-list"]/a/@href').extract() for link in links: url = urlparse.urljoin(response.url, link) yield url, ProductItem()
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = json.loads(response.body) for link in data: link = link['product_link'] link_list.append(link) for link in link_list: yield link, ProductItem() else: links = response.css( 'div[data-selenium="itemDetail"] a[data-selenium="itemHeadingLink"]::attr(href)').extract() links = [response.urljoin(x) for x in links] for link in links: yield link, ProductItem()
def _scrape_product_links(self, response): link_data = [] if self.retailer_id: data = requests.get(self.API_URL.format(retailer_id=self.retailer_id)).json() link_list = data for link in link_list: link = link['product_link'] link_data.append(link) for link in link_data: yield link, ProductItem() else: links = response.xpath('//div[contains(@class, "serp-results")]/div[@class="product"]' '/a[@class="title"]/@href').extract() for link in links: yield link, ProductItem()
def _scrape_product_links(self, response): self.logger.info("Start parsing products response") try: json_response = json.loads(response.body.decode("utf-8", "ignore")) except TypeError as e: self.logger.error(e.message + "Json respone cannot be parsed") except Exception as e: self.logger.error(e.message) else: try: num_products = int(json_response["shown"]) except: if json_response: for item in json_response: mfr_part_id = self.get_mfr_part_num_from_url( item["product_link"]) payload = json.dumps( self.get_product_payload(json_response, mfr_part_id)) meta = response.meta meta['fire'] = True product_request = scrapy.Request( url=self.product_api, method='POST', body=payload, meta=meta, headers={'Content-Type': 'application/json'}, callback=self.parse, dont_filter=True) yield product_request, ProductItem() else: for i in range(num_products): mfr_part_id = json_response["nugsProducts"][i][ "manufacturerPartNumber"] payload = json.dumps( self.get_product_payload(json_response, mfr_part_id)) meta = response.meta meta['fire'] = True product_request = scrapy.Request( url=self.product_api, method='POST', body=payload, dont_filter=True, headers={'Content-Type': 'application/json'}, meta=meta, callback=self.parse, ) yield product_request, ProductItem()
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = requests.get( self.API_URL.format(retailer_id=self.retailer_id)).json() for link in data: link = link['product_link'] link_list.append(link) for link in link_list: url = link yield (url, ProductItem()) else: links = response.xpath( "//a[@class='productMainImage']/@href").extract() for link in links: yield link, ProductItem()
def _scrape_product_links(self, response): link_data = [] links = response.xpath( '//a[contains(@property, "url")]/@href').extract() if not links: links = response.xpath( './/div[@class="product-info"]' '/a[contains(@class, "product-title")]/@href').extract() if not links: links = response.xpath( '//a[@class="product-title scTrack pfm"]/@href').extract() link_data.extend(links) if self.retailer_id: data = json.loads(response.body) link_list = data for link in link_list: link = link['product_link'] link_data.append(link) link_data = [urlparse.urljoin(response.url, x) for x in link_data] for link in link_data: yield link, ProductItem()
def _scrape_product_links(self, response): links = response.xpath( '//div[contains(@class, "serp-results")]/div[@class="product"]' '/a[@class="title"]/@href').extract() for link in links: yield link, ProductItem()
def _scrape_product_links(self, response): # print "Scrape Product Links Called" link_list = [] if self.retailer_id: data = requests.get( self.API_URL.format(retailer_id=self.retailer_id)).json() for link in data: link = link['product_link'] link_list.append(link) for link in link_list: url = link yield (url, ProductItem()) else: links = response.xpath( "//header[@class='productTitle']/a/@href").extract() for link in links: yield link, ProductItem()
def _get_products(self, response): if "officedepot.com/a/products" in response.url: prod = ProductItem(search_redirected_to_product=True) yield prod else: for req_or_prod in super(OfficedepotProductsSpider, self)._get_products(response): yield req_or_prod
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = requests.get( self.API_URL.format(retailer_id=self.retailer_id)).json() for link in data: link = link['product_link'] link_list.append(link) for link in link_list: url = urlparse.urljoin(response.url, link) yield url, ProductItem() else: links = response.xpath( '//div[@class="search-results"]' '//a[@class="search-result-product-url"]/@href').extract() for link in links: url = urlparse.urljoin(response.url, link) yield url, ProductItem()
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = json.loads(response.body) for link in data: link = link['product_link'] link_list.append(link) for link in link_list: yield link, ProductItem() else: data = json.loads(response.body) self.total_matches = data['total'] links = [] for result in data['results']: links.append(result['link']) for link in links: yield link, ProductItem()
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = json.loads(response.body) for link in data: link = link['product_link'] if 'officedepot' in link: link_list.append(link) for link in link_list: yield link, ProductItem() else: links = response.xpath( '//div[contains(@class, "descriptionFull")]//a[contains(@class, "med_txt")]/@href' ).extract() or response.css( '.desc_text a::attr("href")').extract() for link in links: yield link, ProductItem()
def parse_product(self, response): product = ProductItem() # Parse name name = self._parse_name(response) product['name'] = name # Parse brand brand = self._parse_brand(response) product['brand'] = brand # Parse image image = self._parse_image(response) product['image'] = image product['link'] = response.url # Parse model model = self._parse_model(response) product['model'] = model product['productstockstatus'] = self.STOCK_STATUS[ 'CALL_FOR_AVAILABILITY'] product['ean'] = self._parse_ean(response) # Parse categories categories = self._parse_categories(response) product['categories'] = categories sku = self._parse_sku(response) product['sku'] = sku # Parse currencycode product['currencycode'] = self._parse_currency_code(response) # Set locale product['locale'] = 'en-GB' # Parse price price = self._parse_price(response) product['price'] = price # Parse gallery gallery = self._parse_gallery(response) product['gallery'] = gallery # Parse features features = self._parse_features(response) product['features'] = features # Parse condition product['condition'] = 1 return product
def _set_product_meta(self, response): prod = ProductItem() prod['site'] = self.site_name prod['search_term'] = response.meta['search_term'] prod['total_matches'] = 1 prod['results_per_page'] = 1 prod['scraped_results_per_page'] = 1 prod['ranking'] = 1 response.meta['product'] = prod return response
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = json.loads(response.body) for link in data: link = link['product_link'] link_list.append(link) for link in link_list: yield link, ProductItem() else: links = response.xpath( '//div[@class="productWrapper"]' '//div[@class="productInfo2"]//a[@class="productHdr"]/@href' ).extract() links = [response.urljoin(x) for x in links] for link in links: yield link, ProductItem()
def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', ProductItem()) try: json_response = json.loads(response.body.decode("utf-8", "ignore")) except TypeError as e: self.logger.error(e.message + "Json respone cannot be parsed") except Exception as e: self.logger.error(e.message) else: return self.parse_product_item(json_response, product)
def _scrape_product_links(self, response): link_list = [] if self.retailer_id: data = requests.get( self.API_URL.format(retailer_id=self.retailer_id)).json() for link in data: link = link['product_link'] link_list.append(link) for link in link_list: url = urlparse.urljoin(response.url, link) meta = response.meta meta['fire'] = True meta['dont_redirect'] = True # stopping 301 redirects product_request = Request(url=url, meta=meta, dont_filter=True) yield product_request, ProductItem() else: links = response.css( 'div.product-list a.ac-product-link::attr(href)').extract() for link in links: url = urlparse.urljoin(response.url, link) yield url, ProductItem()
def _scrape_product_links(self, response): links = response.xpath( '//a[contains(@property, "url")]/@href').extract() if not links: links = response.xpath( './/div[@class="product-info"]' '/a[contains(@class, "product-title")]/@href').extract() if not links: links = response.xpath( '//a[@class="product-title scTrack pfm"]/@href').extract() links = [urlparse.urljoin(response.url, x) for x in links] for link in links: yield link, ProductItem()
def parse_product(self, response): product = ProductItem() # Parse name name = self._parse_name(response) product['name'] = name # Parse brand brand = self._parse_brand(response) product['brand'] = brand # Parse image image = self._parse_image(response) product['image'] = image product['link'] = response.url # Parse model model = self._parse_model(response) product['model'] = model # Parse categories categories = self._parse_categories(response) product['categories'] = categories # Parse unspec unspec = self._parse_unspec(response) product['unspec'] = unspec # Parse currencycode product['currencycode'] = 'USD' # Set locale product['locale'] = 'en-US' # Parse price price = self._parse_price(response) product['price'] = price # Parse sale price product['saleprice'] = price # Parse in_store in_store = self._parse_instore(response) product['instore'] = in_store # Parse ship to store ship_to_store = self._parse_shiptostore(response) product['shiptostore'] = ship_to_store # Parse shipping phrase shipping_phrase = self._parse_shippingphrase(response) product['shippingphrase'] = shipping_phrase # Parse stock status stock_status = self._parse_stock_status(response) product['productstockstatus'] = stock_status # Parse gallery gallery = self._parse_gallery(response) product['gallery'] = gallery # Parse features features = self._parse_features(response) product['features'] = features # Parse condition product['condition'] = 1 return product
def parse_product(self, response): meta = response.meta product = meta.get('product', ProductItem()) # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse name name = self.parse_name(response) cond_set(product, 'name', name, conv=string.strip) # Parse image image = self.parse_image(response) cond_set(product, 'image', image) # Parse brand brand = self.parse_brand(response) cond_set_value(product, 'brand', brand) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse price price = self.parse_price(response) cond_set_value(product, 'price', price) # Parse sale price product['saleprice'] = price # Parse model model = self._parse_model(response) cond_set_value(product, 'model', model) # Parse gallery gallery = self._parse_gallery(response) product['gallery'] = gallery # Parse stock status oos = self._parse_product_stock_status(response) cond_set_value(product, 'productstockstatus', oos) # Parse categories categories = self._parse_categories(response) cond_set_value(product, 'categories', categories) # Parse manufacturer manufacturer = self._parse_manufacturer(response) cond_set_value(product, 'manufacturer', manufacturer, conv=string.strip) # Parse shipping phrase shipping_phrase = self._parse_shippingphrase(response) product['shippingphrase'] = shipping_phrase # Parse ship to store ship_to_store = self._parse_shiptostore(response) product['shiptostore'] = ship_to_store # Parse retailer_key retailer_key = self._parse_retailer_key(response) product['retailer_key'] = retailer_key # Parse features features = self._parse_features(response) product['features'] = features return product
def parse_product(self, response): product = ProductItem() self.link.append(response.url) # Parse name name = self._parse_name(response) product['name'] = name # Parse brand brand = self._parse_brand(response) product['brand'] = brand # Parse image image = self._parse_image(response) product['image'] = image # Parse link product['link'] = response.url # Parse model model = self._parse_model(response) product['model'] = model # Parse upc upc = self._parse_upc(response) product['upc'] = upc # Parse ean product['ean'] = None # Parse currencycode product['currencycode'] = 'USD' # Set locale product['locale'] = 'en-US' # Parse price price = self._parse_price(response) product['price'] = price # Parse sale price product['saleprice'] = price # Parse sku sku = self._parse_sku(response) product['sku'] = sku # Parse retailer_key retailer_key = self._parse_retailer_key(response) product['retailer_key'] = retailer_key # Parse in_store in_store = self._parse_instore(response) product['instore'] = in_store # Parse ship to store ship_to_store = self._parse_shiptostore(response) product['shiptostore'] = ship_to_store # Parse shipping phrase shipping_phrase = self._parse_shippingphrase(response) product['shippingphrase'] = shipping_phrase # Parse stock status stock_status = self._parse_stock_status(response) product['productstockstatus'] = stock_status # Parse gallery product['gallery'] = None # Parse features features = self._parse_features(response) product['features'] = features # Parse condition product['condition'] = 1 return product
def parse_product(self, response): product = ProductItem() # Parse name name = self._parse_name(response) product['name'] = name # Parse brand brand = self._parse_brand(response) product['brand'] = brand # Parse image image = self._parse_image(response) product['image'] = image product['link'] = response.url # Parse model model = self._parse_model(response) product['model'] = model product['mpn'] = model ean = self._parse_ean(response) product['ean'] = ean # Parse categories categories = self._parse_categories(response) product['categories'] = categories # Parse unspec DOUBT DOUBT # unspec = self._parse_unspec(response) # product['unspec'] = unspec # Parse currencycode product['currencycode'] = 'GBP' # Set locale product['locale'] = 'en-UK' # Parse price price = self._parse_price(response) product['price'] = price # Parse price sku = self._parse_sku(response) product['sku'] = sku # Parse retailer_key retailer_key = self._parse_retailer_key(response) product['retailer_key'] = retailer_key # Parse stock status stock_status = self._parse_stock_status(response) product['productstockstatus'] = stock_status # Parse gallery gallery = self._parse_gallery(response) product['gallery'] = gallery # Parse features features = self._parse_features(response) product['features'] = features # Parse condition product['condition'] = 1 return product
def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', ProductItem()) if 'Good thing this is not permanent' in response.body_as_unicode(): # product['not_found'] = True return maintenance_error = response.xpath( './/*[contains(text(), "The site is currently under maintenance.")]' ) if maintenance_error: self.log( "Website under maintenance error, retrying request: {}".format( response.url), WARNING) return Request(response.url, callback=self.parse_product, meta=response.meta, dont_filter=True) if response.status == 429: response = requests.get(url=response.url, timeout=5) # Parse name name = self._parse_name(response) product['name'] = name # Parse image image = self._parse_image(response) product['image'] = image # Parse model model = self._parse_model(response) product['model'] = model # Parse upc # upc = self._parse_upc(response) # product['upc'] = upc # Parse currencycode product['currencycode'] = 'USD' # Set locale product['locale'] = 'en-US' # Parse sku sku = self._parse_sku(response) product['sku'] = sku # Parse manufacturer # manufacturer = self._parse_manufacturer(response) # product['manufacturer'] = manufacturer # Parse categories categories = self._parse_categories(response) product['categories'] = categories # Parse retailer_key retailer_key = self._parse_retailer_key(response) product['retailer_key'] = retailer_key # Parse in_store in_store = self._parse_instore(response) product['instore'] = in_store # Parse stock status response.meta['product'] = product oos = self._parse_product_stock_status(response) cond_set_value(product, 'productstockstatus', oos) # Parse ship to store # ship_to_store = self._parse_shiptostore(response) # product['shiptostore'] = ship_to_store # Parse gallery product['gallery'] = self._parse_gallery(response) # Parse features # features = self._parse_features(response) # product['features'] = features # Parse condition product['condition'] = 1 # Parse price price = self._parse_price(response) product['price'] = price return product
def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', ProductItem()) if 'Good thing this is not permanent' in response.body_as_unicode(): product['not_found'] = True return product maintenance_error = response.xpath('.//*[contains(text(), "The site is currently under maintenance.")]') if maintenance_error: self.log("Website under maintenance error, retrying request: {}".format(response.url), WARNING) return Request(response.url, callback=self.parse_product, meta=response.meta, dont_filter=True) if response.status == 429: response = requests.get(url=response.url, timeout=5) # Parse name name = self._parse_name(response) product['name'] = name # Parse image image = self._parse_image(response) product['image'] = image # Parse model model = self._parse_model(response) product['model'] = model # Parse upc upc = self._parse_upc(response) product['upc'] = upc # Parse currencycode product['currencycode'] = 'USD' # Set locale product['locale'] = 'en-US' # Parse sku sku = self._parse_sku(response) product['sku'] = sku # Parse manufacturer manufacturer = self._parse_manufacturer(response) product['manufacturer'] = manufacturer # Parse categories categories = self._parse_categories(response) product['categories'] = categories # Parse retailer_key retailer_key = self._parse_retailer_key(response) product['retailer_key'] = retailer_key # Parse in_store in_store = self._parse_instore(response) product['instore'] = in_store # Parse ship to store ship_to_store = self._parse_shiptostore(response) product['shiptostore'] = ship_to_store # Parse gallery product['gallery'] = self._parse_gallery(response) # Parse features features = self._parse_features(response) product['features'] = features # Parse condition product['condition'] = 1 # Parse price js_data = self.parse_js_data(response) try: if product.get("sku", ""): prod_doc_key = js_data['prod_doc_key'] prod_doc_key = prod_doc_key.split("/")[:-1] prod_doc_key.append(product.get("sku", "")) prod_doc_key = "/".join(prod_doc_key) else: prod_doc_key = js_data['prod_doc_key'] return Request( url=self.PRICE_URL.format(sku=sku, metadata__coming_soon_flag=js_data['metadata']['coming_soon_flag'], metadata__price_in_cart_flag=js_data['metadata']['price_in_cart_flag'], prod_doc_key=prod_doc_key, metadata__product_type__id=js_data['metadata']['product_type']['id'], metadata__preorder_flag=js_data['metadata']['preorder_flag'], street_date=time.time(), metadata__channel_availability_for__id= js_data['metadata']['channel_availability_for']['id'], metadata__backorder_flag=js_data['metadata']['backorder_flag']), dont_filter=True, callback=self._parse_price, meta=meta, headers={"Referer": None, "X-Requested-With": "XMLHttpRequest", 'User-Agent': 'Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)'} ) except Exception as e: self.log("Error while forming request for base product data: {}".format(traceback.format_exc()), WARNING) return product
def parse_product(self, response): product = ProductItem() # Parse name name = self._parse_name(response) product['name'] = name # Parse brand brand = self._parse_brand(response) product['brand'] = brand # Parse image image = self._parse_image(response) product['image'] = image product['link'] = response.url # Parse model model = self._parse_model(response) product['model'] = model # Parse categories categories = self._parse_categories(response) product['categories'] = categories # Parse sku sku = self._parse_sku(response) product['sku'] = sku # Parse retailer key retailer_key = self._parse_retailer_key(response) product['retailer_key'] = retailer_key # Parse retailer key 2 retailer_key2 = self._parse_retailer_key2(response) product['ean'] = retailer_key2 # Parse mpn mpn = self._parse_mpn(response) product['sku'] = mpn # Parse currencycode product['currencycode'] = self._parse_currency_code(response) # Set locale product['locale'] = 'en-gb' # Parse price price = self._parse_price(response) product['price'] = price # Parse ship to store ship_to_store = 1 product['shiptostore'] = ship_to_store # Parse stock status stock_status = self._parse_stock_status(response) product['productstockstatus'] = stock_status # Parse gallery gallery = self._parse_gallery(response) product['gallery'] = gallery # Parse features features = self._parse_features(response) product['features'] = features # Parse condition product['condition'] = 1 return product
def parse_product(self, response): product = response.meta.get('product', ProductItem()) # Parse name name = self._parse_name(response) product['name'] = name # Parse brand product['brand'] = self._parse_brand(response) # Parse image image = self._parse_image(response) product['image'] = image # Parse link link = response.url product['link'] = link # Parse model product['model'] = self._parse_model(response) # Parse upc product['upc'] = self._parse_upc(response) # Parse ean product['ean'] = None # Parse currencycode product['currencycode'] = 'USD' # Set locale product['locale'] = 'en-US' # Parse price product['price'] = self._parse_price(response) # Parse sale price product['saleprice'] = self._parse_sale_price(response) # Parse sku product['sku'] = self._parse_sku(response) # Parse retailer_key product['retailer_key'] = self._parse_retailer_key(response) # Parse in_store product['instore'] = self._parse_instore(response) # Parse productstockstatus product['productstockstatus'] = self._parse_stock_status(response) # Parse categories product['categories'] = self._parse_categories(response) # Parse gallery product['gallery'] = self._parse_gallery(response) # Parse features product['features'] = self._parse_features(response) # Parse condition product['condition'] = self._parse_condition(response) yield product