class WaitroseSpider(CrawlSpider): """WaitroseSpider =========== Main spider for crawling Waitrose website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from SearchTreeFactory. Spider yields ProductItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'waitrose' store = "WAITROSE" output_dir = None settings = WaitroseSearchSettings() def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl waitrose -a csv_file=waitrose_input.csv Input CSV file should be in data directory. If CSV file not specified, defaults to {name}_input.csv e.g. waitrose_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(WaitroseSpider, self).__init__(*args, **kwargs) ########## Fix for infinite scrolling ############# self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.driver = webdriver.Firefox() self.driver.wait = WebDriverWait(self.driver, 5) #self.driver.maximize_window() self.driver.set_window_size(1920, 1080) time.sleep(3) self.tb = 'tb none' dispatcher.connect(self.spider_closed, signals.spider_closed) ########## Fix for infinite scrolling ############# if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise WaitroseSpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Waitrose website allows us to construct URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise WaitroseSpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" #search_list = self.get_searches() # Build URLs based on base URL + sub-categories #for s in search_list: # search_meta = {} # product_url = '' # search_meta = s.get_meta_map() product1_url = "http://www.waitrose.com/shop/Browse/Groceries/" log.msg("Spider: start_requests() yielding URL: " + product1_url, level=log.DEBUG) yield Request(url=product1_url) def parse_start_url(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll through them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" ########## Fix for infinite scrolling ############# search_list = self.get_searches() for s in search_list: search_meta = {} product_url = '' metadata = s.get_meta_map() product_url = '/'.join([ self.settings.base_url, s.store_sub1, s.store_sub2, s.store_sub3 ]) + '/' self.driver.maximize_window() time.sleep(1) self.driver.get(product_url) time.sleep(2) log.msg("Spider: parse_start_url :: " + product_url, level=log.DEBUG) sel = Selector(text=self.driver.page_source) #i=0 while True: try: #i = i + 1 next_element = self.driver.find_element_by_xpath( self.settings.next_page_xpath) debug_text_class = next_element.get_attribute('href') #log.msg("Spider: parse_start_url :: Inside while :: next element"+str(debug_text_class), level=log.DEBUG) self.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") try: button = self.driver.wait.until( EC.element_to_be_clickable( (By.XPATH, self.settings.next_page_xpath))) button.click() except: self.tb = traceback.format_exc() #print '------------------inside button click exception i count--------------' ,i #print 'ERROR TRACE ::: ',self.tb #log.msg("Spider: parse_start_url :: Inside Exception handling :: Load more button Click "+str(self.tb), level=log.DEBUG) break time.sleep(2) except NoSuchElementException: self.tb = traceback.format_exc() #print '------------------ End of infinite scrolling/NoSuchElementException :: i count --------------' ,i #print 'ERROR TRACE ::: ',self.tb #log.msg("Spider: parse_start_url :: End of infinite scrolling/NoSuchElementException "+str(self.tb), level=log.DEBUG) break except: self.tb = traceback.format_exc() #print '------------------inside Exception handling:: i count --------------' ,i #print 'ERROR TRACE ::: ',self.tb #log.msg("Spider: parse_start_url :: inside infinite scrolling exception handling "+ str(self.tb), level=log.DEBUG) break sel = Selector(text=self.driver.page_source) products = sel.xpath(self.settings.products_xpath) log.msg("Spider: parsing response for URL: " + response.url + " for ONS item " + metadata['ons_item_name'], level=log.DEBUG) product_counter = len(products) #print 'Spider: parsing response for URL: total no. of products:: ',product_counter log.msg("Spider: parse_start_url :: total no. of products:: " + str(product_counter), level=log.DEBUG) for product in products: # Create an item for each entry item = ProductItem() #UPPER case product name for storage to make searching easier try: item['product_name'] = (product.xpath( self.settings.product_name_xpath).extract()[0] ).upper() except: continue log.msg("Spider: Response for URL: " + response.url + " found " + item['product_name'].encode('utf-8'), level=log.DEBUG) try: item['store'] = self.store item['ons_item_no'] = metadata['ons_item_no'] item['ons_item_name'] = metadata['ons_item_name'] item['product_type'] = metadata['store_sub3'] item['search_string'] = metadata['search_terms'] except: continue #Default matches to 1.0 and modify later try: item['search_matches'] = 1.0 # Save price string and convert it to number later item['item_price_str'] = product.xpath( self.settings.raw_price_xpath).extract()[0].strip() x = item['item_price_str'][0] #print('test', x) #pos = item['item_price_str'].index('\xc2') #item['item_price_str'] = item['item_price_str'][:].strip() #print(item['item_price_str'][4]) if item['item_price_str'][0] == 'N': item['item_price_str'] = item['item_price_str'][ 3:].strip() else: item['item_price_str'] = item[ 'item_price_str'][:].strip() # Try getting the volume and putting it on the end of the product name volume = product.xpath( self.settings.volume_xpath).extract() if volume: item['product_name'] = item[ 'product_name'] + " " + volume[0].strip().upper() except: continue # Waitrose volume price not always provided, so if it is not there, # we try using volume and item price instead. try: item['volume_price'] = '' vol_price = product.xpath( self.settings.vol_price_xpath).extract() if vol_price: #Allow for e.g. "1.25 per litre" instead of "1.25/litre" item['volume_price'] = (vol_price[0].strip()).replace( "per", "/") else: item['volume_price'] = item[ 'item_price_str'] + "/" + volume[0].strip() # Add timestamp item['timestamp'] = datetime.datetime.now() # Get promotion text (if any) NOT YET IMPLEMENTED item['promo'] = '' if self.settings.promo_xpath: promo = product.xpath( self.settings.promo_xpath).extract() #TODO if promo: item['promo'] = promo[0] # Get short term offer (if any) NOT YET IMPLEMENTED item['offer'] = '' if self.settings.offer_xpath: offer = product.xpath( self.settings.offer_xpath).extract() #TODO if offer: item['offer'] = offer[0] except: continue #Pass the item back product_counter = product_counter - 1 yield item def spider_closed(self, spider): self.display.stop() self.driver.quit()
class SainsburySpider(CrawlSpider): """SainsburySpider =========== Main spider for crawling Tecso store website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from sainsburySearchTreeFactory. Spider yields sainsburyItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'sainsbury' store = "SAINSBURY" settings = SainsburySearchSettings() output_dir = None def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl sainsbury -a csv_file=sainsbury_input.csv Input CSV file should be in supermarket_scraper/input directory. If CSV file not specified, defaults to {name}_input.csv e.g. sainsbury_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(SainsburySpider, self).__init__(*args, **kwargs) if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise SainsburySpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Sainsbury website allows us to identify URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise SainsburySpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" search_list = self.get_searches() a = str(search_list) #print('gets urls and parses the repsonse objects to form a list: ', a) sb_cookies = self.settings.cookies # Build URLs based on base URL + sub-categories for s in search_list: search_meta = {} product_url = '' search_meta = s.get_meta_map() search_meta['cookiejar'] = 1 product_url = s.store_sub3 # print urls #print('product:' ,product_url) log.msg("Spider: start_requests() yielding URL: " + product_url, level=log.DEBUG) yield Request(url=product_url, cookies=sb_cookies, meta=search_meta, callback=self.parse_base) def parse_base(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll through them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" # Get details of current search (passed in via response meta data) metadata = response.meta #Find product lines sel = Selector(response) # rb test, only collecting three item (object) responses #print('test reponse objects from spider: ',sel) sb_cookies = self.settings.cookies #Find any "next" links for paging and yield Request to next page next_page = sel.xpath(self.settings.next_page_xpath) for page in next_page: #Check each nav link for the required sub-category next_link_ref = page.xpath('@href').extract()[0] log.msg("Spider: found NEXT page link: " + next_link_ref, level=log.DEBUG) yield Request(next_link_ref, cookies=sb_cookies, meta=response.meta, callback=self.parse_base) #Process each product line log.msg("Spider: parsing response for URL: " + response.url + " for ONS item " + metadata['ons_item_name'], level=log.DEBUG) products = sel.xpath(self.settings.products_xpath) for product in products: # Create an item for each entry item = ProductItem() item['store'] = self.store #print('store field of item object', item['store']) item['ons_item_no'] = metadata['ons_item_no'] item['ons_item_name'] = metadata['ons_item_name'] item['product_type'] = metadata['store_sub3'] item['search_string'] = metadata['search_terms'] #Default matches to 1.0 and modify later #item['search_matches'] = 1.0 #UPPER case product name for storage to make searching easier prodname = product.xpath( self.settings.product_name_xpath).extract() if len(prodname) > 0: item['product_name'] = prodname[0].upper().strip() #print('individual item product names: ', item['product_name']) # WARNING: Prices format is much more complicated on Sainsburys # pages, so we have to do multiple layers of extraction here to # get the prices while we still have access to the XPaths etc. price_block = product.xpath(self.settings.raw_price_xpath) raw_price_block = price_block[0] vol_price_block = price_block[1] #price_block[0] #price_block[1] #print('individual item prices ', raw_price_block) #print('individual volume item prices ', vol_price_block) #Extract a raw price ppu_price = raw_price_block.xpath('text()')[0] ppu_unit = raw_price_block.xpath( '*/span[@class="pricePerUnitUnit"]/text()')[0] item['item_price_str'] = ppu_price.extract().strip( ) + '/' + ppu_unit.extract().strip() #print('individual item prices processed', item['item_price_str']) #Extract the components of the volume price e.g. 1.50 per 100g #THIS WILL BREAK IF PRICE FORMAT ON PAGE CHANGES! vol_abbr = vol_price_block.xpath('text()').extract() #print('volume_unit_raw', vol_abbr ) if vol_abbr[0].strip(): vol_price = vol_abbr[0].strip() if vol_abbr[1].strip(): vol_price = vol_price + ' / ' + vol_abbr[1] else: #default std quantity to 1 vol_price = vol_price + ' / 1 ' #Get the volume units as well #exception added as the last two unit_vol's were not collecting, this adds an NA in when this is the case and parses to the next product try: vol_unit = product.xpath(self.settings.vol_unit)[2] vol_price = vol_price + vol_unit.extract().strip() except: #default std quantity to 1 vol_unit = "NA" vol_price = vol_price + vol_unit #Get the volume units as well #print('vol_unit', vol_unit) #print('vol _nunit', vol_unit) #vol_price_block.xpath("*/span[@class='pricePerMeasureMeasure']/text()") #Construct the vol price in known format and save it to the item item['volume_price'] = vol_price #print('vol _nunit', item['volume_price']) # Add timestamp item['timestamp'] = datetime.datetime.now() #Ignore promos/offers item['promo'] = product.xpath( self.settings.promo_xpath).extract() item['offer'] = product.xpath( self.settings.offer_xpath).extract() #Pass the item back yield item
class SainsburySpider(CrawlSpider): """SainsburySpider =========== Main spider for crawling Tecso store website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from sainsburySearchTreeFactory. Spider yields sainsburyItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'sainsbury' store = "SAINSBURY" settings = SainsburySearchSettings() output_dir = None def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl sainsbury -a csv_file=sainsbury_input.csv Input CSV file should be in supermarket_scraper/input directory. If CSV file not specified, defaults to {name}_input.csv e.g. sainsbury_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(SainsburySpider, self).__init__(*args, **kwargs) ## selenium self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.driver = webdriver.Firefox() self.driver.wait = WebDriverWait(self.driver, 5) #self.driver.maximize_window() self.driver.set_window_size(1920, 1080) time.sleep(20) self.tb = 'tb none' dispatcher.connect(self.spider_closed, signals.spider_closed) #i=0 if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise SainsburySpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Sainsbury website allows us to identify URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise SainsburySpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" sb_cookies = self.settings.cookies product_url = "http://www.sainsburys.co.uk" log.msg("Spider: start_requests() yielding URL: " + product_url, level=log.DEBUG) yield Request(url=product_url, cookies=sb_cookies, callback=self.parse_base) def parse_base(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll thro ugh them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" search_list = self.get_searches() for s in search_list: search_meta = {} product_url = '' search_meta = s.get_meta_map() product_url = s.store_sub3 self.driver.get(product_url) sel = Selector(text=self.driver.page_source) first_page_parse_finished = None log.msg("Spider: start_requests() yielding URL:" + product_url, level=log.DEBUG) while True: try: if first_page_parse_finished: #Find any "next" links for paging next_element = self.driver.find_element_by_xpath( self.settings.next_page_xpath) debug_text_class = next_element.get_attribute('href') button = self.driver.wait.until( EC.element_to_be_clickable( (By.XPATH, self.settings.next_page_xpath))) button.click() time.sleep(3) first_page_parse_finished = True sel = Selector(text=self.driver.page_source) products = sel.xpath(self.settings.products_xpath) for product in products: # Create an item for each entry item = ProductItem() item['store'] = self.store #print('store field of item object', item['store']) item['ons_item_no'] = search_meta['ons_item_no'] item['ons_item_name'] = search_meta['ons_item_name'] item['product_type'] = search_meta['store_sub3'] item['search_string'] = search_meta['search_terms'] #Default matches to 1.0 and modify later #item['search_matches'] = 1.0 #UPPER case product name for storage to make searching easier prodname = product.xpath( self.settings.product_name_xpath).extract() if len(prodname) > 0: item['product_name'] = prodname[0].upper().strip() #print 'SPIDER :: sainsbury :: product_name',format(item['product_name'].encode('utf-8')) # WARNING: Prices format is much more complicated on Sainsburys # pages, so we have to do multiple layers of extraction here to # get the prices while we still have access to the XPaths etc. price_block = product.xpath( self.settings.raw_price_xpath) raw_price_block = price_block[0] vol_price_block = price_block[1] #price_block[0] #price_block[1] #print('individual item prices ', raw_price_block) #print('individual volume item prices ', vol_price_block) #Extract a raw price ppu_price = raw_price_block.xpath('text()')[0] ppu_unit = raw_price_block.xpath( '*/span[@class="pricePerUnitUnit"]/text()')[0] item['item_price_str'] = ppu_price.extract().strip( ) + '/' + ppu_unit.extract().strip() #print('individual item prices processed', item['item_price_str']) #Extract the components of the volume price e.g. 1.50 per 100g #THIS WILL BREAK IF PRICE FORMAT ON PAGE CHANGES! vol_abbr = vol_price_block.xpath( 'text()').extract() #print('volume_unit_raw', vol_abbr ) if vol_abbr[0].strip(): vol_price = vol_abbr[0].strip() if vol_abbr[1].strip(): vol_price = vol_price + ' / ' + vol_abbr[1] else: #default std quantity to 1 vol_price = vol_price + ' / 1 ' #Get the volume units as well #exception added as the last two unit_vol's were not collecting, this adds an NA in when this is the case and parses to the next product try: vol_unit = product.xpath( self.settings.vol_unit)[2] vol_price = vol_price + vol_unit.extract( ).strip() except: #default std quantity to 1 vol_unit = "NA" vol_price = vol_price + vol_unit #Get the volume units as well #print('vol_unit', vol_unit) #print('vol _nunit', vol_unit) #vol_price_block.xpath("*/span[@class='pricePerMeasureMeasure']/text()") #Construct the vol price in known format and save it to the item item['volume_price'] = vol_price #print('vol _nunit', item['volume_price']) # Add timestamp item['timestamp'] = datetime.datetime.now() #Ignore promos/offers item['promo'] = product.xpath( self.settings.promo_xpath).extract() item['offer'] = product.xpath( self.settings.offer_xpath).extract() #Pass the item back yield item except NoSuchElementException: #print 'Inside NoSuchElementException handling::: ' break except: self.tb = traceback.format_exc() log.msg( "Spider: parse request :Inside Exception handling:::" + self.tb, level=log.DEBUG) #print 'Inside Exception handling::: ',self.tb break def spider_closed(self, spider): #print "--- %s seconds ---" % (time.time() - start_time)) self.display.stop() self.driver.quit()
class WaitroseSpider(CrawlSpider): """WaitroseSpider =========== Main spider for crawling Waitrose website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from SearchTreeFactory. Spider yields ProductItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'waitrose' store = "WAITROSE" output_dir = None settings = WaitroseSearchSettings() def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl waitrose -a csv_file=waitrose_input.csv Input CSV file should be in data directory. If CSV file not specified, defaults to {name}_input.csv e.g. waitrose_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(WaitroseSpider, self).__init__(*args, **kwargs) if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise WaitroseSpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Waitrose website allows us to construct URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise WaitroseSpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" search_list = self.get_searches() # Build URLs based on base URL + sub-categories for s in search_list: search_meta = {} product_url = '' search_meta = s.get_meta_map() product_url = '/'.join([ self.settings.base_url, s.store_sub1, s.store_sub2, s.store_sub3 ]) + '/' log.msg("Spider: start_requests() yielding URL: " + product_url, level=log.DEBUG) yield Request(url=product_url, meta=search_meta) def parse_start_url(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll through them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" # Get details of current search (passed in via response meta data) metadata = response.meta #Find product lines sel = Selector(response) products = sel.xpath(self.settings.products_xpath) #Process each product line log.msg("Spider: parsing response for URL: " + response.url + " for ONS item " + metadata['ons_item_name'], level=log.DEBUG) for product in products: # Create an item for each entry item = ProductItem() #UPPER case product name for storage to make searching easier try: item['product_name'] = (product.xpath( self.settings.product_name_xpath).extract()[0]).upper() except: continue log.msg("Spider: Response for URL: " + response.url + " found " + item['product_name'].encode('utf-16'), level=log.DEBUG) try: item['store'] = self.store item['ons_item_no'] = metadata['ons_item_no'] item['ons_item_name'] = metadata['ons_item_name'] item['product_type'] = metadata['store_sub3'] item['search_string'] = metadata['search_terms'] except: continue #Default matches to 1.0 and modify later try: item['search_matches'] = 1.0 # Save price string and convert it to number later item['item_price_str'] = product.xpath( self.settings.raw_price_xpath).extract()[0].strip() x = item['item_price_str'][0] #print('test', x) #pos = item['item_price_str'].index('\xc2') #item['item_price_str'] = item['item_price_str'][:].strip() #print(item['item_price_str'][4]) if item['item_price_str'][0] == 'N': item['item_price_str'] = item['item_price_str'][3:].strip() else: item['item_price_str'] = item['item_price_str'][:].strip() # Try getting the volume and putting it on the end of the product name volume = product.xpath(self.settings.volume_xpath).extract() if volume: item['product_name'] = item['product_name'] + " " + volume[ 0].strip().upper() except: continue # Waitrose volume price not always provided, so if it is not there, # we try using volume and item price instead. try: item['volume_price'] = '' vol_price = product.xpath( self.settings.vol_price_xpath).extract() if vol_price: #Allow for e.g. "1.25 per litre" instead of "1.25/litre" item['volume_price'] = (vol_price[0].strip()).replace( "per", "/") else: item['volume_price'] = item[ 'item_price_str'] + "/" + volume[0].strip() # Add timestamp item['timestamp'] = datetime.datetime.now() # Get promotion text (if any) NOT YET IMPLEMENTED item['promo'] = '' if self.settings.promo_xpath: promo = product.xpath( self.settings.promo_xpath).extract() #TODO if promo: item['promo'] = promo[0] # Get short term offer (if any) NOT YET IMPLEMENTED item['offer'] = '' if self.settings.offer_xpath: offer = product.xpath( self.settings.offer_xpath).extract() #TODO if offer: item['offer'] = offer[0] except: continue #Pass the item back yield item