def test_replace_tags(self): # make sure it always return uncode assert isinstance(replace_tags('no entities'), unicode) self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'), u'This text contains some tag') self.assertEqual(replace_tags('This text is very im<b>port</b>ant', ' '), u'This text is very im port ant') # multiline tags self.assertEqual(replace_tags('Click <a class="one"\r\n href="url">here</a>'), u'Click here')
def test_replace_tags(self): # make sure it always return uncode assert isinstance(replace_tags('no entities'), unicode) self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'), u'This text contains some tag') self.assertEqual( replace_tags('This text is very im<b>port</b>ant', ' '), u'This text is very im port ant') # multiline tags self.assertEqual( replace_tags('Click <a class="one"\r\n href="url">here</a>'), u'Click here')
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select("//div[@id='dvWrapControl732']//a/@href").extract() for link in categories: url = urljoin_rfc(base_url, link) yield Request(url, callback=self.parse) items = hxs.select("//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\ //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']") for item in items: name = item.select("td[@id='tdProductGroupDisplayDescription']/div/font | \ td[@id='tdProductGroupDisplayAltDescription']/div/font").extract() if not name: print "%s - ERROR! NO NAME!" % response.url continue name = replace_tags(name[0]) url = response.url price = item.select("td[@id='tdProductGroupDisplayPricing']//text() | \ td[@id='tdProductGroupDisplayAltPricing']//text()").extract() if not price: print "%s - ERROR! NO PRICE!" % response.url continue price = price[0].split(',')[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', unicode(name).encode('ascii', 'ignore')) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def process_item(self, item, spider): body_only = Selector(text=item['body']).css('body').get() script_removed = remove_tags_with_content(body_only, which_ones=('style', 'script')) tags_replaced = replace_tags(script_removed, ' ') item['body'] = replace_escape_chars(tags_replaced, ' ') logging.info(f'Item cleaned up: {item["title"]}') return item
def parse(self, response): selector = scrapy.Selector(response) articles = selector.xpath( '/html/body/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[3]' '/div/div/div/div[2]/div[1]/div/section[2]/section/article/*') for a in articles: name = a.xpath('./div/div[2]/div[1]/h2/a/text()').get() description_with_tags = a.xpath('./div/div[2]/div[1]/div[2]/dl').get() description = "" if description_with_tags is not None: description = replace_tags(description_with_tags, " ") price = a.xpath('./div/div[2]/div[2]/div[1]/div/span/span/text()').get() url = a.xpath('./div/div[2]/div[1]/h2/a/@href').get() item = ProductItem() item['name'] = name item['price'] = price item['description'] = description item['url'] = url item['task_id'] = self.task_id yield item
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select( "//div[@id='dvWrapControl732']//a/@href").extract() for link in categories: url = urljoin_rfc(base_url, link) yield Request(url, callback=self.parse) items = hxs.select( "//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\ //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']" ) for item in items: name = item.select( "td[@id='tdProductGroupDisplayDescription']/div/font | \ td[@id='tdProductGroupDisplayAltDescription']/div/font" ).extract() if not name: print "%s - ERROR! NO NAME!" % response.url continue name = replace_tags(name[0]) url = response.url price = item.select( "td[@id='tdProductGroupDisplayPricing']//text() | \ td[@id='tdProductGroupDisplayAltPricing']//text()" ).extract() if not price: print "%s - ERROR! NO PRICE!" % response.url continue price = price[0].split(',')[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', unicode(name).encode('ascii', 'ignore')) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select( "//div[@id='dvWrapControl732']//a/@href").extract() for link in categories: url = urljoin_rfc(base_url, link) yield Request(url, callback=self.parse) sub_categories = hxs.select( '//span[@class="CategoryProductNameLink"]/a/@href').extract() for link in sub_categories: url = urljoin_rfc(base_url, link) yield Request(url, callback=self.parse) items = hxs.select( "//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\ //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']" ) for item in items: name = item.select( "td[@id='tdProductGroupDisplayDescription']/div/font | \ td[@id='tdProductGroupDisplayAltDescription']/div/font" ).extract() if not name: print "%s - ERROR! NO NAME!" % response.url continue name = replace_tags(name[0]) url = response.url price = item.select( "td[@id='tdProductGroupDisplayPricing']//text() | \ td[@id='tdProductGroupDisplayAltPricing']//text()" ).extract() if not price: print "%s - ERROR! NO PRICE!" % response.url continue price = price[0].split(',')[0] l = ProductLoader(item=Product(), response=response) identifier = item.select( 'td[contains(@id, "ItemNumber")]/input/@value').extract()[0] l.add_value('identifier', identifier) sku = item.select( 'td[contains(@id, "ItemNumber")]/span/text()').extract()[0] l.add_value('sku', sku) l.add_value('name', name) l.add_value('url', url) l.add_xpath( 'brand', '//div[@class="ProductDetailsManufacturerName"]/a/img/@alt') image_url = hxs.select( '//div[@class="ProductDetailsPhoto"]/a/img/@src').extract() if image_url: l.add_value('image_url', urljoin_rfc(base_url, image_url[0])) category = hxs.select( '//span[@id="lblCategoryTrail"]/a/text()').extract()[-1] l.add_value('category', category) l.add_value('price', price) in_stock = 'IN STOCK' in ''.join( item.select('td[contains(@id, "Availability")]/span/text()'). extract()).upper() if not in_stock: l.add_value('stock', 0) yield l.load_item()