Beispiel #1
0
    def test_replace_tags(self):
        # make sure it always return uncode
        assert isinstance(replace_tags('no entities'), unicode)

        self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
                         u'This text contains some tag')

        self.assertEqual(replace_tags('This text is very im<b>port</b>ant', ' '),
                         u'This text is very im port ant')

        # multiline tags
        self.assertEqual(replace_tags('Click <a class="one"\r\n href="url">here</a>'),
                         u'Click here')
Beispiel #2
0
    def test_replace_tags(self):
        # make sure it always return uncode
        assert isinstance(replace_tags('no entities'), unicode)

        self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
                         u'This text contains some tag')

        self.assertEqual(
            replace_tags('This text is very im<b>port</b>ant', ' '),
            u'This text is very im port ant')

        # multiline tags
        self.assertEqual(
            replace_tags('Click <a class="one"\r\n href="url">here</a>'),
            u'Click here')
Beispiel #3
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select("//div[@id='dvWrapControl732']//a/@href").extract()
        for link in categories:
            url = urljoin_rfc(base_url, link)
            yield Request(url, callback=self.parse)

        items = hxs.select("//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\
                            //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']")
        for item in items:
            name = item.select("td[@id='tdProductGroupDisplayDescription']/div/font | \
                                td[@id='tdProductGroupDisplayAltDescription']/div/font").extract()
            if not name:
                print "%s - ERROR! NO NAME!" % response.url
                continue
            name = replace_tags(name[0])
            url = response.url
            price = item.select("td[@id='tdProductGroupDisplayPricing']//text() | \
                                 td[@id='tdProductGroupDisplayAltPricing']//text()").extract()
            if not price:
                print "%s - ERROR! NO PRICE!" % response.url
                continue
            price = price[0].split(',')[0]
            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', unicode(name).encode('ascii', 'ignore'))
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Beispiel #4
0
    def process_item(self, item, spider):
        body_only = Selector(text=item['body']).css('body').get()
        script_removed = remove_tags_with_content(body_only, which_ones=('style', 'script'))
        tags_replaced = replace_tags(script_removed, ' ')
        item['body'] = replace_escape_chars(tags_replaced, ' ')

        logging.info(f'Item cleaned up: {item["title"]}')
        return item
Beispiel #5
0
    def parse(self, response):
        selector = scrapy.Selector(response)

        articles = selector.xpath(
            '/html/body/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[3]'
            '/div/div/div/div[2]/div[1]/div/section[2]/section/article/*')

        for a in articles:
            name = a.xpath('./div/div[2]/div[1]/h2/a/text()').get()
            description_with_tags = a.xpath('./div/div[2]/div[1]/div[2]/dl').get()
            description = ""
            if description_with_tags is not None:
                description = replace_tags(description_with_tags, " ")
            price = a.xpath('./div/div[2]/div[2]/div[1]/div/span/span/text()').get()
            url = a.xpath('./div/div[2]/div[1]/h2/a/@href').get()
            item = ProductItem()
            item['name'] = name
            item['price'] = price
            item['description'] = description
            item['url'] = url
            item['task_id'] = self.task_id
            yield item
Beispiel #6
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select(
            "//div[@id='dvWrapControl732']//a/@href").extract()
        for link in categories:
            url = urljoin_rfc(base_url, link)
            yield Request(url, callback=self.parse)

        items = hxs.select(
            "//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\
                            //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']"
        )
        for item in items:
            name = item.select(
                "td[@id='tdProductGroupDisplayDescription']/div/font | \
                                td[@id='tdProductGroupDisplayAltDescription']/div/font"
            ).extract()
            if not name:
                print "%s - ERROR! NO NAME!" % response.url
                continue
            name = replace_tags(name[0])
            url = response.url
            price = item.select(
                "td[@id='tdProductGroupDisplayPricing']//text() | \
                                 td[@id='tdProductGroupDisplayAltPricing']//text()"
            ).extract()
            if not price:
                print "%s - ERROR! NO PRICE!" % response.url
                continue
            price = price[0].split(',')[0]
            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', unicode(name).encode('ascii', 'ignore'))
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select(
            "//div[@id='dvWrapControl732']//a/@href").extract()
        for link in categories:
            url = urljoin_rfc(base_url, link)
            yield Request(url, callback=self.parse)

        sub_categories = hxs.select(
            '//span[@class="CategoryProductNameLink"]/a/@href').extract()
        for link in sub_categories:
            url = urljoin_rfc(base_url, link)
            yield Request(url, callback=self.parse)

        items = hxs.select(
            "//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\
                            //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']"
        )
        for item in items:
            name = item.select(
                "td[@id='tdProductGroupDisplayDescription']/div/font | \
                                td[@id='tdProductGroupDisplayAltDescription']/div/font"
            ).extract()
            if not name:
                print "%s - ERROR! NO NAME!" % response.url
                continue
            name = replace_tags(name[0])
            url = response.url
            price = item.select(
                "td[@id='tdProductGroupDisplayPricing']//text() | \
                                 td[@id='tdProductGroupDisplayAltPricing']//text()"
            ).extract()
            if not price:
                print "%s - ERROR! NO PRICE!" % response.url
                continue
            price = price[0].split(',')[0]
            l = ProductLoader(item=Product(), response=response)
            identifier = item.select(
                'td[contains(@id, "ItemNumber")]/input/@value').extract()[0]
            l.add_value('identifier', identifier)
            sku = item.select(
                'td[contains(@id, "ItemNumber")]/span/text()').extract()[0]
            l.add_value('sku', sku)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_xpath(
                'brand',
                '//div[@class="ProductDetailsManufacturerName"]/a/img/@alt')

            image_url = hxs.select(
                '//div[@class="ProductDetailsPhoto"]/a/img/@src').extract()
            if image_url:
                l.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

            category = hxs.select(
                '//span[@id="lblCategoryTrail"]/a/text()').extract()[-1]
            l.add_value('category', category)
            l.add_value('price', price)
            in_stock = 'IN STOCK' in ''.join(
                item.select('td[contains(@id, "Availability")]/span/text()').
                extract()).upper()
            if not in_stock:
                l.add_value('stock', 0)
            yield l.load_item()