Ejemplo n.º 1
0
    def parse_item(self, response):
        item = YdNewsCrawlerItem()
        item["create_at"] = get_time_now()
        item["update_at"] = get_time_now()
        item["topic_id"] = get_id("100")
        item["topic_type"] = "101"
        item["origin_url"] = response.meta["origin_url"]
        # regex = re.compile(r'[^/]*.[jpg|png|gif]$')
        for sel in response.xpath("//div[contains(@id, 'container')]"):
            item["topic_title"] = sel.xpath("//div//h1/text()").extract()[0]
            item["content"] = sel.xpath("//div[contains(@id, 'contentText')]//div//p/text()").extract()

            item["image_urls"] = sel.xpath("//td//img/@src").extract()
            item["images"] = sel.xpath("//td//img/@src").re(r"[^/]*.[jpg|png|gif]$")
            if item["image_urls"]:
                yield item
            else:
                continue
    def parse_item(self, response):
        item = YdNewsCrawlerItem()

        item["create_at"] = get_time_now()
        item["update_at"] = get_time_now()
        item["topic_id"] = get_id("100")
        item["topic_type"] = "102"
        item["topic_title"] = response.meta["topic_title"]
        item["origin_url"] = response.meta["origin_url"]

        for sel in response.xpath("//div[contains(@class, end-text)]//p/text()"):
            item["content"] = sel.xpath(".//p/text()").extract()
            item["image_urls"] = sel.xpath(".//p//img/@src").extract()
            item["images"] = sel.xpath(".//p//img/@src").re(r"[^/]*.[jpg|png|gif]$")
            if item["image_urls"]:
                yield item
            else:
                continue
    def parse_item(self, response):
        item = YdNewsCrawlerItem()

        item['create_at'] = get_time_now()
        item['update_at'] = get_time_now()
        item['topic_id'] = get_id('100')
        item['topic_type'] = '103'
        item['topic_title'] = response.meta['topic_title']
        item['origin_url'] = response.meta['origin_url']

        for sel in response.xpath("//div[contains(@class, 'end-text')]"):
            item['content'] = sel.xpath(".//p/text()").extract()
            item['image_urls'] = sel.xpath(".//p//img/@src").extract()
            item['images'] = sel.xpath(".//p//img/@src").re(r'[^/]*.[jpg|png|gif]$')
            if item['image_urls']:
                yield item
            else:
                continue