Esempio n. 1
0
    def parse_itme(self, response):
        article_p = response.xpath('//div[@class="content"]//p')
        title_index = []
        for i, p in enumerate(article_p):
            if p.xpath("./strong").extract():
                title_index.append(i)

        for s_index, i_item in enumerate(title_index):
            category = response.xpath("//h1/text()").extract()[0]
            title = article_p[i_item].xpath("./strong/text()").extract()[0]
            try:
                next_tile = title_index[s_index + 1]
                content_str = [
                    "".join(item.xpath("./text()").extract())
                    for item in article_p[i_item:next_tile]
                ]
            except Exception as e:
                content_str = [
                    "".join(item.xpath("./text()").extract())
                    for item in article_p[i_item:]
                ]

            item = TangspiderframeItem()
            item['url'] = response.url
            item['category'] = category
            item['content'] = "".join([item.strip() for item in content_str])
            item['title'] = title
            yield item
 def parse(self, response):
     category = response.xpath('//div[@id="place"]/a[2]/text()').extract()[0]
     title = response.xpath('//div[@id="info"]/dl/h1/a/text()').extract()[0]
     content = response.xpath('//dl[@id="zi"]//p//text()').extract()
     item = TangspiderframeItem()
     item['url'] = response.url
     item['category'] = category
     item['content'] = "".join([item.strip() for item in content])
     item['title'] = title
     yield item
 def parse(self, response):
     category = response.xpath(
         '//div[@class="position"]/a[2]/text()').extract()[0]
     title = response.xpath(
         '//div[@class="article"]/h1/text()').extract()[0]
     content = response.xpath('//div[@class="article"]/p//text()').extract()
     item = TangspiderframeItem()
     item['url'] = response.url
     item['category'] = category
     item['content'] = "".join([item.strip() for item in content])
     item['title'] = title
     yield item
    def parse(self, response):
        category = response.xpath('//div[@class="dingtou"]/div[@class="da-bre"]/a[2]/text()').extract()
        title = response.xpath("//h1/text()").extract()  # 文章标题
        contents, para = [], []
        contents.extend(response.xpath('//div[contains(@class, "article")]//p'))
        contents.extend(response.xpath('//div[contains(@class, "zhengwen")]//p'))
        contents.extend(response.xpath('//div[contains(@class, "c_body")]//p'))
        contents.extend(response.xpath('//div[contains(@class, "cnt_bd")]//p'))
        for item in contents:
            para.extend(item.xpath("./text()").extract())
            sub_tag = item.xpath(".//*")
            for sub in sub_tag:
                if sub.root.tag != "script":
                    para.extend(sub.xpath(".//text()").extract())

        item = TangspiderframeItem()
        item['url'] = response.url
        item['category'] = "".join(category).replace(" > ", '').strip()
        item['content'] = "".join([item.strip() for item in para])
        item['title'] = "".join(title).strip()
        yield item