Esempio n. 1
0
    def parse_item(self, response):
        item = NewdongguanItem()

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        return item
Esempio n. 2
0
    def parse_item(self, response):
        item = NewdongguanItem()
        # 标题
        # item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
        item['title'] = response.xpath(
            '//div[@id = "news_main"]//div[@class = "news_content"]/h2[@class = "newsTitle"]/text()'
        ).extract()[0]
        # 编号
        item['number'] = response.xpath(
            '//div[@id = "news_main"]//div[@class = "news_content"]//div[@class = "message"]/text()'
        ).extract()[0]

        # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合
        contents = response.xpath(
            '//div[@id = "news_main"]//div[@class = "news_content"]//p/text()'
        ).extract()
        content = ""
        for cont in contents:
            content = content + cont
        item['content'] = content

        item['url'] = response.url

        # 交给管道
        yield item
Esempio n. 3
0
    def parse_item(self, response):
        item = NewdongguanItem()
        # 标题
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
        # 编号
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]
        # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则
        if len(content) == 0:
            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()
        # 链接
        item['url'] = response.url

        yield item
  







# # -*- coding: utf-8 -*-
# import scrapy
# from newdongguan.items import NewdongguanItem


# class DongdongSpider(scrapy.Spider):



#     # 处理每个帖子的response内容
#     def parse_item(self, response):
#         item = NewdongguanItem()
#         # 标题
#         item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
#         # 编号
#         item['number'] = item['title'].split(' ')[-1].split(":")[-1]
#         # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合
#         content = response.xpath('//div[@class="contentext"]/text()').extract()
#         # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则
#         if len(content) == 0:
#             content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
#             item['content'] = "".join(content).strip()
#         else:
#             item['content'] = "".join(content).strip()
#         # 链接
#         item['url'] = response.url

#         # 交给管道
#         yield item
Esempio n. 4
0
 def parse_item(self, response):
     item = NewdongguanItem()
     item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
     item['num'] = item['title'].split(' ')[-1].split(":")[-1]
     content = response.xpath('//div[@class="contentext"]/text()').extract()
     if len(content) == 0:
         content = response.xpath(
             '//div[@class="c1 text14_2"]/text()').extract()
         item['content'] = ''.join(content).strip()
     else:
         item['content'] = ''.join(content).strip()
     item['url'] = response.url
     yield item
Esempio n. 5
0
    def parse_item(self, response):
        item = NewdongguanItem()

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
        # 编号
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]
        # 内容
        item['content'] = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0]
        # 链接
        item['url'] = response.url

        yield item
Esempio n. 6
0
	def parse_item(self, response):
		print response.url
		item = NewdongguanItem()
		item['title'] = response.xpath('//div[contains(@class,"pagecenter p3")]//strong/text()').extract()[0]
		# 编号
		item['number'] = item['title'].split(' ')[-1].split(":")[-1]
		# 内容,先取出有图片的情况下匹配规则,如果有内容,返回所有内容的列表集合,如果没有内容
		content = response.xpath('//div[@class="contentext"]/text()').extract()
		#如果没有内容,则返回空列表,则无图片情况下的匹配规则
		if len(content)== 0:
			content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
			item['content'] = "".join(content).strip()
		else:
			item['content'] = "".join(content).strip()
		# 链接
		item['url'] = response.url
		# 交给管道
		yield item
Esempio n. 7
0
    def parse_item(self, response):
        # try:
        item = NewdongguanItem()

        # 标题
        # item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
        item['title'] = response.xpath(
            '//div[@class = "article_cent"]//h1/text()').extract()[0]
        # 编号
        item['time'] = response.xpath(
            '//div[@class = "article_cent"]//span/text()').extract()[0]
        # time2= response.xpath('//div[@class="content"]//div[@class="zx_all"]/text()').extract()[0]
        # item['time'] = time1 =time2
        #  = str(time).replace("&nbsp","")
        # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合
        contents = response.xpath(
            '//div[@class = "article_cent"]//p/text()').extract()
        # contentsp = response.xpath('//div[@class="content"]//div[@class="font14"]//p/text()').extract()
        # contentsdiv = response.xpath('//div[@class="content"]//div[@class="font14"]/text()').extract()
        # contentsstrong= response.xpath('//div[@class="content"]//div[@class="font14"]//strong/text()').extract()

        content = ""
        # for condiv in contentsdiv:
        #     content = content + condiv
        # for constp in contentsp:
        #     content = content + constp
        # for constrong in contentsstrong:
        #     content = content + constrong
        for cont in contents:
            print cont
            content = content + "/r/n" + cont

        # item['content']= str(content).replace(" ", "")
        item['content'] = content
        # item['content']= str(content).replace("&nbsp","")

        item['url'] = response.url
        # except Exception, e:
        #     print "parse_item failue"

        # 交给管道
        yield item