def parse_item(self, response): item = NewdongguanItem() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return item
def parse_item(self, response): item = NewdongguanItem() # 标题 # item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] item['title'] = response.xpath( '//div[@id = "news_main"]//div[@class = "news_content"]/h2[@class = "newsTitle"]/text()' ).extract()[0] # 编号 item['number'] = response.xpath( '//div[@id = "news_main"]//div[@class = "news_content"]//div[@class = "message"]/text()' ).extract()[0] # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 contents = response.xpath( '//div[@id = "news_main"]//div[@class = "news_content"]//p/text()' ).extract() content = "" for cont in contents: content = content + cont item['content'] = content item['url'] = response.url # 交给管道 yield item
def parse_item(self, response): item = NewdongguanItem() # 标题 item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 content = response.xpath('//div[@class="contentext"]/text()').extract() # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则 if len(content) == 0: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 链接 item['url'] = response.url yield item # # -*- coding: utf-8 -*- # import scrapy # from newdongguan.items import NewdongguanItem # class DongdongSpider(scrapy.Spider): # # 处理每个帖子的response内容 # def parse_item(self, response): # item = NewdongguanItem() # # 标题 # item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # # 编号 # item['number'] = item['title'].split(' ')[-1].split(":")[-1] # # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 # content = response.xpath('//div[@class="contentext"]/text()').extract() # # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则 # if len(content) == 0: # content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() # item['content'] = "".join(content).strip() # else: # item['content'] = "".join(content).strip() # # 链接 # item['url'] = response.url # # 交给管道 # yield item
def parse_item(self, response): item = NewdongguanItem() item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] item['num'] = item['title'].split(' ')[-1].split(":")[-1] content = response.xpath('//div[@class="contentext"]/text()').extract() if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item['content'] = ''.join(content).strip() else: item['content'] = ''.join(content).strip() item['url'] = response.url yield item
def parse_item(self, response): item = NewdongguanItem() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 内容 item['content'] = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0] # 链接 item['url'] = response.url yield item
def parse_item(self, response): print response.url item = NewdongguanItem() item['title'] = response.xpath('//div[contains(@class,"pagecenter p3")]//strong/text()').extract()[0] # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 内容,先取出有图片的情况下匹配规则,如果有内容,返回所有内容的列表集合,如果没有内容 content = response.xpath('//div[@class="contentext"]/text()').extract() #如果没有内容,则返回空列表,则无图片情况下的匹配规则 if len(content)== 0: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 链接 item['url'] = response.url # 交给管道 yield item
def parse_item(self, response): # try: item = NewdongguanItem() # 标题 # item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] item['title'] = response.xpath( '//div[@class = "article_cent"]//h1/text()').extract()[0] # 编号 item['time'] = response.xpath( '//div[@class = "article_cent"]//span/text()').extract()[0] # time2= response.xpath('//div[@class="content"]//div[@class="zx_all"]/text()').extract()[0] # item['time'] = time1 =time2 # = str(time).replace(" ","") # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 contents = response.xpath( '//div[@class = "article_cent"]//p/text()').extract() # contentsp = response.xpath('//div[@class="content"]//div[@class="font14"]//p/text()').extract() # contentsdiv = response.xpath('//div[@class="content"]//div[@class="font14"]/text()').extract() # contentsstrong= response.xpath('//div[@class="content"]//div[@class="font14"]//strong/text()').extract() content = "" # for condiv in contentsdiv: # content = content + condiv # for constp in contentsp: # content = content + constp # for constrong in contentsstrong: # content = content + constrong for cont in contents: print cont content = content + "/r/n" + cont # item['content']= str(content).replace(" ", "") item['content'] = content # item['content']= str(content).replace(" ","") item['url'] = response.url # except Exception, e: # print "parse_item failue" # 交给管道 yield item