def parse(self, response): res=response.xpath('//div[@class="result"]/div[@class="box-result clearfix"]') if len(res)==0: self.end=True return for each in res: title=each.xpath('div[@class="r-info r-info2"]/h2/a//text()').extract() title=''.join(title) if title=='': title = each.xpath('h2/a//text()').extract() title = ''.join(title) # print(title) if title in self.flag: self.end=True return self.flag.add(title) author='新华网' excerpt=each.xpath('div[@class="r-info r-info2"]/p//text()').extract() excerpt=''.join(excerpt) if excerpt=='': excerpt = each.xpath('div[@class="r-info r-info2"]/p//text()').extract() excerpt = ''.join(excerpt) #print(excerpt) release_time=each.xpath('div[@class="r-info r-info2"]/h2//text()').extract() if len(release_time)==0: release_time = each.xpath('h2//text()').extract() temp=release_time[-1] release_time=temp[-20:-9] # print(release_time) url=each.xpath('div[@class="r-info r-info2"]/h2/a/@href').extract() url=''.join(url) if url=='': url = each.xpath('h2/a/@href').extract() url = ''.join(url) html = requests.get(url).content selector = lxml.html.document_fromstring(html) content = selector.xpath('//p//text()') content = ''.join(content).replace('\'', '') img_url = 'http://seopic.699pic.com/photo/50045/7863.jpg_wh1200.jpg' item = NewsspiderItem() item['title'] = title item['author'] = author item['release_time'] = release_time item['excerpt'] = excerpt item['content'] = content item['img_url'] = img_url yield item if not self.end: self.page = self.page + 1 # print(URL.format(day1=beginTime,day2=endTime,start=self.page*20)) yield Request(URL.format(q=keyword, stime=self.beginTime, etime=self.endTime, page=self.page), self.parse, dont_filter=True)
def parse_photonews(self, response): item = NewsspiderItem() item['title'] = response.xpath("//h1/text()").extract() item['date'] = response.xpath("//div[@class='post_time_source']/text()").re( r'[0-9]*-[0-9]*-[0-9]* [0-9]*:[0-9]*:[0-9]*') item['source'] = response.xpath("//a[@id='ne_article_source']/text()").extract() item['content'] = ''.join(response.xpath("//div[@class='picinfo-text']/p[not(@class)]/span/text()").extract()).replace('\n', '') item['url'] = response.url yield item
def page_parser(self, response): data = self.parser.extract_news(response.text) if data: item = NewsspiderItem() item['keyword'] = self.keyword item['news_url'] = response.meta['url'] item['news_time'] = data['news_pubtime'] item['news_date'] = data['news_date'] item['news_title'] = data['news_title'] item['news_content'] = data['news_content'] yield item return
def parse(self, response): res=response.xpath('//div[@id="news_list"]/table') if len(res)==0: self.end=True return for each in res: title=each.xpath('tr/td/ul/li[@class="news_title"]/a//text()').extract() title=''.join(title) if title in self.flag: self.end=True return self.flag.add(title) author='中新网' ans=each.xpath('tr/td/ul/li[@class="news_other"]/text()').extract_first().split() release_time=ans[1]+" "+ans[2] excerpt=each.xpath('tr/td/ul/li[@class="news_content"]//text()').extract() excerpt=''.join(excerpt).replace(' ','').replace('\r','').replace('\t','') excerpt=excerpt.lstrip().replace(' ','') #排除发现中新网的搜索结果显示界面和新闻显示界面编码不同 url=ans[0] html=requests.get(url).content.decode('GBK') selector = lxml.html.document_fromstring(html) content=selector.xpath('//p//text()') content=''.join(content).replace('\'','') content.encode('utf-8') img_url=each.xpath('tr/td/a/img[@class="rsimg"]/@src').extract() img_url=''.join(img_url) #设置默认图片 if img_url=="": img_url='http://seopic.699pic.com/photo/50045/7863.jpg_wh1200.jpg' else: img_url=img_url item=NewsspiderItem() item['title'] = title item['author'] = author item['release_time'] = release_time item['excerpt'] = excerpt item['content']=content item['img_url']=img_url yield item if not self.end: self.page=self.page+1 # print(URL.format(day1=beginTime,day2=endTime,start=self.page*20)) yield Request(URL.format(q=keyword,day1=self.beginTime,day2=self.endTime,start=self.page*20),self.parse,dont_filter=True)
def parse_news_list(self, response): # 爬取每个url json_array = "".join( response.text[14:-1].split()) # 去掉前面的"data_callback" news_array = json.loads(json_array) category = response.meta['category'] for row in enumerate(news_array): news_item = NewsspiderItem() row_data = row[1] news_item["url"] = row_data["tlink"] yield scrapy.Request(news_item["url"], meta={"news_item": news_item}, callback=self.parse_news_content)
def parse_item(self, response): item = NewsspiderItem() if 'special' not in response.url: # 不是新闻列表 item['title'] = response.xpath("//h1/text()").extract() item['date'] = response.xpath( "//div[@class='post_time_source']/text()").re( r'[0-9]*-[0-9]*-[0-9]* [0-9]*:[0-9]*:[0-9]*') item['source'] = response.xpath( "//a[@id='ne_article_source']/text()").extract() # item['content'] = ''.join(response.xpath("//div[@id='endText']/p[not(@class)]").xpath('string(.)').extract()) item['content'] = ''.join( response.xpath("//div[@id='endText']/p[not(@class)]/text()"). extract()).replace('\n', '') item['url'] = response.url yield item
def parse(self, response): json_raw = response.text[9:-1] json_dic = json.loads(json_raw) for key in json_dic.keys(): for i in range(0, 20): newsUrl = json_dic[key][i]["url"] if (re.match(r"http://3g.163.com", newsUrl)): item = NewsspiderItem() item['digest'] = json_dic[key][i]["digest"] #str item['title'] = json_dic[key][i]["title"] #str item['time'] = json_dic[key][i]["ptime"] #str item['commentCount'] = json_dic[key][i][ "commentCount"] #int item['source'] = json_dic[key][i]["source"] #str yield Request(newsUrl, callback=self.parse_content, meta={'item': item})
def parse(self, response): #爬取内容解析 sel = Selector(response) items = NewsspiderItem() # for data in response.body[15:-13].split('","'): # tmp = data.split(',') # item = NewsspiderItem() #新闻标题 # item['news_Title'] = 9 # 新闻发布时间 # item['news_PublishDate'] = tmp[1] # 新闻链接--系统内部使用 # item['news_Url'] = tmp[2] # 新闻来源链接 item['news_FromUrl'] = str(response.url) # 入库时间 # item['news_CreateDate'] = '' # items.append(item) return items
def parse(self, response): ret_data = json.loads(response.text) for i in ret_data['data']: item = NewsspiderItem() item['author'] = i['author'] item['fpTime'] = datetime.datetime.utcfromtimestamp(int(i['fpTime'])).strftime('%Y.%m.%d %H:%M:%S') item['title'] = i['title'] item['tags'] = str({'tags':i['tags']}) item['url'] = i['url_https'] yield scrapy.Request( item['url'], callback=self.parse_content, meta={'item':item} ) self.page += 1 next_url = 'https://cre.mix.sina.com.cn/api/v3/get?cre=tianyi&mod=pctech&offset={page}'.format(page=str(self.page)) yield scrapy.Request( next_url, callback=self.parse )
def parse(self, response): res=response.xpath('//div[@class="mainM"]/div[@class="searchResults"]') # print(res) if len(res)==0: self.end=True return for each in res: title=each.xpath('p[@class="fz16 line24"]//text()').extract() title=''.join(title) # print(title) # if title in self.flag: # self.end=True # return self.flag.add(title) author='凤凰资讯' excerpt=each.xpath('p//text()').extract() release_time = excerpt[-1].replace('\r','').replace('\t','').replace('\n','').split() release_time = release_time[1] excerpt=excerpt[0:-1] excerpt=''.join(excerpt) # print(excerpt) # print(release_time) if release_time[0]!=2: release_time= time.strftime("%Y-%m-%d", time.localtime()) # 进行时间的判断 rtime=int(time.mktime(time.strptime(release_time, "%Y-%m-%d"))) url=each.xpath('p[@class="fz16 line24"]/a/@href').extract() url=''.join(url) # print(url) html = requests.get(url).content selector = lxml.html.document_fromstring(html) content = selector.xpath('//div[@id="main_content"]//text()') content = ''.join(content).replace('\'', '').replace('\r','').replace('\t','').replace('\n','') if content=='': content = selector.xpath('//div[@class="article"]/p//text()') content=''.join(content) if content=='': content=excerpt # print(content) img_url = 'http://seopic.699pic.com/photo/50045/7863.jpg_wh1200.jpg' item = NewsspiderItem() #如果在此时间范围内,存取这个item if rtime>=self.bt and rtime<=self.et: item['title'] = title item['author'] = author item['release_time'] = release_time item['excerpt'] = excerpt item['content'] = content item['img_url'] = img_url yield item # print(self.end) if self.page!=12: self.page=self.page+1 yield Request(URL.format(q=keyword,p=self.page),self.parse,dont_filter=True)