def parse_news(self, response): item = response.meta['item'] pageindex = response.meta['pageindex'] soup = BeautifulSoup(response.body, "lxml") #eg:2016年07月13日 21:50:42 news_date = soup.find("span", class_="item time").string if soup.find( "span", class_="item time") else None struct_date = datetime.datetime.strptime(news_date.encode('utf-8'), "%Y年%m月%d日 %H:%M:%S") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") #规范日期时间 title = soup.find("h1", class_="article-title").string if soup.find( "h1", class_="article-title") else None author = soup.find("span", class_="item author").a.string if soup.find( "span", class_="item author") else None comment_num = soup.find("span", class_="wscn-cm-counter").string if soup.find( "span", class_="wscn-cm-counter") else None content = soup.find("div", class_="article-content").text if soup.find( "div", class_="article-content") else None news_no = response.url.rsplit("/", 1)[-1] item["title"] = title item["author"] = author item["comment_num"] = comment_num item["content"] = content item["news_date"] = news_date item["crawl_date"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") item["news_no"] = news_no item = judge_news_crawl(item) #判断是否符合爬取时间 if item: yield item else: self.flag = pageindex
def parse(self, response): origin_url = response.url #http://money.163.com/special/002526O5/transport_02.html search_result = re.search(r"_(\d)*?\.", origin_url) #获取页数 pageindex = search_result.group(1) if search_result else 1 soup = BeautifulSoup(response.body, "lxml") news_list = soup("div", class_="list_item clearfix") for news in news_list: news_date = news.find("span", class_="time").text if news.find( "span", class_="time") else None title = news.find("h2").text if news.find("h2") else None news_url = news.find("h2").a.get("href", None) if news.find("h2") else None abstract = news.find("p").contents[0] if news.find("p") else None item = NewsItem(title=title, news_url=news_url, abstract=abstract, news_date=news_date) item = judge_news_crawl(item) #判断是否符合爬取时间 if item: request = scrapy.Request(news_url, callback=self.parse_news, meta={"item": item}) yield request else: self.flag = int(pageindex) if not self.flag: next_url = self.next_url % int(pageindex) + 1 yield scrapy.Request(next_url)
def parse(self, response): origin_url = response.url pageindex = origin_url.rsplit("/")[-1] soup = BeautifulSoup(response.body,"lxml") news_list = soup.find_all("li",class_="mt24 pr") for news in news_list: news_date = news.find("a",href="javascript:;").text if news.find("a",href="javascript:;") else None if news_date: news_url = news.find("p",class_="h1").a.get("href",None) if news.find("p",class_="h1") else None news_no = news_url.rsplit("/")[-1].split(".")[0] #http://www.nbd.com.cn/articles/2016-07-25/1025147.html title = news.find("p",class_="h1").text.strip() if news.find("p",class_="h1") else None #显示不全,在新闻具体页拿 # abstract = news.find("p",class_="news-p").text.strip() if news.find("p",class_="news-p") else None referer_web =news.find("div",class_="messge").contents[-2].a.text if news.find("div",class_="messge") else None referer_web = referer_web if referer_web != '' else None comment_num =soup.find("span",class_="fr").a.text if soup.find("span",class_="fr") else None item = NewsItem(news_date=news_date, title=title, # abstract=abstract, referer_web=referer_web, comment_num=comment_num, news_no = news_no, news_url=news_url ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item}) else: self.flag = pageindex else: logger.warning("can't find news_date") if not self.flag: next_url = self.next_url % (int(pageindex)+1) yield scrapy.Request(next_url)
def parse_news(self, response): item = response.meta.get("item", NewsItem()) pageindex = response.meta.get("pageindex", 1) topic_url = response.meta.get("topic_url", None) origin_url = response.url news_no_res = re.search(r"news/(\d+)\.html", origin_url) news_no = news_no_res.group(1) if news_no_res else None soup = BeautifulSoup(response.body, "lxml") ff3 = soup.find("h2", class_="f-ff3 f-fwn") referer_web = soup.find("h2", class_="f-ff3 f-fwn").i.text if ff3 else None #日期 origin_date = soup.find( "h2", class_="f-ff3 f-fwn").contents[-1].text if ff3 else None struct_date = datetime.datetime.strptime(origin_date, "%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") content = soup.find("div", class_="m-text").text if soup.find( "div", class_="m-text") else None author = soup.find("h3", class_="f-ff3 f-fwn").span.text if soup.find( "h3", class_="f-ff3 f-fwn") else None crawl_date = NOW item["referer_web"] = referer_web item["crawl_date"] = crawl_date item["author"] = author item["content"] = content item["news_no"] = news_no item["news_date"] = news_date item = judge_news_crawl(item) if item: yield item else: self.flag[topic_url] = pageindex
def parse_topic(self,response): origin_url = response.url if "_" not in origin_url: pageindex = 0 topic_url = origin_url.rsplit(".",1)[0] else: temp = origin_url.rsplit("_",1) pageindex = temp[-1].split(".",1)[0] topic_url = temp[0] soup = BeautifulSoup(response.body,"lxml") catalogue = soup.find("a",class_ = "blue CurrChnlCls").get("title").strip() news_list = soup.find("div", class_ = "lie_main_m").find_all("li") for news in news_list: news_date = news.find("span").text.strip() + " 00:00:00" title = news.find("a").text.strip()[10:] news_url = topic_url.rsplit("/",1)[0] + news.find("a").get("href")[1:] news_no = news_url.rsplit("/",1)[-1].split(".")[0] item = NewsItem( news_date = news_date, news_url =news_url, title = title, news_no = news_no, catalogue = catalogue, ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item}) else: self.flag[topic_url] = pageindex if not self.flag[topic_url]: next_url = topic_url + "_" + str(int(pageindex) + 1) + ".shtml" yield scrapy.Request(next_url,callback=self.parse_topic)
def parse(self,response): origin_url = response.url if 'index' not in origin_url: pageindex = 0 else: pageindex = origin_url.rsplit('index_',1)[-1].replace('.html','') pageindex = int(pageindex) soup = BeautifulSoup(response.body.decode('utf8'),"lxml") news_list = soup.find_all('li',style = 'overflow:hidden;') for news in news_list: news_date = news.find('span').text if news.find('span') else None if news_date : news_url = news.find('a').get('href') news_no = news_url.rsplit('/',1)[-1].replace('.html','') # http://www.caac.gov.cn/XWZX/MHYW/201607/t20160726_39146.html title = news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')).text.strip() if news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')) else None item = NewsItem( news_date = news_date + ' 00:00:00', title = title, news_url = news_url, news_no = news_no ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item}) else: self.flag = pageindex else: logger.warning("mhyw can't find news_date") if not self.flag: next_url = self.next_url % (str(pageindex + 1 )) yield scrapy.Request(next_url)
def parse_news(self,response): item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",None) soup = BeautifulSoup(response.body) #TODO:新闻列表中会有专题,里面没有新闻的内容。现在是抛弃! #爬取新闻 news_txt=soup.find("div",class_="news_txt") if news_txt: content = news_txt.text news_about = soup.find("div",class_="news_about") #referer_web,news_date if news_about: referer_web = news_about.p.string news_date = news_about.p.next_sibling.next_sibling.text[0:16] struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") item["referer_web"]=referer_web item["news_date"]=news_date item["content"]=content item["crawl_date"]=NOW # import pdb;pdb.set_trace() item = judge_news_crawl(item) if item: yield item else: self.flag = pageindex else: logger.info("news page can't find news_txt.That may be a theme")
def parse(self, response): keyword = response.meta.get("keyword", None) soup = BeautifulSoup(response.body,"lxml") for t in soup.find_all("div", attrs={"class":"result title"}): item = NewsItem() url = t.find("a").get("href") #新闻url title = t.find("a").text #新闻标题 temp_list = t.find("div",attrs={"class":"c-title-author"}).text.split(u"\xa0\xa0") website_name = temp_list[0] #新闻网站名称、 news_time = temp_list[1] #TODO: Some error now = datetime.datetime.now() if u"分钟前" in news_time: print news_time[:-3] struct_date = now - datetime.timedelta(minutes=int(news_time[:-3])) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") elif u"小时前" in news_time: print news_time[:-3] struct_date = now - datetime.timedelta(hours=int(news_time[:-3])) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") else: news_date = "%s-%s-%s %s:00" % (news_time[:4],news_time[5:7],news_time[8:10],news_time[12:]) item['news_url'] = url item['title'] = title item['news_date'] = news_date item['referer_web'] = website_name item["crawl_date"] = NOW item["keywords"] = [keyword] item = judge_news_crawl(item) if item: yield item
def parse(self, response): origin_url = response.url pageindex = origin_url.rsplit("=", 1)[-1] soup = BeautifulSoup(response.body, "lxml") news_temp = soup.find("table", class_="list").find("table", border="0").find("tbody") if not news_temp: return news_list = news_temp.find_all("tr")[1:] for news in news_list: temp = news.find("a") news_url = temp.get("href") title = temp.text.strip() temp = news.find_all("span") referer_web = temp[0].text.strip() news_date = temp[1].text.strip() news_no = news_url.rsplit("=", 1)[-1] item = NewsItem(news_date=news_date, title=title, referer_web=referer_web, news_url=news_url, news_no=news_no) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex if not self.flag: next_url = self.next_url % (int(pageindex) + 1) yield scrapy.Request(next_url)
def parse_news(self,response): item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",1) topic_url = response.meta.get("topic_url",None) origin_url = response.url news_no_res = re.search(r"news/(\d+)\.html",origin_url) news_no = news_no_res.group(1) if news_no_res else None soup = BeautifulSoup(response.body,"lxml") ff3 = soup.find("h2",class_="f-ff3 f-fwn") referer_web = soup.find("h2",class_="f-ff3 f-fwn").i.text if ff3 else None #日期 origin_date = soup.find("h2",class_="f-ff3 f-fwn").contents[-1].text if ff3 else None struct_date = datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") content = soup.find("div",class_="m-text").text.strip() if soup.find("div",class_="m-text") else None author = soup.find("h3",class_="f-ff3 f-fwn").span.text if soup.find("h3",class_="f-ff3 f-fwn") else None crawl_date = NOW item["referer_web"]=referer_web item["crawl_date"]=crawl_date item["author"]=author item["content"]=content item["news_no"]=news_no item["news_date"]=news_date item = judge_news_crawl(item) if item: yield item else: self.flag[topic_url]=pageindex
def parse(self,response): origin_url = response.url if '_' not in origin_url: pageindex = '1' origin_url = origin_url.rsplit('.',1)[0] + '_0' + '.html' else: pageindex = origin_url.rsplit('_',1)[-1].replace('.html','') soup = BeautifulSoup(response.body) catalogue = soup.find('div' ,class_ = 'nav_cur_index').find('span').text news_list = soup.find_all('div',class_ = 'item_top') for news in news_list: news_date = news.find('span',class_ = 'time').text.strip() if news.find('span',class_ = 'time') else None if news_date: news_url = news.find('h2').find('a',href = re.compile('http://money.163.com/')).get('href') title = news.find('h2').find('a',href = re.compile('http://money.163.com/')).text item = NewsItem( news_date = news_date, news_url = news_url, title = title, catalogue = catalogue ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item}) else: self.flag = pageindex else: logger.warning("carnoc:%s can't find news_date " % origin_url) if not self.flag: pageindex = str(int(pageindex) + 1 ) if int(pageindex) > 9 else '0' + str(int(pageindex) + 1) next_url = self.next_url % (pageindex) yield scrapy.Request(next_url)
def parse(self,response): origin_url = response.url pageindex = origin_url.rsplit("=",1)[-1] soup = BeautifulSoup(response.body,"lxml") news_temp = soup.find("table" ,class_ = "list").find("table" , border="0").find("tbody") if not news_temp: return news_list = news_temp.find_all("tr")[1:] for news in news_list: temp = news.find("a") news_url = temp.get("href") title = temp.text.strip() temp = news.find_all("span") referer_web = temp[0].text.strip() news_date = temp[1].text.strip() news_no = news_url.rsplit("=",1)[-1] item = NewsItem(news_date=news_date, title=title, referer_web=referer_web, news_url=news_url, news_no = news_no ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex if not self.flag: next_url = self.next_url % (int(pageindex) + 1) yield scrapy.Request(next_url)
def parse(self, response): origin_url = response.url if origin_url == _36krSpider.start_urls[0]: temp = re.search('"highProjects\|focus":([\w\W]+?),"editorChoice\|focus"',response.body).group(1) news_list = json.loads(temp) temp = re.search('"editorChoice\|focus":([\w\W]+?),"feedHeaders\|column"',response.body).group(1) news_list.extend(json.loads(temp)) for news in news_list: title = news["title"] news_url = news["url"] if "36kr.com" not in news_url or ".html" not in news_url: continue news_date = news["created_at"] pic = news["cover"] news_no = news_url.rsplit("/",1)[-1].split(".")[0] item = NewsItem( news_date=news_date, title=title, news_no=news_no, news_url=news_url, pic = pic ) yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) next_url = _36krSpider.next_url % ("",str(time.time()).split(".")[0]) yield scrapy.Request(next_url) else: news_list = json.loads(response.body)["data"]["items"] next_id = news_list[-1]["id"] for news in news_list: title = news["title"] news_no = news["id"] news_url = "http://36kr.com/p/%s.html" % (news_no) if "36kr.com" not in news_url or ".html" not in news_url: continue news_date = news["created_at"] pic = news["cover"] abstract = news["summary"] topic = ",".join([ t for t in re.findall('\["([\w\W]+?)",',news["extraction_tags"])]) catalogue = news["column"]["name"] author = news["user"]["name"] item = NewsItem( news_date=news_date, title=title, news_no=news_no, news_url=news_url, pic=pic, abstract = abstract, topic = topic, author = author, catalogue = catalogue ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = 1 if not self.flag: next_url = self.next_url % (next_id,str(time.time()).split(".")[0]) yield scrapy.Request(next_url)
def parse_quick(self, response): soup = BeautifulSoup(response.body) news_list_inner = soup.find("div", class_="list-inner") next_timestamp = None news_list = news_list_inner.find_all( "div", class_=re.compile(r"bulletin-item.*")) if news_list_inner else None #json 页面 if not news_list: news_list = soup.find_all("div", class_=re.compile(r"bulletin-item.*")) for index, news in enumerate(news_list): origin_date = news.find("div", class_="news-time").get( "data-time", None) if news.find("div", class_="news-time") else None next_timestamp = origin_date if index == len( news_list) - 1 else None #取最后一篇文章的时间戳作下一页的时间戳 struct_date = datetime.datetime.fromtimestamp(int(origin_date)) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.find("a", class_="item-title").text if news.find( "a", class_="item-title") else None news_url = news.find("a", class_="item-title").get( "href", None) if news.find("a", class_="item-title") else None pic = news.find("img").get("src", None) if news.find("img") else None content = news.find("div", class_="item-desc").text if news.find( "div", class_="item-desc") else None id_result = re.search(r"/(\d+)\.html", news_url) news_no = id_result.group(1) if id_result else None item = NewsItem(content=content, news_url=news_url, pic=pic, title=title, news_no=news_no, news_date=news_date, catalogue=u"快报") item = judge_news_crawl(item) if item: request = scrapy.Request(news_url, meta={"item": item}, callback=self.parse_quick_news) yield request else: self.quick_flag = int(self.quick_page) if not self.quick_flag: if next_timestamp: next_quick_url = self.quick_json_url % next_timestamp yield scrapy.Request(next_quick_url, callback=self.parse_quick) else: logger.warning("can't find next_timestamp,url is %s " % response)
def parse(self, response): soup = BeautifulSoup(response.body,"lxml") newslist = soup.find(name="div", attrs={"data-lastkey": True}) lastkey = newslist.get("data-lastkey",None) logger.info(lastkey) if not lastkey: logger.warning("can't find next page") else: if newslist: for i in newslist.children: #文章中间有其余无关信息 if i != u' ': news_url = self.domain+i.a.get('href',None) pic = i.find("img").get('data-src') if i.find("img") else None title = i.find("h3").string if i.find("h3") else None comment_num = i.find(class_="iconfont icon-message").string if i.find(class_="iconfont icon-message") else 0 heart = i.find(class_="iconfont icon-heart").string if i.find(class_="iconfont icon-heart") else 0 topic = i.find(class_="category").span.string if i.find(class_="category") else 0 news_date =None if i.find(name="span", attrs={"data-origindate": True}): news_date= i.find(name="span", attrs={"data-origindate": True}).get("data-origindate",None) if news_date: news_date = news_date[:-6] #no content and have heart&conment but not add item = NewsItem(title=title,news_url=news_url,pic=pic,topic=topic,news_date=news_date,comment_num=comment_num) # 所属目录 item['catalogue'] = "Top 15" if "tags" in response.url else u"商业" #判断是否结束 item = judge_news_crawl(item) if item : request = scrapy.Request(news_url,callback=self.parse_article) request.meta["item"] = item yield request else: if "tags" in response.url: self.top_flag = lastkey else: self.com_flag = lastkey next_url = None #判断各个类别是否需要爬取下一页 if "tags" in response.url: if not self.top_flag: next_url = "http://www.qdaily.com/tags/tagmore/29/%s.json" % lastkey else: if not self.com_flag: next_url = "http://www.qdaily.com/categories/categorymore/18/%s.json" % lastkey # logger.info(next_url) if next_url: yield scrapy.Request(next_url,callback=self.parse_next_page) else: logger.warning("can't find newslit")
def parse_topic(self, response): origin_url = response.url topic_url = origin_url.split("_", 1)[1].rsplit("_", 1)[0] pageindex = int(origin_url.rsplit("_", 1)[1].replace('.html', '')) catalogue = re.search('</a> -> ([\w\W]+?) </i></h3>', response.body).group(1).decode("gb2312") soup = BeautifulSoup(response.body, "lxml") news_list = soup.find_all('li') for news in news_list: news_date = news.find('i').text.split(' ')[1].replace( ']', '') if news.find('i') else None if news_date: news_url = news.find( 'a', href=re.compile( 'http://news.carnoc.com/list/*.?')).get('href') news_no = news_url.rsplit('/', 1)[1].replace('.html', '') title = news.find( 'a', href=re.compile( 'http://news.carnoc.com/list/*.?')).text.strip() abstract = news.find('div').text.strip() pic = news.find('div').find( 'img', src=re.compile('http://pic.carnoc.com/file/*.?') ).get('src') if news.find('div').find( 'img', src=re.compile('http://pic.carnoc.com/file/*.?')) else None tags = news.find( 'div', class_='keywordslist').text.strip() if news.find( 'div', class_='keywordslist') else None item = NewsItem( news_url=news_url, news_date=news_date + ' 00:00:00', title=title, abstract=abstract, news_no=news_no, catalogue=catalogue, pic=pic, tags=tags, ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={'item': item}) else: self.flag[topic_url] = pageindex else: logger.warning("carnoc:%s can't find news_date " % origin_url) if not self.flag[topic_url]: next_url = origin_url.rsplit( "_", 1)[0] + '_' + str(pageindex + 1) + '.html' yield scrapy.Request(next_url, callback=self.parse_topic)
def parse_topic(self,response): topic_url = response.url # print topic_url body = json.loads(response.body) news_list = body["data"] page = response.meta.get("page","1") topic_name = response.meta.get("topic_name",None) #http://m.iwshang.com/category/20 没有新闻 if not news_list: self.flag[topic_url]=page for news in news_list: news_date_timestamp = news.get("published",None) struct_date = datetime.datetime.fromtimestamp(int(news_date_timestamp)) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.get("title",None) news_no = news.get("contentid",None) abstract = news.get("description",None) pic = news.get("thumb",None) news_url = news.get("url",None) #手机端新闻页面链接 referenceid = news.get("referenceid",None) #pc端的id,手机端的id跟pc端的id不一样 pc_news_url = self.pc_news_url % referenceid #pc端新闻页面链接 item = NewsItem( news_date=news_date, title=title, news_no=news_no, abstract=abstract, pic=pic, news_url=pc_news_url, topic=topic_name ) item = judge_news_crawl(item) if item: # yield item yield scrapy.Request(pc_news_url,callback=self.parse_news,meta={"item":item}) else: self.flag[topic_url]=page if not self.flag[topic_url]: page = str(int(page)+1) post_data = { "inslider":"0", "page":page, "pagesize":"10" } yield scrapy.FormRequest( url=topic_url, formdata=post_data, callback=self.parse_topic, meta={"page":page} )
def parse_topic(self, response): origin_url = response.url temp = origin_url.rsplit("/", 1) topic_url = temp[0] if temp[1] == "": pageindex = 1 else: pageindex = temp[1].split("_", 1)[-1].split(".", 1)[0] soup = BeautifulSoup(response.body, "lxml") catalogue = soup.find("div", class_="arttitle").text.strip() news_list = soup.find("ul", class_="art_list mt11").find_all("li") for news in news_list: title_info = news.find("h5", class_="title") text_info = news.find("div", class_="text") news_date = text_info.find("span", class_="time").text news_date = "%s-%s-%s %s:00" % (time.strftime("%Y"), int(news_date[0:2]), int(news_date[3:5]), news_date[7:]) author = text_info.find("span", class_="place").text.strip() if author == "": author = None abstract = text_info.find("p", class_="info").text.strip() pic = text_info.find("img").get("src") if text_info.find( "img") else None title = title_info.find("a").text.strip() news_url = title_info.find("a").get("href") temp = news_url.split("/") news_no = temp[-2] + "_" + temp[-1].split(".")[0] item = NewsItem( news_url=news_url, news_date=news_date, title=title, abstract=abstract, author=author, news_no=news_no, catalogue=catalogue, pic=pic, ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={'item': item}) else: self.flag[topic_url] = pageindex if not self.flag[topic_url]: next_url = "%s/index_%s.html" % (topic_url, int(pageindex) + 1) yield scrapy.Request(next_url, callback=self.parse_topic)
def parse(self, response): url = response.url pageindex = url.rsplit("/", 1)[-1] soup = BeautifulSoup(response.body) wrap = soup.find("div", class_="wrap") news_list = wrap.find_all("li", class_="pbox clr") for news in news_list: origin_date = news.find("div", class_="time").text.strip() struct_date = datetime.datetime.strptime(origin_date, "%Y / %m / %d\n%H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") # if not self.flag or self.flag == pageindex: topic = news.find("div", class_="img").a.string.strip() if news.find( "div", class_="img") else None pic = news.find("img").get("src", None) if news.find("img") else None title = news.find("div", class_="tit").string if news.find( "div", class_="tit") else None abstract = news.find("div", class_="des").string if news.find( "div", class_="des") else None author = news.find("div", class_="aut").text.strip() if news.find( "div", class_="aut") else None news_url = news.find("div", class_="tit").parent.get("href") if news.find( "div", class_="tit") else None comment_num = news.find("a", class_="cmt").text if news.find( "a", class_="cmt") else None item = NewsItem(topic=topic, news_url=news_url, pic=pic, title=title, abstract=abstract, author=author, comment_num=comment_num, news_date=news_date) item = judge_news_crawl(item) if item: request = scrapy.Request(news_url, meta={"item": item}, callback=self.parse_news) yield request else: self.flag = int(pageindex) if not self.flag: pageindex = int(pageindex) + 1 next_url = self.next_url % pageindex yield scrapy.Request(next_url)
def parse_newest(self, response): soup = BeautifulSoup(response.body, "lxml") page = response.request.body.split('=')[-1] li = soup.find_all('li') if li: for news in li: news_date = news.find(class_="time").string[2:] if news.find( class_="time") else None struct_date = datetime.datetime.strptime( news_date, "%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.find(class_="title").string if news.find( class_="title") else None news_url = self.domain + news.find(class_="title").a.get( "href", None) if news.find(class_="title") else None abstract = news.find(class_="info").string if news.find( class_="info") else None pic = self.domain + news.find('img').get( 'src', None) if news.find('img') else None topic = news.find(class_="type").string if news.find( class_="type") else None item = NewsItem(catalogue=u"最新内容", title=title, news_url=news_url, abstract=abstract, pic=pic, topic=topic, news_date=news_date) item = judge_news_crawl(item) if item: request = scrapy.Request(news_url, callback=self.parse_news, dont_filter=True) request.meta["item"] = item yield request else: self.flag = page else: logger.info("can't find news list") #下一页 if not self.flag: new_request = scrapy.FormRequest( self.start_url, formdata={'page': str(int(page) + 1)}, callback=self.parse_newest) yield new_request
def parse(self, response): origin_url = response.url result = re.search(r"page=(\d+)", origin_url) # import pdb;pdb.set_trace() pageindex = result.group(1) if result else None soup = BeautifulSoup(response.body) news_list = soup.find_all("div", class_="article-item clearfix") for news in news_list: info = news.find("div", class_="item-push-info") author = info.text[:-3] if info else None news_date = info.span.get("data-time") if info.span else None #时间戳 struct_date = datetime.datetime.fromtimestamp(int(news_date)) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.find("a", class_="item-title").text if news.find( "a", class_="item-title") else None news_url = news.find("a", class_="item-title").get( "href", None) if news.find("a", class_="item-title") else None abstract = news.find("p", class_="item-desc").text if news.find( "p", class_="item-desc") else None pic = news.find("img").get("src", None) if news.find("img") else None id_result = re.search(r"/(\d+)\.html", news_url) news_no = id_result.group(1) if id_result else None item = NewsItem(abstract=abstract, news_url=news_url, pic=pic, title=title, author=author, news_no=news_no, news_date=news_date, catalogue=u"中间推荐模板") item = judge_news_crawl(item) if item: request = scrapy.Request(news_url, meta={"item": item}, callback=self.parse_news) yield request else: self.mid_flag = int(pageindex) if not self.mid_flag: pageindex = int(pageindex) + 1 next_url = self.middle_next_url % pageindex yield scrapy.Request(next_url)
def parse_news(self,response): item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",1) soup = BeautifulSoup(response.body, 'lxml') origin_date = soup.find("td", class_="time").text.strip() struct_date= datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") content = soup.find("div", class_= "lph-article-comView").text.strip() if soup.find("div", class_= "lph-article-comView").text.strip() else None item["news_date"]= news_date item["crawl_date"]= NOW item["content"] = content item["catalogue"] = u"最新资讯" item = judge_news_crawl(item) if item: yield item else: self.flag = int(pageindex)
def parse_next_page(self,response): data = json.loads(response.body) newslist = data['data']["feeds"] last_key = data['data']['last_key'] if data['data']['has_more'] else None for news in newslist: post = news.get("post",None) if post: pic = post.get("image",None) title = post.get("title",None) comment_num = post.get("comment_count",None) praise_count = post.get("praise_count",None) #heart topic = post['category']['title'] id = post.get("id",None) datatype = news.get("datatype",None) news_date= post.get("publish_time",None) if news_date: news_date = news_date[:-6] #文章 if id and datatype: news_url = self.domain+"%s/%s" % (datatype+"s",id) item = NewsItem(title=title,news_url=news_url,pic=pic,topic=topic,news_date=news_date,comment_num=comment_num) item['catalogue'] = "Top 15" if "tags" in response.url else u"商业" item = judge_news_crawl(item) if item : request = scrapy.Request(news_url,callback=self.parse_article) request.meta["item"] = item yield request else: if "tags" in response.url: self.top_flag = last_key else: self.com_flag = last_key #下一页 next_url = None if "tags" in response.url: if not self.top_flag: next_url = "http://www.qdaily.com/tags/tagmore/29/%s.json" % last_key else: if not self.com_flag: next_url = "http://www.qdaily.com/categories/categorymore/18/%s.json" % last_key if next_url: yield scrapy.Request(next_url,callback=self.parse_next_page)
def parse(self, response): origin_url = response.url if "page" not in origin_url: pageindex = 1 else: pageindex = origin_url.split("&", 1)[0].rsplit("=", 1)[-1] soup = BeautifulSoup(response.body, "lxml") news_list = soup.find_all("article", class_="item-wrap cf") for news in news_list: news_date = news.find("span", class_="timeago").text.strip() if news.find( "span", class_="timeago") else None if news_date: title = news.find("a", class_="title").text.strip() news_url = news.find("a", class_="title").get("href") abstract = news.find("div", class_="brief").text.strip() author = news.find("span", class_="name").text.strip() news_no = news_url.rsplit("/", 1)[-1].split(".")[0] item = NewsItem(news_date=news_date + ":00", title=title, abstract=abstract, news_no=news_no, news_url=news_url, author=author) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex else: logger.warning("can't find news_date") if not self.flag: if pageindex == 1: next_url = self.next_url + soup.find("a", class_="more").get("href") else: next_url = self.next_url + "/?page=" + str( int(pageindex) + 1) + "&" + origin_url.split("&", 1)[-1] headers = {"X-Requested-With": "XMLHttpRequest"} yield scrapy.Request( next_url, headers=headers, )
def parse(self, response): origin_url = response.url pageindex = origin_url.rsplit("/")[-1] soup = BeautifulSoup(response.body, "lxml") news_list = soup.find_all("li", class_="mt24 pr") for news in news_list: news_date = news.find("a", href="javascript:;").text if news.find( "a", href="javascript:;") else None if news_date: news_url = news.find("p", class_="h1").a.get( "href", None) if news.find("p", class_="h1") else None news_no = news_url.rsplit("/")[-1].split(".")[ 0] #http://www.nbd.com.cn/articles/2016-07-25/1025147.html title = news.find("p", class_="h1").text.strip() if news.find( "p", class_="h1") else None #显示不全,在新闻具体页拿 # abstract = news.find("p",class_="news-p").text.strip() if news.find("p",class_="news-p") else None referer_web = news.find( "div", class_="messge").contents[-2].a.text if news.find( "div", class_="messge") else None referer_web = referer_web if referer_web != '' else None comment_num = soup.find("span", class_="fr").a.text if soup.find( "span", class_="fr") else None item = NewsItem( news_date=news_date, title=title, # abstract=abstract, referer_web=referer_web, comment_num=comment_num, news_no=news_no, news_url=news_url) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex else: logger.warning("can't find news_date") if not self.flag: next_url = self.next_url % (int(pageindex) + 1) yield scrapy.Request(next_url)
def parse_topic(self,response): origin_url = response.url temp = origin_url.rsplit("/",1) topic_url = temp[0] if temp[1] == "": pageindex = 1 else: pageindex = temp[1].split("_",1)[-1].split(".",1)[0] soup = BeautifulSoup(response.body,"lxml") catalogue = soup.find("div",class_ ="arttitle").text.strip() news_list = soup.find("ul",class_ = "art_list mt11").find_all("li") for news in news_list: title_info = news.find("h5",class_= "title") text_info = news.find("div",class_ = "text") news_date = text_info.find("span",class_ = "time").text news_date = "%s-%s-%s %s:00" % (time.strftime("%Y"),int(news_date[0:2]),int(news_date[3:5]),news_date[7:]) author = text_info.find("span",class_ = "place").text.strip() if author == "": author = None abstract = text_info.find("p",class_ = "info").text.strip() pic = text_info.find("img").get("src") if text_info.find("img") else None title = title_info.find("a").text.strip() news_url = title_info.find("a").get("href") temp = news_url.split("/") news_no = temp[-2] + "_" + temp[-1].split(".")[0] item = NewsItem( news_url =news_url, news_date = news_date, title = title, abstract = abstract, author = author, news_no = news_no, catalogue = catalogue, pic = pic, ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item}) else: self.flag[topic_url] = pageindex if not self.flag[topic_url]: next_url = "%s/index_%s.html" % (topic_url,int(pageindex) + 1) yield scrapy.Request(next_url,callback=self.parse_topic)
def parse(self,response): origin_url = response.url temp = origin_url.rsplit("/",2) year_month = temp[-2] day = temp[-1][7:9] pageindex = datetime.datetime(int(year_month[:4]), int(year_month[4:]), int(day)) - datetime.timedelta(days=1) pageindex = pageindex.strftime('%Y%m%d') soup = BeautifulSoup(response.body,"lxml") temp = soup.find("div",class_="mod newslist") if soup.find("div",class_="mod newslist") else None if temp: news_list = temp.find_all("li") for news in news_list: news_url = news.find("a").get("href") title = news.find("a").text.strip() news_no = news_url.rsplit("/",1)[-1].replace(".htm","") temp = news.find("span").text news_date = "%s-%s-%s %s:%s" % (time.strftime("%Y"),temp[0:2],temp[3:5],temp[7:],"00") item = NewsItem(news_date=news_date, title=title, news_no = news_no, news_url=news_url ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item}) else: self.flag = pageindex else: logger.warning("can't find news_list") temp = soup.find_all("a",class_ = "f12") if soup.find("a",class_ = "f12") else None today_text_url = None if temp: for t in temp: if u"下一页" in t: today_text_url = t.get("href") if today_text_url: yield scrapy.Request(today_text_url) else: if not self.flag: next_url = self.next_date_url % (pageindex[0:6],pageindex[6:]) yield scrapy.Request(next_url)
def parse(self, response): origin_url = response.url if origin_url == Tech163Spider.start_urls[0]: pageindex = 1 else: pageindex = int(origin_url.split("_")[2][0:-1]) soup = BeautifulSoup(response.body, "lxml") news_list = soup.find("ul", class_="newsList").find_all("li") for news in news_list: temp = news.find("p", class_="sourceDate").text.strip() news_date = temp[-19:] if news_date: referer_web = temp[:-19] comment_num = news.find("a", class_="commentCount ").text.strip() temp = news.find("div", class_="titleBar clearfix").find("a") news_url = temp.get("href") title = temp.text.strip() news_no = news_url.rsplit("/", 1)[-1][:-5] item = NewsItem(news_date=news_date, title=title, referer_web=referer_web, comment_num=comment_num, news_no=news_no, news_url=news_url) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex else: logger.warning("can't find news_date") if not self.flag: pageindex = pageindex + 1 if pageindex < 10: pageindex = '0' + str(pageindex) else: pageindex = str(pageindex) next_url = self.next_url + "_%s/" % (pageindex) yield scrapy.Request(next_url)
def parse(self,response): origin_url = response.url if origin_url == Tech163Spider.start_urls[0]: pageindex = 1 else: pageindex = int(origin_url.split("_")[2][0:-1]) soup = BeautifulSoup(response.body, "lxml") news_list = soup.find("ul",class_ = "newsList").find_all("li") for news in news_list: temp = news.find("p",class_ = "sourceDate").text.strip() news_date = temp[-19:] if news_date: referer_web = temp[:-19] comment_num = 0 #news.find("a", class_ = "commentCount ").text.strip() #评论条数需要用模拟器获取 temp = news.find("div",class_="titleBar clearfix").find("a") news_url = temp.get("href") title = temp.text.strip() news_no = news_url.rsplit("/",1)[-1][:-5] item = NewsItem( news_date=news_date, title=title, referer_web=referer_web, comment_num=comment_num, news_no=news_no, news_url=news_url ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex else: logger.warning("can't find news_date") if not self.flag: pageindex = pageindex + 1 if pageindex < 10: pageindex = '0' + str(pageindex) else: pageindex = str(pageindex) next_url = self.next_url + "_%s/" % (pageindex) yield scrapy.Request(next_url)
def parse(self, response): origin_url = response.url pageindex = origin_url.rsplit("&", 1)[0].split("=", 1)[-1] dejson = json.loads(response.body) news_list = dejson["data"] for news in news_list: news_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(float(news["time_published"]))) title = news["title"] abstract = news["summary"] read_num = news["number_of_reads"] comment_num = news["number_of_comments"] if len(news["hero_image"]["original"]) > 0: pic = news["hero_image"]["original"][0]["url"] else: pic = None news_url = news["short_url"] news_no = news_url.rsplit("/", 1)[-1].split(".", 1)[0] author = ",".join([t["username"] for t in news["authors"]]) topic = ",".join([t["tag"] for t in news["tags"]]) item = NewsItem(news_date=news_date, title=title, abstract=abstract, comment_num=comment_num, news_no=news_no, news_url=news_url, read_num=read_num, pic=pic, author=author, topic=topic) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex if not self.flag: next_url = self.next_url % (int(pageindex) + 30) yield scrapy.Request(next_url)
def parse(self, response): origin_url = response.url result = re.search(r"page=(\d+)",origin_url) # import pdb;pdb.set_trace() pageindex = result.group(1) if result else None soup = BeautifulSoup(response.body) news_list = soup.find_all("div",class_="article-item clearfix") for news in news_list: info = news.find("div",class_="item-push-info") author = info.text[:-3] if info else None news_date = info.span.get("data-time") if info.span else None #时间戳 struct_date = datetime.datetime.fromtimestamp(int(news_date)) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title =news.find("a",class_="item-title").text if news.find("a",class_="item-title") else None news_url =news.find("a",class_="item-title").get("href",None) if news.find("a",class_="item-title") else None abstract =news.find("p",class_="item-desc").text if news.find("p",class_="item-desc") else None pic = news.find("img").get("src",None) if news.find("img") else None id_result = re.search(r"/(\d+)\.html",news_url) news_no = id_result.group(1) if id_result else None item = NewsItem(abstract=abstract, news_url=news_url, pic=pic, title=title, author=author, news_no=news_no, news_date=news_date, catalogue=u"中间推荐模板") item = judge_news_crawl(item) if item: request = scrapy.Request(news_url,meta={"item":item},callback=self.parse_news) yield request else: self.mid_flag =int(pageindex) if not self.mid_flag: pageindex = int(pageindex)+1 next_url = self.middle_next_url % pageindex yield scrapy.Request(next_url)
def parse_quick(self,response): soup = BeautifulSoup(response.body) news_list_inner = soup.find("div",class_="list-inner") next_timestamp=None news_list = news_list_inner.find_all("div",class_=re.compile(r"bulletin-item.*")) if news_list_inner else None #json 页面 if not news_list: news_list = soup.find_all("div",class_=re.compile(r"bulletin-item.*")) for index,news in enumerate(news_list): origin_date = news.find("div",class_="news-time").get("data-time",None) if news.find("div",class_="news-time") else None next_timestamp = origin_date if index == len(news_list)-1 else None #取最后一篇文章的时间戳作下一页的时间戳 struct_date = datetime.datetime.fromtimestamp(int(origin_date)) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title =news.find("a",class_="item-title").text if news.find("a",class_="item-title") else None news_url =news.find("a",class_="item-title").get("href",None) if news.find("a",class_="item-title") else None pic = news.find("img").get("src",None) if news.find("img") else None content =news.find("div",class_="item-desc").text if news.find("div",class_="item-desc") else None id_result = re.search(r"/(\d+)\.html",news_url) news_no = id_result.group(1) if id_result else None item = NewsItem(content=content, news_url=news_url, pic=pic, title=title, news_no=news_no, news_date=news_date, catalogue=u"快报") item = judge_news_crawl(item) if item: request = scrapy.Request(news_url,meta={"item":item},callback=self.parse_quick_news) yield request else: self.quick_flag =int(self.quick_page) if not self.quick_flag: if next_timestamp: next_quick_url = self.quick_json_url % next_timestamp yield scrapy.Request(next_quick_url,callback=self.parse_quick) else: logger.warning("can't find next_timestamp,url is %s " % response)
def parse(self, response): pageindex = response.meta.get('pageindex', 1) data = json.loads(response.body) news_list = data['posts'] articleCursor = data['articleCursor'] for news in news_list: item = NewsItem() news_data = news.get("resource", None) if news_data: createdAt = news_data.get("createdAt", None) struct_date = datetime.datetime.utcfromtimestamp( int(createdAt)) + datetime.timedelta(hours=8) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") #规范日期时间 item["news_date"] = news_date item["title"] = news_data.get("title", None) item["comment_num"] = news_data.get("commentCount", None) item["pic"] = news_data.get("imageUrl", None) item["news_no"] = news_data.get("id", None) item["title"] = news_data.get("title", None) news_url = news_data.get("url", None) item["news_url"] = news_url item["abstract"] = news_data.get("summary", None) item["author"] = news_data.get("user", None).get( "screenName", None) if news_data.get("user", None) else None item = judge_news_crawl(item) #判断是否符合爬取时间 if item: request = scrapy.Request(news_url, callback=self.parse_news) request.meta['item'] = item yield request else: self.flag = pageindex else: logger.info("can't find search_result") #下一页 # if int(pageindex)<self.crawl_page: if not self.flag: next_url = self.page_url % str(articleCursor) yield scrapy.Request(next_url, callback=self.parse)
def parse_news(self, response): #content,news_date,news_no,crawl_date,referer_web item = response.meta.get("item", NewsItem()) pageindex = response.meta.get("pageindex", 1) soup = BeautifulSoup(response.body) # news_date = item.get("news_date",None) #需要爬取具体的时间 news_date = soup.find("span", class_="arial").text if soup.find( "span", class_="arial") else None #http://info.meadin.com/PictureNews/2938_1.shtml Exception if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S") # delta = self.end_now-struct_date # if delta.days == self.end_day: # raise CloseSpider('today scrapy end') referer_web = list(soup.find( "p", class_="source").strings)[-1] if soup.find( "p", class_="source") else None #爬取正文 art, content = None, None art = soup.find("div", class_="article js-article") if art: #剔除摘要! art.find("div", class_="intro").replace_with("") content = art.text.strip() news_no = response.url.split("/")[-1].split("_")[0] item["news_date"] = news_date item["content"] = content item["referer_web"] = referer_web item["crawl_date"] = NOW item["news_no"] = news_no item = judge_news_crawl(item) if item: yield item else: self.flag = pageindex else: logger.warning("can't find news_date.the url is %s" % response.url)
def parse(self,response): origin_url = response.url if "page" not in origin_url: pageindex = 1 else: pageindex = origin_url.split("&",1)[0].rsplit("=",1)[-1] soup = BeautifulSoup(response.body,"lxml") news_list = soup.find_all("article", class_="item-wrap cf") for news in news_list: news_date = news.find("span" , class_ = "timeago").text.strip() if news.find("span" , class_ = "timeago") else None if news_date: title = news.find("a" ,class_ = "title").text.strip() news_url = news.find("a" ,class_ = "title").get("href") abstract = news.find("div",class_ = "brief").text.strip() author = news.find("span" , class_ = "name").text.strip() news_no = news_url.rsplit("/",1)[-1].split(".")[0] item = NewsItem(news_date=news_date + ":00", title=title, abstract=abstract, news_no=news_no, news_url=news_url, author = author ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex else: logger.warning("can't find news_date") if not self.flag: if pageindex == 1: next_url = self.next_url + soup.find("a" , class_ = "more").get("href") else: next_url = self.next_url + "/?page=" + str(int(pageindex) + 1) + "&" + origin_url.split("&",1)[-1] headers = { "X-Requested-With":"XMLHttpRequest" } yield scrapy.Request(next_url,headers=headers,)
def parse_news(self, response): driver = webdriver.Firefox() item = response.meta.get("item", NewsItem()) page = response.meta.get("page", 1) index = response.meta.get("index", 0) origin_url = response.url no_res = re.search(r"/(\d+)?.html", origin_url) news_no = no_res.group(1) if no_res else None driver.get(origin_url) time.sleep(3) code = driver.page_source driver.quit() soup = BeautifulSoup(code, "lxml") # import pdb;pdb.set_trace() authors = soup("span", class_="author-name") referer_web = None for a in authors: if "来源".decode("utf-8") in a.text: referer_web = a.text[3:] news_date = soup.find("span", class_="article-time").text if soup.find( "span", class_="article-time") else None content = soup.find("div", id="article_content").get_text( strip=True) if soup.find("div", id="article_content") else None item["content"] = content item["news_date"] = news_date item["referer_web"] = referer_web item["crawl_date"] = NOW item["news_no"] = news_no item = judge_news_crawl(item, end_day=2) if item: yield item else: self.flag = page #把抛出下一页,放到每一页的最后篇文章来判断。 if index == 19 and not self.flag: next_page = page + 1 next_page_url = self.next_page_url % next_page yield scrapy.Request(next_page, meta={"page": next_page})
def parse(self, response): origin_url = response.url if '_' not in origin_url: pageindex = '1' origin_url = origin_url.rsplit('.', 1)[0] + '_0' + '.html' else: pageindex = origin_url.rsplit('_', 1)[-1].replace('.html', '') soup = BeautifulSoup(response.body) catalogue = soup.find('div', class_='nav_cur_index').find('span').text news_list = soup.find_all('div', class_='item_top') for news in news_list: news_date = news.find('span', class_='time').text.strip() if news.find( 'span', class_='time') else None if news_date: news_url = news.find('h2').find( 'a', href=re.compile('http://money.163.com/')).get('href') title = news.find('h2').find( 'a', href=re.compile('http://money.163.com/')).text item = NewsItem(news_date=news_date, news_url=news_url, title=title, catalogue=catalogue) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={'item': item}) else: self.flag = pageindex else: logger.warning("carnoc:%s can't find news_date " % origin_url) if not self.flag: pageindex = str( int(pageindex) + 1) if int(pageindex) > 9 else '0' + str(int(pageindex) + 1) next_url = self.next_url % (pageindex) yield scrapy.Request(next_url)
def parse_news(self,response): # driver = webdriver.Chrome(self.chromedriver) driver = webdriver.Firefox(executable_path=self.chromedriver) item = response.meta.get("item",NewsItem()) page = response.meta.get("page",1) index = response.meta.get("index",0) origin_url = response.url no_res = re.search(r"/(\d+)?.html",origin_url) news_no = no_res.group(1) if no_res else None driver.get(origin_url) time.sleep(3) code = driver.page_source driver.quit() soup = BeautifulSoup(code,"lxml") # import pdb;pdb.set_trace() authors = soup("span",class_="author-name") referer_web = None for a in authors: if "来源".decode("utf-8") in a.text: referer_web = a.text[3:] news_date = soup.find("span",class_="article-time").text if soup.find("span",class_="article-time") else None content = soup.find("div",id="article_content").get_text(strip=True) if soup.find("div",id="article_content") else None item["content"]=content item["news_date"]=news_date item["referer_web"]=referer_web item["crawl_date"]=NOW item["news_no"]=news_no item =judge_news_crawl(item,end_day=2) if item: yield item else: self.flag=page #把抛出下一页,放到每一页的最后篇文章来判断。 if index == 19 and not self.flag: next_page = page+1 next_page_url = self.next_page_url % next_page yield scrapy.Request(next_page,meta={"page":next_page})
def parse(self, response): keyword = response.meta.get("keyword", None) soup = BeautifulSoup(response.body, "lxml") for t in soup.find_all("div", attrs={"class": "result title"}): item = NewsItem() url = t.find("a").get("href") #新闻url title = t.find("a").text #新闻标题 temp_list = t.find("div", attrs={ "class": "c-title-author" }).text.split(u"\xa0\xa0") website_name = temp_list[0] #新闻网站名称、 news_time = temp_list[1] #TODO: Some error now = datetime.datetime.now() if u"分钟前" in news_time: print news_time[:-3] struct_date = now - datetime.timedelta( minutes=int(news_time[:-3])) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") elif u"小时前" in news_time: print news_time[:-3] struct_date = now - datetime.timedelta( hours=int(news_time[:-3])) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") else: news_date = "%s-%s-%s %s:00" % (news_time[:4], news_time[5:7], news_time[8:10], news_time[12:]) item['news_url'] = url item['title'] = title item['news_date'] = news_date item['referer_web'] = website_name item["crawl_date"] = NOW item["keywords"] = [keyword] item = judge_news_crawl(item) if item: yield item
def parse(self, response): origin_url = response.url #http://money.163.com/special/002526O5/transport_02.html search_result = re.search(r"_(\d)*?\.",origin_url) #获取页数 pageindex = search_result.group(1) if search_result else 1 soup = BeautifulSoup(response.body,"lxml") news_list = soup("div",class_="list_item clearfix") for news in news_list: news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None title = news.find("h2").text if news.find("h2") else None news_url = news.find("h2").a.get("href",None) if news.find("h2") else None abstract = news.find("p").contents[0] if news.find("p") else None item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date) item = judge_news_crawl(item) #判断是否符合爬取时间 if item: request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item}) yield request else: self.flag = int(pageindex) if not self.flag: next_url = self.next_url % int(pageindex)+1 yield scrapy.Request(next_url)
def parse(self,response): origin_url = response.url pageindex = origin_url.rsplit("/",3)[-3] soup = BeautifulSoup(response.body,"lxml") news_list = soup.find_all("div",class_=re.compile("zheng_list")) for news in news_list: news_date = news.find("div" ,class_ = "Function").text.strip() if news.find("div" ,class_ = "Function") else None if news_date: temp = news.find("a" , class_ = "t_css") if news.find("a" , class_ = "t_css") else None if not temp: continue news_url = temp.get("href") title = temp.get("title") news_no = news_url.rsplit("/",1)[-1].split(".")[0] abstract = news.find("p").text.strip() if news.find("p") else None if len(news_date) == 10: news_date = news_date + " 00:00:00" else: news_date=news_date + ":00" item = NewsItem(news_date=news_date, title=title, abstract=abstract, news_no=news_no, news_url=news_url ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex else: logger.warning("can't find news_date") nextDate = datetime.datetime(int(pageindex[:4]), int(pageindex[4:6]), int(pageindex[6:8])) - datetime.timedelta(days=1) if not self.flag: next_url = self.next_url % (nextDate.strftime('%Y%m%d')) yield scrapy.Request(next_url)
def parse_news(self,response): #content,news_date,news_no,crawl_date,referer_web item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",1) soup = BeautifulSoup(response.body) # news_date = item.get("news_date",None) #需要爬取具体的时间 news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None #http://info.meadin.com/PictureNews/2938_1.shtml Exception if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S") # delta = self.end_now-struct_date # if delta.days == self.end_day: # raise CloseSpider('today scrapy end') referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None #爬取正文 art,content = None,None art = soup.find("div",class_="article js-article") if art: #剔除摘要! art.find("div",class_="intro").replace_with("") content =art.text.strip() news_no =response.url.split("/")[-1].split("_")[0] item["news_date"]=news_date item["content"]=content item["referer_web"]=referer_web item["crawl_date"]=NOW item["news_no"]=news_no item = judge_news_crawl(item) if item: yield item else: self.flag = pageindex else: logger.warning("can't find news_date.the url is %s" % response.url)
def parse(self,response): pageindex = response.meta.get('pageindex', 1) data = json.loads(response.body) news_list = data['posts'] articleCursor = data['articleCursor'] for news in news_list: item = NewsItem() news_data = news.get("resource",None) if news_data: createdAt = news_data.get("createdAt", None) struct_date = datetime.datetime.utcfromtimestamp(int(createdAt))+ datetime.timedelta(hours=8) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") #规范日期时间 item["news_date"]=news_date item["title"]=news_data.get("title", None) item["comment_num"]=news_data.get("commentCount", None) item["pic"]=news_data.get("imageUrl", None) item["news_no"]=news_data.get("id", None) item["title"]=news_data.get("title", None) news_url = news_data.get("url", None) item["news_url"] = news_url item["abstract"] = news_data.get("summary", None) item["author"] = news_data.get("user", None).get("screenName", None) if news_data.get("user", None) else None item = judge_news_crawl(item) #判断是否符合爬取时间 if item: request = scrapy.Request(news_url,callback=self.parse_news) request.meta['item'] = item yield request else: self.flag= pageindex else: logger.info("can't find search_result") #下一页 # if int(pageindex)<self.crawl_page: if not self.flag: next_url = self.page_url % str(articleCursor) yield scrapy.Request(next_url,callback=self.parse)
def parse(self, response): url = response.url pageindex = url.rsplit("/",1)[-1] soup = BeautifulSoup(response.body) wrap = soup.find("div",class_="wrap") news_list = wrap.find_all("li",class_="pbox clr") for news in news_list: origin_date =news.find("div",class_="time").text.strip() struct_date= datetime.datetime.strptime(origin_date,"%Y / %m / %d\n%H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") # if not self.flag or self.flag == pageindex: topic = news.find("div",class_="img").a.string.strip() if news.find("div",class_="img") else None pic = news.find("img").get("src",None) if news.find("img") else None title = news.find("div",class_="tit").string if news.find("div",class_="tit") else None abstract = news.find("div",class_="des").string if news.find("div",class_="des") else None author = news.find("div",class_="aut").text.strip() if news.find("div",class_="aut") else None news_url = news.find("div",class_="tit").parent.get("href") if news.find("div",class_="tit") else None comment_num = news.find("a",class_="cmt").text if news.find("a",class_="cmt") else None item = NewsItem(topic=topic, news_url=news_url, pic=pic, title=title, abstract=abstract, author=author, comment_num=comment_num, news_date=news_date) item = judge_news_crawl(item) if item: request = scrapy.Request(news_url,meta={"item":item},callback=self.parse_news) yield request else: self.flag =int(pageindex) if not self.flag: pageindex = int(pageindex)+1 next_url = self.next_url % pageindex yield scrapy.Request(next_url)
def parse_topic(self,response): origin_url = response.url topic_url = origin_url.split("_",1)[1].rsplit("_",1)[0] pageindex = int(origin_url.rsplit("_",1)[1].replace('.html','')) catalogue = re.search('</a> -> ([\w\W]+?) </i></h3>',response.body).group(1).decode("gb2312") soup = BeautifulSoup(response.body,"lxml") news_list = soup.find_all('li') for news in news_list: news_date = news.find('i').text.split(' ')[1].replace(']','') if news.find('i') else None if news_date: news_url = news.find('a',href = re.compile('http://news.carnoc.com/list/*.?')).get('href') news_no = news_url.rsplit('/',1)[1].replace('.html','') title = news.find('a',href = re.compile('http://news.carnoc.com/list/*.?')).text.strip() abstract = news.find('div').text.strip() pic = news.find('div').find('img',src = re.compile('http://pic.carnoc.com/file/*.?')).get('src') if news.find('div').find('img',src = re.compile('http://pic.carnoc.com/file/*.?')) else None tags = news.find('div',class_ = 'keywordslist').text.strip() if news.find('div',class_ = 'keywordslist') else None item = NewsItem( news_url =news_url, news_date = news_date + ' 00:00:00', title = title, abstract = abstract, news_no = news_no, catalogue = catalogue, pic = pic, tags = tags, ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item}) else: self.flag[topic_url] = pageindex else: logger.warning("carnoc:%s can't find news_date " % origin_url) if not self.flag[topic_url]: next_url = origin_url.rsplit("_",1)[0] + '_' + str(pageindex + 1) + '.html' yield scrapy.Request(next_url,callback=self.parse_topic)
def parse_newest(self, response): soup = BeautifulSoup(response.body,"lxml") page =response.request.body.split('=')[-1] li = soup.find_all('li') if li: for news in li : news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.find(class_="title").string if news.find(class_="title") else None news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None abstract = news.find(class_="info").string if news.find(class_="info") else None pic = self.domain+news.find('img').get('src',None) if news.find('img') else None topic = news.find(class_="type").string if news.find(class_="type") else None item = NewsItem(catalogue=u"最新内容", title=title, news_url=news_url, abstract=abstract, pic=pic, topic=topic, news_date=news_date) item = judge_news_crawl(item) if item: request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True) request.meta["item"] = item yield request else: self.flag=page else: logger.info("can't find news list") #下一页 if not self.flag: new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest) yield new_request
def parse(self, response): soup = BeautifulSoup(response.body) #获取下一页链接 origin_url = response.url next_page_number = 2 if "page" in origin_url: next_page_number = int(origin_url.rsplit('/')[-2])+1 search = soup.find("section",id="omc-main") if search: news_list = search.find_all("article") if news_list: for news in news_list: abstract,author,news_date = None,None,None #find date and author if news.find("p",class_="omc-date-time-one"): date_aut = list(news.find("p",class_="omc-date-time-one").strings) author = date_aut[1] news_date = date_aut[2][5:] struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") titile = news.h2.text if news.h2 else None news_url= news.h2.a.get("href",None) if news.h2.a else None news_no = news_url.rsplit("/")[-2] topic_group = news.find("h3",class_="omc-blog-one-cat") topics = [] if topic_group: for topic in topic_group.find_all("a"): topics.append(topic.string) #中间会有空隙 # topic = list(news.find("h3",class_="omc-blog-one-cat").strings) if news.find("h3",class_="omc-blog-one-cat") else None if news.find("p",class_="omc-blog-one-exceprt"): abstract = news.find("p",class_="omc-blog-one-exceprt").text.strip() pic = news.img.get("src",None) if news.img else None #生成新闻item 并抛向解析内容 item = NewsItem(news_url=news_url, title=titile, abstract=abstract, pic=pic, author=author, news_date=news_date, crawl_date=NOW, news_no=news_no, topic=topics) item = judge_news_crawl(item,end_day=END_DAY+1) if item: request = scrapy.Request(news_url,callback=self.parse_news) request.meta["item"]=item if news_url: yield request else: logger.warning("can't find news url") else: self.flag =next_page_number-1 else: logger.info("can't find news list") else: logger.info("can't find main container") if not self.flag: next_url = self.page_url % next_page_number yield scrapy.Request(next_url,callback=self.parse)
def parse(self, response): soup = BeautifulSoup(response.body) #获取下一页链接 origin_url = response.url next_page_number = 2 if "page" in origin_url: next_page_number = int(origin_url.rsplit('/')[-2]) + 1 search = soup.find("section", id="omc-main") if search: news_list = search.find_all("article") if news_list: for news in news_list: abstract, author, news_date = None, None, None #find date and author if news.find("p", class_="omc-date-time-one"): date_aut = list( news.find("p", class_="omc-date-time-one").strings) author = date_aut[1] news_date = date_aut[2][5:] struct_date = datetime.datetime.strptime( news_date, "%Y-%m-%d") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") titile = news.h2.text if news.h2 else None news_url = news.h2.a.get("href", None) if news.h2.a else None news_no = news_url.rsplit("/")[-2] topic_group = news.find("h3", class_="omc-blog-one-cat") topics = [] if topic_group: for topic in topic_group.find_all("a"): topics.append(topic.string) #中间会有空隙 # topic = list(news.find("h3",class_="omc-blog-one-cat").strings) if news.find("h3",class_="omc-blog-one-cat") else None if news.find("p", class_="omc-blog-one-exceprt"): abstract = news.find( "p", class_="omc-blog-one-exceprt").text.strip() pic = news.img.get("src", None) if news.img else None #生成新闻item 并抛向解析内容 item = NewsItem(news_url=news_url, title=titile, abstract=abstract, pic=pic, author=author, news_date=news_date, crawl_date=NOW, news_no=news_no, topic=topics) item = judge_news_crawl(item, end_day=END_DAY + 1) if item: request = scrapy.Request(news_url, callback=self.parse_news) request.meta["item"] = item if news_url: yield request else: logger.warning("can't find news url") else: self.flag = next_page_number - 1 else: logger.info("can't find news list") else: logger.info("can't find main container") if not self.flag: next_url = self.page_url % next_page_number yield scrapy.Request(next_url, callback=self.parse)
def parse_news(self,response): soup = BeautifulSoup(response.body,"lxml") origin_url = response.url item = response.meta.get("item",NewsItem()) news_index = response.meta.get("news_index",1)#新闻当前页数 pageindex = response.meta.get("pageindex",1)#新闻爬取到的页数 #新闻第一页,可能有下一页 if news_index == 1: # if soup.find("span",class_="date") == None: # import pdb;pdb.set_trace() news_date = soup.find("span",class_="date").get_text(strip=True) if soup.find("span",class_="date") else None #2016.07.28 22:12:52 struct_date = datetime.datetime.strptime(news_date,"%Y.%m.%d %H:%M:%S") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") # u'' comment_text = soup.find("span",id="comment_num").text if soup.find("span",id="comment_num") else None if comment_text == u'': comment_num = 0 else: comment_num = int(comment_text) #正文会有下一页 TODO:正文有js,需要替换。 content_txt = soup.find("div",class_="content_txt") content = content_txt.get_text(strip=True) if content_txt else None referer_web = soup.find("span",id="source_baidu").a.get_text(strip=True) if soup.find("span",id="source_baidu") else None referer_url = soup.find("span",id="source_baidu").a.get("href") if soup.find("span",id="source_baidu") else None item["content"] = content item["news_date"] =news_date item["comment_num"] =comment_num item["crawl_date"] =NOW item["referer_web"] =referer_web item["referer_url"] =referer_url catalogue = item["catalogue"] item = judge_news_crawl(item) if item: if u"下一页" in content_txt.find("div",class_="page").text: #替换成下一页 格式:http://www.techweb.com.cn/world/2016-07-26/2365804_2.shtml news_next_page = str(int(news_index)+1) news_next_url = re.sub(r'\.shtml','_%s.shtml' % news_next_page,origin_url) yield scrapy.Request(news_next_url,callback=self.parse_news,meta={"news_index":news_next_page,"item":item}) else: yield item else: if "原创".decode("utf-8") == catalogue: self.yuanchuang_flag=pageindex else: self.news_flag=pageindex #新闻的下一页 else: content_txt = soup.find("div",class_="content_txt") content = content_txt.get_text(strip=True) item["content"] += u"\n第%s页\n%s" % (news_index,content) if item: #下一页是disabled说明没有下一页 if not content_txt.find("div",class_="page").find("span",class_="disabled"): #替换成下一页 格式:http://www.techweb.com.cn/world/2016-07-26/2365804_2.shtml news_next_url = re.sub(r'_.+?\.shtml','_%s.shtml' % str(int(news_index)+1),origin_url) yield scrapy.Request(news_next_url,callback=self.parse_news,meta={"pageindex":str(int(news_index)+1),"item":item}) else: yield item
def parse_index(self, response): """ 获取公众号的文章列表 :param response: 公众号主页 :return: """ weixin_id = response.meta.get("weixin_id", None) msg = re.search(r"var msgList =([\W\w]+?)seajs.use",response.body).group(1).strip()[:-1] # 获得公众号的文章列表 msg_dict = json.loads(msg) weixin_name = response.meta.get("name", None) article_list = [] for u in msg_dict["list"]: news_date = u["comm_msg_info"]["datetime"] #某天所有发布的文章的时间戳 news_date = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(int(news_date))) #时间转换 title = u["app_msg_ext_info"]["title"] #某天最后跟新的文章的标题 news_url = "http://mp.weixin.qq.com" + u["app_msg_ext_info"]["content_url"] #某天最后跟新的文章的url pic = u["app_msg_ext_info"]["cover"] #某天最后跟新的文章的图片 abstract = u["app_msg_ext_info"]["digest"] #某天最后跟新的文章的摘要 author = u["app_msg_ext_info"]["author"] # 某天最后跟新的文章的作者 fileid = u["app_msg_ext_info"]["fileid"] # 某天最后跟新的文章的fileid source_url = u["app_msg_ext_info"]["source_url"] # 某天最后跟新的文章的来源信息 article = {"weixin_id":weixin_id,"weixin_name":weixin_name,"news_date":news_date,"title":title,"news_url":news_url.replace("&","&"),"pic":pic,"abstract":abstract,"author":author,"fileid":fileid,"source_url":source_url} article_list.append(article) item = WechatItem( weixin_id = weixin_id, weixin_name = weixin_name, news_date = news_date, title = title, news_url = news_url.replace("&","&"), pic = pic, abstract = abstract, author = author, fileid = fileid, source_url = source_url ) item = judge_news_crawl(item) if item: time.sleep(random.randint(2,5)) print item['news_url'] yield scrapy.Request(item['news_url'],callback=self.parse_news, meta={"item": item}) for c in u["app_msg_ext_info"]["multi_app_msg_item_list"]: title = c["title"] #某天的文章的标题 news_url = "http://mp.weixin.qq.com" + c["content_url"] #某天的文章的url pic = c["cover"] #某天的文章的图片 abstract = c["digest"] #某天的文章的摘要 author = c["author"] # 某天的文章的作者 fileid = c["fileid"] # 某天的文章的fileid source_url = c["source_url"] # 某天的文章的来源url article = {"weixin_id":weixin_id,"weixin_name":weixin_name,"news_date":news_date,"title":title,"news_url":news_url.replace("&","&"),"pic":pic,"abstract":abstract,"author":author,"fileid":fileid,"source_url":source_url} article_list.append(article) item = WechatItem( weixin_id = weixin_id, weixin_name = weixin_name, news_date = news_date, title = title, news_url = news_url.replace("&","&"), pic = pic, abstract = abstract, author = author, fileid = fileid, source_url = source_url ) item = judge_news_crawl(item) if item: print item['news_url'] time.sleep(random.randint(2,5)) yield scrapy.Request(item['news_url'],callback=self.parse_news, meta={"item": item})