def start_crawl(self): ''' 这个函数开始爬取数据 ''' #获取到页面和条目统计信息 fileName = '' stopFlag = False CrawlData = {} #FilterData={} pages, indexcount = 2275, 22757 fields = [ "reporttime", "reporter", "title", "sitename", "keyword", "content", "imgUrl" ] #根据是否需要记录文件来进行 EventInfoExtract = fact_triple_extraction1chen.EventInfoExtract( r"3.3.0\ltp_data", 'out123.txt') # index = 0 #遍历所有的新闻页 for i in range(1, pages + 1): #得到当前页的url urls = self.starturl.replace("page=1", "page=%d" % i) #返回数组对象,每个元素表示一条新闻的简略信息 infodexs = self.index_info(urls, self.my_headers, i) if len(infodexs) > 0: csv_data = [] for infodex in infodexs: #判断是否达到终止天数 if self.deadlineTime != 0 and infodex["reporttime"][ 0:10] < self.deadlineTime: #终止爬取 stopFlag = True if (EventInfoExtract.segmentor != None): EventInfoExtract.release_module() break # if(infodex["keyword"]=="视频"): # continue print "=====================新闻信息==========================" print infodex["url"] # body = self.get_news_body(infodex["url"],self.my_headers) # if body!=None: # infodex["content"]=body["content"] # infodex["reporter"]=u"新华社" # index+=1 #恢复为原来的段落 # datas = infodex["content"].split(u" ") # EventInfoExtract.InitModule() # # #print '数据-----------------', datas # for data in datas: # print data.encode("utf-8") # # if len(data.encode("utf-8"))<30 or data.encode("utf-8")==None: # continue # TimeAndAddress=EventInfoExtract.addresssTime_extract(data.encode("utf-8")) # #print TimeAndAddress # fact_attribute = EventInfoExtract.fact_attribute_from_text(data.encode("utf-8")) # orgnization = EventInfoExtract.organization_from_text(data.encode("utf-8")) # death_num,hurt_num,total_num = EventInfoExtract.death_num_from_text(data.encode("utf-8")) # if TimeAndAddress[0]["date"]=="" and TimeAndAddress[0]["address"]=="": # continue # print ''' # 时间\t地点\t事件类型\t攻击组织\t伤亡总人数\t死亡人数\t受伤人数 # %s--%s--%s--%s--%s--%s--%s'''%(TimeAndAddress[0]['date'],TimeAndAddress[0]['address'],fact_attribute,orgnization,total_num,death_num,hurt_num) # # print("start to releases") #将新闻的原文也进行保存 # imgUrl=infodex["imgUrl"] # imgName="" # if(imgUrl!=None and imgUrl!=""): # imgName=imgUrl.split("/")[-1] # urlretrieve("http://tpic.home.news.cn/xhCloudNewsPic/"+imgUrl,"./imgs/"+ imgName) print infodex news = {} news["title"] = infodex["title"] news["des"] = infodex["des"] news["pubtime"] = infodex["reporttime"] news["content"] = infodex["content"] # news["img"]=imgName news["url"] = infodex["url"] csv_data.append(news) # news['time']=TimeAndAddress[0]['date'] # news['address']=TimeAndAddress[0]['address'] # news['type']=fact_attribute # if(total_num!=None): # news['total']="伤亡:" + total_num # else: # if death_num==None: # death_num="0" # if hurt_num==None: # hurt_num="0" # # print death_num, hurt_num # news['total']="死亡:" + death_num + ",受伤:" + hurt_num # news["gname"]=orgnization # news['nwound']=hurt_num # news['nkill']=death_num # insertSql = db_connect.generateSQL(news) # print insertSql # db_connect.insertOneData(insertSql) # PostData(data,hosturl) # EventInfoExtract.release_module() if stopFlag == True: print "sTOPP ING" break self.save_to_file(csv_data, 'cctv.csv', i) if (EventInfoExtract.segmentor != None): EventInfoExtract.release_module() sys.exit(0) print "Here Release-=======--==="
def craw(self, root_url): count = 1 for i in range(30):#21 start_url = root_url.replace("offset=20", "offset=%d" % count) html = self.downloader.download(start_url) # print html new_urls = self.parser.parse(start_url, html) self.urls.add_new_urls(new_urls) print 'craw %d %s' % (count, start_url) count = count + 20 count = 1 while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print('id %d' % count) html_cont = self.downloader.download(new_url) self.content = [] title, time, des, content, img_url = self.parser.get_data(new_url, html_cont) print(title, time, des, self.content) print('==========================================') except Exception as e: print(e) continue EventInfoExtract = fact_triple_extraction1chen.EventInfoExtract(r"3.3.0\ltp_data", 'out123.txt') EventInfoExtract.InitModule() TimeAndAddress = EventInfoExtract.addresssTime_extract(content.encode("utf-8")) # print TimeAndAddress fact_attribute = EventInfoExtract.fact_attribute_from_text(content.encode("utf-8")) orgnization = EventInfoExtract.organization_from_text(content.encode("utf-8")) death_num, hurt_num, total_num = EventInfoExtract.death_num_from_text(content.encode("utf-8")) # if TimeAndAddress[0]["date"] == "" and TimeAndAddress[0]["address"] == "": # continue print ''' time\t address\t type\t orgnize\t total\t dead\t hurt%s--%s--%s--%s--%s--%s--%s''' % ( TimeAndAddress[0]['date'], TimeAndAddress[0]['address'], fact_attribute, orgnization, total_num, death_num, hurt_num) # basis datas news = {} news['id'] = count news['pubtime'] = time news['title'] = title news['des'] = des news['content'] = content news['url'] = new_url news['img'] = img_url # exact datas news['time'] = TimeAndAddress[0]['date'] news['address'] = TimeAndAddress[0]['address'] news['type'] = fact_attribute if (total_num != None): news['total'] = "total:" + total_num else: if death_num == None: death_num = "0" if hurt_num == None: hurt_num = "0" # print death_num, hurt_num news['total'] = "dead:" + death_num + ", hurt:" + hurt_num news['gname'] = orgnization news['nwound'] = hurt_num news['nkill'] = death_num datas = [] datas.append(news) self.outputer.output_execl(datas, "SputnikNews.csv", count) count = count + 1 EventInfoExtract.release_module()