def replace_words_by_num(whole_words,file_dir,save_dir): if os.path.lexists(save_dir): shutil.rmtree(save_dir) filename = [] def filter(s): if "all" in s: return True return False tools.get_filelist(file_dir,filename,filter) content = {} for file in filename: lines = tools.read_lines(file) string = "" for line in lines: words = line.split(" ") for word in words: if word.__len__()>0: if word in whole_words.keys(): string+= str(whole_words[word])+" " string = string.strip() string+="\n" content[tools.get_name(file)] = string # print(string) # input() for name in content: savepath = save_dir+name+".txt" tools.write(savepath,content[name])
def save_data(data, save_root): news_root = save_root + "/news/" abst_root = save_root + "/abstract/" for i in range(len(data)): fname = "trainning_" + str(i) + ".txt" ftools.write(abst_root + fname, data[i][0]) ftools.write(news_root + fname, data[i][1])
def get_result(dataname="cleandata_highquality_3500"): root = Dir.res + "/result/" + dataname + "/" flist = ftools.get_files(root) content = "" for name in flist: if ".txt" in name: continue lines = ftools.read_lines(root + name + "/eval_res.txt") content += name + ", " + lines[1][lines[1].index("[") + 1:lines[1].index("]")] + "\n" print(content) ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
def get_clue_words(path=Dir.res + "/extradata/", savepath=Dir.res + "/parameter/summarization_parameter/clue_words", word_index=3): _, res_sen = filter(path) words = {} for var in res_sen: for sen in var[1:]: ws = tools.seperate(sen) for w in ws[:word_index]: if w not in words.keys(): words[w] = 0 words[w] += 1 content = "" for w in words.keys(): content += w + "," + str(words[w]) + "\n" ftools.write(savepath + str(word_index), content)
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"): ref_root = Dir.res + "/" + dataname + "/ref_processed/" abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/" detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt" filelist = ftools.get_files(ref_root) content = "" for i in range(len(filelist)): fname = filelist[i] print(i, len(filelist)) abstract = ftools.read_lines(abs_root + fname) refence = ftools.read_lines(ref_root + fname) lines = [line.split(" ") for line in abstract] refen = [line.split(" ") for line in refence] rouge1 = rouge_1_simple(refen, lines) rouge2 = rouge_2_simple(refen, lines) print(fname, rouge1, rouge2) content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n" ftools.write(detail_path, content)
def craw_urls(self): start = 372 for i in range(start, self.page_nums): request = Request.Request(self.url + str(i)) for key in self.params.keys(): request.add_header(key, self.params[key]) response = Request.urlopen(request) html = response.read() html = html.decode('utf-8') infos = re.findall(self.url_regex, html) save_content = "" for info in infos: new_url = self.url_unqoate(info[-1]) new_infor = [info[0], info[1], info[-1], new_url] save_content += self.seperator.join(new_infor) + "\n" tools.check_build_file(self.url_file) tools.write(self.url_file, content=save_content, mode="a") print(i, len(infos))
def clean(data_dir=Dir.res + "/cleandata_8700/news/"): flist = tools.get_files(data_dir) # print(data_dir,len(flist)) for fname in flist: flag = False content = tools.read(data_dir + fname) if "3805" in fname: print(content) input() if "您的浏览器不支持video标签\n" in content: content = content.replace("您的浏览器不支持video标签\n", "") flag = True if "新闻 专题 微博" in content: flag = True content = content[:content.index("新闻 专题 微博")] if flag: print(fname) tools.write(data_dir + fname, content)
def build_word_index(file_dir,words_path): filename = [] def filter(s): if "all" in s: return True return False tools.get_filelist(file_dir, filename, filter) whole_words = {} for file in filename: lines = tools.read_lines(file) for line in lines: words = list(jieba.cut(line)) for word in words: if word.__len__() > 0: if word not in whole_words.keys(): whole_words[word] = whole_words.__len__() word_index = "" for word in whole_words.keys(): word_index += word + ":" + str(whole_words[word]) + "\n" tools.write(words_path, word_index) return whole_words
def craw_url(self,page_index,save_path): url = "http://weibo.cn/breakingnews?page=" + str(page_index) header = ['Host', 'weibo.cn', 'User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language', 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Cookie', 'SCF=Ah6oK9ne4mmUNoYw4kUuNRmslSDJZqMC8SFA5i4tUHBOxdAcSzsIBEEOZfx3fQNj0BgpLdQSDXoBtnymKFxl8KA.; SUHB=0z1B6sSFzJ07wI; _T_WM=7fe561e14961c07e54388eb18a1b0902; SUB=_2A2502RrGDeRhGedG6loS-SbLzzuIHXVUJaaOrDV6PUJbkdANLVmtkW0WkE6llUm_KXMeRq22wEZ0nvVBRQ..; SSOLoginState=1507682966', 'DNT', '1', 'Connection', 'keep-alive', 'Upgrade-Insecure-Requests', '1'] request = Request.Request(url) params = {} for i in range(header.__len__() - 1): if i % 2 == 0: request.add_header(header[i], header[i + 1]) params[header[i]] = header[i + 1] response = Request.urlopen(request) html = response.read() html = html.decode('utf-8') regex = "【(.*?)】(.*?)<a href=\"(http.*?)[\u4e00-\u9fa5]*?\"" infos = re.findall(regex, html) save_content = "" for info in infos: new_url = self.url_unqoate(info[-1]) reheader = requests.head(new_url).headers if "Location" in reheader: reurl = reheader["Location"] else: reurl = new_url if "pic" in reurl or "vedio" in reurl: continue new_infor = [info[0], info[1],reurl] save_content+= '\t'.join(new_infor)+"\n" tools.check_build_file(save_path) tools.write(save_path,content=save_content,mode="a") return len(infos)
def extract_info(self, string, index): regex = "【(.*?)】(.*?)<a href=\"(http.*?)[\u4e00-\u9fa5]*?\"" infos = re.findall(regex, string) result = [] un_crawe_info = [] try_count = 3 for info in infos: if info.__len__() == 3: all_url = info[-1] if "amp;" in all_url: all_url = all_url.replace("amp;", "") # print(all_url) content = self.url(all_url) try_time = 0 if content != None: content = content.replace("\n", "") if (content == None or content.__len__() == 0) and try_time < try_count: while (try_time < try_count): time.sleep(0.1) content = self.url(all_url) # print("try",try_time) if content != None: content = content.replace("\n", "") if content != None and content.__len__() > 0: break try_time += 1 # print(try_time,all_url,info[0]) if content != None: output_string = content.replace("\n", "") # print("hahah",output_string.__len__()) if content != None and content.__len__() > 0: tmp = [] for i in range(2): tmp.append(info[i]) content_try, content_try_time = 3, 0 origin = self.get_origin_text(content) origin = origin.replace("\n", "") # print("result",origin) if origin.__len__() < 100: # print("s-fail",origin) if index not in self.fail_pages: un_tmp = [] for info_un in tmp: un_tmp.append(info_un) # un_crawe_info.append(info_un) un_tmp.append(info[-1]) un_crawe_info.append(un_tmp) continue tmp.append(origin) result.append(tmp) string = "" count = 0 if un_crawe_info.__len__() > 0 and index not in self.fail_pages: print("fail", un_crawe_info.__len__()) fail_info = "" for tmp_fail in un_crawe_info: fail_info += str(tmp_fail) + "\n" tools.write(self.canot_crawe_info + "page" + str(index), fail_info, mode="w") self.fail_pages.add(index) for info in result: count += 1 # print("text",count) # print(str(info)) string += str(info) + "\n" if string.__len__() > 0: tools.write(self.craw_result + "page" + str(index), string) return count, infos.__len__()
def evaluator_rouge(self,model,result_dir,num): summarize_result={} astart = time.time() ### 保存模型的摘要结果 abstract = result_dir+"/abstract/" keys = sorted(self.data.keys()) if self.parall: p = multiprocessing.Pool(self.cpu) inter = int(len(keys) / self.cpu) + 1 args = [] for i in range(self.cpu): tmp = {} if i == 0: key = keys[:inter] # print(i,"0",inter,len(key)) for k in key: tmp[k] = self.data[k] elif i == self.cpu-1: key = keys[i*inter:] # print(i, i * inter, "end", len(key)) for k in key: tmp[k] = self.data[k] else: key = keys[i*inter:(i+1)*inter] # print(i, i * inter, (i + 1) * inter, len(key)) for k in key: tmp[k] = self.data[k] args.append({"n":"work"+str(i), "d":tmp, "m":model, "a" :abstract } ) # input() rslt = p.map(workers,args) # for var in rslt: # for k in var.keys(): # summarize_result[k] = var[k] ### 处理摘要结果数据(数值化) # print("saving abstract ",len(summarize_result)) # for fname in summarize_result.keys(): # tools.write_list(abstract + fname + ".txt", summarize_result[fname]) abstract_processed = result_dir + "/abstract_processed/" abstract_seperate = result_dir + "/abstract_seperate/" RP.result_process(abstract, abstract_seperate) print("abstract separate done") RP.replace_words_by_num(self.word_index, abstract_seperate, abstract_processed) print("abstract replace done") # print(abstract_processed,result_dir) self.rouge_detail(abstract_processed, result_dir) ### 计算 ROUGE # import src.evaluation.ROUGE # self.rouge = src.evaluation.ROUGE.ROUGE() # print("evaling") result = self.rouge.eval(abstract_processed, self.ref_processed,num) eval_result = result_dir + "/eval_res.txt" print(result) tools.write(eval_result, model.info + "\n" + result, mode="a") aend = time.time() print(aend - astart) else: count = 0 for text in keys: if text not in summarize_result.keys(): start = time.time() count+=1 # if count <1530: # count+=1 # continue # print(text) summarize_result[text] = model.summarize(self.data[text], num,fname = text) end = time.time() # tools.print_proccess(count, len(self.data.keys())) print(text,count,"/",len(keys),end-start) # print(result_save_dir_abstract + text + ".txt") # print( model.summarize(self.data[text], num) ) tools.write_list(abstract + text + ".txt", summarize_result[text]) ### 处理摘要结果数据(数值化) abstract_processed = result_dir+"/abstract_processed/" abstract_seperate = result_dir + "/abstract_seperate/" RP.result_process(abstract,abstract_seperate) RP.replace_words_by_num(self.word_index,abstract_seperate,abstract_processed) # print(abstract_processed,result_dir) self.rouge_detail(abstract_processed,result_dir) ### 计算 ROUGE # import src.evaluation.ROUGE # self.rouge = src.evaluation.ROUGE.ROUGE() # print("evaling") result = self.rouge.eval(abstract_processed, self.ref_processed) eval_result = result_dir+"/eval_res.txt" print(result) tools.write(eval_result,model.info+"\n"+result,mode="a") aend = time.time() print(aend-astart)
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/", save_dir=Dir.res + "/cleandata_none"): if os.path.lexists(save_dir): shutil.rmtree(save_dir) files = tools.get_files(data_dir) cleandata = [] count = 0 bad_sample = [] for i in range(len(files)): print(i, len(files), len(cleandata)) fname = files[i] path = data_dir + fname lines = tools.read_lines(path) for line in lines: line = line.strip() # try: if 1: last_ = line.rindex(",") first_ = line.index(",") if first_ == last_: continue tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]] abstracts = tls.seperate_sentences(tmp[1]) news = tls.seperate_sentences(tmp[2]) tmp = get_abstract_index(news, abstracts) count += 1 if len(tmp) != len(abstracts): continue # print(tmp) # cmd = input() # if "1" in cmd: # print('\n'.join(abstracts)) # print("--------------------") # print('\n'.join(news)) # # print("--------------------") # print("words:",w_count) w_count = 0 for li in news: w_count += len(tls.seperate(li)) if w_count < 520: continue if sum(tmp[:3]) <= 3: continue cleandata.append([abstracts, news]) tools.write( save_dir + "/abstract/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(abstracts)) tools.write( save_dir + "/news/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(news)) # except Exception as e: # print(str(e),e.with_traceback(e.__traceback__)) # print("error",line) # bad_sample.append(line) print(count, len(bad_sample), len(cleandata))