def craw_result_process(root=Dir.res + "/data/"): files = ftools.get_files(root) data = [] for i in range(len(files)): filename = files[i] if len(data) > 10: break lines = ftools.read_lines(root + filename) for line in lines: tmp = line.split(",") # print("news",len(tmp[2])) # print("news",tmp[2]) # # print("abstract",len(tmp[1])) # print("abstract",tmp[1]) abstract = tools.seperate_sentences(tmp[1]) news = tools.seperate_sentences(tmp[2]) print(abstract) print(news) # input() jude = data_filter(news, abstract) if jude > 0.5: data.append(['\n'.join(abstract), '\n'.join(news)]) return data
def __init__(self): self.auto = AutoCoder() self.name = "fast encoder" self.data ={} path = Dir.res+"/encoder/cleandata_8700/" fllist = ftools.get_files(path) for name in fllist: self.data[name] = tools.load_object(path+name)
def load(self): path = Dir.res + "/cleandata_highquality_1640/abstract/" for name in ftools.get_files(path): tmp = ftools.read_lines(path + name) self.answer[name] = [] for var in tmp: if len(var.strip()) <= 5: continue self.answer[name].append(var)
def rouge_detail(self,abstract_processed,save_dir): flist = tools.get_files(abstract_processed) save_content = [] for fname in flist: content = tools.read_lines(abstract_processed+fname) refence = tools.read_lines(self.ref_processed+fname) lines =[line.split(" ") for line in content] refen =[line.split(" ") for line in refence] rouge1 = self.rouge_1_simple(refen,lines) rouge2 = self.rouge_2_simple(refen, lines) save_content.append(fname+","+str(rouge1)+","+str(rouge2)) tools.write_list(save_dir+"/detials.txt",save_content)
def analyze(main_name, compare_index, name="cleandata_small"): save_path = Dir.res + "/result/judge.txt" jude_dict = tools.load_object(save_path) # print(list(jude_dict.keys())[0]) print(len(jude_dict)) entry_path = Dir.res + "/result/" + name + "/EntryBigraph/detials.txt" entry_data = load_data(entry_path) first_path = Dir.res + "/result/" + name + "/" + main_name + "/detials.txt" first_data = load_data(first_path) textrank_path = Dir.res + "/result/" + name + "/TextRank/detials.txt" tr_data = load_data(textrank_path) result = {} for key in first_data.keys(): a = first_data[key][0] - entry_data[key][0] b = first_data[key][1] - entry_data[key][1] c = first_data[key][0] - tr_data[key][0] d = first_data[key][1] - tr_data[key][1] e = first_data[key][0] - tr_data[key][0] + entry_data[key][ 0] - tr_data[key][0] f = first_data[key][1] - tr_data[key][1] + entry_data[key][ 1] - tr_data[key][1] result[key] = [a, b, c, d, e, f] count = 0 news_root = Dir.res + "/" + name + "/news/" abst_root = Dir.res + "/" + name + "/abstract/" fname = ftools.get_files(news_root) new_result = {} for filename in fname: # print(filename,count,len(fname)) # news = ftools.read_lines(news_root+filename) # weibo = ftools.read_lines(abst_root+filename) # jude = data_filter(news,weibo) # jude_dict[filename] = jude jude = jude_dict[filename] if jude > 0.5: new_result[filename] = result[filename] new_result[filename].append(jude) count += 1 tools.save_object(jude_dict, Dir.res + "/result/judge.txt") tmp = dict( sorted(new_result.items(), key=lambda d: d[1][compare_index], reverse=True)) save_dict = {} names = [] for key in tmp.keys(): save_dict[key] = tmp[key] names.append(key) save_path = Dir.res + "/result/" + name + "/" + main_name + ".txt" ftools.write_com_dict(save_path, save_dict) return names
def get_result(dataname="cleandata_highquality_3500"): root = Dir.res + "/result/" + dataname + "/" flist = ftools.get_files(root) content = "" for name in flist: if ".txt" in name: continue lines = ftools.read_lines(root + name + "/eval_res.txt") content += name + ", " + lines[1][lines[1].index("[") + 1:lines[1].index("]")] + "\n" print(content) ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
def load_data(self, path=Dir.res + "/cleandata_604/news/"): flist = ftools.get_files(path) data = [] count = 0 for name in flist: filepath = path + name lines = ftools.read_lines(filepath) for line in lines: words = tools.seperate(line) data.append(TaggedDocument(words, ["sen_" + str(count)])) self.sen_dict[''.join(words)] = "sen_" + str(count) count += 1 return data
def indexlize_data(self,reprocess): ### 建立词到数值的映射 print("start") word_index_path = self.dir_path + "/words_index.txt" if not tools.isexists(word_index_path) or \ not tools.isexists(self.ref_processed) or \ not tools.isexists(self.ref_seperate) or \ len(tools.get_files(self.ref_seperate)) ==0 or \ len(tools.get_files(self.ref_processed)) == 0: reprocess = True if reprocess: self.word_index = RP.build_word_index(self.file, word_index_path) # print("word_index_builded") ### 参考摘要数值化 print(self.file_ref,self.ref_seperate,self.ref_processed) RP.result_process(self.file_ref, self.ref_seperate) RP.replace_words_by_num(self.word_index, self.ref_seperate, self.ref_processed) print("references process done") else: self.load_word_index(word_index_path ) print("word index loaded")
def generate_new_data(): npath = Dir.res + "/cleandata_highquality_3500/news/" # apath = Dir.res+"/cleandata_highquality_3500/abstract/" new_npath = Dir.res + "/cleandata_highquality_3500_new/news/" new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/" uper = Uper() for name in ftools.get_files(npath): path = npath + name content = ftools.read_lines(path) new_abstract = uper.summarize(content, num=3, fname=name[:-4]) ftools.copy(path, new_npath + name) ftools.write_list(new_apath + name, new_abstract)
def get_small_data(): root = Dir.res + "/cleandata_8700/" saveroot = Dir.res + "/cleandata_small/" flist = ftools.get_files(root + "news/") count = 0 for i in range(len(flist)): name = flist[i] content = ftools.read_lines(root + "news/" + name) if len(content) < 80: print(count, i, len(flist)) ftools.copy(root + "news/" + name, saveroot + "news/" + name) ftools.copy(root + "abstract/" + name, saveroot + "abstract/" + name) count += 1
def load_data(self, path=Dir.res + "/cleandata_604/news/"): flist = ftools.get_files(path) data = [] count = 0 for name in flist: filepath = path + name lines = ftools.read_lines(filepath) essay = "" tmp = [] for line in lines: words = tools.seperate(line) tmp.extend(words) essay += ''.join(words) data.append(TaggedDocument(tmp, ["text_" + str(count)])) self.sen_dict[essay] = "text_" + str(count) count += 1 return data
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"): ref_root = Dir.res + "/" + dataname + "/ref_processed/" abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/" detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt" filelist = ftools.get_files(ref_root) content = "" for i in range(len(filelist)): fname = filelist[i] print(i, len(filelist)) abstract = ftools.read_lines(abs_root + fname) refence = ftools.read_lines(ref_root + fname) lines = [line.split(" ") for line in abstract] refen = [line.split(" ") for line in refence] rouge1 = rouge_1_simple(refen, lines) rouge2 = rouge_2_simple(refen, lines) print(fname, rouge1, rouge2) content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n" ftools.write(detail_path, content)
def clean(data_dir=Dir.res + "/cleandata_8700/news/"): flist = tools.get_files(data_dir) # print(data_dir,len(flist)) for fname in flist: flag = False content = tools.read(data_dir + fname) if "3805" in fname: print(content) input() if "您的浏览器不支持video标签\n" in content: content = content.replace("您的浏览器不支持video标签\n", "") flag = True if "新闻 专题 微博" in content: flag = True content = content[:content.index("新闻 专题 微博")] if flag: print(fname) tools.write(data_dir + fname, content)
def fill_all(path = Dir.res+"/craw_data/original/",save_path = Dir.res+"/craw_data/data/",fail_save_path = Dir.res+"/craw_data/fail/"): crawer = Crawer() files = tools.get_files(path) for name in files: content = tools.read_lines(path+name) fail_content = "" save_content = "" crawer.writeIntofile(save_path+name,"") crawer.writeIntofile(fail_save_path+name,"") succ_count,fail_count = 0,0 for line in content: tmp = line.split(",") article = crawer.get_article(tmp[-1]).strip().replace("\n","") if len(article)>0: save_content += tmp[0]+","+tmp[1]+","+article+'\n' succ_count+=1 else: fail_content += tmp[0]+","+tmp[1]+","+tmp[2]+'\n' fail_count+=1 # fail_content.append(tmp) crawer.writeIntofile(save_path+name,save_content) crawer.writeIntofile(fail_save_path+name,fail_content) print(name,succ_count,fail_count)
trainformat_sentences.append(tools.seperate(sen)) return trainformat_sentences def train(traindata, savepath=Dir.res + "/parameter/words_vector/w2v.model"): ftools.check_filename(savepath) model = Word2Vec(sentences=traindata, size=200, window=5, min_count=3, workers=4) model.save(savepath) def load(path=Dir.res + "/parameter/words_vector/w2v.model"): model = Word2Vec.load(path) return model if __name__ == "__main__": root = Dir.res + "/data/" flist = ftools.get_files(root) data = [] count = 0 for name in flist: path = root + name print(" %04d" % count, len(flist)) count += 1 data.extend(loaddata(path)) train(data)
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/", save_dir=Dir.res + "/cleandata_none"): if os.path.lexists(save_dir): shutil.rmtree(save_dir) files = tools.get_files(data_dir) cleandata = [] count = 0 bad_sample = [] for i in range(len(files)): print(i, len(files), len(cleandata)) fname = files[i] path = data_dir + fname lines = tools.read_lines(path) for line in lines: line = line.strip() # try: if 1: last_ = line.rindex(",") first_ = line.index(",") if first_ == last_: continue tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]] abstracts = tls.seperate_sentences(tmp[1]) news = tls.seperate_sentences(tmp[2]) tmp = get_abstract_index(news, abstracts) count += 1 if len(tmp) != len(abstracts): continue # print(tmp) # cmd = input() # if "1" in cmd: # print('\n'.join(abstracts)) # print("--------------------") # print('\n'.join(news)) # # print("--------------------") # print("words:",w_count) w_count = 0 for li in news: w_count += len(tls.seperate(li)) if w_count < 520: continue if sum(tmp[:3]) <= 3: continue cleandata.append([abstracts, news]) tools.write( save_dir + "/abstract/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(abstracts)) tools.write( save_dir + "/news/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(news)) # except Exception as e: # print(str(e),e.with_traceback(e.__traceback__)) # print("error",line) # bad_sample.append(line) print(count, len(bad_sample), len(cleandata))