def filter(path=Dir.res + "/extradata/"): # print(os.path.abspath(path)) news_path = path + "news/" abstract_path = path + "abstract/" news_file_list = os.listdir(news_path) abst_file_list = os.listdir(abstract_path) bad_sample = [] news = [] for name in news_file_list: # if name in bad_sample: # continue news.append(ftools.read_lines(news_path + name)) abstracts = [] for name in abst_file_list: # if name in bad_sample: # continue abstracts.append(ftools.read_lines(abstract_path + name)) res = [] res_sen = [] for i in range(len(news)): # print(news_file_list[i], abst_file_list[i], True if news_file_list[i] == abst_file_list[i] else False) matrix = [[0 for var in range(len(news[i]))] for var in range(len(abstracts[i]))] tmp = [] tmp_sen = [] try: for k in range(len(abstracts[i])): # print(abstracts[i][k]) for j in range(len(news[i])): matrix[k][j] = len( crpss.longest_common_subsequence( news[i][j], abstracts[i][k])) # print(matrix[k].index(max(matrix[k])),news[i][matrix[k].index(max(matrix[k]))]) max_index = matrix[k].index(max(matrix[k])) tmp.append(max_index) tmp_sen.append(news[i][max_index]) # print(len(tmp),True if len(tmp) == len(abstracts[i]) else False) except: bad_sample.append(news_file_list[i]) # print(news_file_list[i]) res.append([news_file_list[i]] + tmp) res_sen.append([news_file_list[i]] + tmp_sen) # for bb in bad_sample: # print(bb) # res.append(tmp) # print(bad_sample) # for i in range(len(res)): # tmp = res[i] # print(news_file_list[i],tmp,len(news[i]),len(abstracts[i]) , True if len(abstracts[i] ) == len(tmp) else False) return res, res_sen
def rouge_detail(self,abstract_processed,save_dir): flist = tools.get_files(abstract_processed) save_content = [] for fname in flist: content = tools.read_lines(abstract_processed+fname) refence = tools.read_lines(self.ref_processed+fname) lines =[line.split(" ") for line in content] refen =[line.split(" ") for line in refence] rouge1 = self.rouge_1_simple(refen,lines) rouge2 = self.rouge_2_simple(refen, lines) save_content.append(fname+","+str(rouge1)+","+str(rouge2)) tools.write_list(save_dir+"/detials.txt",save_content)
def rouge_detail(): fname = "trainning_3570.txt" content = ftools.read_lines( Dir.res + "/result/cleandata_small/Second Version/abstract_processed/" + fname) refence = ftools.read_lines(Dir.res + "/cleandata_small/ref_processed/" + fname) lines = [line.split(" ") for line in content] refen = [line.split(" ") for line in refence] # print(lines) # print(refen) rouge1 = rouge_1_simple(refen, lines) rouge2 = rouge_2_simple(refen, lines) print(rouge1, rouge2)
def train(self, dimension=200, iter=10, trainfile=Dir.res + "WikiCorpus/wiki.jian.seperate.txt", load_model_if_exits=True): model_path = Dir.res + "/W2V/w2v_" + str(dimension) + ".model" if os.path.exists(model_path) and load_model_if_exits: self.model = Word2Vec.load(model_path) return self.model tmp = tools.read_lines(trainfile) index = 0 for string in tmp: words = (string.split(" ")) self.corpus.append(words) # print(words) # index+=1 # print(index) # Doc2Vec() self.model = Word2Vec(self.corpus, size=dimension, iter=iter, min_count=5) path = Dir.res + "W2V/w2v_" + str(dimension) + ".model" if not os.path.lexists(Dir.res + "W2V/"): os.makedirs(Dir.res + "W2V/") self.model.save(path) return self.model
def craw_result_process(root=Dir.res + "/data/"): files = ftools.get_files(root) data = [] for i in range(len(files)): filename = files[i] if len(data) > 10: break lines = ftools.read_lines(root + filename) for line in lines: tmp = line.split(",") # print("news",len(tmp[2])) # print("news",tmp[2]) # # print("abstract",len(tmp[1])) # print("abstract",tmp[1]) abstract = tools.seperate_sentences(tmp[1]) news = tools.seperate_sentences(tmp[2]) print(abstract) print(news) # input() jude = data_filter(news, abstract) if jude > 0.5: data.append(['\n'.join(abstract), '\n'.join(news)]) return data
def load_data(path): lines = ftools.read_lines(path) data = {} for line in lines: tmp = line.split(",") data[tmp[0]] = [float(tmp[1]), float(tmp[2])] return data
def load_word_index(self,path): lines = tools.read_lines(path) for line in lines: index = line.rindex(":") # print(line[:index]) # print(line[index+1:]) self.word_index[line[:index]] = int(line[index+1:])
def replace_words_by_num(whole_words,file_dir,save_dir): if os.path.lexists(save_dir): shutil.rmtree(save_dir) filename = [] def filter(s): if "all" in s: return True return False tools.get_filelist(file_dir,filename,filter) content = {} for file in filename: lines = tools.read_lines(file) string = "" for line in lines: words = line.split(" ") for word in words: if word.__len__()>0: if word in whole_words.keys(): string+= str(whole_words[word])+" " string = string.strip() string+="\n" content[tools.get_name(file)] = string # print(string) # input() for name in content: savepath = save_dir+name+".txt" tools.write(savepath,content[name])
def load(self): path = Dir.res + "/cleandata_highquality_1640/abstract/" for name in ftools.get_files(path): tmp = ftools.read_lines(path + name) self.answer[name] = [] for var in tmp: if len(var.strip()) <= 5: continue self.answer[name].append(var)
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"): ref_root = Dir.res + "/" + dataname + "/ref_processed/" abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/" detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt" filelist = ftools.get_files(ref_root) content = "" for i in range(len(filelist)): fname = filelist[i] print(i, len(filelist)) abstract = ftools.read_lines(abs_root + fname) refence = ftools.read_lines(ref_root + fname) lines = [line.split(" ") for line in abstract] refen = [line.split(" ") for line in refence] rouge1 = rouge_1_simple(refen, lines) rouge2 = rouge_2_simple(refen, lines) print(fname, rouge1, rouge2) content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n" ftools.write(detail_path, content)
def build_w2v_train_data(): file_dir = Dir.res + "data/news.sentences/" save_path = Dir.res + "data/all.txt" filelist = [] content = [] tools.get_filelist(file_dir, filelist) for file in filelist: sentences = tools.read_lines(file) content.extend(sentences) tools.write_list(save_path, content)
def load_vectorize_files(vectorize_path): lines = ftools.read_lines(vectorize_path) res = {} for line in lines: seperate_point = line.rindex("\t") key = line[:seperate_point] content = seperate_point[seperate_point+1:][2:-2] vectors = [float(var) for var in content.split("','")] if key not in res.keys(): res[key] = vectors return res
def get_result(dataname="cleandata_highquality_3500"): root = Dir.res + "/result/" + dataname + "/" flist = ftools.get_files(root) content = "" for name in flist: if ".txt" in name: continue lines = ftools.read_lines(root + name + "/eval_res.txt") content += name + ", " + lines[1][lines[1].index("[") + 1:lines[1].index("]")] + "\n" print(content) ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
def load_data(self, path=Dir.res + "/cleandata_604/news/"): flist = ftools.get_files(path) data = [] count = 0 for name in flist: filepath = path + name lines = ftools.read_lines(filepath) for line in lines: words = tools.seperate(line) data.append(TaggedDocument(words, ["sen_" + str(count)])) self.sen_dict[''.join(words)] = "sen_" + str(count) count += 1 return data
def transfer(cleandata_root = Dir.res+"/cleandata_1189/news/",save_path = Dir.res+"/sen_data/1189_corpus.txt"): filelist = os.listdir(cleandata_root) lines = [] for name in filelist: filepath = cleandata_root+name for line in ftools.read_lines(filepath): words = tools.seperate(line) for i in range(len(words)): if words[i].isdigit(): words[i] = "num" lines.append(' '.join(words)) ftools.write_list(save_path,lines)
def get_small_data(): root = Dir.res + "/cleandata_8700/" saveroot = Dir.res + "/cleandata_small/" flist = ftools.get_files(root + "news/") count = 0 for i in range(len(flist)): name = flist[i] content = ftools.read_lines(root + "news/" + name) if len(content) < 80: print(count, i, len(flist)) ftools.copy(root + "news/" + name, saveroot + "news/" + name) ftools.copy(root + "abstract/" + name, saveroot + "abstract/" + name) count += 1
def generate_new_data(): npath = Dir.res + "/cleandata_highquality_3500/news/" # apath = Dir.res+"/cleandata_highquality_3500/abstract/" new_npath = Dir.res + "/cleandata_highquality_3500_new/news/" new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/" uper = Uper() for name in ftools.get_files(npath): path = npath + name content = ftools.read_lines(path) new_abstract = uper.summarize(content, num=3, fname=name[:-4]) ftools.copy(path, new_npath + name) ftools.write_list(new_apath + name, new_abstract)
def loaddata(path): # flist = ftools.get_files(data_root) # count =1 # for name in flist: # print(count,len(flist)) # count+=1 # path = data_root+name trainformat_sentences = [] content = ftools.read_lines(path) for line in content: article = line[line.rindex(",") + 1:] sentences = tools.seperate_sentences(article) for sen in sentences: trainformat_sentences.append(tools.seperate(sen)) return trainformat_sentences
def load_data(self, path=Dir.res + "/cleandata_604/news/"): flist = ftools.get_files(path) data = [] count = 0 for name in flist: filepath = path + name lines = ftools.read_lines(filepath) essay = "" tmp = [] for line in lines: words = tools.seperate(line) tmp.extend(words) essay += ''.join(words) data.append(TaggedDocument(tmp, ["text_" + str(count)])) self.sen_dict[essay] = "text_" + str(count) count += 1 return data
def generate_data(file=Dir.res + "/extract_data_process/data_processed_9.9.txt", savePath=Dir.res + "/extract_data_process/data"): content = tools.read_lines(file)[1:-1] data = {} for file in content: file = file.replace(" ", "") tmp = str(file[1:-1]).split("', '") if tmp[1] not in data.keys(): data[tmp[1]] = tmp[2] index = 0 for key in sorted(data.keys()): save_content = savePath + "/news/training_" + str(index) + ".txt" save_abstract = savePath + "/abstract/training_" + str(index) + ".txt" tools.write_list(save_content, seperate_sentences(data[key])) tools.write_list(save_abstract, seperate_sentences(key)) index += 1
def result_process(file_dir,save_dir): if os.path.lexists(save_dir): shutil.rmtree(save_dir) filenames = [] tools.get_filelist(file_dir,filenames) for file in filenames: content = tools.read_lines(file) name = tools.get_name(file) result =[] for line in content: words = jieba.cut(line) string = "" for word in words: string+= word+" " string = string[:-1] result.append(string) save_path = save_dir+"/"+name+".txt" tools.write_list(save_path,result)
def build_word_index(file_dir,words_path): filename = [] def filter(s): if "all" in s: return True return False tools.get_filelist(file_dir, filename, filter) whole_words = {} for file in filename: lines = tools.read_lines(file) for line in lines: words = list(jieba.cut(line)) for word in words: if word.__len__() > 0: if word not in whole_words.keys(): whole_words[word] = whole_words.__len__() word_index = "" for word in whole_words.keys(): word_index += word + ":" + str(whole_words[word]) + "\n" tools.write(words_path, word_index) return whole_words
def fill_all(path = Dir.res+"/craw_data/original/",save_path = Dir.res+"/craw_data/data/",fail_save_path = Dir.res+"/craw_data/fail/"): crawer = Crawer() files = tools.get_files(path) for name in files: content = tools.read_lines(path+name) fail_content = "" save_content = "" crawer.writeIntofile(save_path+name,"") crawer.writeIntofile(fail_save_path+name,"") succ_count,fail_count = 0,0 for line in content: tmp = line.split(",") article = crawer.get_article(tmp[-1]).strip().replace("\n","") if len(article)>0: save_content += tmp[0]+","+tmp[1]+","+article+'\n' succ_count+=1 else: fail_content += tmp[0]+","+tmp[1]+","+tmp[2]+'\n' fail_count+=1 # fail_content.append(tmp) crawer.writeIntofile(save_path+name,save_content) crawer.writeIntofile(fail_save_path+name,fail_content) print(name,succ_count,fail_count)
def load_clue_words(self, path=Dir.res + "/parameter/summarization_parameter/clue_words"): list1 = ftools.read_lines(path) for var in list1: self.cluewords.add(var.strip())
feed_dict={ self.xl: [self.words2worvect(sens_words[i], words_bag)] }) sens_vec.append(list(sens_i_vec)[0]) essay_vec = list( sess.run(self.encoder_op, feed_dict={self.xl: [[1] * len(words_bag)]})[0]) # endtime = time.time() # print(endtime-start) return sens_vec, essay_vec if __name__ == "__main__": path = Dir.res + "/cleandata_small/news/trainning_2788.txt" text = ftools.read_lines(path) text = '。'.join(text) asv = Auto_Simple_Vec() sens, sens_words, sens_tags = asv.preprocess(text) # for var in sens_words: # print(var) print("se_words lgth", len(sens_words)) sen_vec, essay_vec = asv.vectorize(sens_words, sens_tags) # print(essay_vec) print(sens[0], sens[1]) print(asv.dist.sim(sen_vec[0], sen_vec[1])) print(asv.dist.sim(sen_vec[0], sen_vec[-1])) print(asv.dist.sim(sen_vec[2], sen_vec[3]))
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/", save_dir=Dir.res + "/cleandata_none"): if os.path.lexists(save_dir): shutil.rmtree(save_dir) files = tools.get_files(data_dir) cleandata = [] count = 0 bad_sample = [] for i in range(len(files)): print(i, len(files), len(cleandata)) fname = files[i] path = data_dir + fname lines = tools.read_lines(path) for line in lines: line = line.strip() # try: if 1: last_ = line.rindex(",") first_ = line.index(",") if first_ == last_: continue tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]] abstracts = tls.seperate_sentences(tmp[1]) news = tls.seperate_sentences(tmp[2]) tmp = get_abstract_index(news, abstracts) count += 1 if len(tmp) != len(abstracts): continue # print(tmp) # cmd = input() # if "1" in cmd: # print('\n'.join(abstracts)) # print("--------------------") # print('\n'.join(news)) # # print("--------------------") # print("words:",w_count) w_count = 0 for li in news: w_count += len(tls.seperate(li)) if w_count < 520: continue if sum(tmp[:3]) <= 3: continue cleandata.append([abstracts, news]) tools.write( save_dir + "/abstract/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(abstracts)) tools.write( save_dir + "/news/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(news)) # except Exception as e: # print(str(e),e.with_traceback(e.__traceback__)) # print("error",line) # bad_sample.append(line) print(count, len(bad_sample), len(cleandata))
def load_file(filepath): def filter(sen): return sen.strip() tmp = ftools.read_lines(filepath) return "。".join(map(filter, tmp))
sens_vect = [] essay_key = [] for sen in sens: essay_key.extend(sen) vec = self.sen2v.get_sen_vec(sen) # if vec == None: # input() sens_vect.append(vec) essay_vector = self.doc2v.get_sen_vec(essay_key) return sens_vect, essay_vector if __name__ == "__main__": # sen2v = Sen2Vec() # sen2v.train() # doc2v= Doc2Vec() # doc2v.train() sens = ftools.read_lines(Dir.res + "/cleandata_604/news/training_4.txt") pvdm_v = pvdm_vectorize() text = [] for line in sens: text.append(tools.seperate(line)) sens, essay = pvdm_v.vectorize(text) print(sens[0]) for ss in sens: print(ss) # print(essay)
for var in num: em_vec[num[var]] = mau[var] for var in other: eo_vec[other[var]] = oau[var] essay_vector = en_vec + ev_vec + em_vec + eo_vec return sens_vecs, essay_vector import Dir if __name__ == "__main__": name = "training_4.txt" text_path = Dir.res + "/cleandata_604/news/" + name abstract_path = Dir.res + "/cleandata_604/abstract/" + name lines = ftools.read_lines(text_path) absts = ftools.read_lines(abstract_path) res = [] for i in range(len(absts)): max_v, max_index = 0, 0 for j in range(len(lines)): v = tools.sim(absts[i], lines[j]) if v > max_v: max_v = v max_index = j res.append(max_index) print(res) sens, tags = [], [] for line in lines:
option_score,tmp = self.summ.score_option(option, coverage_list, relative_matrix, clues_list, entities_list) # tmp.append(str(option_score)) if option_score > max_value: best_option = option max_value = option_score abstract = [sens[var] for var in best_option] # print('\n'.join(tmp),max_value) return abstract else: print("using original summarizor") return self.summ.summarize(text,num) if __name__ == "__main__": from src.tools import FileTools as ftools test_file = Dir.res + "/cleandata_highquality_100/news/trainning_31.txt" text = ftools.read_lines(test_file) summ = FastSummarize(ASVec) print(summ.info) res = summ.summarize(text) for line in res: print(line) test_file = Dir.res + "/cleandata_highquality_100/news/trainning_32.txt" text = ftools.read_lines(test_file) summ = FastSummarize(ASVec) print(summ.info) res = summ.summarize(text) for line in res: print(line)