def craw_result_process(root=Dir.res + "/data/"): files = ftools.get_files(root) data = [] for i in range(len(files)): filename = files[i] if len(data) > 10: break lines = ftools.read_lines(root + filename) for line in lines: tmp = line.split(",") # print("news",len(tmp[2])) # print("news",tmp[2]) # # print("abstract",len(tmp[1])) # print("abstract",tmp[1]) abstract = tools.seperate_sentences(tmp[1]) news = tools.seperate_sentences(tmp[2]) print(abstract) print(news) # input() jude = data_filter(news, abstract) if jude > 0.5: data.append(['\n'.join(abstract), '\n'.join(news)]) return data
def wiki_preprocess(self, save_path=Dir.res + "/WikiCorpus/wiki.jian.seperate.txt"): tmp_result = [] # save_path = Dir.res+"WikiCorpus/wiki.jian.seperate.txt" index = 0 with open(self.train_file, "r") as train_corpus: # print("read complete") index = 0 for line in train_corpus: # print(line) # input() regex = "。。。。。。|?|。|!|;|\.\.\.\.\.\." sentences = re.split(regex, line) for sen in sentences: words = list(jieba.cut(sen.strip())) new_line = ' '.join(words) tmp_result.append(new_line) if tmp_result.__len__() == 5000: # print(index*5000 + 5000) tools.write_list(save_path, tmp_result, mode="a") index += 1 tmp_result = [] # print(tmp_result.__len__()) tools.write_list(save_path, tmp_result, mode="a")
def save_value(self, path, key, coverage_list, relative_matrix, clues_list, entities_list): ftools.check_filename(path) save_dict = {} save_dict[key] = [ coverage_list, relative_matrix, clues_list, entities_list ] tools.save_object(save_dict, path)
def save_value(self, path, text, coverage_list, relative_matrix, clues_list, entities_list): ftools.check_filename(path) save_dict = {} save_dict['#$#'.join(text)] = [ coverage_list, relative_matrix, clues_list, entities_list ] tools.save_object(save_dict, path)
def save_data(data, save_root): news_root = save_root + "/news/" abst_root = save_root + "/abstract/" for i in range(len(data)): fname = "trainning_" + str(i) + ".txt" ftools.write(abst_root + fname, data[i][0]) ftools.write(news_root + fname, data[i][1])
def train(traindata, savepath=Dir.res + "/parameter/words_vector/w2v.model"): ftools.check_filename(savepath) model = Word2Vec(sentences=traindata, size=200, window=5, min_count=3, workers=4) model.save(savepath)
def load(self): path = Dir.res + "/cleandata_highquality_1640/abstract/" for name in ftools.get_files(path): tmp = ftools.read_lines(path + name) self.answer[name] = [] for var in tmp: if len(var.strip()) <= 5: continue self.answer[name].append(var)
def build_w2v_train_data(): file_dir = Dir.res + "data/news.sentences/" save_path = Dir.res + "data/all.txt" filelist = [] content = [] tools.get_filelist(file_dir, filelist) for file in filelist: sentences = tools.read_lines(file) content.extend(sentences) tools.write_list(save_path, content)
def filter(path=Dir.res + "/extradata/"): # print(os.path.abspath(path)) news_path = path + "news/" abstract_path = path + "abstract/" news_file_list = os.listdir(news_path) abst_file_list = os.listdir(abstract_path) bad_sample = [] news = [] for name in news_file_list: # if name in bad_sample: # continue news.append(ftools.read_lines(news_path + name)) abstracts = [] for name in abst_file_list: # if name in bad_sample: # continue abstracts.append(ftools.read_lines(abstract_path + name)) res = [] res_sen = [] for i in range(len(news)): # print(news_file_list[i], abst_file_list[i], True if news_file_list[i] == abst_file_list[i] else False) matrix = [[0 for var in range(len(news[i]))] for var in range(len(abstracts[i]))] tmp = [] tmp_sen = [] try: for k in range(len(abstracts[i])): # print(abstracts[i][k]) for j in range(len(news[i])): matrix[k][j] = len( crpss.longest_common_subsequence( news[i][j], abstracts[i][k])) # print(matrix[k].index(max(matrix[k])),news[i][matrix[k].index(max(matrix[k]))]) max_index = matrix[k].index(max(matrix[k])) tmp.append(max_index) tmp_sen.append(news[i][max_index]) # print(len(tmp),True if len(tmp) == len(abstracts[i]) else False) except: bad_sample.append(news_file_list[i]) # print(news_file_list[i]) res.append([news_file_list[i]] + tmp) res_sen.append([news_file_list[i]] + tmp_sen) # for bb in bad_sample: # print(bb) # res.append(tmp) # print(bad_sample) # for i in range(len(res)): # tmp = res[i] # print(news_file_list[i],tmp,len(news[i]),len(abstracts[i]) , True if len(abstracts[i] ) == len(tmp) else False) return res, res_sen
def rouge_detail(self,abstract_processed,save_dir): flist = tools.get_files(abstract_processed) save_content = [] for fname in flist: content = tools.read_lines(abstract_processed+fname) refence = tools.read_lines(self.ref_processed+fname) lines =[line.split(" ") for line in content] refen =[line.split(" ") for line in refence] rouge1 = self.rouge_1_simple(refen,lines) rouge2 = self.rouge_2_simple(refen, lines) save_content.append(fname+","+str(rouge1)+","+str(rouge2)) tools.write_list(save_dir+"/detials.txt",save_content)
def get_result(dataname="cleandata_highquality_3500"): root = Dir.res + "/result/" + dataname + "/" flist = ftools.get_files(root) content = "" for name in flist: if ".txt" in name: continue lines = ftools.read_lines(root + name + "/eval_res.txt") content += name + ", " + lines[1][lines[1].index("[") + 1:lines[1].index("]")] + "\n" print(content) ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
def analyze(main_name, compare_index, name="cleandata_small"): save_path = Dir.res + "/result/judge.txt" jude_dict = tools.load_object(save_path) # print(list(jude_dict.keys())[0]) print(len(jude_dict)) entry_path = Dir.res + "/result/" + name + "/EntryBigraph/detials.txt" entry_data = load_data(entry_path) first_path = Dir.res + "/result/" + name + "/" + main_name + "/detials.txt" first_data = load_data(first_path) textrank_path = Dir.res + "/result/" + name + "/TextRank/detials.txt" tr_data = load_data(textrank_path) result = {} for key in first_data.keys(): a = first_data[key][0] - entry_data[key][0] b = first_data[key][1] - entry_data[key][1] c = first_data[key][0] - tr_data[key][0] d = first_data[key][1] - tr_data[key][1] e = first_data[key][0] - tr_data[key][0] + entry_data[key][ 0] - tr_data[key][0] f = first_data[key][1] - tr_data[key][1] + entry_data[key][ 1] - tr_data[key][1] result[key] = [a, b, c, d, e, f] count = 0 news_root = Dir.res + "/" + name + "/news/" abst_root = Dir.res + "/" + name + "/abstract/" fname = ftools.get_files(news_root) new_result = {} for filename in fname: # print(filename,count,len(fname)) # news = ftools.read_lines(news_root+filename) # weibo = ftools.read_lines(abst_root+filename) # jude = data_filter(news,weibo) # jude_dict[filename] = jude jude = jude_dict[filename] if jude > 0.5: new_result[filename] = result[filename] new_result[filename].append(jude) count += 1 tools.save_object(jude_dict, Dir.res + "/result/judge.txt") tmp = dict( sorted(new_result.items(), key=lambda d: d[1][compare_index], reverse=True)) save_dict = {} names = [] for key in tmp.keys(): save_dict[key] = tmp[key] names.append(key) save_path = Dir.res + "/result/" + name + "/" + main_name + ".txt" ftools.write_com_dict(save_path, save_dict) return names
def get_dirfiles_into_list_luhn(file_dir,replace_dir): list,result = [],{} tools.get_filelist(file_dir,list) for listfile in list: filename = tools.get_name(listfile) filename = filename[8:] if filename not in result.keys(): result[filename ]= [] if replace_dir == "": result[filename] = (listfile) else: result[filename].append(str(replace_dir + "/" + tools.get_name(listfile)+".txt")) return result
def transfer(cleandata_root = Dir.res+"/cleandata_1189/news/",save_path = Dir.res+"/sen_data/1189_corpus.txt"): filelist = os.listdir(cleandata_root) lines = [] for name in filelist: filepath = cleandata_root+name for line in ftools.read_lines(filepath): words = tools.seperate(line) for i in range(len(words)): if words[i].isdigit(): words[i] = "num" lines.append(' '.join(words)) ftools.write_list(save_path,lines)
def load_data(self, path=Dir.res + "/cleandata_604/news/"): flist = ftools.get_files(path) data = [] count = 0 for name in flist: filepath = path + name lines = ftools.read_lines(filepath) for line in lines: words = tools.seperate(line) data.append(TaggedDocument(words, ["sen_" + str(count)])) self.sen_dict[''.join(words)] = "sen_" + str(count) count += 1 return data
def rouge_detail(): fname = "trainning_3570.txt" content = ftools.read_lines( Dir.res + "/result/cleandata_small/Second Version/abstract_processed/" + fname) refence = ftools.read_lines(Dir.res + "/cleandata_small/ref_processed/" + fname) lines = [line.split(" ") for line in content] refen = [line.split(" ") for line in refence] # print(lines) # print(refen) rouge1 = rouge_1_simple(refen, lines) rouge2 = rouge_2_simple(refen, lines) print(rouge1, rouge2)
def load_word_index(self,path): lines = tools.read_lines(path) for line in lines: index = line.rindex(":") # print(line[:index]) # print(line[index+1:]) self.word_index[line[:index]] = int(line[index+1:])
def vectorize_files(fileroot,savepath): data = ftools.read_dir_lines_dict(fileroot) auto = AutoCoder() count = 0 print(len(data.keys())) for key in data.keys(): text = '。'.join(data[key]) sens, sens_words, sens_tags = auto.preprocess(text) start = time.time() sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags) end = time.time() key_text =''.join([''.join(var) for var in sens_words]) save_key = tools.md5(key_text) tmp =[list(var) for var in sens_vector] save_object = [tmp,list(essay_vector)] tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key) count+=1 print(count,len(data.keys()),end-start)
def train(self, dimension=200, iter=10, trainfile=Dir.res + "WikiCorpus/wiki.jian.seperate.txt", load_model_if_exits=True): model_path = Dir.res + "/W2V/w2v_" + str(dimension) + ".model" if os.path.exists(model_path) and load_model_if_exits: self.model = Word2Vec.load(model_path) return self.model tmp = tools.read_lines(trainfile) index = 0 for string in tmp: words = (string.split(" ")) self.corpus.append(words) # print(words) # index+=1 # print(index) # Doc2Vec() self.model = Word2Vec(self.corpus, size=dimension, iter=iter, min_count=5) path = Dir.res + "W2V/w2v_" + str(dimension) + ".model" if not os.path.lexists(Dir.res + "W2V/"): os.makedirs(Dir.res + "W2V/") self.model.save(path) return self.model
def load_data(path): lines = ftools.read_lines(path) data = {} for line in lines: tmp = line.split(",") data[tmp[0]] = [float(tmp[1]), float(tmp[2])] return data
def clean(path=Dir.res + "/extradata/", save=Dir.res + "/cleandata_1073/"): res = filter(path) clean_data = [] for tmp in res: # print(tmp) if 0 not in tmp or 1 not in tmp: clean_data.append(tmp) for cd in clean_data: if cd[0] == "training_288.txt": print("skip------------------") continue print(cd[0]) news_path = save + "news/" + cd[0] abstract_path = save + "abstract/" + cd[0] ftools.copy(path + "news/" + cd[0], news_path) ftools.copy(path + "abstract/" + cd[0], abstract_path)
def __init__(self): self.auto = AutoCoder() self.name = "fast encoder" self.data ={} path = Dir.res+"/encoder/cleandata_8700/" fllist = ftools.get_files(path) for name in fllist: self.data[name] = tools.load_object(path+name)
def generate_data(file=Dir.res + "/extract_data_process/data_processed_9.9.txt", savePath=Dir.res + "/extract_data_process/data"): content = tools.read_lines(file)[1:-1] data = {} for file in content: file = file.replace(" ", "") tmp = str(file[1:-1]).split("', '") if tmp[1] not in data.keys(): data[tmp[1]] = tmp[2] index = 0 for key in sorted(data.keys()): save_content = savePath + "/news/training_" + str(index) + ".txt" save_abstract = savePath + "/abstract/training_" + str(index) + ".txt" tools.write_list(save_content, seperate_sentences(data[key])) tools.write_list(save_abstract, seperate_sentences(key)) index += 1
def demo(): summarizor = Summarizor_luhn() essay = tools.read(Dir.resource + "\\extradata\\luhn\\training20.txt") # print(essay) result = summarizor.summarize(essay=essay) print("========================") for line in result: print(line)
def load_data(self, path=Dir.res + "/cleandata_604/news/"): flist = ftools.get_files(path) data = [] count = 0 for name in flist: filepath = path + name lines = ftools.read_lines(filepath) essay = "" tmp = [] for line in lines: words = tools.seperate(line) tmp.extend(words) essay += ''.join(words) data.append(TaggedDocument(tmp, ["text_" + str(count)])) self.sen_dict[essay] = "text_" + str(count) count += 1 return data
def get_clue_words(path=Dir.res + "/extradata/", savepath=Dir.res + "/parameter/summarization_parameter/clue_words", word_index=3): _, res_sen = filter(path) words = {} for var in res_sen: for sen in var[1:]: ws = tools.seperate(sen) for w in ws[:word_index]: if w not in words.keys(): words[w] = 0 words[w] += 1 content = "" for w in words.keys(): content += w + "," + str(words[w]) + "\n" ftools.write(savepath + str(word_index), content)
def read_file(self, dir): filelist = [] tools.get_filelist(dir, filelist) data = {} reverse_data = {} filelist = sorted(filelist) for filename in filelist: with open(filename, mode="r", encoding="utf-8") as file: content = file.read() sentences = self.seperate_sentences(content) data[filename] = sentences for sen in sentences: if sen not in reverse_data.keys(): reverse_data[sen] = [tools.get_name(filename)] else: reverse_data[sen].append(tools.get_name(filename)) # print(sen,reverse_data[sen]) return data, reverse_data
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"): ref_root = Dir.res + "/" + dataname + "/ref_processed/" abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/" detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt" filelist = ftools.get_files(ref_root) content = "" for i in range(len(filelist)): fname = filelist[i] print(i, len(filelist)) abstract = ftools.read_lines(abs_root + fname) refence = ftools.read_lines(ref_root + fname) lines = [line.split(" ") for line in abstract] refen = [line.split(" ") for line in refence] rouge1 = rouge_1_simple(refen, lines) rouge2 = rouge_2_simple(refen, lines) print(fname, rouge1, rouge2) content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n" ftools.write(detail_path, content)
def load_vectorize_files(vectorize_path): lines = ftools.read_lines(vectorize_path) res = {} for line in lines: seperate_point = line.rindex("\t") key = line[:seperate_point] content = seperate_point[seperate_point+1:][2:-2] vectors = [float(var) for var in content.split("','")] if key not in res.keys(): res[key] = vectors return res
def craw_urls(self): start = 372 for i in range(start, self.page_nums): request = Request.Request(self.url + str(i)) for key in self.params.keys(): request.add_header(key, self.params[key]) response = Request.urlopen(request) html = response.read() html = html.decode('utf-8') infos = re.findall(self.url_regex, html) save_content = "" for info in infos: new_url = self.url_unqoate(info[-1]) new_infor = [info[0], info[1], info[-1], new_url] save_content += self.seperator.join(new_infor) + "\n" tools.check_build_file(self.url_file) tools.write(self.url_file, content=save_content, mode="a") print(i, len(infos))