def vectorize_files(fileroot,savepath): data = ftools.read_dir_lines_dict(fileroot) auto = AutoCoder() count = 0 print(len(data.keys())) for key in data.keys(): text = '。'.join(data[key]) sens, sens_words, sens_tags = auto.preprocess(text) start = time.time() sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags) end = time.time() key_text =''.join([''.join(var) for var in sens_words]) save_key = tools.md5(key_text) tmp =[list(var) for var in sens_vector] save_object = [tmp,list(essay_vector)] tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key) count+=1 print(count,len(data.keys()),end-start)
def save_value(self, path, key, coverage_list, relative_matrix, clues_list, entities_list): ftools.check_filename(path) save_dict = {} save_dict[key] = [ coverage_list, relative_matrix, clues_list, entities_list ] tools.save_object(save_dict, path)
def save_value(self, path, text, coverage_list, relative_matrix, clues_list, entities_list): ftools.check_filename(path) save_dict = {} save_dict['#$#'.join(text)] = [ coverage_list, relative_matrix, clues_list, entities_list ] tools.save_object(save_dict, path)
def analyze(main_name, compare_index, name="cleandata_small"): save_path = Dir.res + "/result/judge.txt" jude_dict = tools.load_object(save_path) # print(list(jude_dict.keys())[0]) print(len(jude_dict)) entry_path = Dir.res + "/result/" + name + "/EntryBigraph/detials.txt" entry_data = load_data(entry_path) first_path = Dir.res + "/result/" + name + "/" + main_name + "/detials.txt" first_data = load_data(first_path) textrank_path = Dir.res + "/result/" + name + "/TextRank/detials.txt" tr_data = load_data(textrank_path) result = {} for key in first_data.keys(): a = first_data[key][0] - entry_data[key][0] b = first_data[key][1] - entry_data[key][1] c = first_data[key][0] - tr_data[key][0] d = first_data[key][1] - tr_data[key][1] e = first_data[key][0] - tr_data[key][0] + entry_data[key][ 0] - tr_data[key][0] f = first_data[key][1] - tr_data[key][1] + entry_data[key][ 1] - tr_data[key][1] result[key] = [a, b, c, d, e, f] count = 0 news_root = Dir.res + "/" + name + "/news/" abst_root = Dir.res + "/" + name + "/abstract/" fname = ftools.get_files(news_root) new_result = {} for filename in fname: # print(filename,count,len(fname)) # news = ftools.read_lines(news_root+filename) # weibo = ftools.read_lines(abst_root+filename) # jude = data_filter(news,weibo) # jude_dict[filename] = jude jude = jude_dict[filename] if jude > 0.5: new_result[filename] = result[filename] new_result[filename].append(jude) count += 1 tools.save_object(jude_dict, Dir.res + "/result/judge.txt") tmp = dict( sorted(new_result.items(), key=lambda d: d[1][compare_index], reverse=True)) save_dict = {} names = [] for key in tmp.keys(): save_dict[key] = tmp[key] names.append(key) save_path = Dir.res + "/result/" + name + "/" + main_name + ".txt" ftools.write_com_dict(save_path, save_dict) return names
def vectorize(self,sens_words,sens_tags): key_text = ''.join([''.join(var) for var in sens_words]) key = tools.md5(key_text) # print(key) if key in self.data.keys(): tmp = self.data[key] else: print("trainning") tmp0,tmp1 = self.auto.vectorize(sens_words,sens_tags) tmp = [tmp0,tmp1] tmpsens = [list(var) for var in tmp0] save_object = [tmpsens, list(tmp1)] save_key = tools.md5(key_text) tools.save_object(save_object, Dir.res+"/encoder/cleandata_8700/" + save_key) # print(type(tmp)) # print(len(tmp)) return tmp[0],tmp[1]