def get_clean_data(filenames, newname, name="cleandata_small"): import shutil path = Dir.res + "/result/" + name + "/Fourth Version auto encoder.txt" # lines = ftools.read_lines(path) # files = [] # for i in range(len(lines)): # line = lines[i].split(",") # files.append(line[0]) # if len(files)>size: # break nroot = Dir.res + "/" + name + "/news/" aroot = Dir.res + "/" + name + "/abstract/" # = "cleandata_highquality_1000" if ftools.isexists(Dir.res + "/" + newname + "/"): shutil.rmtree(Dir.res + "/" + newname + "/") if ftools.isexists(Dir.res + "/result/" + newname + "/"): shutil.rmtree(Dir.res + "/result/" + newname + "/") snroot = Dir.res + "/" + newname + "/news/" saroot = Dir.res + "/" + newname + "/abstract/" count = 0 for name in filenames: count += 1 print(count, len(filenames)) ftools.copy(nroot + name, snroot + name) ftools.copy(aroot + name, saroot + name)
def indexlize_data(self,reprocess): ### 建立词到数值的映射 print("start") word_index_path = self.dir_path + "/words_index.txt" if not tools.isexists(word_index_path) or \ not tools.isexists(self.ref_processed) or \ not tools.isexists(self.ref_seperate) or \ len(tools.get_files(self.ref_seperate)) ==0 or \ len(tools.get_files(self.ref_processed)) == 0: reprocess = True if reprocess: self.word_index = RP.build_word_index(self.file, word_index_path) # print("word_index_builded") ### 参考摘要数值化 print(self.file_ref,self.ref_seperate,self.ref_processed) RP.result_process(self.file_ref, self.ref_seperate) RP.replace_words_by_num(self.word_index, self.ref_seperate, self.ref_processed) print("references process done") else: self.load_word_index(word_index_path ) print("word index loaded")