def vectorize_files(fileroot,savepath): data = ftools.read_dir_lines_dict(fileroot) auto = AutoCoder() count = 0 print(len(data.keys())) for key in data.keys(): text = '。'.join(data[key]) sens, sens_words, sens_tags = auto.preprocess(text) start = time.time() sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags) end = time.time() key_text =''.join([''.join(var) for var in sens_words]) save_key = tools.md5(key_text) tmp =[list(var) for var in sens_vector] save_object = [tmp,list(essay_vector)] tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key) count+=1 print(count,len(data.keys()),end-start)
def __init__(self,corpus,corpus_ref,reprocess = True,parall = False,cpu = 4): self.file = corpus self.file_ref = corpus_ref self.data = tools.read_dir_lines_dict(self.file) self.rouge= src.evaluation.PythonROUGE self.word_index={} self.dir_path = corpus[:corpus.rindex("/")] self.ref_processed = self.dir_path + "/ref_processed/" self.ref_seperate = self.dir_path + "/ref_seperate/" self.indexlize_data(reprocess=reprocess) self.start =0 self.parall = parall self.cpu = 4