Ejemplo n.º 1
0
def vectorize_files(fileroot,savepath):
    data = ftools.read_dir_lines_dict(fileroot)
    auto  = AutoCoder()
    count = 0
    print(len(data.keys()))
    for key in data.keys():

        text = '。'.join(data[key])

        sens, sens_words, sens_tags = auto.preprocess(text)
        start = time.time()
        sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags)
        end = time.time()
        key_text =''.join([''.join(var) for var in sens_words])

        save_key = tools.md5(key_text)
        tmp =[list(var) for var in sens_vector]

        save_object = [tmp,list(essay_vector)]

        tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key)

        count+=1

        print(count,len(data.keys()),end-start)
Ejemplo n.º 2
0
 def __init__(self,corpus,corpus_ref,reprocess = True,parall = False,cpu = 4):
     self.file = corpus
     self.file_ref = corpus_ref
     self.data = tools.read_dir_lines_dict(self.file)
     self.rouge=  src.evaluation.PythonROUGE
     self.word_index={}
     self.dir_path = corpus[:corpus.rindex("/")]
     self.ref_processed = self.dir_path + "/ref_processed/"
     self.ref_seperate = self.dir_path + "/ref_seperate/"
     self.indexlize_data(reprocess=reprocess)
     self.start =0
     self.parall = parall
     self.cpu = 4