Esempio n. 1
0
def vectorize_files(fileroot,savepath):
    data = ftools.read_dir_lines_dict(fileroot)
    auto  = AutoCoder()
    count = 0
    print(len(data.keys()))
    for key in data.keys():

        text = '。'.join(data[key])

        sens, sens_words, sens_tags = auto.preprocess(text)
        start = time.time()
        sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags)
        end = time.time()
        key_text =''.join([''.join(var) for var in sens_words])

        save_key = tools.md5(key_text)
        tmp =[list(var) for var in sens_vector]

        save_object = [tmp,list(essay_vector)]

        tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key)

        count+=1

        print(count,len(data.keys()),end-start)
Esempio n. 2
0
    def vectorize(self,sens_words,sens_tags):
        key_text = ''.join([''.join(var) for var in sens_words])
        key = tools.md5(key_text)
        # print(key)
        if key in self.data.keys():
            tmp = self.data[key]
        else:
            print("trainning")
            tmp0,tmp1 = self.auto.vectorize(sens_words,sens_tags)
            tmp = [tmp0,tmp1]
            tmpsens = [list(var) for var in tmp0]

            save_object = [tmpsens, list(tmp1)]
            save_key = tools.md5(key_text)
            tools.save_object(save_object, Dir.res+"/encoder/cleandata_8700/" + save_key)
        # print(type(tmp))
        # print(len(tmp))
        return tmp[0],tmp[1]