Esempio n. 1
0
def get_clean_data(filenames, newname, name="cleandata_small"):
    import shutil
    path = Dir.res + "/result/" + name + "/Fourth Version auto encoder.txt"
    # lines = ftools.read_lines(path)
    # files = []
    # for i in range(len(lines)):
    #     line = lines[i].split(",")
    #     files.append(line[0])
    #     if len(files)>size:
    #         break
    nroot = Dir.res + "/" + name + "/news/"
    aroot = Dir.res + "/" + name + "/abstract/"

    # = "cleandata_highquality_1000"

    if ftools.isexists(Dir.res + "/" + newname + "/"):
        shutil.rmtree(Dir.res + "/" + newname + "/")
    if ftools.isexists(Dir.res + "/result/" + newname + "/"):
        shutil.rmtree(Dir.res + "/result/" + newname + "/")
    snroot = Dir.res + "/" + newname + "/news/"
    saroot = Dir.res + "/" + newname + "/abstract/"

    count = 0
    for name in filenames:
        count += 1
        print(count, len(filenames))
        ftools.copy(nroot + name, snroot + name)
        ftools.copy(aroot + name, saroot + name)
Esempio n. 2
0
    def indexlize_data(self,reprocess):
        ###  建立词到数值的映射
        print("start")
        word_index_path = self.dir_path + "/words_index.txt"
        if not tools.isexists(word_index_path) or \
                not tools.isexists(self.ref_processed) or \
                not tools.isexists(self.ref_seperate) or \
                len(tools.get_files(self.ref_seperate)) ==0 or \
                len(tools.get_files(self.ref_processed)) == 0:
            reprocess = True
        if reprocess:

            self.word_index = RP.build_word_index(self.file, word_index_path)
            # print("word_index_builded")
            ###  参考摘要数值化
            print(self.file_ref,self.ref_seperate,self.ref_processed)
            RP.result_process(self.file_ref, self.ref_seperate)
            RP.replace_words_by_num(self.word_index, self.ref_seperate, self.ref_processed)
            print("references process done")
        else:
            self.load_word_index(word_index_path )
            print("word index loaded")