コード例 #1
0
    def wiki_preprocess(self,
                        save_path=Dir.res +
                        "/WikiCorpus/wiki.jian.seperate.txt"):
        tmp_result = []
        # save_path = Dir.res+"WikiCorpus/wiki.jian.seperate.txt"
        index = 0
        with open(self.train_file, "r") as train_corpus:
            # print("read complete")
            index = 0
            for line in train_corpus:

                # print(line)
                # input()
                regex = "。。。。。。|?|。|!|;|\.\.\.\.\.\."
                sentences = re.split(regex, line)
                for sen in sentences:
                    words = list(jieba.cut(sen.strip()))
                new_line = ' '.join(words)
                tmp_result.append(new_line)
                if tmp_result.__len__() == 5000:
                    # print(index*5000 + 5000)
                    tools.write_list(save_path, tmp_result, mode="a")
                    index += 1
                    tmp_result = []
            # print(tmp_result.__len__())
            tools.write_list(save_path, tmp_result, mode="a")
コード例 #2
0
def build_w2v_train_data():
    file_dir = Dir.res + "data/news.sentences/"
    save_path = Dir.res + "data/all.txt"
    filelist = []
    content = []
    tools.get_filelist(file_dir, filelist)
    for file in filelist:
        sentences = tools.read_lines(file)
        content.extend(sentences)
    tools.write_list(save_path, content)
コード例 #3
0
 def rouge_detail(self,abstract_processed,save_dir):
     flist = tools.get_files(abstract_processed)
     save_content = []
     for fname in flist:
         content = tools.read_lines(abstract_processed+fname)
         refence = tools.read_lines(self.ref_processed+fname)
         lines =[line.split(" ") for line in content]
         refen =[line.split(" ") for line in refence]
         rouge1 = self.rouge_1_simple(refen,lines)
         rouge2 = self.rouge_2_simple(refen, lines)
         save_content.append(fname+","+str(rouge1)+","+str(rouge2))
     tools.write_list(save_dir+"/detials.txt",save_content)
コード例 #4
0
def transfer(cleandata_root = Dir.res+"/cleandata_1189/news/",save_path = Dir.res+"/sen_data/1189_corpus.txt"):
    filelist = os.listdir(cleandata_root)
    lines = []
    for name in filelist:
        filepath = cleandata_root+name
        for line in ftools.read_lines(filepath):
            words = tools.seperate(line)
            for i in range(len(words)):
                if words[i].isdigit():
                    words[i] = "num"
            lines.append(' '.join(words))

    ftools.write_list(save_path,lines)
コード例 #5
0
def generate_new_data():
    npath = Dir.res + "/cleandata_highquality_3500/news/"
    # apath = Dir.res+"/cleandata_highquality_3500/abstract/"

    new_npath = Dir.res + "/cleandata_highquality_3500_new/news/"
    new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/"

    uper = Uper()

    for name in ftools.get_files(npath):
        path = npath + name
        content = ftools.read_lines(path)
        new_abstract = uper.summarize(content, num=3, fname=name[:-4])
        ftools.copy(path, new_npath + name)
        ftools.write_list(new_apath + name, new_abstract)
コード例 #6
0
def generate_data(file=Dir.res +
                  "/extract_data_process/data_processed_9.9.txt",
                  savePath=Dir.res + "/extract_data_process/data"):
    content = tools.read_lines(file)[1:-1]
    data = {}
    for file in content:
        file = file.replace(" ", "")
        tmp = str(file[1:-1]).split("', '")
        if tmp[1] not in data.keys():
            data[tmp[1]] = tmp[2]
    index = 0
    for key in sorted(data.keys()):
        save_content = savePath + "/news/training_" + str(index) + ".txt"
        save_abstract = savePath + "/abstract/training_" + str(index) + ".txt"
        tools.write_list(save_content, seperate_sentences(data[key]))
        tools.write_list(save_abstract, seperate_sentences(key))
        index += 1
コード例 #7
0
def result_process(file_dir,save_dir):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)
    filenames = []
    tools.get_filelist(file_dir,filenames)
    for file in filenames:
        content  = tools.read_lines(file)
        name = tools.get_name(file)
        result =[]
        for line in content:
            words = jieba.cut(line)
            string = ""
            for word in words:
                string+= word+" "
            string = string[:-1]
            result.append(string)
            save_path = save_dir+"/"+name+".txt"
            tools.write_list(save_path,result)
コード例 #8
0
def check_extract(file_dir, save_path):
    files = []
    tools.get_filelist(file_dir, files, filter)
    extract_result = set()
    un_first_result = set()
    analysis_result = {}
    for file in files:
        # print(file)
        content = tools.read(file)
        content = re.sub("\[|\]|", "", content)
        lines = content.split("\n")
        for line in lines:
            tmp = line.split("', '")
            if tmp.__len__() == 3:
                extract = check_if_extract(tmp[1], tmp[2])
                if extract[0]:
                    extract_result.add(line)
                    if tmp[0] not in analysis_result.keys():
                        analysis_result[tmp[0]] = []
                    analysis_result[tmp[0]] = extract[1]

                    all_value = sum(extract[1][:-2])
                    supose_value = 0
                    low_ = get_sum(extract[1][-2])
                    hight_ = get_sum(extract[1][-1])
                    # print(tmp[0], all_value, low_, hight_, extract[1][:-2],extract[1][-2:])
                    # print(extract_result.__len__())
                    if all_value > low_ + 2:
                        # print(tmp[0], all_value, low_, hight_, extract[1][:-2], extract[1][-2:])
                        un_first_result.add(line)

                    print(extract_result.__len__(), un_first_result.__len__())

                else:
                    pass
            else:
                # print("format error",tmp.__len__())
                # print(line)
                pass
        # print("exract",extract_result.__len__())
    tools.write_list(save_path, extract_result)
    tools.write_list(save_path + ".txt", un_first_result)
コード例 #9
0
def workers(args):
    wname = args["n"]
    data = args["d"]
    model = args["m"]
    abstract = args["a"]
    count = 0
    summarize_result = {}
    print(wname, "start")
    for text in data.keys():
        # if text not in summarize_result.keys():
        start = time.time()
        # try:
        summarize_result[text] = model.summarize(data[text], num=3,fname = text)
        count += 1
        end = time.time()
        tools.write_list(abstract + text + ".txt", summarize_result[text])
        # print(wname, text, count, "/", len(data), end - start)
        # except:
        #     print(wname,text)
        #     input()
    # print(wname,len(self.summarize_result))
    print(wname, " done")
    return summarize_result
コード例 #10
0
def preprocess_file(file, savepath):
    content = tools.read(file)
    result = preprocess(content)
    tools.write_list(savepath, result)
コード例 #11
0
    def evaluator_rouge(self,model,result_dir,num):
        summarize_result={}
        astart = time.time()
        ### 保存模型的摘要结果
        abstract = result_dir+"/abstract/"
        keys = sorted(self.data.keys())
        if self.parall:
            p = multiprocessing.Pool(self.cpu)
            inter = int(len(keys) / self.cpu) + 1
            args = []
            for i in range(self.cpu):
                tmp = {}
                if i == 0:
                    key = keys[:inter]
                    # print(i,"0",inter,len(key))
                    for k in key:
                        tmp[k] = self.data[k]
                elif i == self.cpu-1:

                    key = keys[i*inter:]
                    # print(i, i * inter, "end", len(key))
                    for k in key:
                        tmp[k] = self.data[k]
                else:

                    key = keys[i*inter:(i+1)*inter]
                    # print(i, i * inter, (i + 1) * inter, len(key))
                    for k in key:
                        tmp[k] = self.data[k]
                args.append({"n":"work"+str(i),
                            "d":tmp,
                            "m":model,
                             "a" :abstract
                             }
                            )
            # input()
            rslt = p.map(workers,args)
            # for var in rslt:
            #     for k in var.keys():
            #         summarize_result[k] = var[k]
            ### 处理摘要结果数据(数值化)
            # print("saving abstract ",len(summarize_result))
            # for fname in summarize_result.keys():
                # tools.write_list(abstract + fname + ".txt", summarize_result[fname])
            abstract_processed = result_dir + "/abstract_processed/"
            abstract_seperate = result_dir + "/abstract_seperate/"
            RP.result_process(abstract, abstract_seperate)
            print("abstract separate done")
            RP.replace_words_by_num(self.word_index, abstract_seperate, abstract_processed)
            print("abstract replace done")
            # print(abstract_processed,result_dir)
            self.rouge_detail(abstract_processed, result_dir)

            ### 计算 ROUGE
            # import src.evaluation.ROUGE
            # self.rouge = src.evaluation.ROUGE.ROUGE()
            # print("evaling")
            result = self.rouge.eval(abstract_processed, self.ref_processed,num)
            eval_result = result_dir + "/eval_res.txt"
            print(result)
            tools.write(eval_result, model.info + "\n" + result, mode="a")
            aend = time.time()
            print(aend - astart)

        else:
            count = 0
            for text in keys:
                if text not in summarize_result.keys():
                    start = time.time()
                    count+=1
                    # if count <1530:
                    #     count+=1
                    #     continue
                    # print(text)
                    summarize_result[text] = model.summarize(self.data[text], num,fname = text)


                    end = time.time()
                    # tools.print_proccess(count, len(self.data.keys()))
                    print(text,count,"/",len(keys),end-start)
                    # print(result_save_dir_abstract + text + ".txt")
                    # print(  model.summarize(self.data[text], num) )
                    tools.write_list(abstract + text + ".txt", summarize_result[text])
            ### 处理摘要结果数据(数值化)
            abstract_processed = result_dir+"/abstract_processed/"
            abstract_seperate = result_dir + "/abstract_seperate/"
            RP.result_process(abstract,abstract_seperate)
            RP.replace_words_by_num(self.word_index,abstract_seperate,abstract_processed)
            # print(abstract_processed,result_dir)
            self.rouge_detail(abstract_processed,result_dir)

            ### 计算 ROUGE
            # import src.evaluation.ROUGE
            # self.rouge = src.evaluation.ROUGE.ROUGE()
            # print("evaling")
            result = self.rouge.eval(abstract_processed, self.ref_processed)
            eval_result = result_dir+"/eval_res.txt"
            print(result)
            tools.write(eval_result,model.info+"\n"+result,mode="a")
            aend = time.time()
            print(aend-astart)