Beispiel #1
0
def replace_words_by_num(whole_words,file_dir,save_dir):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)
    filename = []
    def filter(s):
        if "all" in s:
            return True
        return False
    tools.get_filelist(file_dir,filename,filter)
    content = {}
    for file in filename:
        lines = tools.read_lines(file)
        string = ""
        for line in lines:
            words = line.split(" ")
            for word in words:
                if word.__len__()>0:
                    if word in whole_words.keys():
                        string+= str(whole_words[word])+" "
            string = string.strip()
            string+="\n"
        content[tools.get_name(file)] = string
        # print(string)
        # input()
    for name in content:
        savepath = save_dir+name+".txt"
        tools.write(savepath,content[name])
Beispiel #2
0
def build_w2v_train_data():
    file_dir = Dir.res + "data/news.sentences/"
    save_path = Dir.res + "data/all.txt"
    filelist = []
    content = []
    tools.get_filelist(file_dir, filelist)
    for file in filelist:
        sentences = tools.read_lines(file)
        content.extend(sentences)
    tools.write_list(save_path, content)
Beispiel #3
0
def get_dirfiles_into_list_luhn(file_dir,replace_dir):
    list,result  = [],{}
    tools.get_filelist(file_dir,list)
    for listfile in list:
        filename = tools.get_name(listfile)
        filename = filename[8:]
        if filename not in result.keys():
            result[filename ]= []
        if replace_dir == "":
            result[filename] = (listfile)
        else:
            result[filename].append(str(replace_dir + "/" + tools.get_name(listfile)+".txt"))
    return result
Beispiel #4
0
def result_process(file_dir,save_dir):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)
    filenames = []
    tools.get_filelist(file_dir,filenames)
    for file in filenames:
        content  = tools.read_lines(file)
        name = tools.get_name(file)
        result =[]
        for line in content:
            words = jieba.cut(line)
            string = ""
            for word in words:
                string+= word+" "
            string = string[:-1]
            result.append(string)
            save_path = save_dir+"/"+name+".txt"
            tools.write_list(save_path,result)
Beispiel #5
0
    def read_file(self, dir):
        filelist = []
        tools.get_filelist(dir, filelist)
        data = {}
        reverse_data = {}
        filelist = sorted(filelist)
        for filename in filelist:
            with open(filename, mode="r", encoding="utf-8") as file:
                content = file.read()
                sentences = self.seperate_sentences(content)
                data[filename] = sentences
                for sen in sentences:
                    if sen not in reverse_data.keys():
                        reverse_data[sen] = [tools.get_name(filename)]
                    else:

                        reverse_data[sen].append(tools.get_name(filename))
                        # print(sen,reverse_data[sen])
        return data, reverse_data
Beispiel #6
0
def check_extract(file_dir, save_path):
    files = []
    tools.get_filelist(file_dir, files, filter)
    extract_result = set()
    un_first_result = set()
    analysis_result = {}
    for file in files:
        # print(file)
        content = tools.read(file)
        content = re.sub("\[|\]|", "", content)
        lines = content.split("\n")
        for line in lines:
            tmp = line.split("', '")
            if tmp.__len__() == 3:
                extract = check_if_extract(tmp[1], tmp[2])
                if extract[0]:
                    extract_result.add(line)
                    if tmp[0] not in analysis_result.keys():
                        analysis_result[tmp[0]] = []
                    analysis_result[tmp[0]] = extract[1]

                    all_value = sum(extract[1][:-2])
                    supose_value = 0
                    low_ = get_sum(extract[1][-2])
                    hight_ = get_sum(extract[1][-1])
                    # print(tmp[0], all_value, low_, hight_, extract[1][:-2],extract[1][-2:])
                    # print(extract_result.__len__())
                    if all_value > low_ + 2:
                        # print(tmp[0], all_value, low_, hight_, extract[1][:-2], extract[1][-2:])
                        un_first_result.add(line)

                    print(extract_result.__len__(), un_first_result.__len__())

                else:
                    pass
            else:
                # print("format error",tmp.__len__())
                # print(line)
                pass
        # print("exract",extract_result.__len__())
    tools.write_list(save_path, extract_result)
    tools.write_list(save_path + ".txt", un_first_result)
Beispiel #7
0
def build_word_index(file_dir,words_path):
    filename = []
    def filter(s):
        if "all" in s:
            return True
        return False
    tools.get_filelist(file_dir, filename, filter)
    whole_words = {}
    for file in filename:
        lines = tools.read_lines(file)
        for line in lines:
            words = list(jieba.cut(line))
            for word in words:
                if word.__len__() > 0:
                    if word not in whole_words.keys():
                        whole_words[word] = whole_words.__len__()
    word_index = ""
    for word in whole_words.keys():
        word_index += word + ":" + str(whole_words[word]) + "\n"
    tools.write(words_path, word_index)
    return whole_words
Beispiel #8
0
def load_files(self ,filedir):
    filenames ,data = []
    tools.get_filelist(filedir ,filenames)
    for file in filenames:
        data.append(tools.read(file))
    return data