def replace_words_by_num(whole_words,file_dir,save_dir): if os.path.lexists(save_dir): shutil.rmtree(save_dir) filename = [] def filter(s): if "all" in s: return True return False tools.get_filelist(file_dir,filename,filter) content = {} for file in filename: lines = tools.read_lines(file) string = "" for line in lines: words = line.split(" ") for word in words: if word.__len__()>0: if word in whole_words.keys(): string+= str(whole_words[word])+" " string = string.strip() string+="\n" content[tools.get_name(file)] = string # print(string) # input() for name in content: savepath = save_dir+name+".txt" tools.write(savepath,content[name])
def build_w2v_train_data(): file_dir = Dir.res + "data/news.sentences/" save_path = Dir.res + "data/all.txt" filelist = [] content = [] tools.get_filelist(file_dir, filelist) for file in filelist: sentences = tools.read_lines(file) content.extend(sentences) tools.write_list(save_path, content)
def get_dirfiles_into_list_luhn(file_dir,replace_dir): list,result = [],{} tools.get_filelist(file_dir,list) for listfile in list: filename = tools.get_name(listfile) filename = filename[8:] if filename not in result.keys(): result[filename ]= [] if replace_dir == "": result[filename] = (listfile) else: result[filename].append(str(replace_dir + "/" + tools.get_name(listfile)+".txt")) return result
def result_process(file_dir,save_dir): if os.path.lexists(save_dir): shutil.rmtree(save_dir) filenames = [] tools.get_filelist(file_dir,filenames) for file in filenames: content = tools.read_lines(file) name = tools.get_name(file) result =[] for line in content: words = jieba.cut(line) string = "" for word in words: string+= word+" " string = string[:-1] result.append(string) save_path = save_dir+"/"+name+".txt" tools.write_list(save_path,result)
def read_file(self, dir): filelist = [] tools.get_filelist(dir, filelist) data = {} reverse_data = {} filelist = sorted(filelist) for filename in filelist: with open(filename, mode="r", encoding="utf-8") as file: content = file.read() sentences = self.seperate_sentences(content) data[filename] = sentences for sen in sentences: if sen not in reverse_data.keys(): reverse_data[sen] = [tools.get_name(filename)] else: reverse_data[sen].append(tools.get_name(filename)) # print(sen,reverse_data[sen]) return data, reverse_data
def check_extract(file_dir, save_path): files = [] tools.get_filelist(file_dir, files, filter) extract_result = set() un_first_result = set() analysis_result = {} for file in files: # print(file) content = tools.read(file) content = re.sub("\[|\]|", "", content) lines = content.split("\n") for line in lines: tmp = line.split("', '") if tmp.__len__() == 3: extract = check_if_extract(tmp[1], tmp[2]) if extract[0]: extract_result.add(line) if tmp[0] not in analysis_result.keys(): analysis_result[tmp[0]] = [] analysis_result[tmp[0]] = extract[1] all_value = sum(extract[1][:-2]) supose_value = 0 low_ = get_sum(extract[1][-2]) hight_ = get_sum(extract[1][-1]) # print(tmp[0], all_value, low_, hight_, extract[1][:-2],extract[1][-2:]) # print(extract_result.__len__()) if all_value > low_ + 2: # print(tmp[0], all_value, low_, hight_, extract[1][:-2], extract[1][-2:]) un_first_result.add(line) print(extract_result.__len__(), un_first_result.__len__()) else: pass else: # print("format error",tmp.__len__()) # print(line) pass # print("exract",extract_result.__len__()) tools.write_list(save_path, extract_result) tools.write_list(save_path + ".txt", un_first_result)
def build_word_index(file_dir,words_path): filename = [] def filter(s): if "all" in s: return True return False tools.get_filelist(file_dir, filename, filter) whole_words = {} for file in filename: lines = tools.read_lines(file) for line in lines: words = list(jieba.cut(line)) for word in words: if word.__len__() > 0: if word not in whole_words.keys(): whole_words[word] = whole_words.__len__() word_index = "" for word in whole_words.keys(): word_index += word + ":" + str(whole_words[word]) + "\n" tools.write(words_path, word_index) return whole_words
def load_files(self ,filedir): filenames ,data = [] tools.get_filelist(filedir ,filenames) for file in filenames: data.append(tools.read(file)) return data