def gen_zhengzhou_tree(dirname=myconfig.ZZ_STD_ADD, sav_file=myconfig.zhengzhou_std_word, sav_file_2=myconfig.zhengzhou_std_tree): addr_kv_rec = open("./addr_match.txt", 'w+') print('\n>gen_zhengzhou_tree start') #pdb.set_trace() my_tree = trie_tree.Trie() my_word = trie_tree.Trie() paths = os.walk(dirname) sum_lines = [] cnt = 0 for _, _, fs in paths: for f in fs: pth = os.path.join(dirname, str(f)) lines = open(pth, 'r').readlines() np.random.shuffle(lines) #lines = open(pth,'r').readlines()[:myconfig.TRAIN_DATA] for line in lines: if not ',' in line: continue _line = line.split(',')[1] line = utils.pre_trans(_line) addr_kv_rec.write('%s\t%s\n' % (str(line), str(_line))) cnt += 1 if cnt % 10000 == 1: print(cnt) my_tree.insert(line) my_word.insert(_line) utils.save_var(my_word, sav_file) utils.save_var(my_tree, sav_file_2) print('\n>my address tree save ok') addr_kv_rec.close()
def gen_address_tree(filename=myconfig.STDTXTPATH, sav_file=myconfig.MY_TREE): print('\n>gen_address_tree start') my_tree = trie_tree.Trie() df = open(filename, 'r') lines = df.readlines() #pd.read_csv(filename) for sent in lines: my_tree.insert(sent) utils.save_var(my_tree, sav_file) print('\n>my address tree save ok') return my_tree
def search(self,words_lst): res = {} res['ROOT'] = {} words_lst = words_lst.split(" ") if "" in words_lst: words_lst.remove("") words_dct = {} print(words_lst) for word in words_lst: key, value = word.split("/") if value in words_dct: words_dct[value]+="&" words_dct[value]+=key else: words_dct[value]=key words = [] for key in myconfig.COLUMNS: value = words_dct.get(key,'nan') if value == 'nan': continue words.append("%s/%s"%(value,key)) tree = trie_tree.Trie() tree.part_insert(tree.root,words) result = [] tree.scan_child_word(tree.root,result) _result = [] for node in result: if node.is_word: _result.append(node) parents = tree.get_all_parent_tree(_result) result = [] for words in parents: words = words.split(' ') result.append(self.search_one(words)) print(result) result = list(set(result)) formula_result = [] final_result = [] for word in result: word = word.split(' ') if '' in word: word.remove('') formula_result = self.full_my_tree.scan_nodes([tree.root], word, formula_result) print(formula_result) result_child = [] for node in formula_result: self.full_my_tree.scan_child_word(node,result_child) #pdb.set_trace() #print(result_child) #pdb.set_trace() final_result.extend(self.full_my_tree.get_all_parent_tree(result_child)) print(final_result) #pdb.set_trace() return ",".join(final_result)
def gen_std_tree_from_dataframe(data_src, sav_file=myconfig.MY_TREE): # 从dataframe创建标准地址树 print('\n>gen_std_tree_from_dataframe start') my_tree = trie_tree.Trie() for item in data_src: clritem = remove_nan(item) print(clritem) pdb.set_trace() my_tree.part_insert(my_tree.root, clritem) utils.save_var(my_tree, sav_file) print('\n>gen_std_tree_from_dataframe ready and save finish') return myconfig.SUCCESS
def gen_word_tree(filename=myconfig.STDTXTPATH, sav_file=myconfig.MY_WORD): print('\n>gen_address_tree start') my_tree = trie_tree.Trie() df = open(filename, 'r') lines = df.readlines() #pd.read_csv(filename) print(len(lines)) for sent in lines: words = sent.split('/') for word in words: my_tree.insert(word) utils.save_var(my_tree, sav_file) print('\n>my address tree save ok') return my_tree
def gen_std_tree(filename=myconfig.STDTXTPATH, sav_file=myconfig.MY_TREE, delimeter='/'): print('\n>gen_std_tree start') my_tree = trie_tree.Trie() df = open(filename, 'r') lines = df.readlines() #pd.read_csv(filename) for sent in lines: words = sent.split(delimeter) my_tree.insert(words) utils.save_var(my_tree, sav_file) print('\n>my std tree save ok') return my_tree