def new_cut(line): global global_cnt global_cnt += 1 txt = utils.pre_trans(line) cuts = list(jieba.cut(txt)) rule = get_words_level(cuts) rule['sent'] = utils.pre_trans(line) print('new_cut', rule) return rule
def gen_zhengzhou_tree(dirname=myconfig.ZZ_STD_ADD, sav_file=myconfig.zhengzhou_std_word, sav_file_2=myconfig.zhengzhou_std_tree): addr_kv_rec = open("./addr_match.txt", 'w+') print('\n>gen_zhengzhou_tree start') #pdb.set_trace() my_tree = trie_tree.Trie() my_word = trie_tree.Trie() paths = os.walk(dirname) sum_lines = [] cnt = 0 for _, _, fs in paths: for f in fs: pth = os.path.join(dirname, str(f)) lines = open(pth, 'r').readlines() np.random.shuffle(lines) #lines = open(pth,'r').readlines()[:myconfig.TRAIN_DATA] for line in lines: if not ',' in line: continue _line = line.split(',')[1] line = utils.pre_trans(_line) addr_kv_rec.write('%s\t%s\n' % (str(line), str(_line))) cnt += 1 if cnt % 10000 == 1: print(cnt) my_tree.insert(line) my_word.insert(_line) utils.save_var(my_word, sav_file) utils.save_var(my_tree, sav_file_2) print('\n>my address tree save ok') addr_kv_rec.close()
def base_address_formula(self, txt): #txt = re.sub("(.+?)[\((].+?[\))]","\1",txt) #txt = re.sub("\(.+?\)","",txt) #txt = re.sub("(.+?)","",txt) #txt = re.sub("[^\u4e00-\u9fa50-9a-zA-Z\-]","",txt) global global_cnt maxlen = 0 base, outbase = {}, {} base_exp = "" base = {} txt = utils.pre_trans(txt) #base= new_cut_xgb(txt) base = {} base_rule = new_cut(txt) out = self.complex(base, base_rule) print('new_cut', out) outbase = self.double_check(out) outbase['rw'] = "0" global w_cnt global r_cnt if outbase['rw'] == "1": w_cnt += 1 else: r_cnt += 1 #keys = ["省","市","区","社区","村居委会","街路巷名","自然村组","门牌号","小区名","组团名称","楼号","栋号","单元号","楼层","户室号","rw",'sent'] out_set = {} for k in level_keys: out_set[k] = outbase.get(k, '') pass #print(out_set) out_set['楼号'] = '' #out_set['省'] = '河南省' print('out_set', out_set) return out_set
def judge(self, k, vin, vout): ''' vin xgb vout rule ''' vin = utils.pre_trans(vin) vout = utils.pre_trans(vout) v = '' if k in ['sent']: return vin if vin == vout: return vin print('diff', k, '-', vin, '-', vout) if vout == '': v = vout else: v = vout v = re.sub("栋栋", "栋", v) v = utils.pre_trans(v) if '郑州' in v or '郑州市' in v or '河南省' in v or '河南' in v: return '' return v
def eval_file(self, f2): rcnt, wcnt = 0.0, 0.0 if True: #$DEBUG: #with open(os.path.join(CURPATH,"eval_1227.json"),"r") as f: #with open(os.path.join(CURPATH,"eval_file_1228.json"),"r") as f: with open(os.path.join(CURPATH, "address_formula.json"), "r") as f: cont = json.loads(f.read()) klst = [k for k in cont] np.random.shuffle(klst) for k in klst: item = cont[k] if "城基路" in item.get('sent', '') or "鼎益市场" in item.get( 'sent', ''): # #pdb.set_trace() continue if item.get('自然村组', '') == "服务中心金关村": continue if item.get('社区', '') == "电子商务园小区": continue if item.get('社区', '') == "高天小区": continue if item.get('街路巷名', '') == '百花大道5号': continue if item.get('社区', '') == '沙河花园a': continue pass #print("input",item) _item = {} for i in item: j = utils.pre_trans(item[i]) _item[i] = j flag = self.eval_one(_item, f2) if flag: rcnt += 1.0 else: wcnt += 1.0 self.wr_lst(['\n', '原文', str(item), '\n'], "eval_score.txt") cut_result = self.address_formula(item['sent']) self.wr_lst( ['\n', '预测', str(cut_result), '\n'], "eval_score.txt") self.show_diff(item, cut_result, item['sent']) if wcnt % 1 == 0: pass #print(rcnt, wcnt) pass acc = rcnt / (rcnt + wcnt + 0.1) print(rcnt, wcnt, acc)
def cut(line): txt = utils.pre_trans(line) cuts = list(jieba.cut(txt))