Esempio n. 1
0
def new_cut(line):
    global global_cnt
    global_cnt += 1
    txt = utils.pre_trans(line)
    cuts = list(jieba.cut(txt))
    rule = get_words_level(cuts)
    rule['sent'] = utils.pre_trans(line)
    print('new_cut', rule)
    return rule
Esempio n. 2
0
def gen_zhengzhou_tree(dirname=myconfig.ZZ_STD_ADD,
                       sav_file=myconfig.zhengzhou_std_word,
                       sav_file_2=myconfig.zhengzhou_std_tree):
    addr_kv_rec = open("./addr_match.txt", 'w+')
    print('\n>gen_zhengzhou_tree start')
    #pdb.set_trace()
    my_tree = trie_tree.Trie()
    my_word = trie_tree.Trie()
    paths = os.walk(dirname)
    sum_lines = []
    cnt = 0
    for _, _, fs in paths:
        for f in fs:
            pth = os.path.join(dirname, str(f))
            lines = open(pth, 'r').readlines()
            np.random.shuffle(lines)
            #lines = open(pth,'r').readlines()[:myconfig.TRAIN_DATA]
            for line in lines:
                if not ',' in line:
                    continue
                _line = line.split(',')[1]
                line = utils.pre_trans(_line)
                addr_kv_rec.write('%s\t%s\n' % (str(line), str(_line)))
                cnt += 1
                if cnt % 10000 == 1:
                    print(cnt)
                my_tree.insert(line)
                my_word.insert(_line)
    utils.save_var(my_word, sav_file)
    utils.save_var(my_tree, sav_file_2)
    print('\n>my address tree save ok')
    addr_kv_rec.close()
Esempio n. 3
0
 def base_address_formula(self, txt):
     #txt = re.sub("(.+?)[\((].+?[\))]","\1",txt)
     #txt = re.sub("\(.+?\)","",txt)
     #txt = re.sub("(.+?)","",txt)
     #txt = re.sub("[^\u4e00-\u9fa50-9a-zA-Z\-]","",txt)
     global global_cnt
     maxlen = 0
     base, outbase = {}, {}
     base_exp = ""
     base = {}
     txt = utils.pre_trans(txt)
     #base= new_cut_xgb(txt)
     base = {}
     base_rule = new_cut(txt)
     out = self.complex(base, base_rule)
     print('new_cut', out)
     outbase = self.double_check(out)
     outbase['rw'] = "0"
     global w_cnt
     global r_cnt
     if outbase['rw'] == "1":
         w_cnt += 1
     else:
         r_cnt += 1
     #keys = ["省","市","区","社区","村居委会","街路巷名","自然村组","门牌号","小区名","组团名称","楼号","栋号","单元号","楼层","户室号","rw",'sent']
     out_set = {}
     for k in level_keys:
         out_set[k] = outbase.get(k, '')
     pass  #print(out_set)
     out_set['楼号'] = ''
     #out_set['省'] = '河南省'
     print('out_set', out_set)
     return out_set
Esempio n. 4
0
 def judge(self, k, vin, vout):
     '''
     vin xgb
     vout rule
     '''
     vin = utils.pre_trans(vin)
     vout = utils.pre_trans(vout)
     v = ''
     if k in ['sent']:
         return vin
     if vin == vout:
         return vin
     print('diff', k, '-', vin, '-', vout)
     if vout == '':
         v = vout
     else:
         v = vout
     v = re.sub("栋栋", "栋", v)
     v = utils.pre_trans(v)
     if '郑州' in v or '郑州市' in v or '河南省' in v or '河南' in v:
         return ''
     return v
Esempio n. 5
0
 def eval_file(self, f2):
     rcnt, wcnt = 0.0, 0.0
     if True:  #$DEBUG:
         #with open(os.path.join(CURPATH,"eval_1227.json"),"r") as f:
         #with open(os.path.join(CURPATH,"eval_file_1228.json"),"r") as f:
         with open(os.path.join(CURPATH, "address_formula.json"), "r") as f:
             cont = json.loads(f.read())
             klst = [k for k in cont]
             np.random.shuffle(klst)
             for k in klst:
                 item = cont[k]
                 if "城基路" in item.get('sent', '') or "鼎益市场" in item.get(
                         'sent', ''):
                     #    #pdb.set_trace()
                     continue
                 if item.get('自然村组', '') == "服务中心金关村":
                     continue
                 if item.get('社区', '') == "电子商务园小区":
                     continue
                 if item.get('社区', '') == "高天小区":
                     continue
                 if item.get('街路巷名', '') == '百花大道5号':
                     continue
                 if item.get('社区', '') == '沙河花园a':
                     continue
                 pass  #print("input",item)
                 _item = {}
                 for i in item:
                     j = utils.pre_trans(item[i])
                     _item[i] = j
                 flag = self.eval_one(_item, f2)
                 if flag:
                     rcnt += 1.0
                 else:
                     wcnt += 1.0
                     self.wr_lst(['\n', '原文', str(item), '\n'],
                                 "eval_score.txt")
                     cut_result = self.address_formula(item['sent'])
                     self.wr_lst(
                         ['\n', '预测', str(cut_result), '\n'],
                         "eval_score.txt")
                     self.show_diff(item, cut_result, item['sent'])
                 if wcnt % 1 == 0:
                     pass  #print(rcnt, wcnt)
                     pass
                 acc = rcnt / (rcnt + wcnt + 0.1)
                 print(rcnt, wcnt, acc)
Esempio n. 6
0
def cut(line):
    txt = utils.pre_trans(line)
    cuts = list(jieba.cut(txt))