def rule_check(src, tar, rule="louhao"): src = utils.clr(src) tar = utils.clr(tar) logger.debug("%s %s\n" % (src, tar)) if rule == "louhao": #reg0 = re.compile("([一二三四五六七八九零]+?[号杠])(?:.*?)?([一二三四五六七八九零]+?[号杠$])") reg0 = myconfig.CHECK_RULE_LOUHAO src0 = re.findall(reg0, src) tar0 = re.findall(reg0, tar) if len(src0) > 0: src0 = "".join(src0[0]) else: src0 = "" if len(tar0) > 0: tar0 = "".join(tar0[0]) else: tar0 = "" print(src0, tar0, src, tar) if src0 == tar0 and not src0 == "": return True else: return False elif rule == "jieluxiang": #====== #reg0 = re.compile("\D\D\D[街道路巷]") reg0 = myconfig.CHECK_RULE_JIEDAO src0 = re.findall(reg0, src) tar0 = re.findall(reg0, tar) if len(src0) > 0 and len(tar0) > 0: if src0[-1] == tar0[-1]: return True return False else: print(rule)
def read_xlrd(filename): ad = xlrd.open_workbook(filename) sts = ad.sheets() rows = sts[0].get_rows() result = [] for line in rows: k = line[14].value v = line[10].value k = utils.clr(k) v = utils.clr(v) data, label, s, r, c = hugry_match(matrix_build(k, v), k, v) yield (data, label, k, v)
def read_txt(filename, shuffle): lines = codecs.open(filename, "r", "utf-8").readlines() for line in lines: if shuffle: line = lines[np.random.randint(len(lines))] line = line.split("&")[0] line = utils.clr(line) yield line
def init_ner_train_data(filename): gen = read_txt(filename, shuffle=True) f = open(filename, "a+") for sent in gen: sent = utils.clr(sent) for char in sent: f.write("%s O\n" % char) f.write("\n") f.close()
def xgboost_train_data_gen(cnt=myconfig.TRAIN_DATA, shuffle=True): X_train, y_train = [], [] gen = os.walk("../address_gy/source/dct_file/dct_level") for path, _, files in gen: for filename in files: if filename == "tokens.txt": continue lines = open(os.path.join(path, filename), "r").readlines() for line in lines: if not k in dct_trans: continue line = utils.clr(line) X_train.append(trans(line)) filename = filename.split(".")[0] y_train.append(dct_trans(filename, global_dct)) #print(len(lines)) d_train = xgb.DMatrix(np.array(X_train).reshape(-1, myconfig.LENTH_PADDING), label=np.array(y_train).reshape(-1)) return d_train, y_train
def seperate_zhengz_address(filename): rt = open("/home/dell/data/zhengz_train.txt", "w+") wx = open("/home/dell/data/zhengz_dev.txt", "w+") tmp = [] with open(filename) as f: lines = f.readlines() for line in lines: line = re.sub("[\r\n]", "", line) line = re.sub("NONE", "", line) line = re.sub(" ", "", line) line = utils.clr(line) if 'ROOT' in line: qua, ans = line.split('ROOT') rt.write("%s %s 0\n" % (qua, ans)) else: if len(tmp) == 2: rt.write("%s %s 1\n" % (tmp[0], tmp[1])) tmp = [] else: tmp.append(line) rt.close() wx.close()