def searchTable3(): dz_train = open(dataroot + 'dz_pk_table.train', 'a+') dz_dev = open(dataroot + 'dz_pk_table.dev', 'a+') dingzengs = getDingZengUnion(dz_trainpath) for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' rank = 6 mod = int(id) % rank if mod == 0: makefile = dz_dev else: makefile = dz_train soup = BeautifulSoup(open(htmlpath), 'lxml') tables = soup.find_all('table') for table in tables: # 遍历所有table cuts = table2array(table) # 将table转为二维数组 for cut in cuts: # 遍历规整行列数组 rows = len(cut) cols = len(cut[0]) for row in range(rows): for col in range(cols): valuecell = cut[row][col] topcell = cut[0][col] leftcell = cut[row][0] tag_arr = ['O'] * len(valuecell) top_arr = ['O'] * len(topcell) left_arr = ['O'] * len(leftcell) isTrain = False for dz in dzs: if mask_contract_field(valuecell, dz.duixiang, tag_arr, 'DX', dz): isTrain = True if isTrain: if row == 0 and col == 0: pass elif row == 0 and col != 0: valuecell = leftcell + valuecell tag_arr = left_arr + tag_arr elif row != 0 and col == 0: valuecell = topcell + valuecell tag_arr = top_arr + tag_arr else: valuecell = topcell + leftcell + valuecell tag_arr = top_arr + left_arr + tag_arr for i in range(len(valuecell)): makefile.write(valuecell[i] + ' ' + tag_arr[i] + '\n') makefile.write('\n') for dz in dzs: dz.desc()
def showText(before=1): dingzengs = getDingZengUnion(dz_trainpath) dx_dict = {} sl_dict = {} je_dict = {} for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' sentences = levelText_without_table(htmlpath) for sid in range(len(sentences)): sentence = sentences[sid] beforetext = '' for i in range(sid - before, sid): if i >= 0: beforetext += sentences[i] context = beforetext + sentence ywset = ltp_tokenize_distinct(context) tag_arr = ['O'] * len(sentence) isMask = False for dz in dzs: if mask_contract_field(sentence, dz.duixiang, tag_arr, 'DX', dz): setCount(dx_dict, ywset, id) isMask = True if mask_contract_field(sentence, dz.shuliang, tag_arr, 'SL', dz): setCount(sl_dict, ywset, id) isMask = True if mask_contract_field(sentence, dz.jine, tag_arr, 'JE', dz): setCount(je_dict, ywset, id) isMask = True if isMask: print('sid=' + str(sid) + ' sentence:' + sentence) # print('beforetext:' + beforetext) print('--------------------------------') guolv = {k: v for k, v in dx_dict.items() if len(v) > 9} paixu = sorted(guolv.items(), key=lambda d: len(d[1]), reverse=True) print( '--------------------------------------------------------------------------------------------------------' ) for px in paixu: print(px[0] + '\t' + str(px[1])) guolv = {k: v for k, v in sl_dict.items() if len(v) > 9} paixu = sorted(guolv.items(), key=lambda d: len(d[1]), reverse=True) print( '--------------------------------------------------------------------------------------------------------' ) for px in paixu: print(px[0] + '\t' + str(px[1])) guolv = {k: v for k, v in je_dict.items() if len(v) > 9} paixu = sorted(guolv.items(), key=lambda d: len(d[1]), reverse=True) print( '--------------------------------------------------------------------------------------------------------' ) for px in paixu: print(px[0] + '\t' + str(px[1]))
def searchTable3(sample=1, enhance=1): dz_train = open('dz_pk_cls_table.train', 'a+') dz_dev = open('dz_pk_cls_table.dev', 'a+') dingzengs = getDingZengUnion(dz_trainpath) for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' rank = 6 mod = int(id) % rank if mod == 0: makefile = dz_dev else: makefile = dz_train soup = BeautifulSoup(open(htmlpath), 'lxml') tables = soup.find_all('table') for table in tables: # 遍历所有table cuts = table2array(table) # 将table转为二维数组 for cut in cuts: # 遍历规整行列数组 rows = len(cut) cols = len(cut[0]) for row in range(rows): for col in range(cols): valuecell = cut[row][col] topcell = cut[0][col] leftcell = cut[row][0] if matchDuixiang(topcell) or matchDuixiang(leftcell): label = '__label__nothing ' for dz in dzs: if hasPK(valuecell, dz.duixiang, 'DX', dz): label = '__label__dzpk ' break if row == 0 and col == 0: pass elif row == 0 and col != 0: valuecell = leftcell + valuecell elif row != 0 and col == 0: valuecell = topcell + valuecell else: valuecell = topcell + leftcell + valuecell toline = label + ' '.join(jieba_tokenize(valuecell)) + '\n' for j in range(enhance): if label != '__label__nothing ': for i in range(sample): # 干涉正负样本比例 makefile.write(toline) else: makefile.write(toline) for dz in dzs: dz.desc()
def showTable(): dingzengs = getDingZengUnion(dz_trainpath) for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' soup = BeautifulSoup(open(htmlpath), 'lxml') tables = list(soup.find_all('table')) dz_tmp_list_dict = {} for t1 in range(len(tables)): table = tables[t1] cuts = table2array(table) for t2 in range(len(cuts)): cut = cuts[t2] dx_weight, effective, dz_tmp_list, tag = table_tag_byrow( id, cut) if dx_weight > 0 and effective > 0: density = float(effective / (5 * len(dz_tmp_list))) # 有效数据密度 if density > 0.2: dz_tmp_list_dict[(t1, t2)] = (dx_weight, effective, dz_tmp_list, tag) # paixu = list(sorted(dz_tmp_list_dict.items(), key=lambda d: d[1][0], reverse=True)) # 按对象表头权重×密度倒排 paixu = list( sorted(dz_tmp_list_dict.items(), key=lambda d: d[1][0] * float(d[1][1] / (5 * len(d[1][2]))), reverse=True)) if len(dzs) != 1 and (len(paixu) == 0 or len(dzs) != len(paixu[0][1][2])): for dz in dzs: print('HTMLID= {} |DX= {} |数量= {} |金额= {} |锁定= {} |认购= {} '. format(dz.id, dz.duixiang, dz.shuliang, dz.jine, dz.suoding, dz.rengou)) print( '以上结果为真实------------------------------------------------------------------------------------------------' ) if len(paixu) > 0: p = paixu[0] key = p[0] val = p[1] dx_weight = val[0] effective = val[1] dz_tmp_list = val[2] tag = val[3] for tmp in dz_tmp_list: tmp.desc() print('tag=( {} ),dx_weight=( {} ),effective=( {} )'.format( tag, dx_weight, str(effective))) print( '以上结果为预测------------------------------------------------------------------------------------------------' ) print('\n')
def maketrain(before=0): rank = 6 dz_train = open(dataroot + 'dz_all_text.train', 'a+') dz_dev = open(dataroot + 'dz_all_text.dev', 'a+') dingzengs = getDingZengUnion(dz_trainpath) for id in dingzengs.keys(): mod = int(id) % rank if mod == 0: makefile = dz_dev else: makefile = dz_train dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' sentences = levelText_without_table(htmlpath) for sid in range(len(sentences)): sentence = sentences[sid] beforetext = '' for i in range(sid - before, sid): if i >= 0: beforetext += sentences[i] context = beforetext + sentence context = context.replace('十二个月', '12个月') context = context.replace('三十六个月', '12个月') tag_arr = ['O'] * len(context) isMask = False for dz in dzs: if package.reg_dx_table(context) and mask_contract_field( context, dz.duixiang, tag_arr, 'DX', dz, False): isMask = True if package.reg_sl_table(context) and mask_contract_field( context, dz.shuliang, tag_arr, 'SL', dz, True): isMask = True if package.reg_je_table(context) and mask_contract_field( context, dz.jine, tag_arr, 'JE', dz, True): isMask = True if package.reg_sd_table(context) and mask_contract_field( context, dz.suoding, tag_arr, 'SD', dz, False): isMask = True # if package.reg_rg_table(context) and mask_contract_field(context, dz.rengou, tag_arr, 'RG', dz, False): # isMask = True if isMask: for i in range(len(context)): makefile.write(context[i] + ' ' + tag_arr[i] + '\n') makefile.write('\n') print('--------------------------------') for dz in dzs: dz.desc()
def catTable(): trigger_dict = {} dingzengs = getDingZengUnion(dz_trainpath) for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' soup = BeautifulSoup(open(htmlpath), 'lxml') tables = soup.find_all('table') for table in tables: cuts = table2array(table) for cut in cuts: # 遍历规整行列数组 rows = len(cut) cols = len(cut[0]) for row in range(rows): for col in range(cols): valuecell = cut[row][col] topcell = cut[0][col] leftcell = cut[row][0] for dz in dzs: if hasAtt(valuecell, dz.duixiang, 'DX', dz, False, None, None, False): # if hasAtt(valuecell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False): # if hasAtt(valuecell, dz.jine, 'JE', dz, True, topcell, leftcell, False): # if hasAtt(valuecell, dz.suoding, 'SD', dz, False, topcell, leftcell, True): # if hasAtt(valuecell, dz.rengou, 'RG', dz, False, topcell, leftcell, True): if len(topcell) > 1 and len(topcell) < 8: if trigger_dict.get(topcell): trigger_dict[topcell] += 1 else: trigger_dict[topcell] = 1 if len(leftcell) > 1 and len(leftcell) < 8: if trigger_dict.get(leftcell): trigger_dict[leftcell] += 1 else: trigger_dict[leftcell] = 1 guolv = {k: v for k, v in trigger_dict.items() if v > 9} paixu = sorted(guolv.items(), key=lambda d: d[1], reverse=True) print( '--------------------------------------------------------------------------------------------------------' ) for px in paixu: print(px[0] + '\t' + str(px[1]))
def catTable(): trigger_dict = {} dingzengs = getDingZengUnion(dz_trainpath) for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' soup = BeautifulSoup(open(htmlpath), 'lxml') tables = soup.find_all('table') for table in tables: t2a = table2array(table) if len(t2a) > 0: cuts = cutTable(t2a) for cut in cuts: for dz in dzs: val = dz.duixiang if val != 'fddcUndefined': triggers = locateAll(cut, val) if len(triggers) > 0: for trigger in triggers: top = trigger[0] left = trigger[1] if len(top) > 1 and len(top) < 8: if trigger_dict.get(top): trigger_dict[top] += 1 else: trigger_dict[top] = 1 if len(left) > 1 and len(left) < 8: if trigger_dict.get(left): trigger_dict[left] += 1 else: trigger_dict[left] = 1 guolv = {k: v for k, v in trigger_dict.items() if v > 9} paixu = sorted(guolv.items(), key=lambda d: d[1], reverse=True) print(paixu) print('---------------------------------------------------------') for px in paixu: print(px[0] + '\t' + str(px[1]))
def searchTable3(sample=1, enhance=1): dz_train = open('dz_att_cls_table.train', 'a+') dz_dev = open('dz_att_cls_table.dev', 'a+') dingzengs = getDingZengUnion(dz_trainpath) for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' rank = 6 mod = int(id) % rank if mod == 0: makefile = dz_dev else: makefile = dz_train soup = BeautifulSoup(open(htmlpath), 'lxml') tables = soup.find_all('table') for table in tables: # 遍历所有table cuts = table2array(table) # 将table转为二维数组 for cut in cuts: # 遍历规整行列数组 rows = len(cut) cols = len(cut[0]) for row in range(rows): for col in range(cols): valuecell = cut[row][col] topcell = cut[0][col] leftcell = cut[row][0] # if matchDuixiang(topcell) or matchDuixiang(leftcell): labels = set() # for dz in dzs: # if hasAtt(valuecell, dz.duixiang, 'DX', dz, False, None, None): # labels.add('__label__dzdx') # if hasAtt(valuecell, dz.shuliang, 'SL', dz, True, topcell, leftcell): # labels.add('__label__dzsl') # if hasAtt(valuecell, dz.jine, 'JE', dz, True, topcell, leftcell): # labels.add('__label__dzje') for dz in dzs: if hasAtt(valuecell, dz.duixiang, 'DX', dz, False, None, None, False) \ and not hasAtt(topcell + leftcell, dz.duixiang, 'DX', dz, False, None, None, False): labels.add('__label__dzdx') if hasAtt(valuecell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False) \ and not hasAtt(topcell + leftcell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False): labels.add('__label__dzsl') if hasAtt(valuecell, dz.jine, 'JE', dz, True, topcell, leftcell, False) \ and not hasAtt(topcell + leftcell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False): labels.add('__label__dzje') if len(labels) == 0: labels.add('__label__nothing') if row == 0 and col == 0: pass elif row == 0 and col != 0: valuecell = leftcell + valuecell elif row != 0 and col == 0: valuecell = topcell + valuecell else: valuecell = topcell + leftcell + valuecell toline = ' '.join(labels) + ' ' + ' '.join(jieba_tokenize(valuecell)) + '\n' makefile.write(toline) for dz in dzs: dz.desc()
def evaluate(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) dingzengs = getDingZengUnion(dz_trainpath) TDX = T_PDX = T_RDX = 0 TSL = T_PSL = T_RSL = 0 TJE = T_PJE = T_RJE = 0 for id in dingzengs.keys(): dzs = dingzengs[id] htmlpath = dz_htmlpath + id + '.html' rank = 6 mod = int(id) % rank if mod == 0: pre_dxs = set() pre_sls = set() pre_jes = set() soup = BeautifulSoup(open(htmlpath), 'lxml') tables = soup.find_all('table') for table in tables: # 遍历所有table cuts = table2array(table) # 将table转为二维数组 for cut in cuts: # 遍历规整行列数组 rows = len(cut) cols = len(cut[0]) for row in range(rows): for col in range(cols): valuecell = cut[row][col] topcell = cut[0][col] leftcell = cut[row][0] if row == 0 and col == 0: pass elif row == 0 and col != 0: valuecell = leftcell + valuecell elif row != 0 and col == 0: valuecell = topcell + valuecell else: valuecell = topcell + leftcell + valuecell if clsDuixiang(valuecell): result = model.evaluate_line( sess, input_from_line(valuecell, char_to_id), id_to_tag) entities = result.get('entities') for ent in entities: type = ent['type'] word = ent['word'] if type == 'DX': pre_dxs.add(word) elif type == 'SL_unit10k': if pattern_isnum.match(word): pre_sls.add( str((Decimal(word) * 10000))) elif type == 'SL_unit1': pre_sls.add(word) elif type == 'JE_unit10k': if pattern_isnum.match(word): pre_jes.add( str((Decimal(word) * 10000))) elif type == 'JE_unit1': pre_jes.add(word) # print(entities) truthdxs = set([ dz.duixiang for dz in dzs if dz.duixiang != 'fddcUndefined' ]) truthsls = set([ dz.shuliang for dz in dzs if dz.shuliang != 'fddcUndefined' ]) truthjes = set( [dz.jine for dz in dzs if dz.jine != 'fddcUndefined']) tdx, total_pdx, total_rdx = p_r(pre_dxs, truthdxs) tsl, total_psl, total_rsl = p_r(pre_sls, truthsls) tje, total_pje, total_rje = p_r(pre_jes, truthjes) TDX += tdx T_PDX += total_pdx T_RDX += total_rdx TSL += tsl T_PSL += total_psl T_RSL += total_rsl TJE += tje T_PJE += total_pje T_RJE += total_rje print('TDX={},T_PDX={},T_RDX={}'.format(TDX, T_PDX, T_RDX)) print('TSL={},T_PSL={},T_RSL={}'.format(TSL, T_PSL, T_RSL)) print('TJE={},T_PJE={},T_RJE={}'.format(TJE, T_PJE, T_RJE)) print(truthdxs) print(pre_dxs) print(truthsls) print(pre_sls) print(truthjes) print(pre_jes) print('---------------------------------------------------')
def eval_dz(): dzlogger = get_logger(dzlog_path) trains = getDingZengUnion(dz_trainpath) tests = getDingZengUnion(dz_testpath) posid = posdx = possl = posje = possd = posrg = 0 actid = actdx = actsl = actje = actsd = actrg = 0 corid = cordx = corsl = corje = corsd = corrg = 0 for id in trains.keys(): train = trains.get(id) test = tests.get(id) train_pk = dz_withpk(train) test_pk = dz_withpk(test) for item in train: posid = pos(item.id, posid) posdx = pos(item.duixiang, posdx) possl = pos(item.shuliang, possl) posje = pos(item.jine, posje) possd = pos(item.suoding, possd) posrg = pos(item.rengou, posrg) dzlogger.info('id={},dx={},sl={},je={},sd={},rg={}'.format( item.id, item.duixiang, item.shuliang, item.jine, item.suoding, item.rengou)) dzlogger.info( '------------------------------以上为训练数据------------------------------' ) if test is None or len(train) != len(test): dzlogger.info('id={},数量不一致'.format(id)) else: dzlogger.info('id={},数量一致'.format(id)) if test is not None: for item in test: actid = act(item.id, actid) actdx = act(item.duixiang, actdx) actsl = act(item.shuliang, actsl) actje = act(item.jine, actje) actsd = act(item.suoding, actsd) actrg = act(item.rengou, actrg) dzlogger.info('id={},dx={},sl={},je={},sd={},rg={}'.format( item.id, item.duixiang, item.shuliang, item.jine, item.suoding, item.rengou)) else: dzlogger.info('id={},未能识别'.format(id)) for pk in train_pk.keys(): train = train_pk.get(pk) test = test_pk.get(pk) if test is not None: # 匹配主键 corsl, truesl = cor(train.shuliang, test.shuliang, corsl) corje, trueje = cor(train.jine, test.jine, corje) corsd, truesd = cor(train.suoding, test.suoding, corsd) corrg, truerg = cor(train.rengou, test.rengou, corrg) # dzlogger.info('id={},dx={},sl={},je={},sd={},rg={}'.format(test.id, test.duixiang, # mask(test.shuliang, truesl), # mask(test.jine, trueje), # mask(test.suoding, truesd), # mask(test.rengou, truerg))) else: dzlogger.info('pk={},主键未能识别'.format(pk)) dzlogger.info( '------------------------------以上为预测数据------------------------------\n\n\n' ) disaccord = 0 for id in tests.keys(): train = trains.get(id) if train is None: disaccord += len(tests.get(id)) dzlogger.info('id={},ID识别不一致'.format(id)) dzlogger.info( 'posid={}, posdx={}, possl={}, posje={}, possd={}, posrg={}'.format( posid, posdx, possl, posje, possd, posrg)) dzlogger.info( 'actid={}, actdx={}, actsl={}, actje={}, actsd={}, actrg={}'.format( actid, actdx, actsl, actje, actsd, actrg)) dzlogger.info( 'corid={}, cordx={}, corsl={}, corje={}, corsd={}, corrg={}'.format( corid, cordx, corsl, corje, corsd, corrg)) dzlogger.info('disaccord={}'.format(disaccord)) p_sl, r_sl, f_sl = f1(corsl, possl, actsl) p_je, r_je, f_je = f1(corje, posje, actje) p_sd, r_sd, f_sd = f1(corsd, possd, actsd) p_rg, r_rg, f_rg = f1(corrg, posrg, actrg) dzlogger.info('SL: p={},r={},f1={}'.format(p_sl, r_sl, f_sl)) dzlogger.info('JE: p={},r={},f1={}'.format(p_je, r_je, f_je)) dzlogger.info('SD: p={},r={},f1={}'.format(p_sd, r_sd, f_sd)) dzlogger.info('RG: p={},r={},f1={}'.format(p_rg, r_rg, f_rg)) dzlogger.info('score={}'.format(score([f_sl, f_je, f_sd, f_rg])))