def searchTable3():
    dz_train = open(dataroot + 'dz_pk_table.train', 'a+')
    dz_dev = open(dataroot + 'dz_pk_table.dev', 'a+')

    dingzengs = getDingZengUnion(dz_trainpath)
    for id in dingzengs.keys():
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'

        rank = 6
        mod = int(id) % rank
        if mod == 0:
            makefile = dz_dev
        else:
            makefile = dz_train

        soup = BeautifulSoup(open(htmlpath), 'lxml')
        tables = soup.find_all('table')
        for table in tables:  # 遍历所有table
            cuts = table2array(table)  # 将table转为二维数组
            for cut in cuts:  # 遍历规整行列数组
                rows = len(cut)
                cols = len(cut[0])
                for row in range(rows):
                    for col in range(cols):

                        valuecell = cut[row][col]
                        topcell = cut[0][col]
                        leftcell = cut[row][0]
                        tag_arr = ['O'] * len(valuecell)
                        top_arr = ['O'] * len(topcell)
                        left_arr = ['O'] * len(leftcell)

                        isTrain = False
                        for dz in dzs:
                            if mask_contract_field(valuecell, dz.duixiang,
                                                   tag_arr, 'DX', dz):
                                isTrain = True

                        if isTrain:

                            if row == 0 and col == 0:
                                pass
                            elif row == 0 and col != 0:
                                valuecell = leftcell + valuecell
                                tag_arr = left_arr + tag_arr
                            elif row != 0 and col == 0:
                                valuecell = topcell + valuecell
                                tag_arr = top_arr + tag_arr
                            else:
                                valuecell = topcell + leftcell + valuecell
                                tag_arr = top_arr + left_arr + tag_arr

                            for i in range(len(valuecell)):
                                makefile.write(valuecell[i] + ' ' +
                                               tag_arr[i] + '\n')
                            makefile.write('\n')

        for dz in dzs:
            dz.desc()
def showText(before=1):
    dingzengs = getDingZengUnion(dz_trainpath)
    dx_dict = {}
    sl_dict = {}
    je_dict = {}
    for id in dingzengs.keys():
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'
        sentences = levelText_without_table(htmlpath)
        for sid in range(len(sentences)):
            sentence = sentences[sid]
            beforetext = ''
            for i in range(sid - before, sid):
                if i >= 0:
                    beforetext += sentences[i]
            context = beforetext + sentence
            ywset = ltp_tokenize_distinct(context)
            tag_arr = ['O'] * len(sentence)
            isMask = False
            for dz in dzs:
                if mask_contract_field(sentence, dz.duixiang, tag_arr, 'DX',
                                       dz):
                    setCount(dx_dict, ywset, id)
                    isMask = True
                if mask_contract_field(sentence, dz.shuliang, tag_arr, 'SL',
                                       dz):
                    setCount(sl_dict, ywset, id)
                    isMask = True
                if mask_contract_field(sentence, dz.jine, tag_arr, 'JE', dz):
                    setCount(je_dict, ywset, id)
                    isMask = True
            if isMask:
                print('sid=' + str(sid) + ' sentence:' + sentence)
                # print('beforetext:' + beforetext)
        print('--------------------------------')

    guolv = {k: v for k, v in dx_dict.items() if len(v) > 9}
    paixu = sorted(guolv.items(), key=lambda d: len(d[1]), reverse=True)
    print(
        '--------------------------------------------------------------------------------------------------------'
    )
    for px in paixu:
        print(px[0] + '\t' + str(px[1]))

    guolv = {k: v for k, v in sl_dict.items() if len(v) > 9}
    paixu = sorted(guolv.items(), key=lambda d: len(d[1]), reverse=True)
    print(
        '--------------------------------------------------------------------------------------------------------'
    )
    for px in paixu:
        print(px[0] + '\t' + str(px[1]))

    guolv = {k: v for k, v in je_dict.items() if len(v) > 9}
    paixu = sorted(guolv.items(), key=lambda d: len(d[1]), reverse=True)
    print(
        '--------------------------------------------------------------------------------------------------------'
    )
    for px in paixu:
        print(px[0] + '\t' + str(px[1]))
def searchTable3(sample=1, enhance=1):
    dz_train = open('dz_pk_cls_table.train', 'a+')
    dz_dev = open('dz_pk_cls_table.dev', 'a+')

    dingzengs = getDingZengUnion(dz_trainpath)
    for id in dingzengs.keys():
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'

        rank = 6
        mod = int(id) % rank
        if mod == 0:
            makefile = dz_dev
        else:
            makefile = dz_train

        soup = BeautifulSoup(open(htmlpath), 'lxml')
        tables = soup.find_all('table')
        for table in tables:  # 遍历所有table
            cuts = table2array(table)  # 将table转为二维数组
            for cut in cuts:  # 遍历规整行列数组
                rows = len(cut)
                cols = len(cut[0])
                for row in range(rows):
                    for col in range(cols):

                        valuecell = cut[row][col]
                        topcell = cut[0][col]
                        leftcell = cut[row][0]

                        if matchDuixiang(topcell) or matchDuixiang(leftcell):
                            label = '__label__nothing '
                            for dz in dzs:
                                if hasPK(valuecell, dz.duixiang, 'DX', dz):
                                    label = '__label__dzpk '
                                    break

                            if row == 0 and col == 0:
                                pass
                            elif row == 0 and col != 0:
                                valuecell = leftcell + valuecell
                            elif row != 0 and col == 0:
                                valuecell = topcell + valuecell
                            else:
                                valuecell = topcell + leftcell + valuecell

                            toline = label + ' '.join(jieba_tokenize(valuecell)) + '\n'

                            for j in range(enhance):
                                if label != '__label__nothing ':
                                    for i in range(sample):  # 干涉正负样本比例
                                        makefile.write(toline)
                                else:
                                    makefile.write(toline)

        for dz in dzs:
            dz.desc()
Exemple #4
0
def showTable():
    dingzengs = getDingZengUnion(dz_trainpath)
    for id in dingzengs.keys():
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'
        soup = BeautifulSoup(open(htmlpath), 'lxml')
        tables = list(soup.find_all('table'))
        dz_tmp_list_dict = {}
        for t1 in range(len(tables)):
            table = tables[t1]
            cuts = table2array(table)
            for t2 in range(len(cuts)):
                cut = cuts[t2]
                dx_weight, effective, dz_tmp_list, tag = table_tag_byrow(
                    id, cut)
                if dx_weight > 0 and effective > 0:
                    density = float(effective /
                                    (5 * len(dz_tmp_list)))  # 有效数据密度
                    if density > 0.2:
                        dz_tmp_list_dict[(t1, t2)] = (dx_weight, effective,
                                                      dz_tmp_list, tag)

        # paixu = list(sorted(dz_tmp_list_dict.items(), key=lambda d: d[1][0], reverse=True))
        # 按对象表头权重×密度倒排
        paixu = list(
            sorted(dz_tmp_list_dict.items(),
                   key=lambda d: d[1][0] * float(d[1][1] / (5 * len(d[1][2]))),
                   reverse=True))

        if len(dzs) != 1 and (len(paixu) == 0
                              or len(dzs) != len(paixu[0][1][2])):
            for dz in dzs:
                print('HTMLID= {} |DX= {} |数量= {} |金额= {} |锁定= {} |认购= {} '.
                      format(dz.id, dz.duixiang, dz.shuliang, dz.jine,
                             dz.suoding, dz.rengou))
            print(
                '以上结果为真实------------------------------------------------------------------------------------------------'
            )
            if len(paixu) > 0:
                p = paixu[0]
                key = p[0]
                val = p[1]
                dx_weight = val[0]
                effective = val[1]
                dz_tmp_list = val[2]
                tag = val[3]
                for tmp in dz_tmp_list:
                    tmp.desc()
                print('tag=( {} ),dx_weight=( {} ),effective=( {} )'.format(
                    tag, dx_weight, str(effective)))
            print(
                '以上结果为预测------------------------------------------------------------------------------------------------'
            )
            print('\n')
def maketrain(before=0):
    rank = 6
    dz_train = open(dataroot + 'dz_all_text.train', 'a+')
    dz_dev = open(dataroot + 'dz_all_text.dev', 'a+')
    dingzengs = getDingZengUnion(dz_trainpath)
    for id in dingzengs.keys():
        mod = int(id) % rank
        if mod == 0:
            makefile = dz_dev
        else:
            makefile = dz_train
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'
        sentences = levelText_without_table(htmlpath)
        for sid in range(len(sentences)):
            sentence = sentences[sid]
            beforetext = ''
            for i in range(sid - before, sid):
                if i >= 0:
                    beforetext += sentences[i]
            context = beforetext + sentence
            context = context.replace('十二个月', '12个月')
            context = context.replace('三十六个月', '12个月')
            tag_arr = ['O'] * len(context)
            isMask = False
            for dz in dzs:
                if package.reg_dx_table(context) and mask_contract_field(
                        context, dz.duixiang, tag_arr, 'DX', dz, False):
                    isMask = True
                if package.reg_sl_table(context) and mask_contract_field(
                        context, dz.shuliang, tag_arr, 'SL', dz, True):
                    isMask = True
                if package.reg_je_table(context) and mask_contract_field(
                        context, dz.jine, tag_arr, 'JE', dz, True):
                    isMask = True
                if package.reg_sd_table(context) and mask_contract_field(
                        context, dz.suoding, tag_arr, 'SD', dz, False):
                    isMask = True
                # if package.reg_rg_table(context) and mask_contract_field(context, dz.rengou, tag_arr, 'RG', dz, False):
                #     isMask = True
            if isMask:
                for i in range(len(context)):
                    makefile.write(context[i] + ' ' + tag_arr[i] + '\n')
                makefile.write('\n')
        print('--------------------------------')
        for dz in dzs:
            dz.desc()
Exemple #6
0
def catTable():
    trigger_dict = {}
    dingzengs = getDingZengUnion(dz_trainpath)
    for id in dingzengs.keys():
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'
        soup = BeautifulSoup(open(htmlpath), 'lxml')
        tables = soup.find_all('table')
        for table in tables:
            cuts = table2array(table)
            for cut in cuts:  # 遍历规整行列数组
                rows = len(cut)
                cols = len(cut[0])
                for row in range(rows):
                    for col in range(cols):
                        valuecell = cut[row][col]
                        topcell = cut[0][col]
                        leftcell = cut[row][0]
                        for dz in dzs:
                            if hasAtt(valuecell, dz.duixiang, 'DX', dz, False,
                                      None, None, False):
                                # if hasAtt(valuecell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False):
                                # if hasAtt(valuecell, dz.jine, 'JE', dz, True, topcell, leftcell, False):
                                # if hasAtt(valuecell, dz.suoding, 'SD', dz, False, topcell, leftcell, True):
                                # if hasAtt(valuecell, dz.rengou, 'RG', dz, False, topcell, leftcell, True):
                                if len(topcell) > 1 and len(topcell) < 8:
                                    if trigger_dict.get(topcell):
                                        trigger_dict[topcell] += 1
                                    else:
                                        trigger_dict[topcell] = 1
                                if len(leftcell) > 1 and len(leftcell) < 8:
                                    if trigger_dict.get(leftcell):
                                        trigger_dict[leftcell] += 1
                                    else:
                                        trigger_dict[leftcell] = 1

    guolv = {k: v for k, v in trigger_dict.items() if v > 9}
    paixu = sorted(guolv.items(), key=lambda d: d[1], reverse=True)
    print(
        '--------------------------------------------------------------------------------------------------------'
    )
    for px in paixu:
        print(px[0] + '\t' + str(px[1]))
def catTable():
    trigger_dict = {}
    dingzengs = getDingZengUnion(dz_trainpath)
    for id in dingzengs.keys():
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'
        soup = BeautifulSoup(open(htmlpath), 'lxml')
        tables = soup.find_all('table')
        for table in tables:
            t2a = table2array(table)
            if len(t2a) > 0:
                cuts = cutTable(t2a)
                for cut in cuts:
                    for dz in dzs:
                        val = dz.duixiang
                        if val != 'fddcUndefined':
                            triggers = locateAll(cut, val)
                            if len(triggers) > 0:
                                for trigger in triggers:
                                    top = trigger[0]
                                    left = trigger[1]

                                    if len(top) > 1 and len(top) < 8:
                                        if trigger_dict.get(top):
                                            trigger_dict[top] += 1
                                        else:
                                            trigger_dict[top] = 1

                                    if len(left) > 1 and len(left) < 8:
                                        if trigger_dict.get(left):
                                            trigger_dict[left] += 1
                                        else:
                                            trigger_dict[left] = 1

    guolv = {k: v for k, v in trigger_dict.items() if v > 9}
    paixu = sorted(guolv.items(), key=lambda d: d[1], reverse=True)
    print(paixu)
    print('---------------------------------------------------------')
    for px in paixu:
        print(px[0] + '\t' + str(px[1]))
def searchTable3(sample=1, enhance=1):
    dz_train = open('dz_att_cls_table.train', 'a+')
    dz_dev = open('dz_att_cls_table.dev', 'a+')

    dingzengs = getDingZengUnion(dz_trainpath)
    for id in dingzengs.keys():
        dzs = dingzengs[id]
        htmlpath = dz_htmlpath + id + '.html'

        rank = 6
        mod = int(id) % rank
        if mod == 0:
            makefile = dz_dev
        else:
            makefile = dz_train

        soup = BeautifulSoup(open(htmlpath), 'lxml')
        tables = soup.find_all('table')
        for table in tables:  # 遍历所有table
            cuts = table2array(table)  # 将table转为二维数组
            for cut in cuts:  # 遍历规整行列数组
                rows = len(cut)
                cols = len(cut[0])
                for row in range(rows):
                    for col in range(cols):

                        valuecell = cut[row][col]
                        topcell = cut[0][col]
                        leftcell = cut[row][0]

                        # if matchDuixiang(topcell) or matchDuixiang(leftcell):
                        labels = set()
                        # for dz in dzs:
                        #     if hasAtt(valuecell, dz.duixiang, 'DX', dz, False, None, None):
                        #         labels.add('__label__dzdx')
                        #     if hasAtt(valuecell, dz.shuliang, 'SL', dz, True, topcell, leftcell):
                        #         labels.add('__label__dzsl')
                        #     if hasAtt(valuecell, dz.jine, 'JE', dz, True, topcell, leftcell):
                        #         labels.add('__label__dzje')

                        for dz in dzs:
                            if hasAtt(valuecell, dz.duixiang, 'DX', dz, False, None, None, False) \
                                    and not hasAtt(topcell + leftcell, dz.duixiang, 'DX', dz, False, None, None, False):
                                labels.add('__label__dzdx')
                            if hasAtt(valuecell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False) \
                                    and not hasAtt(topcell + leftcell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False):
                                labels.add('__label__dzsl')
                            if hasAtt(valuecell, dz.jine, 'JE', dz, True, topcell, leftcell, False) \
                                    and not hasAtt(topcell + leftcell, dz.shuliang, 'SL', dz, True, topcell, leftcell, False):
                                labels.add('__label__dzje')

                        if len(labels) == 0:
                            labels.add('__label__nothing')

                        if row == 0 and col == 0:
                            pass
                        elif row == 0 and col != 0:
                            valuecell = leftcell + valuecell
                        elif row != 0 and col == 0:
                            valuecell = topcell + valuecell
                        else:
                            valuecell = topcell + leftcell + valuecell

                        toline = ' '.join(labels) + ' ' + ' '.join(jieba_tokenize(valuecell)) + '\n'
                        makefile.write(toline)

        for dz in dzs:
            dz.desc()
Exemple #9
0
def evaluate():
    config = load_config(FLAGS.config_file)
    logger = get_logger(FLAGS.log_file)
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with open(FLAGS.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    with tf.Session(config=tf_config) as sess:

        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)

        dingzengs = getDingZengUnion(dz_trainpath)
        TDX = T_PDX = T_RDX = 0
        TSL = T_PSL = T_RSL = 0
        TJE = T_PJE = T_RJE = 0
        for id in dingzengs.keys():
            dzs = dingzengs[id]
            htmlpath = dz_htmlpath + id + '.html'

            rank = 6
            mod = int(id) % rank
            if mod == 0:
                pre_dxs = set()
                pre_sls = set()
                pre_jes = set()
                soup = BeautifulSoup(open(htmlpath), 'lxml')
                tables = soup.find_all('table')
                for table in tables:  # 遍历所有table
                    cuts = table2array(table)  # 将table转为二维数组
                    for cut in cuts:  # 遍历规整行列数组
                        rows = len(cut)
                        cols = len(cut[0])
                        for row in range(rows):
                            for col in range(cols):

                                valuecell = cut[row][col]
                                topcell = cut[0][col]
                                leftcell = cut[row][0]

                                if row == 0 and col == 0:
                                    pass
                                elif row == 0 and col != 0:
                                    valuecell = leftcell + valuecell
                                elif row != 0 and col == 0:
                                    valuecell = topcell + valuecell
                                else:
                                    valuecell = topcell + leftcell + valuecell

                                if clsDuixiang(valuecell):
                                    result = model.evaluate_line(
                                        sess,
                                        input_from_line(valuecell, char_to_id),
                                        id_to_tag)
                                    entities = result.get('entities')
                                    for ent in entities:
                                        type = ent['type']
                                        word = ent['word']
                                        if type == 'DX':
                                            pre_dxs.add(word)
                                        elif type == 'SL_unit10k':
                                            if pattern_isnum.match(word):
                                                pre_sls.add(
                                                    str((Decimal(word) *
                                                         10000)))
                                        elif type == 'SL_unit1':
                                            pre_sls.add(word)
                                        elif type == 'JE_unit10k':
                                            if pattern_isnum.match(word):
                                                pre_jes.add(
                                                    str((Decimal(word) *
                                                         10000)))
                                        elif type == 'JE_unit1':
                                            pre_jes.add(word)
                                    # print(entities)

                truthdxs = set([
                    dz.duixiang for dz in dzs if dz.duixiang != 'fddcUndefined'
                ])
                truthsls = set([
                    dz.shuliang for dz in dzs if dz.shuliang != 'fddcUndefined'
                ])
                truthjes = set(
                    [dz.jine for dz in dzs if dz.jine != 'fddcUndefined'])
                tdx, total_pdx, total_rdx = p_r(pre_dxs, truthdxs)
                tsl, total_psl, total_rsl = p_r(pre_sls, truthsls)
                tje, total_pje, total_rje = p_r(pre_jes, truthjes)

                TDX += tdx
                T_PDX += total_pdx
                T_RDX += total_rdx

                TSL += tsl
                T_PSL += total_psl
                T_RSL += total_rsl

                TJE += tje
                T_PJE += total_pje
                T_RJE += total_rje

                print('TDX={},T_PDX={},T_RDX={}'.format(TDX, T_PDX, T_RDX))
                print('TSL={},T_PSL={},T_RSL={}'.format(TSL, T_PSL, T_RSL))
                print('TJE={},T_PJE={},T_RJE={}'.format(TJE, T_PJE, T_RJE))

                print(truthdxs)
                print(pre_dxs)
                print(truthsls)
                print(pre_sls)
                print(truthjes)
                print(pre_jes)
                print('---------------------------------------------------')
Exemple #10
0
def eval_dz():
    dzlogger = get_logger(dzlog_path)
    trains = getDingZengUnion(dz_trainpath)
    tests = getDingZengUnion(dz_testpath)
    posid = posdx = possl = posje = possd = posrg = 0
    actid = actdx = actsl = actje = actsd = actrg = 0
    corid = cordx = corsl = corje = corsd = corrg = 0
    for id in trains.keys():
        train = trains.get(id)
        test = tests.get(id)
        train_pk = dz_withpk(train)
        test_pk = dz_withpk(test)

        for item in train:
            posid = pos(item.id, posid)
            posdx = pos(item.duixiang, posdx)
            possl = pos(item.shuliang, possl)
            posje = pos(item.jine, posje)
            possd = pos(item.suoding, possd)
            posrg = pos(item.rengou, posrg)
            dzlogger.info('id={},dx={},sl={},je={},sd={},rg={}'.format(
                item.id, item.duixiang, item.shuliang, item.jine, item.suoding,
                item.rengou))
        dzlogger.info(
            '------------------------------以上为训练数据------------------------------'
        )

        if test is None or len(train) != len(test):
            dzlogger.info('id={},数量不一致'.format(id))
        else:
            dzlogger.info('id={},数量一致'.format(id))

        if test is not None:
            for item in test:
                actid = act(item.id, actid)
                actdx = act(item.duixiang, actdx)
                actsl = act(item.shuliang, actsl)
                actje = act(item.jine, actje)
                actsd = act(item.suoding, actsd)
                actrg = act(item.rengou, actrg)
                dzlogger.info('id={},dx={},sl={},je={},sd={},rg={}'.format(
                    item.id, item.duixiang, item.shuliang, item.jine,
                    item.suoding, item.rengou))
        else:
            dzlogger.info('id={},未能识别'.format(id))

        for pk in train_pk.keys():
            train = train_pk.get(pk)
            test = test_pk.get(pk)
            if test is not None:  # 匹配主键
                corsl, truesl = cor(train.shuliang, test.shuliang, corsl)
                corje, trueje = cor(train.jine, test.jine, corje)
                corsd, truesd = cor(train.suoding, test.suoding, corsd)
                corrg, truerg = cor(train.rengou, test.rengou, corrg)
                # dzlogger.info('id={},dx={},sl={},je={},sd={},rg={}'.format(test.id, test.duixiang,
                #                                                            mask(test.shuliang, truesl),
                #                                                            mask(test.jine, trueje),
                #                                                            mask(test.suoding, truesd),
                #                                                            mask(test.rengou, truerg)))
            else:
                dzlogger.info('pk={},主键未能识别'.format(pk))
        dzlogger.info(
            '------------------------------以上为预测数据------------------------------\n\n\n'
        )

    disaccord = 0
    for id in tests.keys():
        train = trains.get(id)
        if train is None:
            disaccord += len(tests.get(id))
            dzlogger.info('id={},ID识别不一致'.format(id))

    dzlogger.info(
        'posid={}, posdx={}, possl={}, posje={}, possd={}, posrg={}'.format(
            posid, posdx, possl, posje, possd, posrg))
    dzlogger.info(
        'actid={}, actdx={}, actsl={}, actje={}, actsd={}, actrg={}'.format(
            actid, actdx, actsl, actje, actsd, actrg))
    dzlogger.info(
        'corid={}, cordx={}, corsl={}, corje={}, corsd={}, corrg={}'.format(
            corid, cordx, corsl, corje, corsd, corrg))
    dzlogger.info('disaccord={}'.format(disaccord))

    p_sl, r_sl, f_sl = f1(corsl, possl, actsl)
    p_je, r_je, f_je = f1(corje, posje, actje)
    p_sd, r_sd, f_sd = f1(corsd, possd, actsd)
    p_rg, r_rg, f_rg = f1(corrg, posrg, actrg)
    dzlogger.info('SL: p={},r={},f1={}'.format(p_sl, r_sl, f_sl))
    dzlogger.info('JE: p={},r={},f1={}'.format(p_je, r_je, f_je))
    dzlogger.info('SD: p={},r={},f1={}'.format(p_sd, r_sd, f_sd))
    dzlogger.info('RG: p={},r={},f1={}'.format(p_rg, r_rg, f_rg))
    dzlogger.info('score={}'.format(score([f_sl, f_je, f_sd, f_rg])))