Esempio n. 1
0
def pred_input(str_input, path_hyper_parameter=path_hyper_parameters):
    # 输入预测
    # 加载超参数
    hyper_parameters = load_json(path_hyper_parameter)
    pt = PreprocessTextMulti()
    # 模式初始化和加载
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding
    ques = str_input
    # str to token
    ques_embed = ra_ed.sentence2idx(ques)
    if hyper_parameters['embedding_type'] == 'bert':
        x_val_1 = np.array([ques_embed[0]])
        x_val_2 = np.array([ques_embed[1]])
        x_val = [x_val_1, x_val_2]
    else:
        x_val = ques_embed
    # 预测
    pred = graph.predict(x_val)
    print(pred)
    # 取id to label and pred
    pre = pt.prereocess_idx(pred[0])
    ls_nulti = []
    for ls in pre[0]:
        if ls[1] >= 0.73:
            ls_nulti.append(ls)
    print(str_input)
    print(pre[0])
    print(ls_nulti)
Esempio n. 2
0
def train(hyper_parameters=None, rate=1.0):
    if not hyper_parameters:
        hyper_parameters = {
            'len_max': 150,  # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 本地win10-4G设为20就好, 过大小心OOM
            'embed_size': 200,  # 字/词向量维度, bert取768, word取300, char可以更小些
            'vocab_size': 21128,  # 这里随便填的,会根据代码里修改
            'trainable': True,  # embedding是静态的还是动态的, 即控制可不可以微调
            'level_type': 'char',  # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
            'embedding_type': 'random',  # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
            'gpu_memory_fraction': 0.66,  # gpu使用率
            'ifChangeOutput': True,
            'model': {'label': 19,  # 类别数
                      'batch_size': 100,  # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
                      'dropout': 0.5,  # 随机失活, 概率
                      'decay_step': 100,  # 学习率衰减step, 每N个step衰减一次
                      'decay_rate': 0.9,  # 学习率衰减系数, 乘法
                      'epochs': 50,  # 训练最大轮次
                      'patience': 3,  # 早停,2-3就好
                      'lr': 1e-3,  # 学习率, bert取5e-5, 其他取1e-3, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
                      'l2': 1e-9,  # l2正则化
                      'activate_classify': 'sigmoid',  # 'sigmoid',  # 最后一个layer, 即分类激活函数
                      'loss': 'binary_crossentropy',  # 损失函数, 可能有问题, 可以自己定义
                      # 'metrics': 'top_k_categorical_accuracy',  # 1070个类, 太多了先用topk,  这里数据k设置为最大:33
                      'metrics': 'accuracy',  # 保存更好模型的评价标准
                      'is_training': True,  # 训练后者是测试模型
                      'model_path': path_model,
                      # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True
                      'path_hyper_parameters': path_hyper_parameters,  # 模型(包括embedding),超参数地址,
                      'path_fineture': path_fineture,  # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
                      },
            'embedding': {'layer_indexes': [12],  # bert取的层数
                          # 'corpus_path': '',     # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
                          },
            'data': {'train_data': "./data/train.csv",  # 训练数据
                     'val_data': "./data/val.csv",  # 验证数据
                     'test_data':"./data/test.csv"
                     },
        }
    # 删除先前存在的模型和embedding微调模型等
    delete_file(path_model_dir)
    time_start = time.time()
    # graph初始化
    graph = Graph(hyper_parameters)
    print("graph init ok!")
    ra_ed = graph.word_embedding
    # 数据预处理
    pt = PreprocessTextMulti()
    print(ra_ed,rate)
    x_train, y_train,_,_ = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
                                                             hyper_parameters['data']['train_data'],
                                                             ra_ed, rate=rate, shuffle=True)
    print('train data progress ok!')
    x_val, y_val,_,_ = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
                                                         hyper_parameters['data']['val_data'],
                                                         ra_ed, rate=rate, shuffle=True)
    print("data progress ok!")
    print(len(y_train))
    # 训练
    graph.fit(x_train, y_train, x_val, y_val)
    print("耗时:" + str(time.time() - time_start))
Esempio n. 3
0
def pred_input(path_hyper_parameter=path_hyper_parameters):
    # 输入预测
    # 加载超参数
    hyper_parameters = load_json(path_hyper_parameter)
    pt = PreprocessTextMulti(path_model_dir)
    # 模式初始化和加载
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding
    ques = '我要打王者荣耀'
    # str to token
    ques_embed = ra_ed.sentence2idx(ques)
    if hyper_parameters['embedding_type'] in ['bert', 'albert']:
        x_val_1 = np.array([ques_embed[0]])
        x_val_2 = np.array([ques_embed[1]])
        x_val = [x_val_1, x_val_2]
    else:
        x_val = ques_embed
    # 预测
    pred = graph.predict(x_val)
    print(pred)
    # 取id to label and pred
    pre = pt.prereocess_idx(pred[0])
    ls_nulti = []
    for ls in pre[0]:
        if ls[1] >= 0.5:
            ls_nulti.append(ls)
    print(pre[0])
    print(ls_nulti)
    while True:
        print("请输入: ")
        ques = input()
        ques_embed = ra_ed.sentence2idx(ques)
        print(ques_embed)
        if hyper_parameters['embedding_type'] in ['bert', 'albert']:
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            x_val = [x_val_1, x_val_2]
        else:
            x_val = ques_embed
        pred = graph.predict(x_val)
        pre = pt.prereocess_idx(pred[0])
        ls_nulti = []
        for ls in pre[0]:
            if ls[1] >= 0.5:
                ls_nulti.append(ls)
        print(pre[0])
        print(ls_nulti)
Esempio n. 4
0
def evaluate(path_hyper_parameter=path_hyper_parameters, rate=1.0):
    # 输入预测
    # 加载超参数
    hyper_parameters = load_json(path_hyper_parameter)
    pt = PreprocessTextMulti()
    # 模式初始化和加载
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding
    # get validation data
    ques_list, val_list, que, val = pt.preprocess_label_ques_to_idx(
        hyper_parameters['embedding_type'],
        hyper_parameters['data']['val_data'],
        ra_ed,
        rate=rate,
        shuffle=True)
    print(len(ques_list))
    print("que:", len(que))
    # print(val)

    # str to token
    ques_embed_list = []
    count = 0
    acc_count = 0
    not_none_count = 0
    not_none_acc_count = 0
    sum_iou = 0
    sum_all_iou = 0
    for index, que___ in enumerate(que):
        # print("原句 ", index, que[index])
        # print("真实分类 ", index, val[index])
        # print("ques: ", ques)
        ques_embed = ra_ed.sentence2idx(que[index])
        if hyper_parameters['embedding_type'] == 'albert':
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            ques_embed = [x_val_1, x_val_2]
        else:
            x_val = ques_embed
        # print("ques_embed: ", ques_embed)
        if hyper_parameters['embedding_type'] == 'bert':
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            x_val = [x_val_1, x_val_2]
        else:
            x_val = ques_embed
        # print("x_val", x_val)
        ques_embed_list.append(x_val)
        # 预测
        pred = graph.predict(x_val)
        # print(pred)
        # 取id to label and pred
        pre = pt.prereocess_idx(pred[0])
        # print("pre",pre)
        ls_nulti = []
        threshold = 0.44
        top_threshold = 0
        for i, ls in enumerate(pre[0]):
            if i == 0 or ls[1] > threshold:
                ls_nulti.append(ls)
                top_threshold = ls[1]
            elif abs(ls[1] - top_threshold) < top_threshold / 4.0:
                ls_nulti.append(ls)
        # print("预测结果", index, pre[0])
        # print(ls_nulti)
        res = cal_acc(ls_nulti, val[index].split(","))
        res_iou, res_all_iou = cal_iou(ls_nulti, val[index].split(","))
        sum_iou += res_iou
        sum_all_iou += res_all_iou
        if res:
            if val[index] != "无":
                not_none_acc_count += 1
            acc_count += 1
        else:
            print("原句 ", index, que[index])
            print("真实分类 ", index, val[index])
            print("pre ", pre)
            print("iou ", res_iou)
        count += 1
        if val[index] != "无":
            not_none_count += 1
    print("acc: ", acc_count / count)
    print("not none acc: ", not_none_acc_count / not_none_count)
    print("average iou: ", sum_iou / sum_all_iou)
    # log
    append_log(hyper_parameters, acc_count / count,
               not_none_acc_count / not_none_count, threshold)
Esempio n. 5
0
def evaluate(path_hyper_parameter=path_hyper_parameters, rate=1.0):
    # 输入预测
    # 加载超参数
    hyper_parameters = load_json(path_hyper_parameter)
    pt = PreprocessTextMulti()
    # 模式初始化和加载
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding

    # init confusion table
    dict_all = initConfusion()

    # get validation data
    ques_list, val_list, que, val = pt.preprocess_label_ques_to_idx(
        hyper_parameters['embedding_type'],
        hyper_parameters['data']['test_data'],
        ra_ed,
        rate=rate,
        shuffle=True)
    print(len(ques_list))
    print("que:", len(que))
    # print(val)

    # str to token
    ques_embed_list = []
    count = 0
    acc_count = 0
    not_none_count = 0
    not_none_acc_count = 0
    sum_iou = 0
    sum_all_iou = 0
    for index, que___ in enumerate(que):
        # print("原句 ", index, que[index])
        # print("真实分类 ", index, val[index])
        # print("ques: ", ques)
        ques_embed = ra_ed.sentence2idx(que[index])
        if hyper_parameters['embedding_type'] == 'albert':
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            ques_embed = [x_val_1, x_val_2]
        else:
            x_val = ques_embed
        # print("ques_embed: ", ques_embed)
        if hyper_parameters['embedding_type'] == 'bert':
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            x_val = [x_val_1, x_val_2]
        else:
            x_val = ques_embed
        # print("x_val", x_val)
        ques_embed_list.append(x_val)
        # 预测
        pred = graph.predict(x_val)
        # print(pred)
        # 取id to label and pred
        pre = pt.prereocess_idx(pred[0])
        # print("pre",pre)
        ls_nulti = []
        threshold = 0.65
        has_scope = False
        has_dense = False
        for i, ls in enumerate(pre[0]):
            if ls[0] in ['多发', '散发', '无']:
                if not has_scope:
                    has_scope = True
                    ls_nulti.append(ls)
                    if ls[0] in val[index].split(","):
                        dict_all[ls[0]]['TP'] += 1
                    else:
                        dict_all[ls[0]]['FN'] += 1
                else:
                    if ls[0] in val[index].split(","):
                        dict_all[ls[0]]['FP'] += 1
                    else:
                        dict_all[ls[0]]['TN'] += 1
            if ls[0] not in ['多发', '散发', '无']:
                if ls[1] > threshold or not has_dense:
                    ls_nulti.append(ls)
                    if ls[0] in val[index].split(","):
                        dict_all[ls[0]]['TP'] += 1
                    else:
                        dict_all[ls[0]]['FP'] += 1
                    has_dense = True
                else:
                    if ls[0] in val[index].split(","):
                        dict_all[ls[0]]['FN'] += 1
                    else:
                        dict_all[ls[0]]['TN'] += 1
        # print("预测结果", index, pre[0])
        # print(ls_nulti)
        res = cal_acc(ls_nulti, val[index].split(","))
        res_iou = cal_iou(ls_nulti, val[index].split(","))
        sum_iou += res_iou
        # sum_all_iou+=res_all_iou
        if res:
            # if val[index] != "无":
            #     not_none_acc_count += 1
            acc_count += 1
        else:
            print("原句 ", index, que[index])
            print("真实分类 ", index, val[index])
            print("pre ", pre)
            print("iou ", res_iou)
            print(ls_nulti)
        count += 1
        if val[index] != "无":
            not_none_count += 1
    print("acc: ", acc_count / count)
    # print("not none acc: ", not_none_acc_count / not_none_count)
    print("average iou: ", sum_iou / count)
    import prettytable as pt
    tb = pt.PrettyTable()
    tb.field_names = [" ", "Recall", "Precision", "TP", "FP", "TN", "FN"]
    for item in dict_all:
        if dict_all[item]['TP'] + dict_all[item]['FN'] == 0:
            recall = 1
        else:
            recall = dict_all[item]['TP'] / (dict_all[item]['TP'] +
                                             dict_all[item]['FN'])
        if dict_all[item]['TP'] + dict_all[item]['FP'] == 0:
            precision = 1
        else:
            precision = dict_all[item]['TP'] / (dict_all[item]['TP'] +
                                                dict_all[item]['FP'])
        # print(item,recall,precision)
        tb.add_row([
            item, recall, precision, dict_all[item]['TP'],
            dict_all[item]['FP'], dict_all[item]['TN'], dict_all[item]['FN']
        ])
    print(tb)
    # log
    append_log(hyper_parameters, acc_count / count,
               not_none_acc_count / not_none_count, threshold)