def pred_tet(path_hyper_parameter=path_hyper_parameters,
             path_test=None,
             rate=1.0):
    """
        测试集测试与模型评估
    :param hyper_parameters: json, 超参数
    :param path_test:str, path of test data, 测试集
    :param rate: 比率, 抽出rate比率语料取训练
    :return: None
    """
    hyper_parameters = load_json(path_hyper_parameter)
    if path_test:  # 从外部引入测试数据地址
        hyper_parameters['data']['val_data'] = path_test
    time_start = time.time()
    # graph初始化
    graph = Graph(hyper_parameters)
    print("graph init ok!")
    graph.load_model()
    print("graph load ok!")
    ra_ed = graph.word_embedding
    # 数据预处理
    pt = PreprocessText()
    y, x = read_and_process(hyper_parameters['data']['val_data'])
    # 取该数据集的百分之几的语料测试
    len_rate = int(len(y) * rate)
    x = x[1:len_rate]
    y = y[1:len_rate]
    y_pred = []
    count = 0
    for x_one in x:
        count += 1
        ques_embed = ra_ed.sentence2idx(x_one)
        if hyper_parameters['embedding_type'] == 'bert':  # bert数据处理, token
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            x_val = [x_val_1, x_val_2]
        else:
            x_val = ques_embed
        # 预测
        pred = graph.predict(x_val)
        pre = pt.prereocess_idx(pred[0])
        label_pred = pre[0][0][0]
        if count % 1000 == 0:
            print(label_pred)
        y_pred.append(label_pred)

    print("data pred ok!")
    # 预测结果转为int类型
    index_y = [pt.l2i_i2l['l2i'][i] for i in y]
    index_pred = [pt.l2i_i2l['l2i'][i] for i in y_pred]
    target_names = [
        pt.l2i_i2l['i2l'][str(i)] for i in list(set((index_pred + index_y)))
    ]
    # 评估
    report_predict = classification_report(index_y,
                                           index_pred,
                                           target_names=target_names,
                                           digits=9)
    print(report_predict)
    print("耗时:" + str(time.time() - time_start))
def pred_input():
    hyper_parameters = {
        'model': {
            'label': 17,
            'batch_size': 64,
            'embed_size': 30,
            'filters': [2, 3, 4],
            'filters_num': 30,
            'channel_size': 1,
            'dropout': 0.5,
            'decay_step': 100,
            'decay_rate': 0.9,
            'epochs': 20,
            'len_max': 50,
            'vocab_size': 20000,  # 这里随便填的,会根据代码里修改
            'lr': 1e-3,
            'l2': 1e-6,
            'activate_classify': 'softmax',
            'embedding_type': 'bert',  # 还可以填'random'、 'bert' or 'word2vec"
            'is_training': True,
            'model_path': path_model_fast_text_baiduqa_2019,  # 地址可以自己设置
            'rnn_type': 'GRU',
            # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU'
            'rnn_units': 256,  # large 650, small is 300
        },
        'embedding': {
            'embedding_type': 'bert',
            'corpus_path': path_embedding_bert,
            'level_type': 'char',
            'embed_size': 30,
            'len_max': 50,
            'layer_indexes': [12]  # range 1 to 12
        },
    }

    pt = PreprocessText()
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding
    ques = '我要打王者荣耀'
    ques_embed = ra_ed.sentence2idx(ques)
    x_val_1 = np.array([ques_embed[0]])
    x_val_2 = np.array([ques_embed[1]])
    x_val = [x_val_1, x_val_2]
    pred = graph.predict(x_val)
    pre = pt.prereocess_idx(pred[0])
    print(pre)
    while True:
        print("请输入: ")
        ques = input()
        ques_embed = ra_ed.sentence2idx(ques)
        print(ques_embed)
        ques_embed_1 = np.array([ques_embed[0]])
        ques_embed_2 = np.array([ques_embed[1]])
        pred = graph.predict([ques_embed_1, ques_embed_2])
        pre = pt.prereocess_idx(pred[0])
        print(pre)
def pred_input(path_hyper_parameter=path_hyper_parameters):
    """
       输入预测
    :param path_hyper_parameter: str, 超参存放地址
    :return: None
    """
    # 加载超参数
    hyper_parameters = load_json(path_hyper_parameter)
    pt = PreprocessSim(path_model_dir)
    # 模式初始化和加载
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding
    sen1 = '我要打王者荣耀'
    sen2 = '我要打梦幻西游'

    # str to token
    ques_embed = ra_ed.sentence2idx(text=sen1, second_text=sen2)
    if hyper_parameters['embedding_type'] in ['bert', 'albert']:
        x_val_1 = np.array([ques_embed[0]])
        x_val_2 = np.array([ques_embed[1]])
        x_val = [x_val_1, x_val_2]
        # 预测
        pred = graph.predict(x_val)
        # 取id to label and pred
        pre = pt.prereocess_idx(pred[0])
        print(pre)
        while True:
            print("请输入sen1: ")
            sen1 = input()
            print("请输入sen2: ")
            sen2 = input()

            ques_embed = ra_ed.sentence2idx(text=sen1, second_text=sen2)
            print(ques_embed)
            if hyper_parameters['embedding_type'] in ['bert', 'albert']:
                x_val_1 = np.array([ques_embed[0]])
                x_val_2 = np.array([ques_embed[1]])
                x_val = [x_val_1, x_val_2]
                pred = graph.predict(x_val)
                pre = pt.prereocess_idx(pred[0])
                print(pre)
            else:
                print("error, just support bert or albert")

    else:
        print("error, just support bert or albert")
def pred_input(path_hyper_parameter=path_hyper_parameters):
    """
       输入预测
    :param path_hyper_parameter: str, 超参存放地址
    :return: None
    """
    # 加载超参数
    hyper_parameters = load_json(path_hyper_parameter)
    pt = PreprocessText()
    # 模式初始化和加载
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding
    ques = '我要打王者荣耀'
    # str to token
    ques_embed = ra_ed.sentence2idx(ques)
    if hyper_parameters['embedding_type'] == 'bert':
        x_val_1 = np.array([ques_embed[0]])
        x_val_2 = np.array([ques_embed[1]])
        x_val = [x_val_1, x_val_2]
    else:
        x_val = ques_embed
    # 预测
    pred = graph.predict(x_val)
    # 取id to label and pred
    pre = pt.prereocess_idx(pred[0])
    print(pre)
    while True:
        print("请输入: ")
        ques = input()
        ques_embed = ra_ed.sentence2idx(ques)
        print(ques_embed)
        if hyper_parameters['embedding_type'] == 'bert':
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            x_val = [x_val_1, x_val_2]
        else:
            x_val = ques_embed
        pred = graph.predict(x_val)
        pre = pt.prereocess_idx(pred[0])
        print(pre)
def train(hyper_parameters=None, rate=1.0):
    """
        训练函数
    :param hyper_parameters: json, 超参数
    :param rate: 比率, 抽出rate比率语料取训练
    :return: None
    """
    if not hyper_parameters:
        hyper_parameters = {
            'len_max': 32,  # 句子最大长度, 固定 推荐20-50
            'embed_size': 768,  # 字/词向量维度
            'vocab_size': 20000,  # 这里随便填的,会根据代码里修改
            'trainable': True,  # embedding是静态的还是动态的, 即控制可不可以微调
            'level_type': 'char',  # 级别, 最小单元, 字/词, 填 'char' or 'word'
            'embedding_type':
            'bert',  # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
            'gpu_memory_fraction': 0.76,  #gpu使用率
            'model': {
                'label': 2,  # 类别数
                'batch_size':
                2,  # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
                'filters': [2, 3, 4, 5],  # 卷积核尺寸
                'filters_num': 300,  # 卷积个数 text-cnn:300-600
                'channel_size': 1,  # CNN通道数
                'dropout': 0.5,  # 随机失活, 概率
                'decay_step': 100,  # 学习率衰减step, 每N个step衰减一次
                'decay_rate': 0.9,  # 学习率衰减系数, 乘法
                'epochs': 20,  # 训练最大轮次
                'patience': 3,  # 早停,2-3就好
                'lr': 5e-5,  # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
                'l2': 1e-9,  # l2正则化
                'activate_classify': 'sigmoid',  # 最后一个layer, 即分类激活函数
                'loss': 'binary_crossentropy',  # 损失函数
                'metrics': 'accuracy',  # 保存更好模型的评价标准
                'is_training': True,  # 训练后者是测试模型
                'model_path': path_model,
                # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True
                'path_hyper_parameters':
                path_hyper_parameters,  # 模型(包括embedding),超参数地址,
                'path_fineture':
                path_fineture,  # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
            },
            'embedding': {
                'layer_indexes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                  13],  # bert取的层数,包括embedding层
                # 'corpus_path': '',     # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
            },
            'data': {
                'train_data': path_sim_webank_train,  # 训练数据
                'val_data': path_sim_webank_valid  # 验证数据
            },
        }

    # 删除先前存在的模型\embedding微调模型等
    delete_file(path_model_dir)
    time_start = time.time()
    # graph初始化
    graph = Graph(hyper_parameters)
    print("graph init ok!")
    ra_ed = graph.word_embedding
    # 数据预处理
    pt = PreprocessSim()
    x_train, y_train = pt.preprocess_label_ques_to_idx(
        hyper_parameters['embedding_type'],
        hyper_parameters['data']['train_data'],
        ra_ed,
        rate=rate,
        shuffle=True)
    x_val, y_val = pt.preprocess_label_ques_to_idx(
        hyper_parameters['embedding_type'],
        hyper_parameters['data']['val_data'],
        ra_ed,
        rate=rate,
        shuffle=True)
    print("data propress ok!")
    print(len(y_train))
    # 训练
    graph.fit(x_train, y_train, x_val, y_val)
    print("耗时:" + str(time.time() - time_start))
def pred_tet(path_hyper_parameter=path_hyper_parameters,
             path_test=None,
             rate=1.0):
    """
        测试集测试与模型评估
    :param hyper_parameters: json, 超参数
    :param path_test:str, path of test data, 测试集
    :param rate: 比率, 抽出rate比率语料取训练
    :return: None
    """
    hyper_parameters = load_json(path_hyper_parameter)
    if path_test:  # 从外部引入测试数据地址
        hyper_parameters['data']['test_data'] = path_test
    time_start = time.time()
    # graph初始化
    graph = Graph(hyper_parameters)
    print("graph init ok!")
    graph.load_model()
    print("graph load ok!")
    ra_ed = graph.word_embedding
    # 数据预处理
    pt = PreprocessSim(path_model_dir)

    data = pd.read_csv(hyper_parameters['data']['test_data'])
    sentence_1 = data["sentence1"].values.tolist()
    sentence_2 = data["sentence2"].values.tolist()
    labels = data["label"].values.tolist()
    sentence_1 = [extract_chinese(str(line1).upper()) for line1 in sentence_1]
    sentence_2 = [extract_chinese(str(line2).upper()) for line2 in sentence_2]
    labels = [extract_chinese(str(line3).upper()) for line3 in labels]

    # 取该数据集的百分之几的语料测试
    len_rate = int(len(labels) * rate)
    sentence_1 = sentence_1[0:len_rate]
    sentence_2 = sentence_2[0:len_rate]
    labels = labels[0:len_rate]
    y_pred = []
    count = 0
    for i in range(len_rate):
        count += 1
        ques_embed = ra_ed.sentence2idx(text=sentence_1[i],
                                        second_text=sentence_2[i])
        # print(hyper_parameters['embedding_type'])
        if hyper_parameters['embedding_type'] in ['bert',
                                                  'albert']:  # bert数据处理, token
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            x_val = [x_val_1, x_val_2]
            # 预测
            pred = graph.predict(x_val)
            pre = pt.prereocess_idx(pred[0])
            label_pred = pre[0][0][0]
            if count % 1000 == 0:
                print(label_pred)
            y_pred.append(label_pred)

    print("data pred ok!")
    # 预测结果转为int类型
    index_y = [pt.l2i_i2l['l2i'][i] for i in labels]
    index_pred = [pt.l2i_i2l['l2i'][i] for i in y_pred]
    target_names = [
        pt.l2i_i2l['i2l'][str(i)] for i in list(set((index_pred + index_y)))
    ]
    # 评估
    report_predict = classification_report(index_y,
                                           index_pred,
                                           target_names=target_names,
                                           digits=9)
    print(report_predict)
    print("耗时:" + str(time.time() - time_start))
            'rnn_units': 256,  # large 650, small is 300
        },
        'embedding': {
            'embedding_type': 'bert',
            'corpus_path': path_embedding_bert,
            'level_type': 'char',
            'embed_size': 30,
            'len_max': 50,
            'layer_indexes': [12]  # range 1 to 12
        },
    }

    import time
    time_start = time.time()

    graph = Graph(hyper_parameters)
    print("graph init ok!")
    ra_ed = graph.word_embedding
    pt = PreprocessText()
    x_train, y_train = pt.preprocess_baidu_qa_2019_idx(
        path_baidu_qa_2019_train, ra_ed, rate=0.01)
    x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid,
                                                   ra_ed,
                                                   rate=0.01)
    indexs = [ids for ids in range(len(y_train))]
    random.shuffle(indexs)
    x_train, y_train = x_train[indexs], y_train[indexs]
    print("data propress ok 1 !")
    print(len(y_train))
    x_train = x_train.tolist()
    x_val = x_val.tolist()
def pred_tet():
    # 测试集的准确率
    from keras_textclassification.conf.path_config import path_baidu_qa_2019_valid
    hyper_parameters = {
        'model': {
            'label': 17,
            'batch_size': 64,
            'embed_size': 30,
            'filters': [2, 3, 4],
            'filters_num': 30,
            'channel_size': 1,
            'dropout': 0.5,
            'decay_step': 100,
            'decay_rate': 0.9,
            'epochs': 20,
            'len_max': 50,
            'vocab_size': 20000,  # 这里随便填的,会根据代码里修改
            'lr': 1e-3,
            'l2': 1e-6,
            'activate_classify': 'softmax',
            'embedding_type': 'bert',  # 还可以填'random'、 'bert' or 'word2vec"
            'is_training': True,
            'model_path': path_model_fast_text_baiduqa_2019,  # 地址可以自己设置
            'rnn_type': 'GRU',
            # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU'
            'rnn_units': 256,  # large 650, small is 300
        },
        'embedding': {
            'embedding_type': 'bert',
            'corpus_path': path_embedding_bert,
            'level_type': 'char',
            'embed_size': 30,
            'len_max': 50,
            'layer_indexes': [12]  # range 1 to 12
        },
    }
    pt = PreprocessText()
    graph = Graph(hyper_parameters)
    graph.load_model()
    ra_ed = graph.word_embedding
    x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid,
                                                   ra_ed,
                                                   rate=1)
    x_val = x_val.tolist()
    y_val = y_val.tolist()
    y_pred = []
    count = 0
    for x_val_one in x_val:
        count = count + 1
        print(x_val_one)
        ques_embed_1 = np.array([x_val_one[0]])
        ques_embed_2 = np.array([x_val_one[1]])
        pred = graph.predict([ques_embed_1, ques_embed_2])
        print(pred)
        pred_top1 = pt.prereocess_pred_id(pred[0])
        print(pred_top1)
        y_pred.append(pred_top1)

    acc = 0
    for i in range(len(y_val)):
        if y_val[i] == y_pred[i]:
            acc += 1
    print('true: {}  total: {}  acc: {}'.format(acc, len(y_val),
                                                acc / len(y_val)))