def read_data(head_path, vocab = 1, label_to_ix = 2, max_length = 0):
    '''
    读取数据部分
    :return:txt labels
    '''
    new_label_path, new_txt_paths = NER_pre_data.concat_path(head_path)
    txts, labels = NER_pre_data.load_data(new_label_path, new_txt_paths)
    # arrys, length, num_length = data_change.auto_pad(txts, vocab)
    # max_length = max(num_length, max_length)
    # targets = data_change.prepare_label(labels, label_to_ix, num_length)
    return txts, labels
def prediction(path, mode="bert_bilstm", is_eval=False):
    labels_to_ix, ix_to_label = NER_pre_data.build_label(normal_param.labels)
    vocab = process_data_for_keras.read_vocab(normal_param.lstm_vocab)
    if mode == "lstm":
        save_path = normal_param.save_path_lstm
        model = keras_LSTM_CRF.load_embedding_bilstm2_crf_model(
            save_path, len(vocab), len(labels_to_ix), normal_param.max_length)
    elif mode == "bilstm":
        save_path = normal_param.save_path_bilstm
        model = keras_BILSTM_CEF.load_embedding_bilstm2_crf_model(
            save_path, len(vocab), len(labels_to_ix), normal_param.max_length)
    elif mode == "bert_bilstm":
        save_path = normal_param.save_path_bert_bilstm
        model = keras_Bert_bilstm_crf.load_embedding_bilstm2_crf_model(
            save_path, len(labels_to_ix))
    elif mode == "rnn":
        save_path = normal_param.save_path_gru
        model = keras_RNN_CRF.load_embedding_bilstm2_crf_model(
            save_path, len(vocab), len(labels_to_ix), 0)
    else:
        save_path = normal_param.save_path_wordVEC_bilstm
        embeddings_matrix, vocab = process_data_for_keras.txtpad_use_word2vec()
        # NUM_CLASS, embeddings_matrix, input_length
        model = keras_word2vec_bilstm_crf.load_embedding_bilstm2_crf_model(
            save_path, len(labels_to_ix), embeddings_matrix,
            normal_param.max_length)

    myNerInfer = NERInference.NERInference(model,
                                           vocab,
                                           ix_to_label,
                                           len(vocab),
                                           path,
                                           mode=mode)
    new_string4_pred, ix = myNerInfer.predict_all(is_eval)
    return new_string4_pred
def process_data(embeding = None, is_train = True, vocab2 = None):
    '''
    根据不同的embeding方法处理数据。
    :param embeding: embeding方法:bert、wordvec、不用embeding方法
    :return:
    '''
    labels_to_ix, _ = NER_pre_data.build_label(normal_param.labels)
    vocab = read_vocab(normal_param.lstm_vocab)
    # x_test, y_test = read_data(normal_param.head_test_path, vocab, labels_to_ix)
    if is_train:
        x, y = read_data(normal_param.head_path, vocab, labels_to_ix)

        x_train, y_train, x_test, y_test = split_tst_trn(x, y, 50)
        length = gain_max_length(x_train, x_test)
        if embeding == "wordvec":
            x_train, y_train, x_test, y_test = list_to_array(x_train, y_train, x_test, y_test, vocab2, labels_to_ix,
                                                             length, wordembeding=embeding)
        else:
            x_train, y_train, x_test, y_test = list_to_array(x_train, y_train, x_test, y_test, vocab, labels_to_ix, length, wordembeding = embeding)
        y_test = np.expand_dims(y_test, 2)
        y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1))

        return x_train, y_train, x_test, y_test, len(vocab), len(labels_to_ix)
    else:
        x, y = read_data(normal_param.head_test_path, vocab, labels_to_ix)

        length = gain_max_length(x, [])
        y_test, x_test = deal_txt_label_to_array(x, y, vocab, labels_to_ix, length, mode = embeding)
        return x_test, y_test
def process_test_data():
    '''
    对测试集数据进行
    :return:
    '''
    labels_to_ix, _ = NER_pre_data.build_label(normal_param.labels)
    vocab = read_vocab(normal_param.lstm_vocab)
    x, y = read_data(normal_param.head_test_path, vocab, labels_to_ix)
    y_test, x_test = deal_txt_label_to_array(x, y, vocab, labels_to_ix, normal_param.max_length, mode = "bert")
    return x_test, y_test
Esempio n. 5
0
def pre_score(head_path):

    paths = normal_util.concat_path(os.path.join(head_path, "txt"))
    label_paths = normal_util.concat_path(os.path.join(head_path, "label"))
    labels_all = []
    pre_labels_all = []

    for index in range(len(paths)):
        labels = prediction_entity.prediction(paths[index], mode="rnn")

        labels_entire = NER_pre_data.read_content(label_paths[index])
        for i in range(len(labels)):
            if len(labels[i]) != len(labels_entire[i]):
                continue
            pre_labels_all += labels[i]
            labels_all += labels_entire[i]
    print(classification_report(pre_labels_all, labels_all))
def process_data_gen(data, label, embeding = None):
    '''
    根据不同的embeding方法处理数据。
    :param embeding: embeding方法:bert、wordvec、不用embeding方法
    :return:
    '''
    labels_to_ix, _ = NER_pre_data.build_label(normal_param.labels)
    vocab = read_vocab(normal_param.lstm_vocab)
    # x, y = read_data_part(start_path, end_path)

    # x_test, y_test = read_data(normal_param.head_test_path, vocab, labels_to_ix)
    # x_train, y_train, x_test, y_test = split_tst_trn(x, y, 50)
    data, label = normal_util.shuffle(data, label)
    length = normal_param.max_length
    x_train, y_train = deal_txt_label_to_array(data, label, vocab, labels_to_ix, length, mode = None)
    # y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1))
    # # y_train = np.expand_dims(y_train, 2)
    y_train = np.expand_dims(y_train, 2)
    return x_train, y_train, len(vocab), len(labels_to_ix)