def read_data(head_path, vocab = 1, label_to_ix = 2, max_length = 0): ''' 读取数据部分 :return:txt labels ''' new_label_path, new_txt_paths = NER_pre_data.concat_path(head_path) txts, labels = NER_pre_data.load_data(new_label_path, new_txt_paths) # arrys, length, num_length = data_change.auto_pad(txts, vocab) # max_length = max(num_length, max_length) # targets = data_change.prepare_label(labels, label_to_ix, num_length) return txts, labels
def prediction(path, mode="bert_bilstm", is_eval=False): labels_to_ix, ix_to_label = NER_pre_data.build_label(normal_param.labels) vocab = process_data_for_keras.read_vocab(normal_param.lstm_vocab) if mode == "lstm": save_path = normal_param.save_path_lstm model = keras_LSTM_CRF.load_embedding_bilstm2_crf_model( save_path, len(vocab), len(labels_to_ix), normal_param.max_length) elif mode == "bilstm": save_path = normal_param.save_path_bilstm model = keras_BILSTM_CEF.load_embedding_bilstm2_crf_model( save_path, len(vocab), len(labels_to_ix), normal_param.max_length) elif mode == "bert_bilstm": save_path = normal_param.save_path_bert_bilstm model = keras_Bert_bilstm_crf.load_embedding_bilstm2_crf_model( save_path, len(labels_to_ix)) elif mode == "rnn": save_path = normal_param.save_path_gru model = keras_RNN_CRF.load_embedding_bilstm2_crf_model( save_path, len(vocab), len(labels_to_ix), 0) else: save_path = normal_param.save_path_wordVEC_bilstm embeddings_matrix, vocab = process_data_for_keras.txtpad_use_word2vec() # NUM_CLASS, embeddings_matrix, input_length model = keras_word2vec_bilstm_crf.load_embedding_bilstm2_crf_model( save_path, len(labels_to_ix), embeddings_matrix, normal_param.max_length) myNerInfer = NERInference.NERInference(model, vocab, ix_to_label, len(vocab), path, mode=mode) new_string4_pred, ix = myNerInfer.predict_all(is_eval) return new_string4_pred
def process_data(embeding = None, is_train = True, vocab2 = None): ''' 根据不同的embeding方法处理数据。 :param embeding: embeding方法:bert、wordvec、不用embeding方法 :return: ''' labels_to_ix, _ = NER_pre_data.build_label(normal_param.labels) vocab = read_vocab(normal_param.lstm_vocab) # x_test, y_test = read_data(normal_param.head_test_path, vocab, labels_to_ix) if is_train: x, y = read_data(normal_param.head_path, vocab, labels_to_ix) x_train, y_train, x_test, y_test = split_tst_trn(x, y, 50) length = gain_max_length(x_train, x_test) if embeding == "wordvec": x_train, y_train, x_test, y_test = list_to_array(x_train, y_train, x_test, y_test, vocab2, labels_to_ix, length, wordembeding=embeding) else: x_train, y_train, x_test, y_test = list_to_array(x_train, y_train, x_test, y_test, vocab, labels_to_ix, length, wordembeding = embeding) y_test = np.expand_dims(y_test, 2) y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1)) return x_train, y_train, x_test, y_test, len(vocab), len(labels_to_ix) else: x, y = read_data(normal_param.head_test_path, vocab, labels_to_ix) length = gain_max_length(x, []) y_test, x_test = deal_txt_label_to_array(x, y, vocab, labels_to_ix, length, mode = embeding) return x_test, y_test
def process_test_data(): ''' 对测试集数据进行 :return: ''' labels_to_ix, _ = NER_pre_data.build_label(normal_param.labels) vocab = read_vocab(normal_param.lstm_vocab) x, y = read_data(normal_param.head_test_path, vocab, labels_to_ix) y_test, x_test = deal_txt_label_to_array(x, y, vocab, labels_to_ix, normal_param.max_length, mode = "bert") return x_test, y_test
def pre_score(head_path): paths = normal_util.concat_path(os.path.join(head_path, "txt")) label_paths = normal_util.concat_path(os.path.join(head_path, "label")) labels_all = [] pre_labels_all = [] for index in range(len(paths)): labels = prediction_entity.prediction(paths[index], mode="rnn") labels_entire = NER_pre_data.read_content(label_paths[index]) for i in range(len(labels)): if len(labels[i]) != len(labels_entire[i]): continue pre_labels_all += labels[i] labels_all += labels_entire[i] print(classification_report(pre_labels_all, labels_all))
def process_data_gen(data, label, embeding = None): ''' 根据不同的embeding方法处理数据。 :param embeding: embeding方法:bert、wordvec、不用embeding方法 :return: ''' labels_to_ix, _ = NER_pre_data.build_label(normal_param.labels) vocab = read_vocab(normal_param.lstm_vocab) # x, y = read_data_part(start_path, end_path) # x_test, y_test = read_data(normal_param.head_test_path, vocab, labels_to_ix) # x_train, y_train, x_test, y_test = split_tst_trn(x, y, 50) data, label = normal_util.shuffle(data, label) length = normal_param.max_length x_train, y_train = deal_txt_label_to_array(data, label, vocab, labels_to_ix, length, mode = None) # y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1)) # # y_train = np.expand_dims(y_train, 2) y_train = np.expand_dims(y_train, 2) return x_train, y_train, len(vocab), len(labels_to_ix)