Example #1
0
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim):
    word_sentences_train, label_sentences_train = load_conll_data(train_path)
    word_sentences_dev, label_sentences_dev = load_conll_data(dev_path)
    word_sentences_test, label_sentences_test = load_conll_data(test_path)
    max_length_train = utils.get_max_length(word_sentences_train)
    max_length_dev = utils.get_max_length(word_sentences_dev)
    max_length_test = utils.get_max_length(word_sentences_test)
    max_length = max(max_length_train, max_length_dev, max_length_test)
    label_sentences_id_train, alphabet_label = utils.map_string_2_id_open(
        label_sentences_train, 'pos')
    alphabet_label.save('pre-trained-model/pos', name='alphabet_label')
    label_sentences_id_dev = utils.map_string_2_id_close(
        label_sentences_dev, alphabet_label)
    label_sentences_id_test = utils.map_string_2_id_close(
        label_sentences_test, alphabet_label)
    word_train, label_train, mask_train = \
        utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_dev, label_dev, mask_dev = \
        utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_test, label_test, mask_test = \
        utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    alphabet_char = LabelEncoder('char')
    alphabet_char.get_index(word_end)
    index_sentences_train, max_char_length_train = utils.get_character_indexes(
        word_sentences_train, alphabet_char)
    alphabet_char.close()
    char_embedd_table = utils.build_char_embedd_table(char_embedd_dim,
                                                      alphabet_char)
    alphabet_char.save('pre-trained-model/pos', name='alphabet_char')
    index_sentences_dev, max_char_length_dev = utils.get_character_indexes(
        word_sentences_dev, alphabet_char)
    index_sentences_test, max_char_length_test = utils.get_character_indexes(
        word_sentences_test, alphabet_char)
    max_char_length = max(max_char_length_train, max_char_length_dev,
                          max_char_length_test)
    char_train = utils.construct_tensor_char(index_sentences_train, max_length,
                                             max_char_length, alphabet_char)
    char_dev = utils.construct_tensor_char(index_sentences_dev, max_length,
                                           max_char_length, alphabet_char)
    char_test = utils.construct_tensor_char(index_sentences_test, max_length,
                                            max_char_length, alphabet_char)
    num_labels = alphabet_label.size() - 1
    num_data = word_train.shape[0]
    return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \
           label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \
           char_embedd_table, num_labels, num_data
Example #2
0
def load_data_pos(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length, max_char_length,
                  alphabet_char):
    words, masks = utils.construct_tensor_word(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim,
                                               max_sent_length)
    index_chars = utils.get_character_indexes(sents, alphabet_char, max_char_length)
    chars = utils.construct_tensor_char(index_chars, max_sent_length, max_char_length, alphabet_char)
    return words, masks, chars
Example #3
0
def load_data_chunk(sents, pos_sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length,
                    max_char_length, alphabet_char, alphabet_pos):
    words, masks = utils.construct_tensor_word(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim,
                                               max_sent_length)
    index_poss = utils.map_string_2_id_close(pos_sents, alphabet_pos)
    poss = utils.construct_tensor_onehot(index_poss, max_sent_length, alphabet_pos.size())
    words = np.concatenate((words, poss), axis=2)
    index_chars = utils.get_character_indexes(sents, alphabet_char, max_char_length)
    chars = utils.construct_tensor_char(index_chars, max_sent_length, max_char_length, alphabet_char)
    return words, masks, chars
Example #4
0
File: ner.py Project: Lanuet/NNVLP
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim):
    word_sentences_train, pos_sentences_train, chunk_sentences_train, label_sentences_train = load_conll_data(train_path)
    word_sentences_dev, pos_sentences_dev, chunk_sentences_dev, label_sentences_dev = load_conll_data(dev_path)
    word_sentences_test, pos_sentences_test, chunk_sentences_test, label_sentences_test = load_conll_data(test_path)
    max_length_train = utils.get_max_length(word_sentences_train)
    max_length_dev = utils.get_max_length(word_sentences_dev)
    max_length_test = utils.get_max_length(word_sentences_test)
    max_length = max(max_length_train, max_length_dev, max_length_test)

    pos_sentences_id_train, alphabet_pos = utils.map_string_2_id_open(pos_sentences_train, 'pos')
    alphabet_pos.save('pre-trained-model/ner', name='alphabet_pos')
    pos_sentences_id_dev = utils.map_string_2_id_close(pos_sentences_dev, alphabet_pos)
    pos_sentences_id_test = utils.map_string_2_id_close(pos_sentences_test, alphabet_pos)
    chunk_sentences_id_train, alphabet_chunk = utils.map_string_2_id_open(chunk_sentences_train, 'chunk')
    alphabet_chunk.save('pre-trained-model/ner', name='alphabet_chunk')
    chunk_sentences_id_dev = utils.map_string_2_id_close(chunk_sentences_dev, alphabet_chunk)
    chunk_sentences_id_test = utils.map_string_2_id_close(chunk_sentences_test, alphabet_chunk)
    label_sentences_id_train, alphabet_label = utils.map_string_2_id_open(label_sentences_train, 'ner')
    alphabet_label.save('pre-trained-model/ner', name='alphabet_label')
    label_sentences_id_dev = utils.map_string_2_id_close(label_sentences_dev, alphabet_label)
    label_sentences_id_test = utils.map_string_2_id_close(label_sentences_test, alphabet_label)

    word_train, label_train, mask_train = \
        utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_dev, label_dev, mask_dev = \
        utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_test, label_test, mask_test = \
        utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)

    pos_train = utils.construct_tensor_onehot(pos_sentences_id_train, max_length, alphabet_pos.size())
    pos_dev = utils.construct_tensor_onehot(pos_sentences_id_dev, max_length, alphabet_pos.size())
    pos_test = utils.construct_tensor_onehot(pos_sentences_id_test, max_length, alphabet_pos.size())
    chunk_train = utils.construct_tensor_onehot(chunk_sentences_id_train, max_length, alphabet_chunk.size())
    chunk_dev = utils.construct_tensor_onehot(chunk_sentences_id_dev, max_length, alphabet_chunk.size())
    chunk_test = utils.construct_tensor_onehot(chunk_sentences_id_test, max_length, alphabet_chunk.size())
    word_train = np.concatenate((word_train, pos_train), axis=2)
    word_train = np.concatenate((word_train, chunk_train), axis=2)
    word_dev = np.concatenate((word_dev, pos_dev), axis=2)
    word_dev = np.concatenate((word_dev, chunk_dev), axis=2)
    word_test = np.concatenate((word_test, pos_test), axis=2)
    word_test = np.concatenate((word_test, chunk_test), axis=2)
    alphabet_char = LabelEncoder('char')
    alphabet_char.get_index(word_end)
    index_sentences_train, max_char_length_train = utils.get_character_indexes(word_sentences_train, alphabet_char)
    alphabet_char.close()
    char_embedd_table = utils.build_char_embedd_table(char_embedd_dim, alphabet_char)
    alphabet_char.save('pre-trained-model/ner', name='alphabet_char')
    index_sentences_dev, max_char_length_dev = utils.get_character_indexes(word_sentences_dev, alphabet_char)
    index_sentences_test, max_char_length_test = utils.get_character_indexes(word_sentences_test, alphabet_char)
    max_char_length = max(max_char_length_train, max_char_length_dev, max_char_length_test)
    char_train = utils.construct_tensor_char(index_sentences_train, max_length, max_char_length, alphabet_char)
    char_dev = utils.construct_tensor_char(index_sentences_dev, max_length, max_char_length, alphabet_char)
    char_test = utils.construct_tensor_char(index_sentences_test, max_length, max_char_length, alphabet_char)
    num_labels = alphabet_label.size() - 1
    num_data, _, embedd_dim_concat = word_train.shape
    # print(np.shape(word_train))
    # print(np.shape(word_dev))
    # print(np.shape(word_test))
    # print(np.shape(char_train))
    # print(np.shape(char_dev))
    # print(np.shape(char_test))
    # print(np.shape(mask_train))
    # print(np.shape(mask_dev))
    # print(np.shape(mask_test))
    # print(np.shape(label_train))
    # print(np.shape(label_dev))
    # print(np.shape(label_test))
    # print(word_train[-1])
    np.save("word_train.npy",word_train)
    np.save("label_train.npy", label_train)
    print("Done")
    return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \
           label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \
           char_embedd_table, num_labels, num_data, embedd_dim_concat