Ejemplo n.º 1
0
    morph2idx, idx2morph = prepare_data.encode_data_morphs(whole_data_morphs)
    word2morph = prepare_data.word_to_morph(whole_data_morphs)

    with open('weights/char_dict_lower.pkl', 'wb') as f:
        pickle.dump(char2idx, f, pickle.HIGHEST_PROTOCOL)

    with open('weights/morph_dict_lower.pkl', 'wb') as f:
        pickle.dump(morph2idx, f, pickle.HIGHEST_PROTOCOL)



    indexed_data_train = prepare_data.data_to_idx(train_data, word2idx, embeddings)
    indexed_tag_train = prepare_data.tag_to_idx(train_data, tag2idx)
    indexed_char_train = prepare_data.char_to_idx(train_data, char2idx)
    indexed_morph_train = prepare_data.morph_to_idx(train_data, morph2idx, word2morph)
    data_train = prepare_data.combine_data(indexed_data_train, indexed_tag_train, indexed_char_train, indexed_morph_train, MAX_SEQ_LENGTH)


    indexed_data_dev = prepare_data.data_to_idx(dev_data, word2idx, embeddings)
    indexed_tag_dev = prepare_data.tag_to_idx(dev_data, tag2idx)
    indexed_char_dev = prepare_data.char_to_idx(dev_data, char2idx)
    indexed_morph_dev = prepare_data.morph_to_idx(dev_data, morph2idx, word2morph)
    data_dev = prepare_data.combine_data(indexed_data_dev, indexed_tag_dev, indexed_char_dev, indexed_morph_dev, MAX_SEQ_LENGTH)


    indexed_data_test = prepare_data.data_to_idx(test_data, word2idx, embeddings)
    indexed_tag_test = prepare_data.tag_to_idx(test_data, tag2idx)
    indexed_char_test = prepare_data.char_to_idx(test_data, char2idx)
    indexed_morph_test = prepare_data.morph_to_idx(test_data, morph2idx, word2morph)
    data_test = prepare_data.combine_data(indexed_data_test, indexed_tag_test, indexed_char_test, indexed_morph_test, MAX_SEQ_LENGTH)
Ejemplo n.º 2
0
    tag2idx = {'O': 1, 'PER': 2, 'LOC': 3, 'ORG': 4}
    idx2tag = {1: 'O', 2: 'PER', 3: 'LOC', 4: 'ORG'}


    with open('weights/char2idx_augmented.pkl', 'rb') as f:
        char2idx = pickle.load(f)
    with open('weights/idx2char_augmented.pkl', 'rb') as f:
        idx2char = pickle.load(f)


    # convert labels to indices
    indexed_target_test = prepare_data.label_to_idx(target_test, char2idx)
    indexed_target_word_test = prepare_data.word_to_idx(target_test, embeddings)

    test_data = prepare_data.combine_data(features_test, indexed_target_test)


    # initialize the Encoder
    encoder = Encoder(features_test[0].size(1), encoder_hidden_size, encoder_layers, batch_size, device).to(device)

    # initialize the Decoder
    decoder = Decoder(embedding_dim_chars, encoder_hidden_size, attention_hidden_size, num_filters, len(char2idx)+1, decoder_layers, encoder_layers, batch_size, attention_type, device).to(device)


    # load the model
    checkpoint = torch.load('weights/parliament/state_dict_21.pt', map_location=torch.device('cpu'))
    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])

Ejemplo n.º 3
0
char2idx_ctc = {}
idx2char_ctc = {}
counter = 0
for key, value in char2idx.items():
    if value >= 4:
        char2idx_ctc[key] = counter
        idx2char_ctc[counter] = key
        counter += 1

# convert labels to indices
indexed_target_train = prepare_data.label_to_idx(target_train, char2idx)
indexed_target_dev = prepare_data.label_to_idx(target_dev, char2idx)

# combine features and labels in a tuple
train_data = prepare_data.combine_data(features_train, indexed_target_train)
dev_data = prepare_data.combine_data(features_dev, indexed_target_dev)

# remove extra data that doesn't fit in batch
train_data = prepare_data.remove_extra(train_data, batch_size)
dev_data = prepare_data.remove_extra(dev_data, batch_size)

pairs_batch_train = DataLoader(dataset=train_data,
                               batch_size=batch_size,
                               shuffle=True,
                               collate_fn=prepare_data.collate,
                               pin_memory=True)

pairs_batch_dev = DataLoader(dataset=dev_data,
                             batch_size=batch_size,
                             shuffle=True,