def get_shallow_stats(x_path): X = read_csv(x_path) sentences = build_sentences(X, k=1) word_count = 0 vocab = set() for sentence in sentences: for word in sentence: word_count += 1 vocab.add(word) return len(sentences), word_count, len(vocab)
from data import read_csv, build_sentences_labels, handle_uncommon_words, handle_unknown_words, build_sentences, handle_unknown_sentences from probs import build_emission_map, build_transition_map if __name__ == '__main__': X_train = read_csv("./data/dev_x.csv") X_labels = read_csv("./data/dev_y.csv") sentences, labels = build_sentences_labels(X_train, X_labels, k=2) sentences = handle_uncommon_words(sentences) #transition_map = build_transition_map(labels) #emission_map = build_emission_map(sentences, labels) vocab = set() for sentence in sentences: for word in sentence: vocab.add(word) not_found = [] test_sentences = build_sentences(read_csv('./data/test_x.csv'), k=2) test_sentences = handle_unknown_sentences(test_sentences, vocab) for test_sentence in test_sentences: for test_word in test_sentence: if test_word not in vocab: not_found.append(test_word) with open('output.txt', 'w') as f: for word in not_found: f.write(f"{word}\n")
print(' - (Training) accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( accu=100*train_accu, elapse=(time.time()-start)/60)) start = time.time() valid_loss, valid_accu = eval_epoch(model, validation_data, predicates) print(' - (Validation) accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( accu=100*valid_accu, elapse=(time.time()-start)/60)) valid_accus += [valid_accu] device = torch.device('cpu') word2idx,ints,en1_pos,en2_pos,predicates,relation2idx = data.build_sentences() training_data, validation_data = prepare_dataloaders(word2idx,ints,en1_pos,en2_pos,predicates) model = Transformer( n_src_vocab=len(word2idx), len_max_seq=config.max_seq_len).to(device) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), 512, 1000) train(model, training_data, validation_data, optimizer,predicates)
for mp_idx in range(1, len(backpointers)): result.append(prev) mp = backpointers[mp_idx][prev] #print(f"{prev} -> {mp}") prev = mp result.reverse() final = final + result return final X_train = read_csv("./data/train_x.csv") Y_train = read_csv("./data/train_y.csv") sentences, labels = build_sentences_labels(X_train, Y_train, k=1) # sentences = handle_uncommon_words(sentences, threshold=3) vocab = get_vocab_sentences(sentences) emission_map = build_emission_map(sentences, labels) transition_map = build_transition_map(labels) X_test = read_csv("./data/dev_x.csv") y_dev = read_csv("./data/dev_y.csv") sentences_test = build_sentences(X_test, k=1) # sentences_test = handle_unknown_sentences(sentences_test, vocab) suffix_dict = generate_suffix_dict(X_train, Y_train) pred_dev = viterbi(sentences_test, emission_map, transition_map, suffix_dict) print('Accuracy:', compAccu(X_test, y_dev, pred_dev, vocab)) print(emission("'s", emission_map))