def main(): args = parse_arguments() print(args) ################## # Data Loading # ################## train_corpus, valid_corpus, test_corpus, dictionary = load_corpus(args) # print(len(train_corpus.src)) # print(len(train_corpus.tgt)) # print(train_corpus.src[2]) # print(train_corpus.tgt[2]) # for doc_idx, doc in enumerate(train_corpus.src,1): # print(doc_idx) # print(doc) # break # exit() vocab_size = len(dictionary) print("vocab_size",vocab_size) # For target vocab vocab = torch.load('./data/vocab.bin') vocab_tgt = vocab.tgt vocab_tgt_size = len(vocab_tgt) ################## # Model Setup # ################## model = build_model(vocab_size, args, dictionary) # decoder = Decoder(args.embed_dim, args.hidden_size, out_vocab_size) # if use_cuda: # decoder.cuda() #train_losses, train_accuracies = run_corpus(train_corpus, model, 'train' , train_mode=False) train_losses, train_accuracies = run_corpus(test_corpus, model, 'test', train_mode=False)
) exit() trg_sentences, sentence_embedding, trg_context, trg_target, trg_vocab, context_embedding, dropout, batch, epoch, out_model = sys.argv[ 1:] sentence_embedding = np.int(sentence_embedding) context_embedding = np.int(context_embedding) dropout = np.float(dropout) batch = np.int(batch) epoch = np.int(epoch) print("Loading vocabulary") trg_vocab, trg_max_features = data_utils.load_vocab(trg_vocab) print("Loading sentences") trg_sentences, sentence_max_length = data_utils.load_corpus(trg_sentences) print("Loading contexts") trg_context = data_utils.load_context(trg_context) print("Loading targets") trg_target = data_utils.load_target(trg_target) context_max_length = trg_context.shape[1] validation_size = 0.25 print("Data loaded") nid_sent = Neural_information_density_sentence( trg_sentences, sentence_max_length, trg_context, trg_target, trg_max_features, context_max_length, batch, validation_size) print("Data prepared") print("Training") nid_sent.train(sentence_embedding, context_embedding, dropout, epoch,
exit() src_sentences, src_vocab, src_embedding, trg_context, trg_target, trg_vocab, trg_embedding, dropout, batch, epoch, out_model = sys.argv[ 1:] src_embedding = np.int(src_embedding) trg_embedding = np.int(trg_embedding) dropout = np.float(dropout) batch = np.int(batch) epoch = np.int(epoch) print("Loading vocabulary") src_vocab, src_max_features = data_utils.load_vocab(src_vocab) trg_vocab, trg_max_features = data_utils.load_vocab(trg_vocab) print("Loading source sentences") src_sentences, src_max_length = data_utils.load_corpus(src_sentences) print("Loading contexts") trg_context = data_utils.load_context(trg_context) print("Loading targets") trg_target = data_utils.load_target(trg_target) trg_max_length = trg_context.shape[1] validation_size = 0.25 print("Data loaded") nid = Bilingual_neural_information_density(src_sentences, src_max_features, src_max_length, trg_context, trg_target, trg_max_features, trg_max_length, batch, validation_size) print("Data prepared")
def main(args): corpus = load_corpus(args.input) prepare_resources(corpus) profile_corpus(corpus)
def main(): args = parse_arguments() print(args) ################## # Data Loading # ################## train_corpus, valid_corpus, test_corpus, dictionary = load_corpus(args) vocab_size = len(dictionary) print("vocab_size", vocab_size) ################## # Model Setup # ################## model = build_model(vocab_size, args, dictionary) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() if use_cuda: criterion = criterion.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) ##################### # Training Config # ##################### num_epochs = args.num_epochs config = { 'ignore_x': args.ignore_x, 'ignore_r': args.ignore_r, 'ignore_l': args.ignore_l, 'ignore_e': args.ignore_e, 'skip_sentence': args.skip_sentence, 'max_entity': args.max_entity } best_valid_loss = None early_stop_count = 0 early_stop_threshold = args.early_stop model_name = build_model_name(args) model_path = build_model_path(args) tensorboard_dir = args.tensorboard print("Model will be saved to {}".format(model_path)) train_writer = SummaryWriter('{}/{}/{}'.format(tensorboard_dir, model_name, 'train')) valid_writer = SummaryWriter('{}/{}/{}'.format(tensorboard_dir, model_name, 'valid')) test_writer = SummaryWriter('{}/{}/{}'.format(tensorboard_dir, model_name, 'test')) for epoch in range(1, num_epochs + 1, 1): print("Epoch", epoch) # Run training random.shuffle(train_corpus.documents) train_losses, train_accuracies = run_corpus(train_corpus, model, optimizer, criterion, config, train_mode=True) train_loss, train_entity_acc = train_losses['loss'], train_accuracies[ 'entity_acc'] print("train_loss", train_loss, "train_entity_acc", train_entity_acc) record_to_writer(train_writer, epoch, train_losses, train_accuracies) # Run validation valid_losses, valid_accuracies = run_corpus(valid_corpus, model, optimizer, criterion, config, train_mode=False) valid_loss, valid_entity_acc = valid_losses['loss'], valid_accuracies[ 'entity_acc'] print("valid_loss", valid_loss, "valid_entity_acc", valid_entity_acc) record_to_writer(valid_writer, epoch, valid_losses, valid_accuracies) # Early stopping conditioning on validation set loss if best_valid_loss == None or valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), model_path) early_stop_count = 0 else: early_stop_count += 1 if early_stop_count >= early_stop_threshold: print("Early stopping criteria met!") break print("Test set evaluation") model.load_state_dict(torch.load(model_path)) test_losses, test_accuracies = run_corpus(test_corpus, model, optimizer, criterion, config, train_mode=False) test_loss, test_entity_acc = test_losses['loss'], test_accuracies[ 'entity_acc'] print("test_loss", test_loss, "test_entity_acc", test_entity_acc) record_to_writer(test_writer, epoch, test_losses, test_accuracies) train_writer.close() valid_writer.close() test_writer.close()
import part2_train_utils import helpers ############################################################################## # Settings ############################################################################## CUDA = False ############################################################################## # Load the dataset ############################################################################## Data = namedtuple("Data", "corpus train dev test embeddings word_to_index") data_utils.download_ask_ubuntu_dataset() EMBEDDINGS, WORD_TO_INDEX = data_utils.load_part2_embeddings() ASK_UBUNTU_CORPUS = data_utils.load_corpus(WORD_TO_INDEX) ASK_UBUNTU_TRAIN_DATA = data_utils.load_train_data() ASK_UBUNTU_DEV_DATA, ASK_UBUNTU_TEST_DATA = data_utils.load_eval_data() ASK_UBUNTU_DATA = Data(ASK_UBUNTU_CORPUS, ASK_UBUNTU_TRAIN_DATA,\ ASK_UBUNTU_DEV_DATA, ASK_UBUNTU_TEST_DATA,\ EMBEDDINGS, WORD_TO_INDEX) data_utils.download_android_dataset() ANDROID_CORPUS = data_utils.load_android_corpus(WORD_TO_INDEX) ANDROID_DEV_DATA, ANDROID_TEST_DATA = data_utils.load_android_eval_data() ANDROID_DATA = Data(ANDROID_CORPUS, None,\ ANDROID_DEV_DATA, ANDROID_TEST_DATA,\ EMBEDDINGS, WORD_TO_INDEX) ############################################################################## # Train and evaluate a baseline TFIDF model
import train_utils import helpers ############################################################################## # Settings ############################################################################## CUDA = False ############################################################################## # Load the dataset ############################################################################## Data = namedtuple("Data", \ "corpus train dev test embeddings word_to_index") data_utils.download_ask_ubuntu_dataset() EMBEDDINGS, WORD_TO_INDEX = data_utils.load_embeddings() CORPUS = data_utils.load_corpus(WORD_TO_INDEX) TRAIN_DATA = data_utils.load_train_data() DEV_DATA, TEST_DATA = data_utils.load_eval_data() DATA = Data(CORPUS, TRAIN_DATA, DEV_DATA, TEST_DATA,\ EMBEDDINGS, WORD_TO_INDEX) ############################################################################## # Train and evaluate the models for Part 1 ############################################################################## RESULTS = [] MARGINS = [0.2] MAX_EPOCHS = 50 BATCH_SIZE = 32 FILTER_WIDTHS = [3] POOL_METHOD = "average" FEATURE_DIMS = [600]