def build_data_loader( raw_data, frequencies, algorithm, context_size=data_hyperparameters.CONTEXT_SIZE, threshold=data_hyperparameters.SUBSAMPLE_THRESHOLD, min_review_length=data_hyperparameters.MIN_REVIEW_LENGTH, sub_sample=False, batch_size=data_hyperparameters.WORD_EMBEDDING_BATCH_SIZE, shuffle=False): xs = [] ys = [] for review in raw_data: data_points = pre_process_words(review, algorithm, context_size, min_review_length) for data_point_x, data_point_y in data_points: if sub_sample: if subsample_word(frequencies[data_point_y], threshold): continue xs.append(data_point_x) ys.append(data_point_y) write_log('Size of data: {0}'.format(len(xs)), logger) xs = torch.tensor(xs, device=device) ys = torch.tensor(ys, device=device) ds = torch.utils.data.TensorDataset(xs, ys) dl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=shuffle) return dl
def run_experiment(env, agent, name, log_object, episodes, episodes_for_logging, lr=1e-4): actor_optimiser = torch.optim.Adam(agent.actor.parameters(), lr=lr) critic_optimiser = torch.optim.Adam(agent.critic.parameters(), lr=lr) rewards = [] now_for_episode = datetime.now() for episode in range(episodes): rewards.append( play_and_train_network(env, agent, actor_optimiser, critic_optimiser)) if episode % episodes_for_logging == 0 and episode != 0: write_log( 'Computed {0} out of {1} episodes for {2} in {3} seconds'. format(episode, episodes, name, (datetime.now() - now_for_episode).total_seconds()), log_object) now_for_episode = datetime.now() write_log( 'Mean reward: {0}'.format( np.mean(rewards[-episodes_for_logging:])), log_object) return rewards
def loadModelState_w2v( model_name, algorithm_type, unigram_distribution_power=data_hyperparameters.UNIGRAM_DISTRIBUTION_POWER ): frequencies = pickle.load(open(FREQS_FILE, "rb")) distribution = noise_distribution(frequencies, unigram_distribution_power) infile = open(model_name + '_' + algorithm_type + '_model_data', 'rb') model_data = pickle.load(infile) infile.close() if algorithm_type.upper() == 'CBOW': model = ContinuousBagOfWords(data_hyperparameters.VOCAB_SIZE, model_data['embeddingDim'], model_data['contextSize'], model_name) else: model = SkipGramWithNegativeSampling(data_hyperparameters.VOCAB_SIZE, model_data['embeddingDim'], model_data['contextSize'], model_data['numNegativeSamples'], model_data['innerProductClamp'], model_name) model.load_state_dict(torch.load(model_name + algorithm_type + '.pt')) if data_hyperparameters.USE_CUDA: model.cuda() write_log('Loaded model {0}'.format(model_name), logger) model.eval() return frequencies, distribution, model
def split_data(fit_mapped_tokens, fit_labels): now = datetime.now() write_log('Splitting fit data into training and validation sets', logger) X_train, X_valid, y_train, y_valid = train_test_split( fit_mapped_tokens, fit_labels, test_size=data_hyperparameters.TRAIN_VALID_SPLIT) write_log( 'Splitting took {0} seconds'.format( (datetime.now() - now).total_seconds()), logger) return y_train, X_train, y_valid, X_valid
def load_model_state(model, model_name): model.load_state_dict(torch.load('saved_models/{0}.pt'.format(model_name))) write_log('Loaded model {0} weights'.format(model_name), logger) infile = open('saved_models/{0}_model_data.pkl'.format(model_name), 'rb') model_data = load(infile) infile.close() model.train_losses = model_data['train_losses'] model.valid_losses = model_data['valid_losses'] model.num_epochs_trained = model_data['num_epochs_trained'] model.latest_scheduled_lr = model_data['latest_scheduled_lr'] model.train_time = model_data['train_time'] model.num_trainable_params = model_data['num_trainable_params'] model.instantiated = model_data['instantiated'] model.name = model_data['name'] model.vocab_size = model_data['vocab_size'] model.tokenizer = model_data['tokenizer'] model.batch_size = model_data['batch_size'] model.train_accuracies = model_data['train_accuracies'] model.valid_accuracies = model_data['valid_accuracies'] write_log('Loaded model {0} state'.format(model_name), logger)
def get_vocab(): if not os.path.exists(VOCAB_FILE): write_log('Downloading raw data', logger) now = datetime.now() dataset_fit_raw, dataset_test_raw = torchtext.experimental.datasets.IMDB( tokenizer=tokenizer) write_log( 'Download took {0} seconds'.format( (datetime.now() - now).total_seconds()), logger) assert len(dataset_fit_raw) == 25000 assert len(dataset_test_raw) == 25000 write_log('Building vocab', logger) vocab = dataset_fit_raw.get_vocab() new_vocab = torchtext.vocab.Vocab(counter=vocab.freqs, max_size=VOCAB_SIZE) save_data(new_vocab, VOCAB_FILE) write_log('Vocab saved to {0}'.format(VOCAB_FILE), logger) else: write_log('Loading vocab from {0}'.format(VOCAB_FILE), logger) new_vocab = pickle.load(open(VOCAB_FILE, "rb")) return new_vocab
def save_model_state_w2v(model): torch.save(model.state_dict(), model.name + '_' + model.algorithmType + '.pt') if model.algorithm_type == 'CBOW': model_data = { 'embedding_dim': model.embedding_dim, 'context_size': model.context_size } else: model_data = { 'embedding_dim': model.embedding_dim, 'context_size': model.context_size, 'num_negative_samples': model.num_negative_samples, 'inner_product_clamp': model.inner_product_clamp } outfile = open(model.name + '_' + model.algorithmType + '_model_data', 'wb') pickle.dump(model_data, outfile) outfile.close() write_log('Saved model ' + model.name, logger) return
def setup(algorithm, batch_size=data_hyperparameters.WORD_EMBEDDING_BATCH_SIZE, context_size=data_hyperparameters.CONTEXT_SIZE, threshold=data_hyperparameters.SUBSAMPLE_THRESHOLD, unigram_distribution_power=data_hyperparameters. UNIGRAM_DISTRIBUTION_POWER, min_review_length=data_hyperparameters.MIN_REVIEW_LENGTH): now = datetime.now() frequencies, data = get_data() distribution = noise_distribution(frequencies, unigram_distribution_power) train_data, valid_data = split_data(data) write_log('Train data', logger) train_loader = build_data_loader(train_data, frequencies, algorithm, context_size, threshold, min_review_length, sub_sample=True, batch_size=batch_size, shuffle=True) write_log('Validation data', logger) valid_loader = build_data_loader(valid_data, frequencies, algorithm, context_size, threshold, min_review_length, sub_sample=True, batch_size=2 * batch_size, shuffle=False) seconds = (datetime.now() - now).total_seconds() write_log('Setting up took: {0} seconds'.format(seconds), logger) return frequencies, distribution, train_loader, valid_loader
def load_model_state(model, model_name): model.load_state_dict(torch.load('saved_models/{0}.pt'.format(model_name))) write_log('Loaded model {0} weights'.format(model_name), logger) infile = open('saved_models/{0}_model_data.pkl'.format(model_name), 'rb') model_data = load(infile) infile.close() model.train_losses = model_data['train_losses'] model.valid_losses = model_data['valid_losses'] model.train_bleus = model_data['train_bleus'] model.valid_bleus = model_data['valid_bleus'] model.num_epochs_trained = model_data['num_epochs_trained'] model.latest_scheduled_lr = model_data['latest_scheduled_lr'] model.lr_history = model_data['lr_history'] model.train_time = model_data['train_time'] model.num_trainable_params = model_data['num_trainable_params'] model.instantiated = model_data['instantiated'] model.name = model_data['name'] model.batch_size = model_data['batch_size'] model.teacher_forcing_proportion = model_data['teacher_forcing_proportion'] model.teacher_forcing_proportion_history = model_data[ 'teacher_forcing_proportion_history'] write_log('Loaded model {0} state'.format(model_name), logger)
def get_model_performance_data(): _, dataset_fit, dataset_test = data_downloader.get_data() now = datetime.now() write_log('Computing feature probabilities for naive Bayes', logger) # First position for the number of times word has been positive. # Second position for the total number of times seen scores = np.zeros((len(dataset_fit.get_vocab()), 2)) for data in dataset_fit: label = data[0].item() words = set(data[1].tolist()) for word in words: scores[word, 1] += 1 if label == 1: scores[word, 0] += 1 probs = (1 + scores[:, 0]) / (2 + scores[:, 1]) # Laplacian smoothing train_time = (datetime.now() - now).total_seconds() write_log('Computation took {0} seconds'.format(train_time), logger) dataset_fit_pred = [NB_predict(data, probs) for data in dataset_fit] dataset_test_pred = [NB_predict(data, probs) for data in dataset_test] fit_accuracy = compute_accuracy(dataset_fit_pred, [data[0].item() for data in dataset_fit]) test_accuracy = compute_accuracy(dataset_test_pred, [data[0].item() for data in dataset_test]) return { 'name': 'NB', 'train_accuracy': fit_accuracy, 'valid_accuracy': nan, 'test_accuracy': test_accuracy, 'total_train_time': train_time, 'num_epochs': 1, 'trainable_params': len(probs), 'final_train_loss': nan, 'final_valid_loss': nan, 'model_created': now, 'average_time_per_epoch': train_time, 'vocab_size': data_hyperparameters.VOCAB_SIZE, 'tokenizer': data_hyperparameters.TOKENIZER, 'batch_size': nan }
def get_data(): if not os.path.exists(TOKENIZER_FILE): vocab = get_vocab( ) # Strictly speaking this is coming from the training rather than unsupervised data vocab_index = vocab.itos vocab_reverse_index = vocab.stoi freqs = list( map(lambda i: vocab.freqs[vocab_index[i]], range(len(vocab)))) total = sum(freqs) for i in range(len(freqs)): freqs[i] /= total save_data(freqs, FREQS_FILE) unsup_data = glob.iglob('.data/aclImdb/train/unsup/*') all_mapped_tokens = [] write_log( 'Iterating through unsupervised data (this could take a while)', logger) now = datetime.now() for data in unsup_data: with open(data, 'r') as f: if data_hyperparameters.VERBOSE_LOGGING: write_log('Processing {0}'.format(f), logger) text = f.read() mapped_tokens = [ vocab_reverse_index[token] for token in tokenizer(text) ] all_mapped_tokens.append(mapped_tokens) write_log( 'Iteration took {0} seconds'.format( (datetime.now() - now).total_seconds()), logger) assert len(all_mapped_tokens) == 50000 save_data(all_mapped_tokens, TOKENIZER_FILE) else: write_log('Loading tokens from {0}'.format(TOKENIZER_FILE), logger) all_mapped_tokens = pickle.load(open(TOKENIZER_FILE, "rb")) write_log('Loading frequencies from {0}'.format(FREQS_FILE), logger) freqs = pickle.load(open(FREQS_FILE, "rb")) return freqs, all_mapped_tokens
def prepare_data(batch_size=data_hyperparameters.BATCH_SIZE): english = Language('en') french = Language('fr') if not os.path.exists(EN_WORD_TO_INDEX_FILE) or not os.path.exists(EN_WORD_TO_COUNT_FILE) or not os.path.exists(EN_INDEX_TO_WORD_FILE) or not os.path.exists(FR_WORD_TO_INDEX_FILE) or not os.path.exists(FR_WORD_TO_COUNT_FILE) or not os.path.exists(FR_INDEX_TO_WORD_FILE) or not os.path.exists(EN_FIT_INDEX_FILE) or not os.path.exists(EN_VALID_INDEX_FILE) or not os.path.exists(EN_TEST_INDEX_FILE) or not os.path.exists(FR_FIT_INDEX_FILE) or not os.path.exists(FR_VALID_INDEX_FILE) or not os.path.exists(FR_TEST_INDEX_FILE): en_sentences = [] fr_sentences = [] write_log('Reading sentence pairs from file', logger) with open(DATA_FILE, mode='r', encoding='utf-8') as f: for line in f: en_sentence, fr_sentence = line.strip().split('\t') en_sentence = [t.text for t in english.tokenizer(normalize_string(en_sentence))] fr_sentence = [t.text for t in french.tokenizer(normalize_string(fr_sentence))] #todo: remove this filtration if len(en_sentence) > data_hyperparameters.MAX_LENGTH or len(fr_sentence) > data_hyperparameters.MAX_LENGTH: continue en_sentences.append(en_sentence) fr_sentences.append(fr_sentence) write_log('Splitting data', logger) en_sentences_train, en_sentences_test, fr_sentences_train, fr_sentences_test = train_test_split(en_sentences, fr_sentences, test_size=data_hyperparameters.TRAIN_TEST_SPLIT) en_sentences_fit, en_sentences_valid, fr_sentences_fit, fr_sentences_valid = train_test_split(en_sentences_train, fr_sentences_train, test_size=data_hyperparameters.TRAIN_VALID_SPLIT) write_log('Building languages', logger) english.read(en_sentences_fit) english.cache() french.read(fr_sentences_fit) french.cache() write_log('Indexing sentences', logger) en_sentences_fit_index = english.index_sentences(en_sentences_fit) save_data(en_sentences_fit_index, EN_FIT_INDEX_FILE) en_sentences_valid_index = english.index_sentences(en_sentences_valid) save_data(en_sentences_valid_index, EN_VALID_INDEX_FILE) en_sentences_test_index = english.index_sentences(en_sentences_test) save_data(en_sentences_test_index, EN_TEST_INDEX_FILE) fr_sentences_fit_index = french.index_sentences(fr_sentences_fit) save_data(fr_sentences_fit_index, FR_FIT_INDEX_FILE) fr_sentences_valid_index = french.index_sentences(fr_sentences_valid) save_data(fr_sentences_valid_index, FR_VALID_INDEX_FILE) fr_sentences_test_index = french.index_sentences(fr_sentences_test) save_data(fr_sentences_test_index, FR_TEST_INDEX_FILE) else: write_log('Loading languages from disk cache', logger) english.load() french.load() write_log('Loading indexed sentences from disk cache', logger) en_sentences_fit_index = pickle.load(open(EN_FIT_INDEX_FILE, 'rb')) en_sentences_valid_index = pickle.load(open(EN_VALID_INDEX_FILE, 'rb')) en_sentences_test_index = pickle.load(open(EN_TEST_INDEX_FILE, 'rb')) fr_sentences_fit_index = pickle.load(open(FR_FIT_INDEX_FILE, 'rb')) fr_sentences_valid_index = pickle.load(open(FR_VALID_INDEX_FILE, 'rb')) fr_sentences_test_index = pickle.load(open(FR_TEST_INDEX_FILE, 'rb')) train_data_loader = get_dataloader(fr_sentences_fit_index, en_sentences_fit_index, batch_size=batch_size) write_log('{0} batches in training data'.format(len(train_data_loader)), logger) valid_data_loader = get_dataloader(fr_sentences_valid_index, en_sentences_valid_index, batch_size=batch_size) write_log('{0} batches in validation data'.format(len(valid_data_loader)), logger) test_data_loader = get_dataloader(fr_sentences_test_index, en_sentences_test_index, batch_size=batch_size) write_log('{0} batches in test data'.format(len(test_data_loader)), logger) return french, english, train_data_loader, valid_data_loader, test_data_loader
def train_agent(epsilon, discount_factor, env, agent_name, log_object, dqn_type='duelling', replay_type='prioritised'): write_log('Running agent {0}'.format(agent_name), log_object) now = datetime.now() n_actions = env.action_space.n state_dim = env.observation_space.shape[0] if dqn_type == 'duelling': agent = model_classes.DuellingDQN(epsilon, discount_factor, state_dim, n_actions) optimiser = torch.optim.Adam([{ 'params': agent.online_feature_network.parameters() }, { 'params': agent.online_value_network.parameters() }, { 'params': agent.online_advantage_network.parameters() }]) else: agent = model_classes.DoubleDQN(epsilon, discount_factor, state_dim, n_actions) optimiser = torch.optim.Adam(agent.online_network.parameters()) if replay_type == 'prioritised': replay = model_classes.PrioritisedReplay( data_hyperparameters.REPLAY_CAPACITY) else: replay = model_classes.Replay(data_hyperparameters.REPLAY_CAPACITY) s = env.reset() total_reward = 0. total_rewards = [] for t in range(data_hyperparameters.NUM_STEPS): a = agent.get_action(s) next_s, r, done, _ = env.step(a) total_reward += r replay.add(s, a, r, next_s, done) if len(replay) >= data_hyperparameters.REPLAY_BATCH_SIZE: s_batch, a_batch, r_batch, next_s_batch, done_batch, i_batch, w_batch = replay.sample( data_hyperparameters.REPLAY_BATCH_SIZE) batch_loss = agent.calculate_temporal_difference_loss( s_batch, a_batch, r_batch, next_s_batch, done_batch, w_batch) if replay_type == 'prioritised': with torch.no_grad(): new_priorities = batch_loss + data_hyperparameters.PRIORITY_ADJUSTMENT_EPSILON if agent.use_cuda: new_priorities = new_priorities.cpu() new_priorities = new_priorities.numpy() replay.update_priorities(i_batch, new_priorities) optimiser.zero_grad() batch_loss = torch.mean(batch_loss) batch_loss.backward() optimiser.step() s = next_s agent.adjust_epsilon() replay.adjust_beta() if done: total_rewards.append(total_reward) total_reward = 0. s = env.reset() if t % data_hyperparameters.NUM_STEPS_FOR_SYNCHRONISING == 0: agent.synchronise() write_log( 'Agent {0} took {1} seconds total'.format( agent_name, (datetime.now() - now).total_seconds()), log_object) return total_rewards
def train_w2v(model_name, train_loader, valid_loader, vocab_size=data_hyperparameters.VOCAB_SIZE, distribution=None, epochs=data_hyperparameters.WORD_EMBEDDING_EPOCHS, lr=1e-2, embedding_dim=data_hyperparameters.WORD_EMBEDDING_DIMENSION, context_size=data_hyperparameters.CONTEXT_SIZE, inner_product_clamp=data_hyperparameters.INNER_PRODUCT_CLAMP, num_negative_samples=data_hyperparameters.NUM_NEGATIVE_SAMPLES, algorithm='SGNS'): train_losses = [] valid_losses = [] if algorithm.upper() == 'CBOW': model = ContinuousBagOfWords(vocab_size, embedding_dim, context_size, model_name) loss_function = torch.nn.NLLLoss() else: model = SkipGramWithNegativeSampling(vocab_size, embedding_dim, context_size, num_negative_samples, inner_product_clamp, model_name) distribution_tensor = torch.tensor(distribution, dtype=torch.float, device=device) if data_hyperparameters.USE_CUDA: model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, verbose=True) write_log( 'Training on {0} batches and validating on {1} batches'.format( len(train_loader), len(valid_loader)), logger) for epoch in range(epochs): now = datetime.now() write_log('Epoch: {0}'.format(epoch), logger) model.train() total_loss = 0 for xb, yb in train_loader: if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE: xb = xb.cuda() yb = yb.cuda() if algorithm.upper() == 'CBOW': predictions = model(xb) loss = loss_function(predictions, yb) else: negative_samples = produce_negative_samples( distribution_tensor, num_negative_samples, len(yb)) loss = torch.mean(model(yb, xb, negative_samples)) loss.backward() total_loss += loss.item() optimizer.zero_grad() optimizer.step() train_loss = total_loss / len(train_loader) write_log('Training loss: {0}'.format(train_loss), logger) train_losses.append(train_loss) model.eval() with torch.no_grad(): valid_loss = 0 for xb, yb in valid_loader: if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE: xb = xb.cuda() yb = yb.cuda() if algorithm.upper() == 'CBOW': valid_loss += loss_function(model(xb), yb).item() elif algorithm.upper() == 'SGNS': negative_samples = produce_negative_samples( distribution_tensor, num_negative_samples, len(yb)) loss = model(yb, xb, negative_samples) valid_loss += torch.mean(loss).item() valid_loss = valid_loss / len(valid_loader) valid_losses.append(valid_loss) write_log('Validation loss: {0}'.format(valid_loss), logger) seconds = (datetime.now() - now).total_seconds() write_log('Epoch took: {0} seconds'.format(seconds), logger) scheduler.step(valid_loss) fig, ax = plt.subplots() ax.plot(range(epochs), train_losses, label='Training') ax.plot(range(epochs), valid_losses, label='Validation') ax.set_xlabel('Epoch') ax.set_ylabel('Loss') ax.set_title('Learning curve for model {0}'.format(model_name)) ax.legend() plt.savefig('learning_curves/{0}_learning_curve_{1}_{2}_{3}.png'.format( model_name, embedding_dim, algorithm, context_size)) return model
import model_classes import matplotlib.pyplot as plt from model_pipeline import run_experiment, moving_average import torch LOG_FILE = 'run_experiment' logger = create_logger(LOG_FILE) cart_pole = gym.make('CartPole-v0') n_actions = cart_pole.action_space.n state_dim = cart_pole.observation_space.shape[0] all_experiment_rewards = [] all_experiment_names = [] write_log('Environment: {0}'.format('CartPole-v0'), logger) write_log('Hyperparameters', logger) write_log('num_experiments: {0}'.format(data_hyperparameters.NUM_EXPERIMENTS), logger) write_log('epsilon: {0}'.format(data_hyperparameters.EPSILON), logger) write_log('discount_factor: {0}'.format(data_hyperparameters.DISCOUNT_FACTOR), logger) write_log( 'epsilon_decay_factor: {0}'.format( data_hyperparameters.EPSILON_DECAY_FACTOR), logger) write_log('num_episodes: {0}'.format(data_hyperparameters.NUM_EPISODES), logger) write_log( 'num_episodes_for_decay: {0}'.format( data_hyperparameters.NUM_EPISODES_FOR_DECAY), logger)
import model_classes import model_pipeline import data_hyperparameters import data_downloader from log_utils import create_logger, write_log LOG_FILE = 'run_experiment' logger = create_logger(LOG_FILE) train_data, valid_data, test_data = data_downloader.get_dataloaders() # Note: this will not work directly with LogisticRegressionBOW as this trains with a different dataset #Also should do naive Bayes separately (with naive_bayes.report_statistics() since it's based on a different paradigm #Usage: add models to the list below models = [ model_classes.TransformerEncoder(num_layers=2, max_len=500, name='TransformerEncoder_500') ] for model in models: write_log('Running experiment for {0}'.format(model.name), logger) model_pipeline.train(model=model, train_data=train_data, valid_data=valid_data) model.plot_losses() model_pipeline.report_statistics(model=model, train_data=train_data, valid_data=valid_data, test_data=test_data)
def get_data(): vocab = get_vocab() vocab_reverse_index = vocab.stoi PAD_TOKEN = vocab.stoi['<pad>'] if not os.path.exists(FIT_FILE) or not os.path.exists(FIT_LABELS_FILE) or not os.path.exists(TEST_FILE) \ or not os.path.exists(TEST_LABELS_FILE): write_log('Building fit and test data (this may take a while)', logger) now = datetime.now() fit_mapped_tokens = [] fit_labels = [] train_pos_data = glob.iglob('.data/aclImdb/train/pos/*') for data in train_pos_data: with open(data, 'r') as f: if data_hyperparameters.VERBOSE_LOGGING: write_log('Processing {0}'.format(f), logger) text = f.read() mapped_tokens = [ vocab_reverse_index[token] for token in tokenizer(text) ] fit_mapped_tokens.append(mapped_tokens) fit_labels.append(1) train_neg_data = glob.iglob('.data/aclImdb/train/neg/*') for data in train_neg_data: with open(data, 'r') as f: if data_hyperparameters.VERBOSE_LOGGING: write_log('Processing {0}'.format(f), logger) text = f.read() mapped_tokens = [ vocab_reverse_index[token] for token in tokenizer(text) ] fit_mapped_tokens.append(mapped_tokens) fit_labels.append(0) save_data(fit_mapped_tokens, FIT_FILE) save_data(fit_labels, FIT_LABELS_FILE) write_log('Processed fit data', logger) test_mapped_tokens = [] test_labels = [] test_pos_data = glob.iglob('.data/aclImdb/test/pos/*') for data in test_pos_data: with open(data, 'r') as f: if data_hyperparameters.VERBOSE_LOGGING: write_log('Processing {0}'.format(f), logger) text = f.read() mapped_tokens = [ vocab_reverse_index[token] for token in tokenizer(text) ] test_mapped_tokens.append(mapped_tokens) test_labels.append(1) test_neg_data = glob.iglob('.data/aclImdb/test/neg/*') for data in test_neg_data: with open(data, 'r') as f: if data_hyperparameters.VERBOSE_LOGGING: write_log('Processing {0}'.format(f), logger) text = f.read() mapped_tokens = [ vocab_reverse_index[token] for token in tokenizer(text) ] test_mapped_tokens.append(mapped_tokens) test_labels.append(0) save_data(test_mapped_tokens, TEST_FILE) save_data(test_labels, TEST_LABELS_FILE) write_log('Processed test data', logger) write_log( 'Building fit and test data took {0} seconds'.format( (datetime.now() - now).total_seconds()), logger) else: write_log('Loading fit data from {0}'.format(FIT_FILE), logger) fit_mapped_tokens = pickle.load(open(FIT_FILE, "rb")) write_log('Loading fit labels from {0}'.format(FIT_LABELS_FILE), logger) fit_labels = pickle.load(open(FIT_LABELS_FILE, "rb")) write_log('Loading test data from {0}'.format(TEST_FILE), logger) test_mapped_tokens = pickle.load(open(TEST_FILE, "rb")) write_log('Loading test labels from {0}'.format(TEST_LABELS_FILE), logger) test_labels = pickle.load(open(TEST_LABELS_FILE, "rb")) return PAD_TOKEN, fit_mapped_tokens, fit_labels, test_mapped_tokens, test_labels
def train(model, train_data, valid_data, epochs=data_hyperparameters.EPOCHS, patience=data_hyperparameters.PATIENCE, teacher_forcing_scale_factor=data_hyperparameters. TEACHER_FORCING_SCALE_FACTOR): loss_function = torch.nn.NLLLoss( ignore_index=data_hyperparameters.PAD_TOKEN) if data_hyperparameters.USE_CUDA: model.cuda() optimiser = torch.optim.Adam(model.parameters( )) if model.latest_scheduled_lr is None else torch.optim.Adam( model.parameters(), lr=model.latest_scheduled_lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, patience=patience, mode='max') now_begin_training = datetime.now() start_epoch = model.num_epochs_trained for epoch in range(start_epoch, epochs + start_epoch): now_begin_epoch = datetime.now() model.latest_scheduled_lr = optimiser.param_groups[0]['lr'] model.teacher_forcing_proportion_history.append( model.teacher_forcing_proportion) model.lr_history.append(model.latest_scheduled_lr) write_log( 'Running epoch {0} of {1} with learning rate {2} and teacher forcing rate {3}' .format(epoch + 1, epochs + start_epoch, model.latest_scheduled_lr, model.teacher_forcing_proportion), logger) model.train() loss = 0. for xb, yb in train_data: if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE: xb = xb.cuda() yb = yb.cuda() teacher_force = random() < model.teacher_forcing_proportion batch_loss = loss_function( torch.flatten(model(xb, yb, teacher_force=teacher_force)[1:], start_dim=0, end_dim=1), torch.flatten(yb.transpose(0, 1)[1:])) loss += batch_loss.item() / len(train_data) optimiser.zero_grad() batch_loss.backward() optimiser.step() model.train_losses.append(loss) write_log('Training loss: {0}'.format(loss), logger) model.eval() train_bleu = average_bleu(train_data, model) write_log('Training BLEU: {0}'.format(train_bleu), logger) model.train_bleus.append(train_bleu) with torch.no_grad(): if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE: loss = 0. for xb, yb in valid_data: xb = xb.cuda() yb = yb.cuda() loss += loss_function( torch.flatten(model(xb, yb, teacher_force=False)[1:], start_dim=0, end_dim=1), torch.flatten(yb.transpose( 0, 1)[1:])).item() / len(valid_data) else: loss = sum([ loss_function( torch.flatten(model(xb, yb, teacher_force=False)[1:], start_dim=0, end_dim=1), torch.flatten(yb.transpose(0, 1)[1:])).item() for xb, yb in valid_data ]) / len(valid_data) model.valid_losses.append(loss) write_log('Validation loss: {0}'.format(loss), logger) valid_bleu = average_bleu(valid_data, model) write_log('Validation BLEU: {0}'.format(valid_bleu), logger) model.valid_bleus.append(valid_bleu) scheduler.step(valid_bleu) model.num_epochs_trained += 1 model.teacher_forcing_proportion *= teacher_forcing_scale_factor write_log( 'Epoch took {0} seconds'.format( (datetime.now() - now_begin_epoch).total_seconds()), logger) model.train_time += (datetime.now() - now_begin_training).total_seconds() if data_hyperparameters.USE_CUDA: model.cpu()
def train(model, train_data, valid_data, epochs=data_hyperparameters.EPOCHS, patience=data_hyperparameters.PATIENCE, report_accuracy_every=data_hyperparameters.REPORT_ACCURACY_EVERY): loss_function = torch.nn.NLLLoss() if data_hyperparameters.USE_CUDA: model.cuda() optimiser = torch.optim.Adam(model.parameters( )) if model.latest_scheduled_lr is None else torch.optim.Adam( model.parameters(), lr=model.latest_scheduled_lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, patience=patience) now_begin_training = datetime.now() start_epoch = model.num_epochs_trained for epoch in range(start_epoch, epochs + start_epoch): now_begin_epoch = datetime.now() model.latest_scheduled_lr = optimiser.param_groups[0]['lr'] model.lr_history.append(model.latest_scheduled_lr) write_log( 'Running epoch {0} of {1} with learning rate {2}'.format( epoch + 1, epochs + start_epoch, model.latest_scheduled_lr), logger) model.train() loss = 0. for xb, yb in train_data: if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE: xb = xb.cuda() yb = yb.cuda() batch_loss = loss_function(model(xb), yb) loss += batch_loss.item() / len(train_data) optimiser.zero_grad() batch_loss.backward() optimiser.step() model.train_losses.append(loss) write_log('Training loss: {0}'.format(loss), logger) model.eval() if report_accuracy_every is not None: if (epoch + 1) % report_accuracy_every == 0: accuracy, mean_correct_prediction_probs, mean_incorrect_prediction_probs = get_accuracy( train_data, model, also_report_model_confidences=True) write_log('Training accuracy: {0}'.format(accuracy), logger) model.train_accuracies[epoch + 1] = accuracy write_log( 'Model confidence: {0} (correct predictions), {1} (incorrect predictions)' .format(mean_correct_prediction_probs, mean_incorrect_prediction_probs), logger) model.train_correct_confidences[ epoch + 1] = mean_correct_prediction_probs model.train_incorrect_confidences[ epoch + 1] = mean_incorrect_prediction_probs with torch.no_grad(): if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE: loss = 0. for xb, yb in valid_data: xb = xb.cuda() yb = yb.cuda() loss += loss_function(model(xb), yb).item() / len(valid_data) else: loss = sum([ loss_function(model(xb), yb).item() for xb, yb in valid_data ]) / len(valid_data) model.valid_losses.append(loss) scheduler.step(loss) write_log('Validation loss: {0}'.format(loss), logger) if report_accuracy_every is not None: if (epoch + 1) % report_accuracy_every == 0: accuracy, mean_correct_prediction_probs, mean_incorrect_prediction_probs = get_accuracy( valid_data, model, also_report_model_confidences=True) write_log('Validation accuracy: {0}'.format(accuracy), logger) model.valid_accuracies[epoch + 1] = accuracy write_log( 'Model confidence: {0} (correct predictions), {1} (incorrect predictions)' .format(mean_correct_prediction_probs, mean_incorrect_prediction_probs), logger) model.valid_correct_confidences[ epoch + 1] = mean_correct_prediction_probs model.valid_incorrect_confidences[ epoch + 1] = mean_incorrect_prediction_probs model.num_epochs_trained += 1 write_log( 'Epoch took {0} seconds'.format( (datetime.now() - now_begin_epoch).total_seconds()), logger) model.train_time += (datetime.now() - now_begin_training).total_seconds() if data_hyperparameters.USE_CUDA: model.cpu()