Beispiel #1
0
def build_data_loader(
        raw_data,
        frequencies,
        algorithm,
        context_size=data_hyperparameters.CONTEXT_SIZE,
        threshold=data_hyperparameters.SUBSAMPLE_THRESHOLD,
        min_review_length=data_hyperparameters.MIN_REVIEW_LENGTH,
        sub_sample=False,
        batch_size=data_hyperparameters.WORD_EMBEDDING_BATCH_SIZE,
        shuffle=False):
    xs = []
    ys = []
    for review in raw_data:
        data_points = pre_process_words(review, algorithm, context_size,
                                        min_review_length)
        for data_point_x, data_point_y in data_points:
            if sub_sample:
                if subsample_word(frequencies[data_point_y], threshold):
                    continue
            xs.append(data_point_x)
            ys.append(data_point_y)
    write_log('Size of data: {0}'.format(len(xs)), logger)
    xs = torch.tensor(xs, device=device)
    ys = torch.tensor(ys, device=device)
    ds = torch.utils.data.TensorDataset(xs, ys)
    dl = torch.utils.data.DataLoader(ds,
                                     batch_size=batch_size,
                                     shuffle=shuffle)
    return dl
Beispiel #2
0
def run_experiment(env,
                   agent,
                   name,
                   log_object,
                   episodes,
                   episodes_for_logging,
                   lr=1e-4):
    actor_optimiser = torch.optim.Adam(agent.actor.parameters(), lr=lr)
    critic_optimiser = torch.optim.Adam(agent.critic.parameters(), lr=lr)
    rewards = []
    now_for_episode = datetime.now()
    for episode in range(episodes):
        rewards.append(
            play_and_train_network(env, agent, actor_optimiser,
                                   critic_optimiser))
        if episode % episodes_for_logging == 0 and episode != 0:
            write_log(
                'Computed {0} out of {1} episodes for {2} in {3} seconds'.
                format(episode, episodes, name,
                       (datetime.now() - now_for_episode).total_seconds()),
                log_object)
            now_for_episode = datetime.now()
            write_log(
                'Mean reward: {0}'.format(
                    np.mean(rewards[-episodes_for_logging:])), log_object)
    return rewards
Beispiel #3
0
def loadModelState_w2v(
    model_name,
    algorithm_type,
    unigram_distribution_power=data_hyperparameters.UNIGRAM_DISTRIBUTION_POWER
):
    frequencies = pickle.load(open(FREQS_FILE, "rb"))
    distribution = noise_distribution(frequencies, unigram_distribution_power)
    infile = open(model_name + '_' + algorithm_type + '_model_data', 'rb')
    model_data = pickle.load(infile)
    infile.close()
    if algorithm_type.upper() == 'CBOW':
        model = ContinuousBagOfWords(data_hyperparameters.VOCAB_SIZE,
                                     model_data['embeddingDim'],
                                     model_data['contextSize'], model_name)
    else:
        model = SkipGramWithNegativeSampling(data_hyperparameters.VOCAB_SIZE,
                                             model_data['embeddingDim'],
                                             model_data['contextSize'],
                                             model_data['numNegativeSamples'],
                                             model_data['innerProductClamp'],
                                             model_name)
    model.load_state_dict(torch.load(model_name + algorithm_type + '.pt'))
    if data_hyperparameters.USE_CUDA:
        model.cuda()
    write_log('Loaded model {0}'.format(model_name), logger)
    model.eval()
    return frequencies, distribution, model
def split_data(fit_mapped_tokens, fit_labels):
    now = datetime.now()
    write_log('Splitting fit data into training and validation sets', logger)
    X_train, X_valid, y_train, y_valid = train_test_split(
        fit_mapped_tokens,
        fit_labels,
        test_size=data_hyperparameters.TRAIN_VALID_SPLIT)
    write_log(
        'Splitting took {0} seconds'.format(
            (datetime.now() - now).total_seconds()), logger)
    return y_train, X_train, y_valid, X_valid
Beispiel #5
0
def load_model_state(model, model_name):
    model.load_state_dict(torch.load('saved_models/{0}.pt'.format(model_name)))
    write_log('Loaded model {0} weights'.format(model_name), logger)
    infile = open('saved_models/{0}_model_data.pkl'.format(model_name), 'rb')
    model_data = load(infile)
    infile.close()
    model.train_losses = model_data['train_losses']
    model.valid_losses = model_data['valid_losses']
    model.num_epochs_trained = model_data['num_epochs_trained']
    model.latest_scheduled_lr = model_data['latest_scheduled_lr']
    model.train_time = model_data['train_time']
    model.num_trainable_params = model_data['num_trainable_params']
    model.instantiated = model_data['instantiated']
    model.name = model_data['name']
    model.vocab_size = model_data['vocab_size']
    model.tokenizer = model_data['tokenizer']
    model.batch_size = model_data['batch_size']
    model.train_accuracies = model_data['train_accuracies']
    model.valid_accuracies = model_data['valid_accuracies']
    write_log('Loaded model {0} state'.format(model_name), logger)
def get_vocab():
    if not os.path.exists(VOCAB_FILE):
        write_log('Downloading raw data', logger)
        now = datetime.now()
        dataset_fit_raw, dataset_test_raw = torchtext.experimental.datasets.IMDB(
            tokenizer=tokenizer)
        write_log(
            'Download took {0} seconds'.format(
                (datetime.now() - now).total_seconds()), logger)
        assert len(dataset_fit_raw) == 25000
        assert len(dataset_test_raw) == 25000
        write_log('Building vocab', logger)
        vocab = dataset_fit_raw.get_vocab()
        new_vocab = torchtext.vocab.Vocab(counter=vocab.freqs,
                                          max_size=VOCAB_SIZE)
        save_data(new_vocab, VOCAB_FILE)
        write_log('Vocab saved to {0}'.format(VOCAB_FILE), logger)
    else:
        write_log('Loading vocab from {0}'.format(VOCAB_FILE), logger)
        new_vocab = pickle.load(open(VOCAB_FILE, "rb"))
    return new_vocab
Beispiel #7
0
def save_model_state_w2v(model):
    torch.save(model.state_dict(),
               model.name + '_' + model.algorithmType + '.pt')
    if model.algorithm_type == 'CBOW':
        model_data = {
            'embedding_dim': model.embedding_dim,
            'context_size': model.context_size
        }
    else:
        model_data = {
            'embedding_dim': model.embedding_dim,
            'context_size': model.context_size,
            'num_negative_samples': model.num_negative_samples,
            'inner_product_clamp': model.inner_product_clamp
        }
    outfile = open(model.name + '_' + model.algorithmType + '_model_data',
                   'wb')
    pickle.dump(model_data, outfile)
    outfile.close()
    write_log('Saved model ' + model.name, logger)
    return
Beispiel #8
0
def setup(algorithm,
          batch_size=data_hyperparameters.WORD_EMBEDDING_BATCH_SIZE,
          context_size=data_hyperparameters.CONTEXT_SIZE,
          threshold=data_hyperparameters.SUBSAMPLE_THRESHOLD,
          unigram_distribution_power=data_hyperparameters.
          UNIGRAM_DISTRIBUTION_POWER,
          min_review_length=data_hyperparameters.MIN_REVIEW_LENGTH):
    now = datetime.now()
    frequencies, data = get_data()
    distribution = noise_distribution(frequencies, unigram_distribution_power)
    train_data, valid_data = split_data(data)
    write_log('Train data', logger)
    train_loader = build_data_loader(train_data,
                                     frequencies,
                                     algorithm,
                                     context_size,
                                     threshold,
                                     min_review_length,
                                     sub_sample=True,
                                     batch_size=batch_size,
                                     shuffle=True)
    write_log('Validation data', logger)
    valid_loader = build_data_loader(valid_data,
                                     frequencies,
                                     algorithm,
                                     context_size,
                                     threshold,
                                     min_review_length,
                                     sub_sample=True,
                                     batch_size=2 * batch_size,
                                     shuffle=False)
    seconds = (datetime.now() - now).total_seconds()
    write_log('Setting up took: {0} seconds'.format(seconds), logger)
    return frequencies, distribution, train_loader, valid_loader
Beispiel #9
0
def load_model_state(model, model_name):
    model.load_state_dict(torch.load('saved_models/{0}.pt'.format(model_name)))
    write_log('Loaded model {0} weights'.format(model_name), logger)
    infile = open('saved_models/{0}_model_data.pkl'.format(model_name), 'rb')
    model_data = load(infile)
    infile.close()
    model.train_losses = model_data['train_losses']
    model.valid_losses = model_data['valid_losses']
    model.train_bleus = model_data['train_bleus']
    model.valid_bleus = model_data['valid_bleus']
    model.num_epochs_trained = model_data['num_epochs_trained']
    model.latest_scheduled_lr = model_data['latest_scheduled_lr']
    model.lr_history = model_data['lr_history']
    model.train_time = model_data['train_time']
    model.num_trainable_params = model_data['num_trainable_params']
    model.instantiated = model_data['instantiated']
    model.name = model_data['name']
    model.batch_size = model_data['batch_size']
    model.teacher_forcing_proportion = model_data['teacher_forcing_proportion']
    model.teacher_forcing_proportion_history = model_data[
        'teacher_forcing_proportion_history']
    write_log('Loaded model {0} state'.format(model_name), logger)
Beispiel #10
0
def get_model_performance_data():
    _, dataset_fit, dataset_test = data_downloader.get_data()
    now = datetime.now()
    write_log('Computing feature probabilities for naive Bayes', logger)
    # First position for the number of times word has been positive.
    # Second position for the total number of times seen
    scores = np.zeros((len(dataset_fit.get_vocab()), 2))
    for data in dataset_fit:
        label = data[0].item()
        words = set(data[1].tolist())
        for word in words:
            scores[word, 1] += 1
            if label == 1:
                scores[word, 0] += 1
    probs = (1 + scores[:, 0]) / (2 + scores[:, 1])  # Laplacian smoothing
    train_time = (datetime.now() - now).total_seconds()
    write_log('Computation took {0} seconds'.format(train_time), logger)
    dataset_fit_pred = [NB_predict(data, probs) for data in dataset_fit]
    dataset_test_pred = [NB_predict(data, probs) for data in dataset_test]
    fit_accuracy = compute_accuracy(dataset_fit_pred,
                                    [data[0].item() for data in dataset_fit])
    test_accuracy = compute_accuracy(dataset_test_pred,
                                     [data[0].item() for data in dataset_test])
    return {
        'name': 'NB',
        'train_accuracy': fit_accuracy,
        'valid_accuracy': nan,
        'test_accuracy': test_accuracy,
        'total_train_time': train_time,
        'num_epochs': 1,
        'trainable_params': len(probs),
        'final_train_loss': nan,
        'final_valid_loss': nan,
        'model_created': now,
        'average_time_per_epoch': train_time,
        'vocab_size': data_hyperparameters.VOCAB_SIZE,
        'tokenizer': data_hyperparameters.TOKENIZER,
        'batch_size': nan
    }
Beispiel #11
0
def get_data():
    if not os.path.exists(TOKENIZER_FILE):
        vocab = get_vocab(
        )  # Strictly speaking this is coming from the training rather than unsupervised data
        vocab_index = vocab.itos
        vocab_reverse_index = vocab.stoi
        freqs = list(
            map(lambda i: vocab.freqs[vocab_index[i]], range(len(vocab))))
        total = sum(freqs)
        for i in range(len(freqs)):
            freqs[i] /= total
        save_data(freqs, FREQS_FILE)
        unsup_data = glob.iglob('.data/aclImdb/train/unsup/*')
        all_mapped_tokens = []
        write_log(
            'Iterating through unsupervised data (this could take a while)',
            logger)
        now = datetime.now()
        for data in unsup_data:
            with open(data, 'r') as f:
                if data_hyperparameters.VERBOSE_LOGGING:
                    write_log('Processing {0}'.format(f), logger)
                text = f.read()
                mapped_tokens = [
                    vocab_reverse_index[token] for token in tokenizer(text)
                ]
            all_mapped_tokens.append(mapped_tokens)
        write_log(
            'Iteration took {0} seconds'.format(
                (datetime.now() - now).total_seconds()), logger)
        assert len(all_mapped_tokens) == 50000
        save_data(all_mapped_tokens, TOKENIZER_FILE)
    else:
        write_log('Loading tokens from {0}'.format(TOKENIZER_FILE), logger)
        all_mapped_tokens = pickle.load(open(TOKENIZER_FILE, "rb"))
        write_log('Loading frequencies from {0}'.format(FREQS_FILE), logger)
        freqs = pickle.load(open(FREQS_FILE, "rb"))
    return freqs, all_mapped_tokens
Beispiel #12
0
def prepare_data(batch_size=data_hyperparameters.BATCH_SIZE):
    english = Language('en')
    french = Language('fr')
    if not os.path.exists(EN_WORD_TO_INDEX_FILE) or not os.path.exists(EN_WORD_TO_COUNT_FILE) or not os.path.exists(EN_INDEX_TO_WORD_FILE) or not os.path.exists(FR_WORD_TO_INDEX_FILE) or not os.path.exists(FR_WORD_TO_COUNT_FILE) or not os.path.exists(FR_INDEX_TO_WORD_FILE) or not os.path.exists(EN_FIT_INDEX_FILE) or not os.path.exists(EN_VALID_INDEX_FILE) or not os.path.exists(EN_TEST_INDEX_FILE) or not os.path.exists(FR_FIT_INDEX_FILE) or not os.path.exists(FR_VALID_INDEX_FILE) or not os.path.exists(FR_TEST_INDEX_FILE):
        en_sentences = []
        fr_sentences = []
        write_log('Reading sentence pairs from file', logger)
        with open(DATA_FILE, mode='r', encoding='utf-8') as f:
            for line in f:
                en_sentence, fr_sentence = line.strip().split('\t')
                en_sentence = [t.text for t in english.tokenizer(normalize_string(en_sentence))]
                fr_sentence = [t.text for t in french.tokenizer(normalize_string(fr_sentence))]
                #todo: remove this filtration
                if len(en_sentence) > data_hyperparameters.MAX_LENGTH or len(fr_sentence) > data_hyperparameters.MAX_LENGTH:
                    continue
                en_sentences.append(en_sentence)
                fr_sentences.append(fr_sentence)
        write_log('Splitting data', logger)
        en_sentences_train, en_sentences_test, fr_sentences_train, fr_sentences_test = train_test_split(en_sentences,
                                                                                                        fr_sentences,
                                                                                                        test_size=data_hyperparameters.TRAIN_TEST_SPLIT)
        en_sentences_fit, en_sentences_valid, fr_sentences_fit, fr_sentences_valid = train_test_split(en_sentences_train,
                                                                                                      fr_sentences_train,
                                                                                                      test_size=data_hyperparameters.TRAIN_VALID_SPLIT)
        write_log('Building languages', logger)
        english.read(en_sentences_fit)
        english.cache()
        french.read(fr_sentences_fit)
        french.cache()
        write_log('Indexing sentences', logger)
        en_sentences_fit_index = english.index_sentences(en_sentences_fit)
        save_data(en_sentences_fit_index, EN_FIT_INDEX_FILE)
        en_sentences_valid_index = english.index_sentences(en_sentences_valid)
        save_data(en_sentences_valid_index, EN_VALID_INDEX_FILE)
        en_sentences_test_index = english.index_sentences(en_sentences_test)
        save_data(en_sentences_test_index, EN_TEST_INDEX_FILE)
        fr_sentences_fit_index = french.index_sentences(fr_sentences_fit)
        save_data(fr_sentences_fit_index, FR_FIT_INDEX_FILE)
        fr_sentences_valid_index = french.index_sentences(fr_sentences_valid)
        save_data(fr_sentences_valid_index, FR_VALID_INDEX_FILE)
        fr_sentences_test_index = french.index_sentences(fr_sentences_test)
        save_data(fr_sentences_test_index, FR_TEST_INDEX_FILE)
    else:
        write_log('Loading languages from disk cache', logger)
        english.load()
        french.load()
        write_log('Loading indexed sentences from disk cache', logger)
        en_sentences_fit_index = pickle.load(open(EN_FIT_INDEX_FILE, 'rb'))
        en_sentences_valid_index = pickle.load(open(EN_VALID_INDEX_FILE, 'rb'))
        en_sentences_test_index = pickle.load(open(EN_TEST_INDEX_FILE, 'rb'))
        fr_sentences_fit_index = pickle.load(open(FR_FIT_INDEX_FILE, 'rb'))
        fr_sentences_valid_index = pickle.load(open(FR_VALID_INDEX_FILE, 'rb'))
        fr_sentences_test_index = pickle.load(open(FR_TEST_INDEX_FILE, 'rb'))
    train_data_loader = get_dataloader(fr_sentences_fit_index, en_sentences_fit_index, batch_size=batch_size)
    write_log('{0} batches in training data'.format(len(train_data_loader)), logger)
    valid_data_loader = get_dataloader(fr_sentences_valid_index, en_sentences_valid_index, batch_size=batch_size)
    write_log('{0} batches in validation data'.format(len(valid_data_loader)), logger)
    test_data_loader = get_dataloader(fr_sentences_test_index, en_sentences_test_index, batch_size=batch_size)
    write_log('{0} batches in test data'.format(len(test_data_loader)), logger)
    return french, english, train_data_loader, valid_data_loader, test_data_loader
Beispiel #13
0
def train_agent(epsilon,
                discount_factor,
                env,
                agent_name,
                log_object,
                dqn_type='duelling',
                replay_type='prioritised'):
    write_log('Running agent {0}'.format(agent_name), log_object)
    now = datetime.now()
    n_actions = env.action_space.n
    state_dim = env.observation_space.shape[0]
    if dqn_type == 'duelling':
        agent = model_classes.DuellingDQN(epsilon, discount_factor, state_dim,
                                          n_actions)
        optimiser = torch.optim.Adam([{
            'params':
            agent.online_feature_network.parameters()
        }, {
            'params':
            agent.online_value_network.parameters()
        }, {
            'params':
            agent.online_advantage_network.parameters()
        }])
    else:
        agent = model_classes.DoubleDQN(epsilon, discount_factor, state_dim,
                                        n_actions)
        optimiser = torch.optim.Adam(agent.online_network.parameters())
    if replay_type == 'prioritised':
        replay = model_classes.PrioritisedReplay(
            data_hyperparameters.REPLAY_CAPACITY)
    else:
        replay = model_classes.Replay(data_hyperparameters.REPLAY_CAPACITY)
    s = env.reset()
    total_reward = 0.
    total_rewards = []
    for t in range(data_hyperparameters.NUM_STEPS):
        a = agent.get_action(s)
        next_s, r, done, _ = env.step(a)
        total_reward += r
        replay.add(s, a, r, next_s, done)
        if len(replay) >= data_hyperparameters.REPLAY_BATCH_SIZE:
            s_batch, a_batch, r_batch, next_s_batch, done_batch, i_batch, w_batch = replay.sample(
                data_hyperparameters.REPLAY_BATCH_SIZE)
            batch_loss = agent.calculate_temporal_difference_loss(
                s_batch, a_batch, r_batch, next_s_batch, done_batch, w_batch)
            if replay_type == 'prioritised':
                with torch.no_grad():
                    new_priorities = batch_loss + data_hyperparameters.PRIORITY_ADJUSTMENT_EPSILON
                    if agent.use_cuda:
                        new_priorities = new_priorities.cpu()
                    new_priorities = new_priorities.numpy()
                replay.update_priorities(i_batch, new_priorities)
            optimiser.zero_grad()
            batch_loss = torch.mean(batch_loss)
            batch_loss.backward()
            optimiser.step()
        s = next_s
        agent.adjust_epsilon()
        replay.adjust_beta()
        if done:
            total_rewards.append(total_reward)
            total_reward = 0.
            s = env.reset()
        if t % data_hyperparameters.NUM_STEPS_FOR_SYNCHRONISING == 0:
            agent.synchronise()
    write_log(
        'Agent {0} took {1} seconds total'.format(
            agent_name, (datetime.now() - now).total_seconds()), log_object)
    return total_rewards
Beispiel #14
0
def train_w2v(model_name,
              train_loader,
              valid_loader,
              vocab_size=data_hyperparameters.VOCAB_SIZE,
              distribution=None,
              epochs=data_hyperparameters.WORD_EMBEDDING_EPOCHS,
              lr=1e-2,
              embedding_dim=data_hyperparameters.WORD_EMBEDDING_DIMENSION,
              context_size=data_hyperparameters.CONTEXT_SIZE,
              inner_product_clamp=data_hyperparameters.INNER_PRODUCT_CLAMP,
              num_negative_samples=data_hyperparameters.NUM_NEGATIVE_SAMPLES,
              algorithm='SGNS'):
    train_losses = []
    valid_losses = []
    if algorithm.upper() == 'CBOW':
        model = ContinuousBagOfWords(vocab_size, embedding_dim, context_size,
                                     model_name)
        loss_function = torch.nn.NLLLoss()
    else:
        model = SkipGramWithNegativeSampling(vocab_size, embedding_dim,
                                             context_size,
                                             num_negative_samples,
                                             inner_product_clamp, model_name)
        distribution_tensor = torch.tensor(distribution,
                                           dtype=torch.float,
                                           device=device)
    if data_hyperparameters.USE_CUDA:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='min',
                                                           patience=1,
                                                           verbose=True)
    write_log(
        'Training on {0} batches and validating on {1} batches'.format(
            len(train_loader), len(valid_loader)), logger)
    for epoch in range(epochs):
        now = datetime.now()
        write_log('Epoch: {0}'.format(epoch), logger)
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE:
                xb = xb.cuda()
                yb = yb.cuda()
            if algorithm.upper() == 'CBOW':
                predictions = model(xb)
                loss = loss_function(predictions, yb)
            else:
                negative_samples = produce_negative_samples(
                    distribution_tensor, num_negative_samples, len(yb))
                loss = torch.mean(model(yb, xb, negative_samples))
            loss.backward()
            total_loss += loss.item()
            optimizer.zero_grad()
            optimizer.step()
        train_loss = total_loss / len(train_loader)
        write_log('Training loss: {0}'.format(train_loss), logger)
        train_losses.append(train_loss)

        model.eval()
        with torch.no_grad():
            valid_loss = 0
            for xb, yb in valid_loader:
                if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE:
                    xb = xb.cuda()
                    yb = yb.cuda()
                if algorithm.upper() == 'CBOW':
                    valid_loss += loss_function(model(xb), yb).item()
                elif algorithm.upper() == 'SGNS':
                    negative_samples = produce_negative_samples(
                        distribution_tensor, num_negative_samples, len(yb))
                    loss = model(yb, xb, negative_samples)
                    valid_loss += torch.mean(loss).item()
        valid_loss = valid_loss / len(valid_loader)
        valid_losses.append(valid_loss)
        write_log('Validation loss: {0}'.format(valid_loss), logger)

        seconds = (datetime.now() - now).total_seconds()
        write_log('Epoch took: {0} seconds'.format(seconds), logger)
        scheduler.step(valid_loss)

    fig, ax = plt.subplots()
    ax.plot(range(epochs), train_losses, label='Training')
    ax.plot(range(epochs), valid_losses, label='Validation')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title('Learning curve for model {0}'.format(model_name))
    ax.legend()
    plt.savefig('learning_curves/{0}_learning_curve_{1}_{2}_{3}.png'.format(
        model_name, embedding_dim, algorithm, context_size))

    return model
import model_classes
import matplotlib.pyplot as plt
from model_pipeline import run_experiment, moving_average
import torch

LOG_FILE = 'run_experiment'
logger = create_logger(LOG_FILE)

cart_pole = gym.make('CartPole-v0')
n_actions = cart_pole.action_space.n
state_dim = cart_pole.observation_space.shape[0]

all_experiment_rewards = []
all_experiment_names = []

write_log('Environment: {0}'.format('CartPole-v0'), logger)
write_log('Hyperparameters', logger)
write_log('num_experiments: {0}'.format(data_hyperparameters.NUM_EXPERIMENTS),
          logger)
write_log('epsilon: {0}'.format(data_hyperparameters.EPSILON), logger)
write_log('discount_factor: {0}'.format(data_hyperparameters.DISCOUNT_FACTOR),
          logger)
write_log(
    'epsilon_decay_factor: {0}'.format(
        data_hyperparameters.EPSILON_DECAY_FACTOR), logger)
write_log('num_episodes: {0}'.format(data_hyperparameters.NUM_EPISODES),
          logger)
write_log(
    'num_episodes_for_decay: {0}'.format(
        data_hyperparameters.NUM_EPISODES_FOR_DECAY), logger)
Beispiel #16
0
import model_classes
import model_pipeline
import data_hyperparameters
import data_downloader
from log_utils import create_logger, write_log
LOG_FILE = 'run_experiment'
logger = create_logger(LOG_FILE)

train_data, valid_data, test_data = data_downloader.get_dataloaders()
# Note: this will not work directly with LogisticRegressionBOW as this trains with a different dataset
#Also should do naive Bayes separately (with naive_bayes.report_statistics() since it's based on a different paradigm
#Usage: add models to the list below
models = [
    model_classes.TransformerEncoder(num_layers=2,
                                     max_len=500,
                                     name='TransformerEncoder_500')
]
for model in models:
    write_log('Running experiment for {0}'.format(model.name), logger)
    model_pipeline.train(model=model,
                         train_data=train_data,
                         valid_data=valid_data)
    model.plot_losses()
    model_pipeline.report_statistics(model=model,
                                     train_data=train_data,
                                     valid_data=valid_data,
                                     test_data=test_data)
def get_data():
    vocab = get_vocab()
    vocab_reverse_index = vocab.stoi
    PAD_TOKEN = vocab.stoi['<pad>']
    if not os.path.exists(FIT_FILE) or not os.path.exists(FIT_LABELS_FILE) or not os.path.exists(TEST_FILE) \
            or not os.path.exists(TEST_LABELS_FILE):
        write_log('Building fit and test data (this may take a while)', logger)
        now = datetime.now()
        fit_mapped_tokens = []
        fit_labels = []
        train_pos_data = glob.iglob('.data/aclImdb/train/pos/*')
        for data in train_pos_data:
            with open(data, 'r') as f:
                if data_hyperparameters.VERBOSE_LOGGING:
                    write_log('Processing {0}'.format(f), logger)
                text = f.read()
                mapped_tokens = [
                    vocab_reverse_index[token] for token in tokenizer(text)
                ]
            fit_mapped_tokens.append(mapped_tokens)
            fit_labels.append(1)
        train_neg_data = glob.iglob('.data/aclImdb/train/neg/*')
        for data in train_neg_data:
            with open(data, 'r') as f:
                if data_hyperparameters.VERBOSE_LOGGING:
                    write_log('Processing {0}'.format(f), logger)
                text = f.read()
                mapped_tokens = [
                    vocab_reverse_index[token] for token in tokenizer(text)
                ]
            fit_mapped_tokens.append(mapped_tokens)
            fit_labels.append(0)
        save_data(fit_mapped_tokens, FIT_FILE)
        save_data(fit_labels, FIT_LABELS_FILE)
        write_log('Processed fit data', logger)
        test_mapped_tokens = []
        test_labels = []
        test_pos_data = glob.iglob('.data/aclImdb/test/pos/*')
        for data in test_pos_data:
            with open(data, 'r') as f:
                if data_hyperparameters.VERBOSE_LOGGING:
                    write_log('Processing {0}'.format(f), logger)
                text = f.read()
                mapped_tokens = [
                    vocab_reverse_index[token] for token in tokenizer(text)
                ]
            test_mapped_tokens.append(mapped_tokens)
            test_labels.append(1)
        test_neg_data = glob.iglob('.data/aclImdb/test/neg/*')
        for data in test_neg_data:
            with open(data, 'r') as f:
                if data_hyperparameters.VERBOSE_LOGGING:
                    write_log('Processing {0}'.format(f), logger)
                text = f.read()
                mapped_tokens = [
                    vocab_reverse_index[token] for token in tokenizer(text)
                ]
            test_mapped_tokens.append(mapped_tokens)
            test_labels.append(0)
        save_data(test_mapped_tokens, TEST_FILE)
        save_data(test_labels, TEST_LABELS_FILE)
        write_log('Processed test data', logger)
        write_log(
            'Building fit and test data took {0} seconds'.format(
                (datetime.now() - now).total_seconds()), logger)
    else:
        write_log('Loading fit data from {0}'.format(FIT_FILE), logger)
        fit_mapped_tokens = pickle.load(open(FIT_FILE, "rb"))
        write_log('Loading fit labels from {0}'.format(FIT_LABELS_FILE),
                  logger)
        fit_labels = pickle.load(open(FIT_LABELS_FILE, "rb"))
        write_log('Loading test data from {0}'.format(TEST_FILE), logger)
        test_mapped_tokens = pickle.load(open(TEST_FILE, "rb"))
        write_log('Loading test labels from {0}'.format(TEST_LABELS_FILE),
                  logger)
        test_labels = pickle.load(open(TEST_LABELS_FILE, "rb"))
    return PAD_TOKEN, fit_mapped_tokens, fit_labels, test_mapped_tokens, test_labels
Beispiel #18
0
def train(model,
          train_data,
          valid_data,
          epochs=data_hyperparameters.EPOCHS,
          patience=data_hyperparameters.PATIENCE,
          teacher_forcing_scale_factor=data_hyperparameters.
          TEACHER_FORCING_SCALE_FACTOR):
    loss_function = torch.nn.NLLLoss(
        ignore_index=data_hyperparameters.PAD_TOKEN)
    if data_hyperparameters.USE_CUDA:
        model.cuda()
    optimiser = torch.optim.Adam(model.parameters(
    )) if model.latest_scheduled_lr is None else torch.optim.Adam(
        model.parameters(), lr=model.latest_scheduled_lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser,
                                                           patience=patience,
                                                           mode='max')
    now_begin_training = datetime.now()
    start_epoch = model.num_epochs_trained
    for epoch in range(start_epoch, epochs + start_epoch):
        now_begin_epoch = datetime.now()
        model.latest_scheduled_lr = optimiser.param_groups[0]['lr']
        model.teacher_forcing_proportion_history.append(
            model.teacher_forcing_proportion)
        model.lr_history.append(model.latest_scheduled_lr)
        write_log(
            'Running epoch {0} of {1} with learning rate {2} and teacher forcing rate {3}'
            .format(epoch + 1, epochs + start_epoch, model.latest_scheduled_lr,
                    model.teacher_forcing_proportion), logger)
        model.train()
        loss = 0.
        for xb, yb in train_data:
            if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE:
                xb = xb.cuda()
                yb = yb.cuda()
            teacher_force = random() < model.teacher_forcing_proportion
            batch_loss = loss_function(
                torch.flatten(model(xb, yb, teacher_force=teacher_force)[1:],
                              start_dim=0,
                              end_dim=1),
                torch.flatten(yb.transpose(0, 1)[1:]))
            loss += batch_loss.item() / len(train_data)
            optimiser.zero_grad()
            batch_loss.backward()
            optimiser.step()
        model.train_losses.append(loss)
        write_log('Training loss: {0}'.format(loss), logger)
        model.eval()
        train_bleu = average_bleu(train_data, model)
        write_log('Training BLEU: {0}'.format(train_bleu), logger)
        model.train_bleus.append(train_bleu)
        with torch.no_grad():
            if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE:
                loss = 0.
                for xb, yb in valid_data:
                    xb = xb.cuda()
                    yb = yb.cuda()
                    loss += loss_function(
                        torch.flatten(model(xb, yb, teacher_force=False)[1:],
                                      start_dim=0,
                                      end_dim=1),
                        torch.flatten(yb.transpose(
                            0, 1)[1:])).item() / len(valid_data)
            else:
                loss = sum([
                    loss_function(
                        torch.flatten(model(xb, yb, teacher_force=False)[1:],
                                      start_dim=0,
                                      end_dim=1),
                        torch.flatten(yb.transpose(0, 1)[1:])).item()
                    for xb, yb in valid_data
                ]) / len(valid_data)
        model.valid_losses.append(loss)
        write_log('Validation loss: {0}'.format(loss), logger)
        valid_bleu = average_bleu(valid_data, model)
        write_log('Validation BLEU: {0}'.format(valid_bleu), logger)
        model.valid_bleus.append(valid_bleu)
        scheduler.step(valid_bleu)
        model.num_epochs_trained += 1
        model.teacher_forcing_proportion *= teacher_forcing_scale_factor
        write_log(
            'Epoch took {0} seconds'.format(
                (datetime.now() - now_begin_epoch).total_seconds()), logger)
    model.train_time += (datetime.now() - now_begin_training).total_seconds()
    if data_hyperparameters.USE_CUDA:
        model.cpu()
Beispiel #19
0
def train(model,
          train_data,
          valid_data,
          epochs=data_hyperparameters.EPOCHS,
          patience=data_hyperparameters.PATIENCE,
          report_accuracy_every=data_hyperparameters.REPORT_ACCURACY_EVERY):
    loss_function = torch.nn.NLLLoss()
    if data_hyperparameters.USE_CUDA:
        model.cuda()
    optimiser = torch.optim.Adam(model.parameters(
    )) if model.latest_scheduled_lr is None else torch.optim.Adam(
        model.parameters(), lr=model.latest_scheduled_lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser,
                                                           patience=patience)
    now_begin_training = datetime.now()
    start_epoch = model.num_epochs_trained
    for epoch in range(start_epoch, epochs + start_epoch):
        now_begin_epoch = datetime.now()
        model.latest_scheduled_lr = optimiser.param_groups[0]['lr']
        model.lr_history.append(model.latest_scheduled_lr)
        write_log(
            'Running epoch {0} of {1} with learning rate {2}'.format(
                epoch + 1, epochs + start_epoch, model.latest_scheduled_lr),
            logger)
        model.train()
        loss = 0.
        for xb, yb in train_data:
            if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE:
                xb = xb.cuda()
                yb = yb.cuda()
            batch_loss = loss_function(model(xb), yb)
            loss += batch_loss.item() / len(train_data)
            optimiser.zero_grad()
            batch_loss.backward()
            optimiser.step()
        model.train_losses.append(loss)
        write_log('Training loss: {0}'.format(loss), logger)
        model.eval()
        if report_accuracy_every is not None:
            if (epoch + 1) % report_accuracy_every == 0:
                accuracy, mean_correct_prediction_probs, mean_incorrect_prediction_probs = get_accuracy(
                    train_data, model, also_report_model_confidences=True)
                write_log('Training accuracy: {0}'.format(accuracy), logger)
                model.train_accuracies[epoch + 1] = accuracy
                write_log(
                    'Model confidence: {0} (correct predictions), {1} (incorrect predictions)'
                    .format(mean_correct_prediction_probs,
                            mean_incorrect_prediction_probs), logger)
                model.train_correct_confidences[
                    epoch + 1] = mean_correct_prediction_probs
                model.train_incorrect_confidences[
                    epoch + 1] = mean_incorrect_prediction_probs
        with torch.no_grad():
            if data_hyperparameters.USE_CUDA and not data_hyperparameters.STORE_DATA_ON_GPU_IF_AVAILABLE:
                loss = 0.
                for xb, yb in valid_data:
                    xb = xb.cuda()
                    yb = yb.cuda()
                    loss += loss_function(model(xb),
                                          yb).item() / len(valid_data)
            else:
                loss = sum([
                    loss_function(model(xb), yb).item()
                    for xb, yb in valid_data
                ]) / len(valid_data)
        model.valid_losses.append(loss)
        scheduler.step(loss)
        write_log('Validation loss: {0}'.format(loss), logger)
        if report_accuracy_every is not None:
            if (epoch + 1) % report_accuracy_every == 0:
                accuracy, mean_correct_prediction_probs, mean_incorrect_prediction_probs = get_accuracy(
                    valid_data, model, also_report_model_confidences=True)
                write_log('Validation accuracy: {0}'.format(accuracy), logger)
                model.valid_accuracies[epoch + 1] = accuracy
                write_log(
                    'Model confidence: {0} (correct predictions), {1} (incorrect predictions)'
                    .format(mean_correct_prediction_probs,
                            mean_incorrect_prediction_probs), logger)
                model.valid_correct_confidences[
                    epoch + 1] = mean_correct_prediction_probs
                model.valid_incorrect_confidences[
                    epoch + 1] = mean_incorrect_prediction_probs
        model.num_epochs_trained += 1
        write_log(
            'Epoch took {0} seconds'.format(
                (datetime.now() - now_begin_epoch).total_seconds()), logger)
    model.train_time += (datetime.now() - now_begin_training).total_seconds()
    if data_hyperparameters.USE_CUDA:
        model.cpu()