def main():
    nb_epochs = 30
    batch_size = 200
    hidden_size = 256
    embedding_dim = 300
    max_len = 20
    teacher_forcing = 0.6
    min_count = 2
    max_grad_norm = 5
    val_len = 5000
    weight_decay = 0.00001
    model_filename = '/home/mattd/pycharm/yelp/models' \
                     '/baseline_frozen_pretrained'

    eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset = SentenceDataset(eng_fr_filename, max_len, min_count)
    print('Dataset: {}'.format(len(dataset)))

    train_len = len(dataset) - val_len
    dataset_train, dataset_val = torch.utils.data.dataset.random_split(
        dataset, [train_len, val_len])
    print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val)))

    embeddings_dir = '/home/mattd/pycharm/yelp/embeddings.npy'
    embeddings = cuda(get_pretrained_embeddings(embeddings_dir, dataset))

    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    batch_size,
                                                    shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  batch_size,
                                                  shuffle=False)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN]
    model = Seq2SeqModel(embeddings, hidden_size, padding_idx, init_idx,
                         max_len, teacher_forcing)
    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss(
        ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN])

    phases = [
        'train',
        'val',
    ]
    data_loaders = [
        data_loader_train,
        data_loader_val,
    ]

    lowest_loss = 500

    for epoch in range(nb_epochs):
        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            for i, (inputs, targets) in enumerate(data_loader):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(targets)

                outputs = model(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()

                epoch_loss.append(float(loss))

            epoch_loss = np.mean(epoch_loss)

            if epoch_loss < lowest_loss:
                save_checkpoint(model, loss, optimizer, model_filename)
                lowest_loss = epoch_loss

            if phase == 'train':
                print('Epoch {:03d} | {} loss: {:.3f}'.format(
                    epoch, phase, epoch_loss),
                      end='')
            else:
                print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n')

            # print random sentence
            if phase == 'val':
                random_idx = np.random.randint(len(dataset_val))
                inputs, targets = dataset_val[random_idx]
                inputs_var = variable(inputs)

                outputs_var = model(inputs_var.unsqueeze(
                    0))  # unsqueeze to get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                print(u'> {}'.format(
                    get_sentence_from_indices(inputs, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print(u'= {}'.format(
                    get_sentence_from_indices(targets, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print(u'< {}'.format(
                    get_sentence_from_indices(outputs, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print()
Exemple #2
0
def main():
    nb_epochs = 50
    batch_size = 500
    hidden_size = 256
    embedding_dim = 300
    pretrained_embeddings = "/embeddings_min2_max15.npy"
    max_grad_norm = 5
    max_len = 15
    min_count = 2
    weight_decay = 0.00001
    learning_rate = 0.001
    model_group = "/auto_encoder"
    autoencoder_name = "/auto_encoder_3"
    autoencoder_version = 1
    project_file = "/home/mattd/PycharmProjects/reddit"
    dataset_path = "/home/mattd/PycharmProjects/reddit/data/"

    string = 'nb_epochs: {}\nbatch_size: {}\nhidden_size: {}\nembedding_dim: ' \
             '{}\npretrained_embeddings: {}\nmax_len: {}\nmin_countmin_count: '\
             '{}\nweight_decay: {}\nlearning_rate: {}\nmodel_group: ' \
             '{}\nautoencoder_name: {}\nautoencoder_version: {}\n'.format(
                nb_epochs, batch_size, hidden_size, embedding_dim,
                pretrained_embeddings, max_len, min_count, weight_decay,
                learning_rate, model_group, autoencoder_name,autoencoder_version)
    print(string)
    output = string + '\n'

    # embedding_filename = 'embeddings_20_1.npy's'

    model_filename = '{}{}s{}'.format(
        project_file, model_group, autoencoder_name)

    new_model_filename = '{}_{}'.format(model_filename, autoencoder_version)

    output_file = '{}{}_outputs{}_{}'.format(
        project_file, model_group, autoencoder_name, autoencoder_version)

    description_filename = \
        '{}/description/description_1.txt'.format(project_file)

    # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset_train_filename = "{}train.csv".format(dataset_path)
    dataset_val_filename = "{}validation.csv".format(dataset_path)

    dataset_train = SentenceDataset(dataset_train_filename, max_len, min_count)
    dataset_val = SentenceDataset(dataset_val_filename, max_len, min_count,
                               dataset_train.vocab)

    string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val))
    print(string)
    output += string + '\n'

    # getting pretrained embeddings
    if pretrained_embeddings is not None:
        embeddings_dir = '{}{}'.format(project_file, pretrained_embeddings)
        pretrained_embeddings = cuda(
            get_pretrained_embeddings(embeddings_dir))
        embedding_dim = pretrained_embeddings.shape[1]

    data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False)

    vocab_size = len(dataset_val.vocab)
    padding_idx = dataset_val.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset_val.vocab[SentenceDataset.INIT_TOKEN]

    model = Seq2SeqModel(hidden_size, padding_idx, init_idx,
        max_len, vocab_size, embedding_dim, pretrained_embeddings)

    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(
        parameters, amsgrad=True, weight_decay=weight_decay, lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset_val.vocab[
        SentenceDataset.PAD_TOKEN])

    model, optimizer, lowest_loss, description, last_epoch, \
    train_loss, val_loss, found_model = load_checkpoint(model_filename, model, optimizer)

    if found_model:
        string = 'Loaded Model:\nlowest_validation_loss: {}\ndescription: {}' \
                 '\nlast_epoch:{}\n'.format(lowest_loss, description,
                                            last_epoch)
    else:
        string = 'No model found at {}\n'.format(model_filename)

    print(string)
    output = output + string + '\n'

    outfile = open(output_file, 'w')
    outfile.write(output)
    outfile.close()

    phases = ['train', 'val', ]
    data_loaders = [data_loader_train, data_loader_val, ]

    intervals = 6

    for epoch in range(last_epoch, last_epoch+nb_epochs):
        start = time.clock()

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            epoch_sentenence_accuracy = []
            epoch_token_accuracy = []
            j = 1

            for i, inputs in tqdm(enumerate(data_loader)):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(inputs)

                outputs = model.auto_encoder(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                epoch_loss.append(float(loss))
                average_epoch_loss = np.mean(epoch_loss)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()
                    if (len(data_loader) / intervals)*j <= i+1:
                        train_loss.append(average_epoch_loss)
                        string = (
                            'Epoch {:03d} Example {:03d} | {} loss: {:.3f}'.format(
                             epoch, i, phase, average_epoch_loss))
                        print(string, end='\n')
                        output = output + string + '\n'
                        j += 1

                else:
                    predicted = torch.argmax(
                        outputs.view(-1, max_len, vocab_size), -1)
                    batch_sentence_accuracy, batch_token_accuracy = encoder_accuracy(
                        targets.view(-1, max_len), predicted)
                    epoch_sentenence_accuracy.append(batch_sentence_accuracy)
                    epoch_token_accuracy.append(batch_token_accuracy)

            if phase == 'val':
                averege_epoch_sentenence_accuracy = np.mean(epoch_sentenence_accuracy)
                averege_epoch_token_accuracy = np.mean(epoch_token_accuracy)

                time_taken = time.clock() - start

                val_loss.append(average_epoch_loss)
                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, average_epoch_loss, time_taken)
                print(string, end='')
                output = output + '\n' + string + '\n'

                string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format(
                    averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy)
                print(string, end='\n')
                output = output + string + '\n'

                if average_epoch_loss < lowest_loss:
                    save_checkpoint(
                        model, average_epoch_loss, optimizer, new_model_filename,
                        description_filename, epoch, train_loss, val_loss)
                    lowest_loss = average_epoch_loss

                random_idx = np.random.randint(len(dataset_val))
                inputs = dataset_val[random_idx]
                targets = inputs
                inputs_var = variable(inputs)

                outputs_var = model.auto_encoder(inputs_var.unsqueeze(0)) #
                # unsqueeze to
                # get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                string = '> {}\n'.format(get_sentence_from_indices(
                    inputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN))

                string = string + u'= {}\n'.format(get_sentence_from_indices(
                    targets, dataset_val.vocab, SentenceDataset.EOS_TOKEN))

                string = string + u'< {}'.format(get_sentence_from_indices(
                    outputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                output = output + string + '\n' + '\n'
        outfile = open(output_file, 'w')
        outfile.write(output)
        outfile.close()
Exemple #3
0
def main():
    file = {
        "model_group": "/seq_len_exp",
        "model_name": "/generation_6",
        "old_model_name": None,
        "model_version": 0,
        "project_file": "/home/mattd/PycharmProjects/reddit/generation"
    }

    file["dataset_path"] = "{}/data/".format(file["project_file"])

    file["model_filename"] = '{}{}s{}_{}'.format(file["project_file"],
                                                 file["model_group"],
                                                 file["model_name"],
                                                 file["model_version"])

    file["output_file"] = '{}{}_outputs{}_{}'.format(file["project_file"],
                                                     file["model_group"],
                                                     file["model_name"],
                                                     file["model_version"])

    # check_files(file)

    use_old_model = file["old_model_name"] is not None
    params = {}

    if use_old_model:
        file["old_model_filename"] = '{}{}s{}'.format(file["project_file"],
                                                      file["model_group"],
                                                      file["old_model_name"])
        params, old_files = load_params(file["old_model_filename"])
        use_old_model = old_files != {}

    if not use_old_model:
        params = {
            "batch_size": 1000,
            "hidden_size": 256,
            "embedding_dim": 300,
            "pretrained_embeddings": True,
            "max_grad_norm": 5,
            "max_len": 30,
            "min_count": 2,
            "weight_decay": 0.00001,
            "learning_rate": 0.005,
        }

    params["num_training_examples"] = 78260
    params["num_val_examples"] = -1
    params["nb_epochs"] = 40

    if params["pretrained_embeddings"]:
        file["pretrained_embeddings_file"] = \
            "/embeddings/embeddings_min{}_max{}.npy".format(
            params["min_count"], params["max_len"])

    string = ""
    for k, v in file.items():
        string += "{}: {}\n".format(k, v)
    for k, v in params.items():
        string += "{}: {}\n".format(k, v)

    print(string)
    output = string + '\n'

    # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset_train_filename = "{}train.csv".format(file["dataset_path"])
    dataset_val_filename = "{}validation.csv".format(file["dataset_path"])

    dataset_train = PairsDataset(dataset_train_filename, params["max_len"],
                                 params["min_count"])
    dataset_val = PairsDataset(dataset_val_filename, params["max_len"],
                               params["min_count"], dataset_train.vocab)

    string = 'Vocab size {}\n'.format(len(dataset_train.vocab))
    string += 'Train {} '.format(len(dataset_train))

    if params["num_training_examples"] != -1:
        dataset_train.prune_examples(params["num_training_examples"])
        string += '> {}'.format(len(dataset_train))

    string += '\nVal: {}'.format(len(dataset_val))

    if params["num_val_examples"] != -1:
        dataset_val.prune_examples(params["num_val_examples"])
        string += '-> {}'.format(len(dataset_val))

    print(string)
    output += string + '\n'

    if params["pretrained_embeddings"]:
        embeddings_dir = '{}{}'.format(file["project_file"],
                                       file["pretrained_embeddings_file"])
        pretrained_embeddings = cuda(get_pretrained_embeddings(embeddings_dir))
        params["embedding_dim"] = pretrained_embeddings.shape[1]
    else:
        pretrained_embeddings = None

    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    params["batch_size"],
                                                    shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  params["batch_size"],
                                                  shuffle=False)

    vocab_size = len(dataset_train.vocab)
    padding_idx = dataset_train.vocab[PairsDataset.PAD_TOKEN]
    init_idx = dataset_train.vocab[PairsDataset.INIT_TOKEN]

    model = Seq2SeqModel(params["hidden_size"], padding_idx, init_idx,
                         params["max_len"], vocab_size,
                         params["embedding_dim"], pretrained_embeddings)

    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=params["weight_decay"],
                                 lr=params["learning_rate"])
    criterion = torch.nn.CrossEntropyLoss()

    if use_old_model:
        model, optimizer = load_checkpoint(file["old_model_filename"], model,
                                           optimizer)

    lowest_loss = 100
    train_loss = []
    val_loss = []
    best_model = model
    best_optimizer = optimizer
    average_epoch_loss = 0

    metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

    outfile = open(file["output_file"], 'w')
    outfile.write(output)
    outfile.close()

    phases = [
        'train',
        'val',
    ]
    data_loaders = [
        data_loader_train,
        data_loader_val,
    ]

    intervals = 2
    highest_acc = 0

    for epoch in range(0, params["nb_epochs"]):
        start = time.clock()
        string = 'Epoch: {}\n'.format(epoch)
        print(string, end='')
        output = output + '\n' + string

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
                string = 'Train: \n'
            else:
                model.eval()
                string = 'Validation \n'

            print(string, end='')
            output = output + '\n' + string

            epoch_loss = []
            epoch_accuracy = []
            epoch_precision = []
            epoch_recall = []
            epoch_f1 = []
            j = 1

            for i, (sentence_1, sentence_2,
                    labels) in tqdm(enumerate(data_loader)):
                optimizer.zero_grad()

                sentence_1 = variable(sentence_1)
                sentence_2 = variable(sentence_2)
                targets = variable(labels)

                outputs = model(sentence_1, sentence_2, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)
                loss = criterion(outputs, targets)

                epoch_loss.append(float(loss))
                average_epoch_loss = np.mean(epoch_loss)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters,
                                                   params["max_grad_norm"])
                    optimizer.step()
                    if (len(data_loader) / intervals) * j <= i + 1:
                        string = ('Example {:03d} | {} loss: {:.3f}'.format(
                            i, phase, average_epoch_loss))
                        print(string, end='\n')
                        output = output + string + '\n'
                        j += 1
                else:
                    # get result metrics
                    accuracy, precision, recall, f1 = classifier_accuracy(
                        targets.cpu().numpy(),
                        torch.argmax(outputs.view(-1, 2), -1).cpu().numpy())
                    #print('{},{},{},{}'.format(accuracy, precision, recall,
                    # f1))
                    epoch_accuracy.append(accuracy)
                    epoch_precision.append(precision)
                    epoch_recall.append(recall)
                    epoch_f1.append(f1)

            # print random sentence
            if phase == 'val':
                time_taken = time.clock() - start

                val_loss.append(average_epoch_loss)
                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, average_epoch_loss, time_taken)
                string += ' | lowest loss: {:.3f} highest accuracy:' \
                    ' {:.3f}'.format(lowest_loss, highest_acc)
                print(string, end='\n')
                output = output + '\n' + string + '\n'

                average_epoch_accuracy = np.mean(epoch_accuracy)
                average_epoch_precision = np.mean(epoch_precision)
                average_epoch_recall = np.mean(epoch_recall)
                average_epoch_f1 = np.mean(epoch_f1)
                metrics["accuracy"].append(average_epoch_accuracy),
                metrics["precision"].append(average_epoch_precision)
                metrics["recall"].append(average_epoch_recall)
                metrics["f1"].append(average_epoch_f1)

                if average_epoch_loss < lowest_loss:
                    best_model = model
                    best_optimizer = optimizer
                    best_epoch = epoch
                    lowest_loss = average_epoch_loss

                save_checkpoint(best_epoch, best_model, best_optimizer, epoch,
                                model, optimizer, train_loss, val_loss,
                                metrics, params, file)

                if average_epoch_accuracy > highest_acc:
                    highest_acc = average_epoch_accuracy

                string = "Accuracy: {:.3f}\nPrecision: {:.3f}\nRecall: {:.3f}\n" \
                         "F1: {:.3f}\n".format(
                    average_epoch_accuracy, average_epoch_precision,
                    average_epoch_recall, average_epoch_f1)
                print(string, end='\n')
                output = output + string + '\n'

                random_idx = np.random.randint(len(dataset_val))
                sentence_1, sentence_2, labels = dataset_val[random_idx]
                targets = labels
                sentence_1_var = variable(sentence_1)
                sentence_2_var = variable(sentence_2)

                outputs_var = model(sentence_1_var.unsqueeze(0),
                                    sentence_2_var.unsqueeze(0))  # unsqueeze
                #  to get the batch dimension
                outputs = outputs_var.squeeze(0).data.cpu().numpy()

                string = '> {}\n'.format(
                    get_sentence_from_indices(sentence_1, dataset_val.vocab,
                                              PairsDataset.EOS_TOKEN))

                string = string + u'> {}\n'.format(
                    get_sentence_from_indices(sentence_2, dataset_val.vocab,
                                              PairsDataset.EOS_TOKEN))

                string = string + u'target:{}|  P false:{:.3f}, P true:' \
                    u' {:.3f}'.format(targets, float(outputs[0]), float(outputs[1]))
                print(string, end='\n\n')
                output = output + string + '\n' + '\n'
            else:
                train_loss.append(average_epoch_loss)
        outfile = open(file["output_file"], 'w')
        outfile.write(output)
        outfile.close()
Exemple #4
0
def main():
    nb_epochs = 100
    batch_size = 500
    hidden_size = 256
    embedding_dim = 300
    pretrained_embeddings = None
    max_len = 20
    min_count = 2
    max_grad_norm = 5
    val_len = 10000
    weight_decay = 0.00001
    model_filename = '/home/mattd/pycharm/encoder/models3' \
                     '/Baseline'
    description_filename = \
        '/home/mattd/pycharm/encoder/description/description2.txt'
    output_file = '/home/mattd/pycharm/encoder/model_outputs_3/baseline'

    outfile = open(output_file, 'w')

    eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \
                      '/train_1M.txt'
    dataset = SentenceDataset(eng_fr_filename, max_len, min_count)
    string = 'Dataset: {}'.format(len(dataset))
    print(string)
    outfile.write(string+'\n')

    train_len = len(dataset) - val_len
    dataset_train, dataset_val = torch.utils.data.dataset.random_split(dataset, [train_len, val_len])
    string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val))
    print(string)
    outfile.write(string+'\n')

    embeddings_dir = '/home/mattd/pycharm/encoder' \
                     '/embeddings_3min.npy'
    pretrained_embeddings = cuda(
        get_pretrained_embeddings(embeddings_dir, dataset))
    embedding_dim = pretrained_embeddings.shape[1]

    data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN]

    model = Seq2SeqModel(
        pretrained_embeddings, hidden_size, padding_idx, init_idx,
                         max_len, vocab_size, embedding_dim)
    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN])

    model, optimizer, lowest_loss, description, last_epoch, \
    train_loss, val_loss = load_checkpoint(model_filename, model, optimizer)

    print(description)

    phases = ['train', 'val', ]
    data_loaders = [data_loader_train, data_loader_val, ]


    for epoch in range(last_epoch, last_epoch+nb_epochs):
        start = time.clock()

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            epoch_sentenence_accuracy = []
            epoch_token_accuracy = []

            for i, inputs in enumerate(data_loader):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(inputs)

                outputs = model(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()

                if phase == 'val':
                    predicted = torch.argmax(outputs.view(batch_size, max_len,
                                                          -1), -1)
                    batch_sentenence_accuracy, batch_token_accuracy = accuracy(
                        targets.view(batch_size, -1), predicted)
                    epoch_sentenence_accuracy.append(batch_sentenence_accuracy)
                    epoch_token_accuracy.append(batch_token_accuracy)
                epoch_loss.append(float(loss))

            epoch_loss = np.mean(epoch_loss)

            if phase == 'train':
                train_loss.append(epoch_loss)
                string = ('Epoch {:03d} | {} loss: {:.3f}'.format(
                    epoch, phase, epoch_loss))

                print(string, end='\n')
                outfile.write(string+'\n')
            else:
                averege_epoch_sentenence_accuracy = sum(epoch_sentenence_accuracy) / \
                    len(epoch_sentenence_accuracy)
                averege_epoch_token_accuracy = sum(epoch_token_accuracy) / \
                    len(epoch_token_accuracy)
                time_taken = time.clock() - start

                val_loss.append(epoch_loss)
                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, epoch_loss, time_taken)
                print(string, end='')

                string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format(
                    averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy)
                print(string, end='\n')
                outfile.write(string+'\n')
                if epoch_loss < lowest_loss:
                    save_checkpoint(
                        model, epoch_loss, optimizer, model_filename,
                        description_filename, epoch, train_loss, val_loss)
                    lowest_loss = epoch_loss



            # print random sentence
            if phase == 'val':
                random_idx = np.random.randint(len(dataset_val))
                inputs = dataset_val[random_idx]
                targets = inputs
                inputs_var = variable(inputs)

                outputs_var = model(inputs_var.unsqueeze(0)) # unsqueeze to get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                string = '> {}'.format(get_sentence_from_indices(
                    inputs, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')

                string = u'= {}'.format(get_sentence_from_indices(
                    targets, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')

                string = u'< {}'.format(get_sentence_from_indices(
                    outputs, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')
                print()
    outfile.close()
Exemple #5
0
def main():
    nb_epochs = 30
    #nb_epochs = 1
    batch_size = 64
    hidden_size = 256
    embedding_dim = 300
    max_len = 20
    teacher_forcing = 0.6
    min_count = 2
    max_grad_norm = 5
    val_len = 5000
    weight_decay = 0.00001

    eng_fr_filename = './data/eng-fra.txt'
    dataset = TSVSentencePairDataset(eng_fr_filename, max_len, min_count)
    print('Dataset: {}'.format(len(dataset)))

    train_len = len(dataset) - val_len
    dataset_train, dataset_val = torch.utils.data.dataset.random_split(
        dataset, [train_len, val_len])
    print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val)))

    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    batch_size,
                                                    shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  batch_size,
                                                  shuffle=False)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[TSVSentencePairDataset.PAD_TOKEN]
    init_idx = dataset.vocab[TSVSentencePairDataset.INIT_TOKEN]
    model = Seq2SeqModel(vocab_size, embedding_dim, hidden_size, padding_idx,
                         init_idx, max_len, teacher_forcing)
    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters, weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss(
        ignore_index=dataset.vocab[TSVSentencePairDataset.PAD_TOKEN])

    phases = [
        'train',
        'val',
    ]
    data_loaders = [
        data_loader_train,
        data_loader_val,
    ]

    for epoch in range(nb_epochs):
        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            for i, (inputs, targets) in enumerate(data_loader):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(targets)

                outputs = model(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm(parameters, max_grad_norm)
                    optimizer.step()

                epoch_loss.append(float(loss))

            epoch_loss = np.mean(epoch_loss)
            if phase == 'train':
                print('Epoch {:03d} | {} loss: {:.3f}'.format(
                    epoch, phase, epoch_loss),
                      end='')
            else:
                print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n')

            # print random sentence
            if phase == 'val':
                random_idx = np.random.randint(len(dataset_val))
                inputs, targets = dataset_val[random_idx]
                inputs_var = variable(inputs)
                print("Test")

                outputs_var = model(inputs_var.unsqueeze(
                    0))  # unsqueeze to get the batch dimension

                #outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                softmax = torch.nn.Softmax(dim=1)
                outputs = softmax(outputs_var)
                outputs = torch.multinomial(outputs, 1).data.view(-1)

                print(u'> {}'.format(
                    get_sentence_from_indices(
                        inputs, dataset.vocab,
                        TSVSentencePairDataset.EOS_TOKEN)))
                print(u'= {}'.format(
                    get_sentence_from_indices(
                        targets, dataset.vocab,
                        TSVSentencePairDataset.EOS_TOKEN)))
                print(u'< {}'.format(
                    get_sentence_from_indices(
                        outputs, dataset.vocab,
                        TSVSentencePairDataset.EOS_TOKEN)))

                print()