Ejemplo n.º 1
0
def get_training_setting():

    with open('vocab.pkl', 'rb') as file:
        vocab = pkl.load(file)
    
    reverse_vocab = dict((v, k) for k, v in vocab.items())

    emsize = 40 # embedding dimension
    nhid = 40 # the dimension of the feedforward network model in nn.TransformerEncoder
    nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    nhead = 4 # the number of heads in the multiheadattention models
    dropout = 0.2 # the dropout value
    model = TransformerModel(emsize, len(vocab), nhead, nhid, nlayers, dropout).to(device)

    lr = 1e-2 # learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.5)
    dataloader = Data.DataLoader(
        dataset = SentenceDataset('train.csv', vocab), batch_size = 64,\
        shuffle = True, collate_fn = collate_fn, num_workers = 4
    )
    val_data = Data.DataLoader(
        dataset = SentenceDataset('valid.csv', vocab), batch_size=64,\
        shuffle = False, collate_fn = collate_fn, num_workers=4
    )
    return model, scheduler, optimizer, dataloader, val_data, reverse_vocab
Ejemplo n.º 2
0
def main():
    max_len = 15
    min_count = 2

    embeddings_dir = '/home/mattd/embeddings/reddit_2/'
    #dataset_path = '/home/mattd/datasets/AskReddit/'
    dataset_path = "/home/mattd/PycharmProjects/reddit/generation/data/"
    dataset_train_filename = "{}train.csv".format(dataset_path)
    dataset_val_filename = "{}validation.csv".format(dataset_path)
    save_dir = "/home/mattd/PycharmProjects/reddit/generation/embeddings/"

    dataset_train = SentenceDataset(dataset_train_filename, max_len, min_count)
    dataset_val = SentenceDataset(dataset_val_filename, max_len, min_count,
                                  dataset_train.vocab)
    #dataset.add_file(eng_fr_filename2)

    vectors = embeddings.load_from_dir(embeddings_dir)

    #emb = embeddings.load_from_dir(embeddings_dir)

    embs_matrix = np.zeros((len(dataset_val.vocab), len(vectors.matrix[0])))

    for i, token in enumerate(dataset_val.vocab.token2id):
        if vectors.has_word(token):
            embs_matrix[i] = vectors.get_vector(token)
    np.save('{}embeddings_min{}_max{}'.format(save_dir, min_count, max_len),
            embs_matrix)
Ejemplo n.º 3
0
def main() -> None:
    tokenizer = Tokenizer(args.vocab_file)
    vocabulary_size = len(tokenizer)
    dataset = SentenceDataset(args.input_file, tokenizer=tokenizer.encode)
    loader = DataLoader(dataset,
                        args.batch_size,
                        shuffle=False,
                        collate_fn=dataset.collate_fn,
                        drop_last=False)

    searcher = BeamSearch(tokenizer.eos_index, beam_size=args.search_width)

    model = VAE(
        num_embeddings=len(tokenizer),
        dim_embedding=args.dim_embedding,
        dim_hidden=args.dim_hidden,
        dim_latent=args.dim_latent,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional,
        dropout=0.,
        word_dropout=0.,
        dropped_index=tokenizer.unk_index,
    ).to(device)
    model.load_state_dict(torch.load(args.checkpoint_file,
                                     map_location=device))
    model.eval()

    print('Generating sentence...')
    all_hypotheses = []
    with torch.no_grad():
        for s in tqdm(loader):
            s = s.to(device)
            length = torch.sum(s != tokenizer.pad_index, dim=-1)
            bsz = s.shape[0]

            mean, logvar = model.encode(s, length)
            # z = model.reparameterize(mean, logvar)
            z = mean

            hidden = model.fc_hidden(z)
            hidden = hidden.view(bsz, -1,
                                 model.dim_hidden).transpose(0,
                                                             1).contiguous()

            start_predictions = torch.zeros(bsz, device=device).fill_(
                tokenizer.bos_index).long()
            start_state = {'hidden': hidden.permute(1, 0, 2)}
            predictions, log_probabilities = searcher.search(
                start_predictions, start_state, model.step)

            for preds in predictions:
                tokens = preds[0]
                tokens = tokens[tokens != tokenizer.eos_index].tolist()
                all_hypotheses.append(tokenizer.decode(tokens))
    print('Done')

    with open(args.output_file, 'w') as f:
        f.write('\n'.join(all_hypotheses))
Ejemplo n.º 4
0
def main(_):

    train_path = FLAGS.train_path
    val_path = FLAGS.val_path
    vocab_path = FLAGS.vocab_path
    train_batch_size = FLAGS.train_batch_size
    val_batch_size = FLAGS.val_batch_size
    save_model = FLAGS.save_model

    embed_size = FLAGS.embed_size
    hidden_size = FLAGS.hidden_size
    lr = FLAGS.lr
    epochs = FLAGS.epochs

    save_every = FLAGS.save_every
    display_every = FLAGS.display_every 

    device = torch.device('cuda:0' if FLAGS.device=='cuda' else 'cpu')
    vocab = Vocabulary.load(vocab_path)

    if FLAGS.load_model:
        load_model = FLAGS.load_model
        model = Paraphraser.load(load_model, device)

    else:
        model = Paraphraser(embed_size, hidden_size, vocab, device)
         # uniformly initialize the parameters
        for param in model.parameters():
            param.data.uniform_(-0.1, 0.1)

    train_data_source, train_data_target = read(train_path)
    val_data_source, val_data_target = read(val_path)

    train_dataset = SentenceDataset(train_data_source, train_data_target, vocab)
    train_loader = torch.utils.data.DataLoader(train_dataset, train_batch_size, shuffle=True)

    val_dataset = SentenceDataset(val_data_source, val_data_target, vocab)
    val_loader = torch.utils.data.DataLoader(val_dataset, val_batch_size)

    optimizer = torch.optim.Adam(model.parameters(), lr)

    print('Started training... ')
    train(model, vocab, train_loader, val_loader, optimizer, embed_size, hidden_size, epochs, save_model, save_every, display_every, device)
    convert_onnx(model.load(save_model, device), val_loader, vocab) 
def main():
    embeddings_dir = '/mnt/data1/embeddings/crawl/'
    eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'

    dataset = SentenceDataset(eng_fr_filename, 20, 2)

    emb = embeddings.load_from_dir(embeddings_dir)
    vocab_embs = np.zeros((len(dataset.vocab), emb.matrix.shape[1]))

    for i, token in enumerate(dataset.vocab.token2id):
        if emb.has_word(token):
            vocab_embs[i] = emb.get_vector(token)
    np.save('embeddings', vocab_embs)
Ejemplo n.º 6
0
def main():
    hidden_size = 256
    embedding_dim = 300
    pretrained_embeddings = None
    max_len = 20
    min_count = 2
    max_grad_norm = 5
    val_len = 10000
    weight_decay = 0.00001

    model_filename_1 = '/home/mattd/pycharm/encoder/models3/Baseline'
    model_filename_2 = '/home/mattd/pycharm/encoder/models3/Attention'

    eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \
                      '/train_1M.txt'
    dataset = SentenceDataset(eng_fr_filename, max_len, min_count)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN]

    model = Seq2SeqModel(pretrained_embeddings, hidden_size, padding_idx,
                         init_idx, max_len, vocab_size, embedding_dim)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=weight_decay)

    model, optimizer, lowest_loss, description, last_epoch, \
    train_loss_1, val_loss_1 = load_checkpoint(model_filename_1, model,
                                               optimizer)

    model = Seq2SeqModelAttention(pretrained_embeddings, hidden_size,
                                  padding_idx, init_idx, max_len, vocab_size,
                                  embedding_dim)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=weight_decay)

    model, optimizer, lowest_loss, description, last_epoch, \
    train_loss_2, val_loss_2 = load_checkpoint(model_filename_2, model,
                                              optimizer)

    plot_data(train_loss_1, val_loss_1)
    plot_data(train_loss_2, val_loss_2)
Ejemplo n.º 7
0
def do_test(model_ckpt_path, test_data_path, result_path, word_dict_path,
            emb_dim, hid_dim):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    word_dict = load_pkl(word_dict_path)
    PAD_IDX = word_dict.word2idx("<PAD>")
    print("load data...")
    testData = SentenceDataset(load_pkl(test_data_path),
                               word_dict,
                               PAD_IDX,
                               training=False)
    print("load model...")
    model = get_model(word_dict.get_len(), word_dict.get_len(), emb_dim,
                      hid_dim, device)
    model.load_state_dict(torch.load(model_ckpt_path))
    model.to(device)
    print("predicting...")
    make_prediction(model, testData, word_dict, result_path, device)
Ejemplo n.º 8
0
def main():
    embeddings_dir = '/home/okovaleva/projects/forced_apart/autoencoder/data' \
                      '/w2vec.pkl'
    eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \
                      '/train.txt'
    eng_fr_filename2 = '/home/okovaleva/projects/forced_apart/autoencoder/data' \
                      '/test.txt'

    dataset = SentenceDataset(eng_fr_filename, 20, 2)
    #dataset.add_file(eng_fr_filename2)
    dataset.vocab.prune_vocab(2)

    vectors = get_vectors(embeddings_dir)

    #emb = embeddings.load_from_dir(embeddings_dir)

    embs_matrix = np.zeros((len(dataset.vocab), vectors['r'].size))

    for i, token in enumerate(dataset.vocab.token2id):
        if token in vectors:
            embs_matrix[i] = vectors[token]
    np.save('embeddings_2min', embs_matrix)
Ejemplo n.º 9
0
 def __init__(self):
     self._dataset = SentenceDataset()
     self.sentence_dataset_names = ["MSRvid", "SmartTextile", "MTurk", "environment"]
Ejemplo n.º 10
0
class SentenceSimEvaluation(WordSimEvaluation):

    def __init__(self):
        self._dataset = SentenceDataset()
        self.sentence_dataset_names = ["MSRvid", "SmartTextile", "MTurk", "environment"]

    def evaluate_all_sentence_datasets(self, display_table=True, **kwargs):
        cors = [self.evaluate_sentence_similarity(name, **kwargs) for name in self.sentence_dataset_names]
        if display_table:
            df_wpath = pd.DataFrame([cors], index=["wpath"], columns=self.sentence_dataset_names)
            return display(df_wpath)
        return cors

    def evaluate_sentence_similarity(self, dataset_name="MSRvid", metric = "wpath_graph", relatedness=True, save_results = False, database="wikidata"):
        concepts, cc, texts = get_ideas_in_format(dataset_name, database=database)
        KG = DAC(concepts=concepts, dataset=dataset_name, relatedness=relatedness, database=database)
        if(KG.graph.__len__()==0):
            print("start building knowledge graph")
            KG.build_nx_graph()

        ConSim = ConceptSimilarity(KG)
        sim_M = ConSim.similarityMatrix(lcs_pref_value="freq1", metric=metric)
        WMD = WordMoversSimilarity(sim_M, KG._concepts)

        sen_pairs, human_sim = self._dataset.load_sentence_pairs_and_similarities(dataset_name)
        sim_values = []
        map_sen2bow = dict(zip(texts, [[c["id"] for c in bow] for bow in cc]))
        pg, total_len = 0 , len(sen_pairs)
        remove_index = []
        for sen1, sen2 in sen_pairs:
            show_progression(pg, total_len)
            bow1, bow2 = list(set(map_sen2bow[sen1]) & set(KG._concepts)), list(set(map_sen2bow[sen2]) & set(KG._concepts))
            sim_value = WMD.word_mover_distance(bow1, bow2)
            if sim_value is None:
                print(sen1, sen2)
                remove_index.append(pg)
            else:
                sim_values.append(sim_value)
            pg = pg+1
        
        human_sim = np.delete(human_sim, remove_index)
        cor = pearsonr(sim_values, human_sim)[0]
        if save_results:
            results = list(zip([round(x, 3) for x in sim_values], sen_pairs))
            self._dataset.save_dataset(dict(zip(("correlation", "similarities"),(cor, results))), dataset_name+"_"+metric)
        return cor

    def compute_concept_sentence_M(self, dataset_name="gold", database="wikidata",metric="wpath", lcs_pref_value="freq1", relatedness=True):
        concepts, cc, texts = get_ideas_in_format(dataset_name, database=database)
        bows = [[c["id"] for c in bow] for bow in cc]
        KG = DAC(concepts=concepts, dataset=dataset_name, relatedness=relatedness, database=database)
        if(KG.graph.__len__()==0): 
            print("start building knowledge graph")
            KG.build_nx_graph()

        ConSim = ConceptSimilarity(KG)
        sim_M = ConSim.similarityMatrix(lcs_pref_value="freq1", metric=metric)
        WMD = WordMoversSimilarity(sim_M, KG._concepts)
        ideaM = WMD.sentenceSimilarityMatrix(bows)
        con2ideaM = WMD.concepts2sentenceSIM(bows)
        SIM_data = {
            "concepts":KG._concepts, 
            "ideas":texts,
            "conceptSIM":sim_M.tolist(),
            "ideaSIM":ideaM.tolist(),
            "concept2ideaSIM":con2ideaM.tolist()
        }
        print(SIM_data["concepts"].__len__() == SIM_data["conceptSIM"].__len__())
        self._dataset.save_dataset(SIM_data, dataset_name)
        return SIM_data
Ejemplo n.º 11
0
def main():
    logger = logging.getLogger(__name__)
    handler1 = logging.StreamHandler()
    handler1.setLevel(logging.INFO)
    handler2 = logging.FileHandler(filename=args.log_file, mode='w')
    handler2.setFormatter(
        logging.Formatter("%(asctime)s %(levelname)8s %(message)s"))
    handler2.setLevel(logging.INFO)
    logger.setLevel(logging.INFO)
    logger.addHandler(handler1)
    logger.addHandler(handler2)

    tokenizer = Tokenizer(args.vocab_file)
    train_dataset = SentenceDataset(args.train_file, tokenizer.encode)
    valid_dataset = SentenceDataset(args.valid_file, tokenizer.encode)
    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              shuffle=True,
                              collate_fn=train_dataset.collate_fn,
                              drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              args.batch_size,
                              shuffle=False,
                              collate_fn=valid_dataset.collate_fn,
                              drop_last=True)

    model = VAE(
        num_embeddings=len(tokenizer),
        dim_embedding=args.dim_embedding,
        dim_hidden=args.dim_hidden,
        dim_latent=args.dim_latent,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional,
        dropout=args.dropout,
        word_dropout=args.word_dropout,
        dropped_index=tokenizer.unk_index,
    ).to(device)

    annealer = KLAnnealer(x0=args.x0, k=args.k)

    criterion = LmCrossEntropyLoss(tokenizer.pad_index, reduction='batchmean')
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 betas=(0.9, 0.98),
                                 eps=1e-09)

    logger.info('Start training')
    for epoch in range(args.num_epochs):
        train_loss, train_ce_loss, train_kl_loss, valid_loss, valid_ce_loss, valid_kl_loss = 0., 0., 0., 0., 0., 0.
        pbar = tqdm(train_loader)
        pbar.set_description("[Epoch %d/%d]" % (epoch, args.num_epochs))

        # Train
        model.train()
        for itr, s in enumerate(pbar):
            beta = annealer()

            s = s.to(device)
            length = torch.sum(s != tokenizer.pad_index, dim=-1)
            output, mean, logvar, z = model(s, length)
            ce_loss = criterion(output[:, :-1, :], s[:, 1:])
            kl_loss = -0.5 * torch.mean(
                torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), dim=-1))
            loss = ce_loss + beta * kl_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            annealer.step()

            train_loss += loss.item()
            train_ce_loss += ce_loss.item()
            train_kl_loss += kl_loss.item()
            if itr % args.print_every == 0:
                pbar.set_postfix(loss=train_loss / (itr + 1), beta=beta)
        train_loss /= len(train_loader)
        train_ce_loss /= len(train_loader)
        train_kl_loss /= len(train_loader)

        # Valid
        model.eval()
        with torch.no_grad():
            for s in valid_loader:
                beta = annealer()

                s = s.to(device)
                length = torch.sum(s != tokenizer.pad_index, dim=-1)
                output, mean, logvar, z = model(s, length)
                ce_loss = criterion(output[:, :-1, :], s[:, 1:])
                kl_loss = -0.5 * torch.mean(
                    torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), dim=-1))
                loss = ce_loss + beta * kl_loss

                valid_loss += loss.item()
                valid_ce_loss += ce_loss.item()
                valid_kl_loss += kl_loss.item()
            valid_loss /= len(valid_loader)
            valid_ce_loss /= len(valid_loader)
            valid_kl_loss /= len(valid_loader)

        logger.info(
            '[Epoch %d/%d] Training loss: %.2f, CE loss: %.2f, KL loss: %.2f, Validation loss: %.2f, CE loss: %.2f, KL loss: %.2f'
            % (
                epoch,
                args.num_epochs,
                train_loss,
                train_ce_loss,
                train_kl_loss,
                valid_loss,
                valid_ce_loss,
                valid_kl_loss,
            ))

        torch.save(model.state_dict(), args.checkpoint_file)
Ejemplo n.º 12
0
def main():
    nb_epochs = 30
    batch_size = 200
    hidden_size = 256
    embedding_dim = 300
    max_len = 20
    teacher_forcing = 0.6
    min_count = 2
    max_grad_norm = 5
    val_len = 5000
    weight_decay = 0.00001
    model_filename = '/home/mattd/pycharm/yelp/models' \
                     '/baseline_frozen_pretrained'

    eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset = SentenceDataset(eng_fr_filename, max_len, min_count)
    print('Dataset: {}'.format(len(dataset)))

    train_len = len(dataset) - val_len
    dataset_train, dataset_val = torch.utils.data.dataset.random_split(
        dataset, [train_len, val_len])
    print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val)))

    embeddings_dir = '/home/mattd/pycharm/yelp/embeddings.npy'
    embeddings = cuda(get_pretrained_embeddings(embeddings_dir, dataset))

    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    batch_size,
                                                    shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  batch_size,
                                                  shuffle=False)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN]
    model = Seq2SeqModel(embeddings, hidden_size, padding_idx, init_idx,
                         max_len, teacher_forcing)
    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss(
        ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN])

    phases = [
        'train',
        'val',
    ]
    data_loaders = [
        data_loader_train,
        data_loader_val,
    ]

    lowest_loss = 500

    for epoch in range(nb_epochs):
        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            for i, (inputs, targets) in enumerate(data_loader):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(targets)

                outputs = model(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()

                epoch_loss.append(float(loss))

            epoch_loss = np.mean(epoch_loss)

            if epoch_loss < lowest_loss:
                save_checkpoint(model, loss, optimizer, model_filename)
                lowest_loss = epoch_loss

            if phase == 'train':
                print('Epoch {:03d} | {} loss: {:.3f}'.format(
                    epoch, phase, epoch_loss),
                      end='')
            else:
                print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n')

            # print random sentence
            if phase == 'val':
                random_idx = np.random.randint(len(dataset_val))
                inputs, targets = dataset_val[random_idx]
                inputs_var = variable(inputs)

                outputs_var = model(inputs_var.unsqueeze(
                    0))  # unsqueeze to get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                print(u'> {}'.format(
                    get_sentence_from_indices(inputs, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print(u'= {}'.format(
                    get_sentence_from_indices(targets, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print(u'< {}'.format(
                    get_sentence_from_indices(outputs, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print()
Ejemplo n.º 13
0
def main():
    nb_epochs = 100
    batch_size = 500
    hidden_size = 256
    embedding_dim = 300
    pretrained_embeddings = None
    max_len = 20
    min_count = 2
    max_grad_norm = 5
    val_len = 10000
    weight_decay = 0.00001
    model_filename = '/home/mattd/pycharm/encoder/models3' \
                     '/Baseline'
    description_filename = \
        '/home/mattd/pycharm/encoder/description/description2.txt'
    output_file = '/home/mattd/pycharm/encoder/model_outputs_3/baseline'

    outfile = open(output_file, 'w')

    eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \
                      '/train_1M.txt'
    dataset = SentenceDataset(eng_fr_filename, max_len, min_count)
    string = 'Dataset: {}'.format(len(dataset))
    print(string)
    outfile.write(string+'\n')

    train_len = len(dataset) - val_len
    dataset_train, dataset_val = torch.utils.data.dataset.random_split(dataset, [train_len, val_len])
    string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val))
    print(string)
    outfile.write(string+'\n')

    embeddings_dir = '/home/mattd/pycharm/encoder' \
                     '/embeddings_3min.npy'
    pretrained_embeddings = cuda(
        get_pretrained_embeddings(embeddings_dir, dataset))
    embedding_dim = pretrained_embeddings.shape[1]

    data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN]

    model = Seq2SeqModel(
        pretrained_embeddings, hidden_size, padding_idx, init_idx,
                         max_len, vocab_size, embedding_dim)
    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN])

    model, optimizer, lowest_loss, description, last_epoch, \
    train_loss, val_loss = load_checkpoint(model_filename, model, optimizer)

    print(description)

    phases = ['train', 'val', ]
    data_loaders = [data_loader_train, data_loader_val, ]


    for epoch in range(last_epoch, last_epoch+nb_epochs):
        start = time.clock()

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            epoch_sentenence_accuracy = []
            epoch_token_accuracy = []

            for i, inputs in enumerate(data_loader):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(inputs)

                outputs = model(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()

                if phase == 'val':
                    predicted = torch.argmax(outputs.view(batch_size, max_len,
                                                          -1), -1)
                    batch_sentenence_accuracy, batch_token_accuracy = accuracy(
                        targets.view(batch_size, -1), predicted)
                    epoch_sentenence_accuracy.append(batch_sentenence_accuracy)
                    epoch_token_accuracy.append(batch_token_accuracy)
                epoch_loss.append(float(loss))

            epoch_loss = np.mean(epoch_loss)

            if phase == 'train':
                train_loss.append(epoch_loss)
                string = ('Epoch {:03d} | {} loss: {:.3f}'.format(
                    epoch, phase, epoch_loss))

                print(string, end='\n')
                outfile.write(string+'\n')
            else:
                averege_epoch_sentenence_accuracy = sum(epoch_sentenence_accuracy) / \
                    len(epoch_sentenence_accuracy)
                averege_epoch_token_accuracy = sum(epoch_token_accuracy) / \
                    len(epoch_token_accuracy)
                time_taken = time.clock() - start

                val_loss.append(epoch_loss)
                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, epoch_loss, time_taken)
                print(string, end='')

                string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format(
                    averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy)
                print(string, end='\n')
                outfile.write(string+'\n')
                if epoch_loss < lowest_loss:
                    save_checkpoint(
                        model, epoch_loss, optimizer, model_filename,
                        description_filename, epoch, train_loss, val_loss)
                    lowest_loss = epoch_loss



            # print random sentence
            if phase == 'val':
                random_idx = np.random.randint(len(dataset_val))
                inputs = dataset_val[random_idx]
                targets = inputs
                inputs_var = variable(inputs)

                outputs_var = model(inputs_var.unsqueeze(0)) # unsqueeze to get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                string = '> {}'.format(get_sentence_from_indices(
                    inputs, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')

                string = u'= {}'.format(get_sentence_from_indices(
                    targets, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')

                string = u'< {}'.format(get_sentence_from_indices(
                    outputs, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')
                print()
    outfile.close()
Ejemplo n.º 14
0
def predict(path):


    with open('vocab.pkl', 'rb') as file:
        vocab = pkl.load(file)
    reverse_vocab = {v: k for k, v in vocab.items()}
    emsize = 40 # embedding dimension
    nhid = 40# the dimension of the feedforward network model in nn.TransformerEncoder
    nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    nhead = 4 # the number of heads in the multiheadattention models
    dropout = 0.2 # the dropout value
    model = TransformerModel(emsize, len(vocab), nhead, nhid, nlayers, dropout).to(device)
    model.load_state_dict(torch.load(path)['model'])

    test_data = Data.DataLoader(
        dataset = SentenceDataset('test.csv', vocab, True), batch_size = 64,\
        shuffle = False, collate_fn = pred_collate_fn, num_workers = 4
    )
    model.eval()
    rewrite_indexes = []
    cut_length = []
    for batch in test_data:
        data, masks, length= batch['fail_sent'], batch['mask'], batch['length']
        if cuda:
            data, masks = data.to(device), masks.to(device)
        output = model(data).squeeze(2)
        probs = (F.sigmoid(output) > 0.6)
        for i in range(data.shape[0]):
            check = ','.join([reverse_vocab[idx.item()] for idx in data[i]])
            if check == 'Nb,P,Neu,Nf,Na,VG,Neu,Nf,Ng':
                pdb.set_trace()
            d, l, p = data[i], length[i], probs[i]
            one_hot = (p == True).tolist()[:l]
            index = [i for i in range(len(one_hot)) if one_hot[i] == 1]
            cut_length.append(len(one_hot)-len(index))
            rewrite_indexes.append(index)
    
    output = {}
    answers = []
    with open('answer.txt', 'r') as file:
        for line in file:
            answers.append(line.split('\n')[0])

    plt.hist(cut_length, histtype='stepfilled', alpha=0.3, bins=list(set(cut_length)))
    plt.savefig('cut_length_rouge_transformer.png')
    df = pd.read_csv('test.csv')
    for i, row in df.iterrows():
        index = rewrite_indexes[i]
        word_list = row['Original'].split(',')
        mapping = row['Mapping']
        sent = [word_list[ind] for ind in index]
        if mapping not in output:
            output[mapping] = [sent]
        else:
            output[mapping].append(sent)
    
    with open('rewrite_index.txt', 'w') as file:
        for key, value in output.items():
            out = ""
            for sent in value:
                out += ''.join(sent)+','
            try:
                out = out[:-1] + '?\t' + answers[key] + '\n'
            except:
                pdb.set_trace()
            file.write(out)
Ejemplo n.º 15
0
def main():
    train_data = SentenceDataset(args.train_file,
                                 encoding_type=args.encoding_type,
                                 filter_threshold=args.filter_threshold)
    val_data = SentenceDataset(args.val_file,
                               encoding_type=args.encoding_type,
                               filter_threshold=args.filter_threshold)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               args.batch_size,
                                               shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_data, args.batch_size)

    print(len(train_loader))

    input_dim = len(train_data.vocab.source_vocab)
    output_dim = len(train_data.vocab.target_vocab)
    static = args.embedding_type == 'static'

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    enc_embedding = Embeddings(input_dim, args.hidden_dim, args.max_len,
                               device, static)
    encoder_layer = EncoderLayer(args.hidden_dim, args.num_enc_heads,
                                 args.inner_dim, args.dropout)
    encoder = Encoder(enc_embedding, encoder_layer, args.num_enc_layers,
                      args.dropout)

    dec_embedding = Embeddings(input_dim, args.hidden_dim, args.max_len,
                               device, static)
    decoder_layer = DecoderLayer(args.hidden_dim, args.num_dec_heads,
                                 args.inner_dim, args.dropout)
    decoder = Decoder(output_dim, args.hidden_dim, dec_embedding,
                      decoder_layer, args.num_dec_layers, args.dropout)

    pad_id = train_data.vocab.source_vocab['<pad>']

    model = Transformer(encoder, decoder, pad_id, device)

    print('Transformer has {:,} trainable parameters'.format(
        count_parames(model)))

    if args.load_model is not None:
        model.load(args.load_model)
    else:
        model.apply(init_weights)

    if args.mode == 'test':
        inferencer = Inferencer(model, train_data.vocab, device)
        greedy_out = inferencer.infer_greedy(
            'helo world, I m testin a typo corector')
        print(greedy_out)

    elif args.mode == 'train':
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        loss_function = nn.NLLLoss(ignore_index=pad_id)

        print('Started training...')
        train(model, train_loader, val_loader, optimizer, loss_function,
              device)

    else:
        raise ValueError('Mode not recognized')