コード例 #1
0
ファイル: dataset.py プロジェクト: kevinb22/BinarizedNMT
def save_small_shard_vocab() -> None:
    '''
    Reads the small shard dataset and creates vocabulary objects
    '''
    d = ShardedCSVDataset(WMT14_EN_FR_SMALL_TRAIN_SHARD)

    def en_fn(item):
        return item[0]

    def fr_fn(item):
        return item[1]

    en_vocab = build_vocab(
        d,
        en_fn,
        unk_cutoff=2,
    )

    d.reset()

    fr_vocab = build_vocab(
        d,
        fr_fn,
        unk_cutoff=2,
    )

    save_vocab(en_vocab, SMALL_TRAIN_EN_VOCAB_FILE)
    save_vocab(fr_vocab, SMALL_TRAIN_FR_VOCAB_FILE)
コード例 #2
0
def main(config_file='config/bert_config.json'):
    """Main method for training.

    Args:
        config_file: in config dir
    """
    # 0. Load config and mkdir
    with open(config_file) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    get_path(os.path.join(config.model_path, config.experiment_name))
    get_path(config.log_path)
    if config.model_type == 'rnn':  # build vocab for rnn
        build_vocab(file_in=config.all_train_file_path,
                    file_out=os.path.join(config.model_path, 'vocab.txt'))
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type)
    datasets = data.load_train_and_valid_files(
        train_file=config.train_file_path, valid_file=config.valid_file_path)
    train_set, valid_set_train, valid_set_valid = datasets
    if torch.cuda.is_available():
        device = torch.device('cuda')
        torch.distributed.init_process_group(backend="nccl")
        sampler_train = DistributedSampler(train_set)
    else:
        device = torch.device('cpu')
        sampler_train = RandomSampler(train_set)
    data_loader = {
        'train':
        DataLoader(train_set,
                   sampler=sampler_train,
                   batch_size=config.batch_size),
        'valid_train':
        DataLoader(valid_set_train,
                   batch_size=config.batch_size,
                   shuffle=False),
        'valid_valid':
        DataLoader(valid_set_valid,
                   batch_size=config.batch_size,
                   shuffle=False)
    }
    # 2. Build model
    model = MODEL_MAP[config.model_type](config)
    model.to(device)
    if torch.cuda.is_available():
        model = torch.nn.parallel.DistributedDataParallel(
            model, find_unused_parameters=True)
    # 3. Train
    trainer = Trainer(model=model,
                      data_loader=data_loader,
                      device=device,
                      config=config)
    best_model_state_dict = trainer.train()
    # 4. Save model
    torch.save(best_model_state_dict,
               os.path.join(config.model_path, 'model.bin'))
コード例 #3
0
ファイル: dataset.py プロジェクト: n0obcoder/paper
    def __init__(self, data_path, train=False, longest_sequence_length=None):

        data0 = load_sent(data_path + '.0')
        data1 = load_sent(data_path + '.1')
        print(
            f'\n------------------------ Building a Dataset ------------------------'
        )
        print(f'#sents of {data_path}.0 file 0: {len(data0)}'
              )  # list of list of tokenized words
        print(f'#sents of {data_path}.1 file 1: {len(data1)}'
              )  # list of list of tokenized words

        self.data_all = data0 + data1
        self.style_list = [0 for i in data0] + [
            1 for i in data1
        ]  # data0 is all neg, data1 is all pos

        # sorting all the data according to their seq lengths in descending order
        zip_item = zip(self.data_all, self.style_list)
        sorted_item = sorted(zip_item, key=lambda p: len(p[0]), reverse=True)
        tuple_item = zip(*sorted_item)
        self.data_all, self.style_list = [list(t) for t in tuple_item]

        print(f'len(self.data_all)  : {len(self.data_all)}')
        print(f'len(self.style_list): {len(self.style_list)}')

        if train:
            print('\ntrain: True')
            if not os.path.isfile(cfg.vocab):
                print(f'{cfg.vocab} does not exist')
                print('Building Vocab...')
                build_vocab(data0 + data1, cfg.vocab)
            else:
                print(f'{cfg.vocab} already exists')

        self.vocab = Vocabulary(cfg.vocab, cfg.embedding_file, cfg.embed_dim)
        print('\nvocabulary size:', self.vocab.size)
        print(
            f'vocabulary embedding matrix shape: {self.vocab.embedding.shape}')
        # print(type(self.vocab.embedding)) # np array

        self.longest_sequence_length = longest_sequence_length

        if longest_sequence_length is None:
            self.update_the_max_length()

        print(f'self.longest_sequence_length: {self.longest_sequence_length}')
        print(
            f'--------------------------------------------------------------------'
        )
コード例 #4
0
    def _build_main_vocab(self, min_vocab_count):
        def token_stream():
            for path, lang in self._vocab_files:
                with open(path, errors='ignore') as file_:
                    content = file_.read()
                for token in tokenizer.tokenize(content, self._mode):
                    yield token

        return vocab.build_vocab(token_stream(), min_vocab_count)
コード例 #5
0
    def __init__(self, arg, load_model=False):
        self.args = arg
        self.vocab = build_vocab(self.args)
        self.embedding_matrix = build_embedding_matrix(self.args, self.vocab)

        self.dataset_train = Dataset(self.args, 'train', self.embedding_matrix)
        self.dataset_val = Dataset(self.args, 'val', self.embedding_matrix)
        self.dataset_test = Dataset(self.args, 'test', self.embedding_matrix)

        self.model = Model(self.args)
        self.optimizer = optim.Adam(self.model.parameters())
        self._epoch = 0
        self._iter = 0
        self.max_val_acc = None
        self.max_test_acc = None
        if load_model:
            self.load_model()
コード例 #6
0
def create_datasets():
    train_dataset = conlldataloader.ConllDataSet(constants.CONLL2003_TRAIN)
    valid_dataset = conlldataloader.ConllDataSet(constants.CONLL2003_VALID)

    print('processing train dataset')
    train_dataset.parse_file()
    print('finished processing train dataset')

    print('build training vocab')
    train_vocab = vocab.build_vocab(train_dataset.word_list)
    print('done building vocab')

    print('build output vocab')
    output_categories = vocab.build_output_vocab(train_dataset.categories)
    print('done building output vocab')

    print('processing valid dataset')
    valid_dataset.parse_file()
    print('finished processing valid dataset')

    return train_dataset, valid_dataset, train_vocab, output_categories
コード例 #7
0
def create_datasets() -> CADECLoadedDatasetType:
    train_conll, valid_conll, _, _, _, _ = get_constants()
    train_dataset = SCIERCDataset(train_conll)
    valid_dataset = SCIERCDataset(valid_conll)

    print('processing train dataset')
    train_dataset.parse_file()
    print('finished processing train dataset')

    print('processing valid dataset')
    valid_dataset.parse_file()
    print('finished processing valid dataset')

    print('build training vocab')
    train_vocab = vocab.build_vocab(train_dataset.word_list)
    print('done building vocab')

    print('build output vocab')
    output_categories = vocab.build_output_vocab(train_dataset.categories)
    print('done building output vocab')

    return train_dataset, valid_dataset, train_vocab, output_categories
コード例 #8
0
ファイル: train.py プロジェクト: dakk/ml-experiments

def load_model():
    model = keras.models.load_model(BEST_MODEL_FILE,
                                    custom_objects={
                                        'TokenAndPositionEmbedding':
                                        TokenAndPositionEmbedding,
                                        'TransformerBlock': TransformerBlock
                                    })
    return model


batch_size = 32

# Create a list all files
text_ds, index_to_word, word_to_index = build_vocab(directories, batch_size,
                                                    vocab_size, maxlen)


def tokenizeString(s):
    return [word_to_index.get(_, 1) for _ in s.split()]


class TextGenerator(keras.callbacks.Callback):
    def __init__(self, max_tokens, start_tokens):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.k = 10

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
コード例 #9
0
def load_words(dataset, cutoff=1):

    _vocab = build_vocab(dataset, cutoff)

    return _vocab
コード例 #10
0
from dataset import load_data, load_data_to_one_str
import torch
from torch.autograd import Variable
from utils import similarity, mean_vectors
from torch.nn import Embedding

data = 'data'
emb_dir = 'embedding/'
emb_file = 'wiki.pl'
embedding_dim = 300
hidden_dim = 50
sentence_length = 8

vocab_file = 'tmp/vocab.txt'
build_vocab([
        emb_dir+emb_file+'.vec'
    ], 'tmp/vocab.txt')
vocab = Vocab(filename=vocab_file)
emb: Embedding = load_embedding_model(data=data, emb_dir=emb_dir, emb_file=emb_file, vocab=vocab, input_dim=embedding_dim)
model = LSTMEmbeddings(embedding_dim, hidden_dim, vocab.size(), sentence_length, emb)

encoded_file_animals = load_data(data+"/animals.txt", vocab, sentence_length)
encoded_file_buildings = load_data(data+"/buildings.txt", vocab, sentence_length)

# encoded_file_batman_beyond = \
#     load_data_to_one_str(data+"/shows/Batman Beyond/batman_beyond_-_1x01_-_rebirth_-_part_1_.vpc.txt", vocab, sentence_length)
#
# encoded_file_batman_animated = \
#     load_data_to_one_str(data + "/shows/Batman: The Animated Series/01. On Leather Wings.txt", vocab, sentence_length)

# encoded_file_dharma_greg = \
コード例 #11
0
        train3 = load_sent(args.train + '.3', args.max_train_size)
        train4 = load_sent(args.train + '.4', args.max_train_size)
        train5 = load_sent(args.train + '.5', args.max_train_size)
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)
        print '#sents of training file 2:', len(train2)
        print '#sents of training file 3:', len(train3)
        print '#sents of training file 4:', len(train4)
        print '#sents of training file 5:', len(train5)

        # loaded all three datasets here. Train once with 0-1 and once with 0-2

        print("=====got here training=====")
        # grand vocabulary
        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1 + train2 + train3 + train4 + train5,
                        args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary 1 size:', vocab.size

    # introduce a second input argument, "vocab2"

    # vocab2 = Vocabulary(args.vocab2, args.embedding, args.dim_emb)
    # print 'vocabulary 2 size:', vocab2.size

    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev1 = load_sent(args.dev + '.1')
        dev2 = load_sent(args.dev + '.2')
        dev3 = load_sent(args.dev + '.3')
        dev4 = load_sent(args.dev + '.4')
コード例 #12
0
ファイル: baseline.py プロジェクト: ankurdhoot/CS224U
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import vocab
from dataset import SNLITrainDataset, SNLIDevDataset
import torch.optim as optim
from sklearn.metrics import classification_report
import numpy as np

LABEL_MAP = {'neutral': 0, 'contradiction': 1, 'entailment': 2}
VOCAB = vocab.build_vocab()
# setting device on GPU if available, else CPU
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)


class Baseline(nn.Module):
    def __init__(self,
                 embed_size=100,
                 hidden_size=100,
                 projection_dim=200,
                 dropout_rate=0.2):
        """
        Args:
            embed_size : int
                Size of input embeddings
            hidden_size : int
                Size of the LSTM hidden state
            projection_dim: int
                Size of the linear projection layers
            dropout_rate: float
コード例 #13
0
ファイル: train.py プロジェクト: awesome-archive/CAIL
def main(config_file='config/bert_config.json'):
    """Main method for training.

    Args:
        config_file: in config dir
    """
    global datasets
    # 0. Load config and mkdir
    with open(config_file) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))

    get_path(os.path.join(config.model_path, config.experiment_name))
    get_path(config.log_path)
    if config.model_type in ['rnn', 'lr', 'cnn']:  # build vocab for rnn
        build_vocab(file_in=config.all_train_file_path,
                    file_out=os.path.join(config.model_path, 'vocab.txt'))
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)

    def load_dataset():
        datasets = data.load_train_and_valid_files(
            train_file=config.train_file_path,
            valid_file=config.valid_file_path)
        return datasets

    if config.serial_load:
        datasets = SERIAL_EXEC.run(load_dataset)
    else:
        datasets = load_dataset()

    train_set, valid_set_train, valid_set_valid = datasets
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
        # torch.distributed.init_process_group(backend="nccl")
        # sampler_train = DistributedSampler(train_set)
        sampler_train = RandomSampler(train_set)
    else:
        device = torch.device('cpu')
        sampler_train = RandomSampler(train_set)
    # TPU
    device = xm.xla_device()
    sampler_train = torch.utils.data.distributed.DistributedSampler(
        train_set,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)

    data_loader = {
        'train':
        DataLoader(train_set,
                   sampler=sampler_train,
                   batch_size=config.batch_size),
        'valid_train':
        DataLoader(valid_set_train,
                   batch_size=config.batch_size,
                   shuffle=False),
        'valid_valid':
        DataLoader(valid_set_valid,
                   batch_size=config.batch_size,
                   shuffle=False)
    }

    # 2. Build model
    # model = MODEL_MAP[config.model_type](config)
    model = WRAPPED_MODEL
    #load model states.
    # if config.trained_weight:
    #     model.load_state_dict(torch.load(config.trained_weight))
    model.to(device)
    if torch.cuda.is_available():
        model = model
        # model = torch.nn.parallel.DistributedDataParallel(
        #     model, find_unused_parameters=True)

    # 3. Train
    trainer = Trainer(model=model,
                      data_loader=data_loader,
                      device=device,
                      config=config)
    # best_model_state_dict = trainer.train()

    if config.model_type == 'bert':
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.0
        }]
        optimizer = AdamW(optimizer_parameters,
                          lr=config.lr,
                          betas=(0.9, 0.999),
                          weight_decay=1e-8,
                          correct_bias=False)
    else:  # rnn
        optimizer = Adam(model.parameters(), lr=config.lr)

    # if config.model_type == 'bert':
    #     scheduler = get_linear_schedule_with_warmup(
    #         optimizer,
    #         num_warmup_steps=config.num_warmup_steps,
    #         num_training_steps=config.num_training_steps)
    # else:  # rnn
    #     scheduler = get_constant_schedule(optimizer)

    criterion = nn.CrossEntropyLoss()

    def train_loop_fn(loader):
        tracker = xm.RateTracker()
        model.train()
        for x, batch in enumerate(loader):
            # batch = tuple(t.to(self.device) for t in batch)
            output = model(*batch[:-1])  # the last one is label
            loss = criterion(output, batch[-1])
            loss.backward()
            # xm.optimizer_step(optimizer)
            # optimizer.zero_grad()

            tracker.add(FLAGS.batch_size)
            if (x + 1) % config.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               config.max_grad_norm)
                # after 梯度累加的基本思想在于,在优化器更新参数前,也就是执行 optimizer.step() 前,进行多次反向传播,是的梯度累计值自动保存在 parameter.grad 中,最后使用累加的梯度进行参数更新。
                xm.optimizer_step(optimizer)
                optimizer.zero_grad()

            if xm.get_ordinal() == 0:
                if x % FLAGS.log_steps == 0:
                    print(
                        '[xla:{}]({}) Loss={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}'
                        .format(xm.get_ordinal(), x, loss.item(),
                                tracker.rate(), tracker.global_rate(),
                                time.asctime()),
                        flush=True)

    def test_loop_fn(loader):
        total_samples = 0
        correct = 0
        model.eval()
        data, pred, target = None, None, None
        tracker = xm.RateTracker()
        for x, batch in enumerate(loader):
            output = model(*batch[:-1])  # the last one is label
            target = batch[-1]
            # pred = output.max(1, keepdim=True)[1]
            # correct += pred.eq(target.view_as(pred)).sum().item()
            for i in range(len(output)):
                logits = output[i]
                pred = int(torch.argmax(logits, dim=-1))
                if pred == target[i]:
                    correct += 1
            total_samples += len(output)

            if xm.get_ordinal() == 0:
                if x % FLAGS.log_steps == 0:
                    print(
                        '[xla:{}]({}) Acc={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}'
                        .format(xm.get_ordinal(), x,
                                correct * 1.0 / total_samples, tracker.rate(),
                                tracker.global_rate(), time.asctime()),
                        flush=True)

        accuracy = 100.0 * correct / total_samples
        if xm.get_ordinal() == 0:
            print('[xla:{}] Accuracy={:.2f}%'.format(xm.get_ordinal(),
                                                     accuracy),
                  flush=True)
        return accuracy, data, pred, target

    # Train and eval loops
    accuracy = 0.0
    data, pred, target = None, None, None
    for epoch in range(FLAGS.num_epoch):
        para_loader = pl.ParallelLoader(data_loader['train'], [device])
        train_loop_fn(para_loader.per_device_loader(device))
        xm.master_print("Finished training epoch {}".format(epoch))

        # para_loader = pl.ParallelLoader(data_loader['valid_train'], [device])
        # accuracy_train, data, pred, target = test_loop_fn(para_loader.per_device_loader(device))

        para_loader = pl.ParallelLoader(data_loader['valid_valid'], [device])
        accuracy_valid, data, pred, target = test_loop_fn(
            para_loader.per_device_loader(device))
        xm.master_print("Finished test epoch {}, valid={:.2f}".format(
            epoch, accuracy_valid))

        if FLAGS.metrics_debug:
            xm.master_print(met.metrics_report())

        # 4. Save model
        # if xm.get_ordinal() == 0:
        #     # if epoch==FLAGS.num_epoch-1:
        #     # WRAPPED_MODEL.to('cpu')
        #     torch.save(WRAPPED_MODEL.state_dict(), os.path.join(
        #         config.model_path, config.experiment_name,
        #         config.model_type + '-' + str(epoch + 1) + '.bin'))
        #     xm.master_print('saved model.')
        # WRAPPED_MODEL.to(device)

    return accuracy_valid
コード例 #14
0
        print('Loading model from', os.path.join(args.classifier_path,
                                                 'model'))
        model.saver.restore(sess, os.path.join(args.classifier_path, 'model'))
    else:
        print('Creating model with fresh parameters.')
        sess.run(tf.global_variables_initializer())
    if not os.path.exists(args.classifier_path):
        os.makedirs(args.classifier_path)
    return model


if __name__ == '__main__':
    args = load_arguments()

    if not os.path.isfile(args.vocab):
        build_vocab(args.train_path, args.vocab, lang=args.lang)
    vocab = Vocabulary(args.vocab)
    print('vocabulary size', vocab.size)

    loader = ClassificationBatcher(args, vocab)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:

        model = create_model(sess, args, vocab)

        batches = loader.get_batches(mode='train')

        start_time = time.time()
        loss = 0.0
コード例 #15
0
ファイル: main.py プロジェクト: katha-shah/iksha
def main():
    construct_vocab = False
    encode_images = False
    train = True

    # Read and Process Raw data
    data = CaptioningData()
    # Finding image files as data
    data.set_all_images(cfg.images_path)
    captions_dict = data.get_captions(cfg.token_file)
    caption_maxlen = data.get_caption_maxlen()

    # Construct vocabulary
    if construct_vocab:
        # get all caption to construct Vocab
        all_captions = data.get_all_captions()
        vocab = build_vocab(vocab_path=cfg.data_path,
                            vocab_name=cfg.vocab_name,
                            captions=all_captions,
                            threshold=2)
    else:
        vocab = load_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name)
    # print(vocab.word2idx)
    inception_encoding = Encoder()

    # train data
    if train:
        train_images = data.get_train_images(cfg.train_image_files)
        train_pairs = [
            ImgCaptionPair(img_id, captions_dict[img_id])
            for img_id in train_images
        ]

        # Image Encoding

        if encode_images:
            train_img_encoding = inception_encoding.encode_images(
                file_path=cfg.images_path,
                image_list=train_images,
                encoding_file=cfg.train_img_encoding_file)
        else:
            train_img_encoding = inception_encoding.load_image_encoding(
                encoding_file=cfg.train_img_encoding_file)

        train_data_generator = data_generator(vocab,
                                              train_pairs,
                                              train_img_encoding,
                                              batch_size=1800,
                                              max_len=caption_maxlen)
        # next(g)

    # Decoder model
    decoder = Decoder(vocab_size=len(vocab),
                      embedding_size=300,
                      input_shape=2048,
                      caption_max_len=caption_maxlen)
    decoder_model = decoder.get_model()
    decoder_model.load_weights('best_weights.97-0.95.hdf5')

    if train:
        decoder_model.compile(loss='categorical_crossentropy',
                              optimizer=RMSprop(),
                              metrics=['accuracy'])
        ckpt = ModelCheckpoint('weights.{epoch:02d}-{loss:.2f}.hdf5',
                               monitor='loss',
                               verbose=0,
                               save_best_only=False,
                               save_weights_only=False,
                               mode='auto',
                               period=30)
        best_ckpt = ModelCheckpoint('best_weights.{epoch:02d}-{loss:.2f}.hdf5',
                                    monitor='loss',
                                    verbose=0,
                                    save_best_only=True,
                                    save_weights_only=False,
                                    mode='auto',
                                    period=1)
        decoder_model.fit_generator(train_data_generator,
                                    steps_per_epoch=30,
                                    epochs=100,
                                    callbacks=[ckpt, best_ckpt])

    decoder_model.save('decoder_model.h5')

    img_ids = data.get_val_images(cfg.val_image_files)
    img_name = img_ids[9]

    enc_img = inception_encoding.encode_single_img(file_path=cfg.images_path,
                                                   img_name=img_name)

    caption = ["<start>"]
    while True:
        par_caps = [vocab(i) for i in caption]
        par_caps = sequence.pad_sequences([par_caps],
                                          maxlen=40,
                                          padding='post')
        preds = decoder_model.predict(
            [np.array([enc_img]), np.array(par_caps)])
        word_pred = vocab.idx2word[np.argmax(preds[0])]
        caption.append(word_pred)

        if word_pred == "<end>" or len(caption) > 40:
            break

    full_img_path = os.path.join(cfg.images_path, img_name)
    print(captions_dict[img_name])
    print(full_img_path)
    print(' '.join(caption[1:-1]))
コード例 #16
0
                       model.inputs: batch['inputs'],
                       model.targets: batch['targets'],
                       model.weights: batch['weights'],
                       model.dropout: 1})
        n_words += np.sum(batch['weights'])

    return np.exp(tot_loss / n_words)

if __name__ == '__main__':
    args = load_arguments()

    if args.train:
        train = load_sent(args.train)

        if not os.path.isfile(args.vocab):
            build_vocab(train, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size', vocab.size

    if args.dev:
        dev = load_sent(args.dev)

    if args.test:
        test = load_sent(args.test)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = create_model(sess, args, vocab)
        if args.train:
コード例 #17
0
import json
from dataset import CodeDataset
from vocab import build_vocab, write_vocab, Vocab
from nltk.stem import PorterStemmer
import pymongo

client = pymongo.MongoClient('NOM_CLIENT')
db = client.codes 

stemmer = PorterStemmer()
def add_voc(vocab):
	with open('data/cim.json','r') as df :
		cim = json.load(df)
	with open('data/ccam.json','r') as df :
		ccam = json.load(df)
	vocab = vocab + [stemmer.stem(c) for c in cim.keys()] + [stemmer.stem(c) for c in ccam.keys()]
	return vocab

if __name__ == "__main__":
	dataset_cim = CodeDataset(db.cim)
	dataset_ccam = CodeDataset(db.ccam)
    vocab_cim = build_vocab(dataset_cim)
    vocab_ccam = build_vocab(dataset_ccam)
    write_vocab(vocab_cim + vocab_ccam, "data/vocab.txt")

    all_vocab = add_voc(vocab)
    write_vocab(all_vocab, "data/vocab_all.txt")
コード例 #18
0
        print 'Creating model with fresh parameters.'
        sess.run(tf.global_variables_initializer())
    return model

if __name__ == '__main__':
    args = load_arguments()

    #####   data preparation   #####
    if args.train:
        train0 = load_sent(args.train + '.0', args.max_train_size)
        train1 = load_sent(args.train + '.1', args.max_train_size)
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size:', vocab.size

    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev1 = load_sent(args.dev + '.1')

    if args.test:
        test0 = load_sent(args.test + '.0')
        test1 = load_sent(args.test + '.1')

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
コード例 #19
0
ファイル: traint.py プロジェクト: ShenDezhou/AI_arch
def main(config_file='config/bert_config.json'):
    """Main method for training.

    Args:
        config_file: in config dir
    """
    global datasets
    # 0. Load config and mkdir
    with open(config_file) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))

    get_path(os.path.join(config.model_path, config.experiment_name))
    get_path(config.log_path)
    if config.model_type in ['rnn', 'lr', 'cnn']:  # build vocab for rnn
        build_vocab(file_in=config.all_train_file_path,
                    file_out=os.path.join(config.model_path, 'vocab.txt'))
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                model_type=config.model_type,
                config=config)

    def load_dataset():
        train_dataset, collate_fn = data.load_train_and_valid_files(
            train_file=config.train_file_path)
        return train_dataset, collate_fn

    if config.serial_load:
        train_set, collate_fn = SERIAL_EXEC.run(load_dataset)
    else:
        train_set, collate_fn = load_dataset()

    if torch.cuda.is_available():
        device = torch.device('cuda')
        sampler_train = RandomSampler(train_set)
    else:
        device = torch.device('cpu')
        sampler_train = RandomSampler(train_set)
    # TPU
    device = xm.xla_device()
    sampler_train = torch.utils.data.distributed.DistributedSampler(
        train_set,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)

    data_loader = {
        'train':
        DataLoader(
            train_set,
            batch_size=config.batch_size,
            sampler=sampler_train,
            collate_fn=collate_fn,
            drop_last=True,
        )
    }

    # 2. Build model
    # model = MODEL_MAP[config.model_type](config)
    model = WRAPPED_MODEL
    #load model states.
    # if config.trained_weight:
    #     model.load_state_dict(torch.load(config.trained_weight))
    model.to(device)
    if torch.cuda.is_available():
        model = model
        # model = torch.nn.parallel.DistributedDataParallel(
        #     model, find_unused_parameters=True)

    # # 3. Train
    # trainer = Trainer(model=model, data_loader=data_loader,
    #                   device=device, config=config)
    # # best_model_state_dict = trainer.train()
    #
    # if config.model_type == 'bert':
    #     no_decay = ['bias', 'gamma', 'beta']
    #     optimizer_parameters = [
    #         {'params': [p for n, p in model.named_parameters()
    #                     if not any(nd in n for nd in no_decay)],
    #          'weight_decay_rate': 0.01},
    #         {'params': [p for n, p in model.named_parameters()
    #                     if any(nd in n for nd in no_decay)],
    #          'weight_decay_rate': 0.0}]
    #     optimizer = AdamW(
    #         optimizer_parameters,
    #         lr=config.lr,
    #         betas=(0.9, 0.999),
    #         weight_decay=1e-8,
    #         correct_bias=False)
    # else:  # rnn
    #     optimizer = Adam(model.parameters(), lr=config.lr)

    # if config.model_type == 'bert':
    #     scheduler = get_linear_schedule_with_warmup(
    #         optimizer,
    #         num_warmup_steps=config.num_warmup_steps,
    #         num_training_steps=config.num_training_steps)
    # else:  # rnn
    #     scheduler = get_constant_schedule(optimizer)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.997)

    def train_loop_fn(loader):
        tracker = xm.RateTracker()
        model.train()
        a_score, s_score = 0, 0
        for x, batch in enumerate(loader):
            input_batch, target_batch = batch['input_ids'], batch['labels']
            input_batch, target_batch = input_batch.to(
                device), target_batch.to(device)

            ind_iter = range(input_batch.shape[0])
            index = 0
            while index < input_batch.shape[0]:
                # 2:
                batch_range = list(
                    islice(ind_iter, index, index + int(config.max_stem_size)))
                batch_input = torch.zeros(
                    (len(batch_range), config.vocab_size),
                    dtype=float).float().to(device)
                for i in range(len(batch_range)):
                    batch_input[i, input_batch[batch_range[i]]] = 1.0

                batch_target_batch = target_batch[index:index +
                                                  int(config.max_stem_size)]
                index += int(config.max_stem_size)

                optimizer.zero_grad()
                output = model(batch_input)
                # output : [batch_size, voc_size], target_batch : [batch_size] (LongTensor, not one-hot)
                if config.hierarchical_softmax:
                    loss = torch.mean(
                        model.hsoftmax(output, batch_target_batch))
                else:
                    loss = criterion(output, batch_target_batch)

                loss.backward()
                optimizer.step()
                # drop the learning rate gradually

                if xm.get_ordinal() == 0:
                    if (x + 1) % 100000 == 0:
                        print('Epoch:', '%04d' % (epoch + 1), 'cost =',
                              '{:.6f}'.format(loss))
                        W, WT = model.parameters()
                        weights = W.T.detach().cpu().numpy()
                        dic = data.tokenizer.dictionary
                        vocab = [
                            key
                            for (key, value
                                 ) in sorted(dic.items(), key=lambda x: x[1])
                        ]
                        vocab = numpy.reshape(numpy.array(vocab), (-1, 1))
                        w2v = numpy.concatenate((vocab, weights), axis=1)
                        pandas.DataFrame(w2v).to_csv("word2vec.txt",
                                                     sep=' ',
                                                     header=None,
                                                     index=False)
                        with open("word2vec.txt", 'r+',
                                  encoding='utf-8') as file:
                            readcontent = file.read(
                            )  # store the read value of exe.txt into
                            file.seek(0, 0)  # Takes the cursor to top line
                            file.write(
                                str(len(vocab)) + " " + str(weights.shape[1]) +
                                "\n")  # convert int to str since write() deals
                            file.write(readcontent)
                        # torch.save(model, os.path.join(config.model_path, config.experiment_name, 'model.bin'))
                        a_score, s_score = eval(
                            config.analogy_valid_file_path,
                            config.similarity_valid_file_path)
                        print(
                            '[xla:{}]({}) anlogy:{:.6f},sim:{:.6f},Loss={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}'
                            .format(xm.get_ordinal(), x, a_score, s_score,
                                    loss.item(), tracker.rate(),
                                    tracker.global_rate(), time.asctime()),
                            flush=True)
            tracker.add(FLAGS.batch_size)
            scheduler.step()

            if xm.get_ordinal() == 0:
                if (epoch + 1) % 1 == 0 or epoch == int(config.num_epoch) - 1:
                    print('Epoch:', '%04d' % (epoch + 1), 'cost =',
                          '{:.6f}'.format(loss))
                    W, WT = model.parameters()
                    weights = W.T.detach().cpu().numpy()
                    dic = data.tokenizer.dictionary
                    vocab = [
                        key
                        for (key,
                             value) in sorted(dic.items(), key=lambda x: x[1])
                    ]
                    w2v = numpy.concatenate((vocab, weights), axis=1)
                    pandas.DataFrame(w2v).to_csv("word2vec.txt",
                                                 sep=' ',
                                                 header=None,
                                                 index=False)
                    with open("word2vec.txt", 'r+', encoding='utf-8') as file:
                        readcontent = file.read(
                        )  # store the read value of exe.txt into
                        file.seek(0, 0)  # Takes the cursor to top line
                        file.write(
                            str(len(vocab)) + " " + str(weights.shape[1]) +
                            "\n")  # convert int to str since write() deals
                        file.write(readcontent)
                    # torch.save(model, os.path.join(config.model_path, config.experiment_name, 'model.bin'))

                    a_score, s_score = eval(config.analogy_valid_file_path,
                                            config.similarity_valid_file_path)
                    print(
                        '[xla:{}]({}) anlogy:{:.6f},sim:{:.6f},Loss={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}'
                        .format(xm.get_ordinal(), x, a_score, s_score,
                                loss.item(), tracker.rate(),
                                tracker.global_rate(), time.asctime()),
                        flush=True)
            return a_score, s_score

    # Train and eval loops
    accuracy = 0.0
    data, pred, target = None, None, None
    for epoch in range(FLAGS.num_epoch):
        para_loader = pl.ParallelLoader(data_loader['train'], [device])
        a_score, s_score = train_loop_fn(para_loader.per_device_loader(device))
        xm.master_print("Finished training epoch {}".format(epoch))

        if FLAGS.metrics_debug:
            xm.master_print(met.metrics_report())

    return a_score, s_score
コード例 #20
0
ファイル: style_transfer.py プロジェクト: webis-de/INLG-18
        print 'Creating model with fresh parameters.'
        sess.run(tf.global_variables_initializer(), feed_dict={model.emb_init: vocab.embedding})
    return model

if __name__ == '__main__':
    args = load_arguments()

    #####   data preparation   #####
    if args.train:
        train0 = load_sent(args.train + '.0', args.max_train_size)
        train1 = load_sent(args.train + '.1', args.max_train_size)
        print '#sents of training file 0:', len(train0)
        print '#sents of training file 1:', len(train1)

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab, args.min_count, args.source)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size:', vocab.size
    
    if args.dev:
        dev0 = load_sent(args.dev + '.0')
        dev1 = load_sent(args.dev + '.1')

    if args.test:
        test0 = load_sent(args.test + '.0')
        test1 = load_sent(args.test + '.1')

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
コード例 #21
0
def main(batch_size,
         embed_size,
         num_hiddens,
         num_layers,
         ln_hidden,
         ln_output,
         rec_unit,
         learning_rate=1e-4,
         log_step=10,
         num_epochs=50,
         save_step=100,
         ngpu=1):
    # hyperparameters
    num_workers = 0
    checkpoint_dir = 'checkpoint'
    # Image Preprocessing

    transform = {
        'train':
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ]),
        'val':
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ]),
    }
    # load data
    vocab = build_vocab(path='relative_captions_shoes.json')
    train_data, train_loader = data_and_loader(
        path='relative_captions_shoes.json',
        mode='train',
        vocab=vocab,
        transform=transform['train'],
        batch_size=batch_size)

    val_data, val_loader = data_and_loader(path='relative_captions_shoes.json',
                                           mode='valid',
                                           vocab=vocab,
                                           transform=transform['val'],
                                           batch_size=batch_size)

    losses_val = []
    losses_train = []

    # Build the models
    initial_step = initial_epoch = 0

    encoder = CNN(embed_size)  ### embed_size: power of 2
    middle = fcNet(embed_size, ln_hidden, ln_output)
    decoder = RNN(ln_output,
                  num_hiddens,
                  len(vocab),
                  num_layers,
                  rec_unit=rec_unit,
                  drop_out=0.1)

    # Loss, parameters & optimizer
    loss_fun = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.batchnorm.parameters())
    optimizer = torch.optim.Adam(params, lr=learning_rate)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Train the Models
    total_step = len(train_loader)
    try:
        for epoch in range(initial_epoch, num_epochs):
            print('Epoch: {}'.format(epoch))
            for step, (images, captions,
                       lengths) in enumerate(train_loader, start=initial_step):

                # Set mini-batch dataset
                images = Variable(images)
                captions = Variable(captions)
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]

                # Forward, Backward and Optimize
                decoder.zero_grad()
                middle.zero_grad()
                encoder.zero_grad()

                if ngpu > 1:
                    # run on multiple GPUs
                    features = nn.parallel.data_parallel(
                        encoder, images, range(ngpu))
                    rnn_input = nn.parallel.data_parallel(
                        middle, features, range(ngpu))
                    outputs = nn.parallel.data_parallel(
                        decoder, features, range(ngpu))
                else:
                    # run on single GPU
                    features = encoder(images)
                    rnn_input = middle(features)
                    outputs = decoder(rnn_input, captions, lengths)

                train_loss = loss_fun(outputs, targets)
                losses_train.append(train_loss.item())
                train_loss.backward()
                optimizer.step()

                # Run validation set and predict
                if step % log_step == 0:
                    encoder.batchnorm.eval()
                    # run validation set
                    batch_loss_val = []
                    for val_step, (images, captions,
                                   lengths) in enumerate(val_loader):
                        images = Variable(images)
                        captions = Variable(captions)
                        targets = pack_padded_sequence(captions,
                                                       lengths,
                                                       batch_first=True)[0]
                        #features = encoder(target_images) - encoder(refer_images)
                        features = encoder(images)
                        rnn_input = middle(features)
                        outputs = decoder(rnn_input, captions, lengths)
                        val_loss = loss_fun(outputs, targets)
                        batch_loss_val.append(val_loss.item())

                    losses_val.append(np.mean(batch_loss_val))

                    # predict
                    sampled_ids = decoder.sample(rnn_input)
                    sampled_ids = sampled_ids.cpu().data.numpy()[0]
                    sentence = utils.convert_back_to_text(sampled_ids, vocab)
                    print('Sample:', sentence)

                    true_ids = captions.cpu().data.numpy()[0]
                    sentence = utils.convert_back_to_text(true_ids, vocab)
                    print('Target:', sentence)

                    print(
                        'Epoch: {} - Step: {} - Train Loss: {} - Eval Loss: {}'
                        .format(epoch, step, losses_train[-1], losses_val[-1]))
                    encoder.batchnorm.train()

                # Save the models
                if (step + 1) % save_step == 0:
                    save_models(encoder, middle, decoder, optimizer, step,
                                epoch, losses_train, losses_val,
                                checkpoint_dir)
                    dump_losses(losses_train, losses_val,
                                os.path.join(checkpoint_dir, 'losses.pkl'))

    except KeyboardInterrupt:
        pass
    finally:
        # Do final save
        utils.save_models(encoder, middle, decoder, optimizer, step, epoch,
                          losses_train, losses_val, checkpoint_dir)
        utils.dump_losses(losses_train, losses_val,
                          os.path.join(checkpoint_dir, 'losses.pkl'))
コード例 #22
0
        logger.info('-----Loading styler model from: %s.-----' %
                    os.path.join(args.styler_path, 'model'))
        model.saver.restore(sess, os.path.join(args.styler_path, 'model'))
    else:
        logger.info('-----Creating styler model with fresh parameters.-----')
        sess.run(tf.global_variables_initializer())
    if not os.path.exists(args.styler_path):
        os.makedirs(args.styler_path)
    return model


if __name__ == '__main__':
    args = load_arguments()

    if not os.path.isfile(args.vocab):
        build_vocab(args.train_path, args.vocab)
    vocab = Vocabulary(args.vocab)
    logger.info('vocabulary size: %d' % vocab.size)

    # use tensorboard
    if args.suffix:
        tensorboard_dir = os.path.join(args.logDir, 'tensorboard', args.suffix)
    else:
        tensorboard_dir = os.path.join(args.logDir, 'tensorboard')
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    write_dict = {
        'writer':
        tf.summary.FileWriter(logdir=tensorboard_dir,
                              filename_suffix=args.suffix),
        'step':
コード例 #23
0
 def _build_labels_vocab(self):
     langs = set(lang for path, lang in self._vocab_files)
     return vocab.build_vocab(langs, min_count=0)
コード例 #24
0
    data0 = load_sent(path + 'formal' + suffix)
    data1 = load_sent(path + 'informal' + suffix)
    x = data0 + data1
    y = [0] * len(data0) + [1] * len(data1)
    z = sorted(zip(x, y), key=lambda i: len(i[0]))
    return zip(*z)


if __name__ == '__main__':
    args = load_arguments()

    if args.train:
        train_x, train_y = prepare(args.train)

        if not os.path.isfile(args.vocab):
            build_vocab(train_x, args.vocab)

    # prepare vocabulary
    # we set the embeding dimension
    # we read a pickel file (presumably with the data?)
    # randomly initialize the vector
    # normalize the random vectors
    # embedings are normalized
    vocab = Vocabulary(args.vocab)
    print('vocabulary size', vocab.size)

    # prepare datasets:
    # read form file,
    # zip
    # order them
    if args.dev:
コード例 #25
0
def run_model(args):
    time = datetime.now().timestamp()

    #####   data preparation   #####
    if args.train:

        logger, saves_dir = utils.init_logging(args, time)

        print("args: ", args)
        logger.info("args: " + str(args))
        no_of_epochs = args.max_epochs
        train0 = load_sent(args.train + '.0', args.max_train_size,
                           args.max_seq_length, args.sentence_flag)
        train1 = load_sent(args.train + '.1', args.max_train_size,
                           args.max_seq_length, args.sentence_flag)

        print('#sents of training file 0:', len(train0))
        print('#sents of training file 1:', len(train1))

        logger.info('#sents of training file 0: ' + str(len(train0)))
        logger.info('#sents of training file 1: ' + str(len(train1)))

        # build vocab for every run
        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)

    dev0 = []
    dev1 = []

    if args.dev:
        dev0 = load_sent(args.dev + '.0', -1, args.max_seq_length,
                         args.sentence_flag)
        dev1 = load_sent(args.dev + '.1', -1, args.max_seq_length,
                         args.sentence_flag)

    if args.predict:
        if args.model_path:
            # logger.info("Predicting a sample input\n---------------------\n")
            device = torch.device(
                "cuda:" +
                str(args.cuda_device) if torch.cuda.is_available() else "cpu")
            model = torch.load(args.model_path, map_location=device)
            model.training = False
            output = utils.predict(model, args.predict, args.target_sentiment,
                                   args.beam)
            print(
                f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}"
            )
            # logger.info(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}")
    if args.test:
        logger, saves_dir = utils.init_logging(args, time)

        print("args: ", args)
        logger.info("args: " + str(args))
        device = torch.device(
            "cuda:" +
            str(args.cuda_device) if torch.cuda.is_available() else "cpu")
        file0 = open(args.test + ".0", "r")
        file1 = open(args.test + ".1", "r")
        saves_path = os.path.join(args.saves_path,
                                  utils.get_filename(args, time, ""))
        Path(saves_path).mkdir(parents=True, exist_ok=True)
        out_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"),
                          "w")
        out_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"),
                          "w")
        model = torch.load(args.model_path, map_location=device)
        model.training = False

        for line in file0:
            line = line.strip("\n")
            output = utils.predict(model, line, 1, args.beam)
            out_file_0.write(output + "\n")

        for line in file1:
            line = line.strip("\n")
            output = utils.predict(model, line, 0, args.beam)
            out_file_1.write(output + "\n")

    if args.train:
        summ_filename = 'runs/cross-alignment/' + utils.get_filename(
            args, time, "summary")
        writer = SummaryWriter(summ_filename)

        model = get_model(args, vocab, logger)
        model.train_max_epochs(saves_dir,
                               args,
                               train0,
                               train1,
                               dev0,
                               dev1,
                               vocab,
                               no_of_epochs,
                               writer,
                               time,
                               save_epochs_flag=True)
コード例 #26
0
    #####   data preparation   #####
    if args.train or args.latent_train:
        chosen = args.train if len(args.train) > len(args.latent_train) else \
          args.latent_train
        # train0 = load_sent(chosen + '.0', args.max_train_size)
        # train1 = load_sent(chosen + '.1', args.max_train_size)

        train0 = load_sent(chosen + 'formal', args.max_train_size)
        train1 = load_sent(chosen + 'informal', args.max_train_size)

        print('#sents of training file 0:', len(train0))
        print('#sents of training file 1:', len(train1))

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print('vocabulary size:', vocab.size)

    if args.dev or args.latent_dev:
        chosen = args.dev if len(args.dev) > len(args.latent_dev) else \
          args.latent_dev
        dev0 = load_sent(chosen + 'formal')
        dev1 = load_sent(chosen + 'informal')

    if args.test or args.latent_test:
        chosen = args.test if len(args.test) > len(args.latent_test) else \
          args.latent_test
        test0 = load_sent(chosen + 'formal')
        test1 = load_sent(chosen + 'informal')
コード例 #27
0
def main(args):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on {device}")

    if not os.path.exists(args.models_dir):
        os.makedirs(args.models_dir)

    if args.build_vocab:
        print(
            f"Building vocabulary from captions at {args.captions_json} and with count threshold={args.threshold}"
        )
        vocab_object = build_vocab(args.captions_json, args.threshold)
        with open(args.vocab_path, "wb") as vocab_f:
            pickle.dump(vocab_object, vocab_f)
        print(
            f"Saved the vocabulary object to {args.vocab_path}, total size={len(vocab_object)}"
        )
    else:
        with open(args.vocab_path, 'rb') as f:
            vocab_object = pickle.load(f)
        print(
            f"Loaded the vocabulary object from {args.vocab_path}, total size={len(vocab_object)}"
        )

    if args.glove_embed_path is not None:
        with open(args.glove_embed_path, 'rb') as f:
            glove_embeddings = pickle.load(f)
        print(
            f"Loaded the glove embeddings from {args.glove_embed_path}, total size={len(glove_embeddings)}"
        )

        # We are using 300d glove embeddings
        args.embed_size = 300

        weights_matrix = np.zeros((len(vocab_object), args.embed_size))

        for word, index in vocab_object.word2index.items():
            if word in glove_embeddings:
                weights_matrix[index] = glove_embeddings[word]
            else:
                weights_matrix[index] = np.random.normal(
                    scale=0.6, size=(args.embed_size, ))

        weights_matrix = torch.from_numpy(weights_matrix).float().to(device)

    else:
        weights_matrix = None

    img_transforms = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomCrop((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    train_dataset = cocoDataset(args.image_root, args.captions_json,
                                vocab_object, img_transforms)
    train_dataloader = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        collate_fn=collate_fn)

    encoder = Encoder(args.resnet_size, (3, 224, 224),
                      args.embed_size).to(device)
    decoder = Decoder(args.rnn_type, weights_matrix, len(vocab_object),
                      args.embed_size, args.hidden_size).to(device)

    encoder_learnable = list(encoder.linear.parameters())
    decoder_learnable = list(decoder.rnn.parameters()) + list(
        decoder.linear.parameters())
    if args.glove_embed_path is None:
        decoder_learnable = decoder_learnable + list(
            decoder.embedding.parameters())

    criterion = nn.CrossEntropyLoss()
    params = encoder_learnable + decoder_learnable
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    start_epoch = 0

    if args.ckpt_path is not None:
        model_ckpt = torch.load(args.ckpt_path)
        start_epoch = model_ckpt['epoch'] + 1
        prev_loss = model_ckpt['loss']
        encoder.load_state_dict(model_ckpt['encoder'])
        decoder.load_state_dict(model_ckpt['decoder'])
        optimizer.load_state_dict(model_ckpt['optimizer'])
        print(
            f"Loaded model and optimizer state from {args.ckpt_path}; start epoch at {start_epoch}; prev loss={prev_loss}"
        )

    total_examples = len(train_dataloader)
    for epoch in range(start_epoch, args.num_epochs):
        for i, (images, captions, lengths) in enumerate(train_dataloader):
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True).data

            image_embeddings = encoder(images)
            outputs = decoder(image_embeddings, captions, lengths)

            loss = criterion(outputs, targets)

            decoder.zero_grad()
            encoder.zero_grad()

            loss.backward()
            optimizer.step()

            if i % args.log_interval == 0:
                loss_val = "{:.4f}".format(loss.item())
                perplexity_val = "{:5.4f}".format(np.exp(loss.item()))
                print(
                    f"epoch=[{epoch}/{args.num_epochs}], iteration=[{i}/{total_examples}], loss={loss_val}, perplexity={perplexity_val}"
                )

        torch.save(
            {
                'epoch': epoch,
                'encoder': encoder.state_dict(),
                'decoder': decoder.state_dict(),
                'optimizer': optimizer.state_dict(),
                'loss': loss
            },
            os.path.join(args.models_dir,
                         'model-after-epoch-{}.ckpt'.format(epoch)))