Esempio n. 1
0
def main():
    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)
    print("Done")

    dict_checkpoint = (opt.train_from
                       if opt.train_from else opt.train_from_state_dict)
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    dicts = dataset['dicts']
    nSets = dicts['nSets']
    print(' * Vocabulary sizes: ')
    for lang in dicts['langs']:
        print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size())

    trainSets = dict()
    validSets = dict()
    for i in xrange(nSets):
        trainSets[i] = onmt.Dataset(dataset['train']['src'][i],
                                    dataset['train']['tgt'][i], opt.batch_size,
                                    opt.gpus)

        validSets[i] = onmt.Dataset(dataset['valid']['src'][i],
                                    dataset['valid']['tgt'][i], opt.batch_size,
                                    opt.gpus)

        print(' * number of training sentences for set %d: %d' %
              (i, len(dataset['train']['src'][i])))

    print(' * maximum batch size. %d' % opt.batch_size)
    #~
    print('Building model...')

    encoder = onmt.Models.Encoder(opt, dicts['src'])
    decoder = onmt.Models.Decoder(opt, dicts['tgt'], nSets)
    generator = onmt.Models.Generator(opt, dicts['tgt'])

    model = onmt.Models.NMTModel(encoder, decoder)
    #~
    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {
            k: v
            for k, v in chk_model.state_dict().items() if 'generator' not in k
        }
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = int(math.floor(checkpoint['epoch'] + 1))

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

    model.generator = generator

    if opt.share_embedding:
        model.shareEmbedding(dicts)

    if (not opt.train_from_state_dict
            and not opt.train_from) or checkpoint['optim'] is None:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)


#~
#~ encoder.load_pretrained_vectors(opt)
#~ decoder.load_pretrained_vectors(opt)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())
    optim.set_learning_rate(opt.learning_rate)

    #~ if opt.train_from or opt.train_from_state_dict:
    #~ optim.optimizer.load_state_dict(
    #~ checkpoint['optim'].optimizer.state_dict())

    if opt.train_from or opt.train_from_state_dict:
        del checkpoint  # to save memory

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainSets, validSets, dataset, optim)
Esempio n. 2
0
def main():
    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)
    dict_checkpoint = (opt.train_from
                       if opt.train_from else opt.train_from_state_dict)
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'],
                             opt.batch_size, opt.gpus)
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'],
                             min(opt.batch_size, len(dataset['valid']['src'])),
                             opt.gpus,
                             volatile=True)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    encoder = onmt.Models.Encoder(opt, dicts['src'])
    decoder = onmt.Models.Decoder(opt, dicts['tgt'])

    generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()),
                              nn.LogSoftmax())

    model = onmt.Models.NMTModel(encoder, decoder)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = checkpoint['generator']
        #generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {
            k: v
            for k, v in chk_model.items() if 'generator' not in k
        }
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.load_pretrained_vectors(opt)
        decoder.load_pretrained_vectors(opt)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())

    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, optim)
Esempio n. 3
0
def main():
    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)
    dict_checkpoint = (opt.train_from
                       if opt.train_from else opt.train_from_state_dict)
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    trainData = onmt.Dataset(dataset['train']['src'],
                             dataset['train']['tgt'],
                             opt.batch_size,
                             opt.gpus,
                             data_type=dataset.get("type", "text"),
                             srcFeatures=dataset['train'].get('src_features'),
                             tgtFeatures=dataset['train'].get('tgt_features'),
                             alignment=dataset['train'].get('alignments'))
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'],
                             opt.batch_size,
                             opt.gpus,
                             volatile=True,
                             data_type=dataset.get("type", "text"),
                             srcFeatures=dataset['valid'].get('src_features'),
                             tgtFeatures=dataset['valid'].get('tgt_features'),
                             alignment=dataset['valid'].get('alignments'))

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    if 'src_features' in dicts:
        for j in range(len(dicts['src_features'])):
            print(' * src feature %d size = %d' %
                  (j, dicts['src_features'][j].size()))

    dicts = dataset['dicts']
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    if opt.encoder_type == "text":
        encoder = onmt.Models.Encoder(opt, dicts['src'],
                                      dicts.get('src_features', None))
    elif opt.encoder_type == "img":
        encoder = onmt.modules.ImageEncoder(opt)
        assert ("type" not in dataset or dataset["type"] == "img")
    else:
        print("Unsupported encoder type %s" % (opt.encoder_type))

    decoder = onmt.Models.Decoder(opt, dicts['tgt'])

    if opt.copy_attn:
        generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt'])
    else:
        generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()),
                                  nn.LogSoftmax())
        if opt.share_decoder_embeddings:
            generator[0].weight = decoder.word_lut.weight

    model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {
            k: v
            for k, v in chk_model.state_dict().items() if 'generator' not in k
        }
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    if len(opt.gpus) > 1:
        print('Multi gpu training ', opt.gpus)
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        if opt.param_init != 0.0:
            print('Intializing params')
            for p in model.parameters():
                p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc)
        decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at,
                           opt=opt)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())

    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, optim)
Esempio n. 4
0
def main():
    
    dataset = dict()
    
    print("Loading dicts from '%s'" % opt.data + "/dicts_info.pt")
    dataset['dicts'] = torch.load(opt.data + "/dicts_info.pt")
    
    pairIDs = list()
    if len(opt.adapt_src) > 0 and len(opt.adapt_tgt) > 0:
    
        # find the source and target ID of the pair we need to adapt
        srcID = dataset['dicts']['srcLangs'].index(opt.adapt_src)
        tgtID = dataset['dicts']['tgtLangs'].index(opt.adapt_tgt)
    
        setIDs = dataset['dicts']['setIDs']
        
        # find the pair ID that we need to adapt
        for i, sid in enumerate(setIDs):
            if sid[0] == srcID and sid[1] == tgtID:
                pairIDs.append(i)
                
        if len(pairIDs) == 0:
            pairIDs = None
    
    else:
        srcID = None
        tgtID = None
        pairIDs = None
    
    # convert string to IDs for easier manipulation
    opt.adapt_src = srcID
    opt.adapt_tgt = tgtID 
    opt.pairIDs = pairIDs
        
    
    dict_checkpoint = opt.train_from_state_dict
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage)
        #~ dataset['dicts'] = checkpoint['dicts']
    else:
        checkpoint = None
    
    dicts = dataset['dicts']
    
    dataset['valid'] = torch.load(opt.data + "/valid.pt")
    valid_set = dataset['valid']
    
    #~ print("Loading training data from '%s'" % opt.data + "/train.pt.*")
    
    
    
    dataset['train'] = dict()
    
    #~ torch.load(opt.data + "/train.pt.0")
    
    print("Done")
    
    nSets = dicts['nSets']
    setIDs = dicts['setIDs']
    print(' * Vocabulary sizes: ')
    for lang in dicts['langs']:
        print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size())
        
    # A wrapper to manage data loading
    trainLoader = onmt.MultiShardLoader(opt, dicts)

    trainSets = dict()
    validSets = dict()
    for i in xrange(nSets):
        
        #~ trainSets[i] = onmt.Dataset(dataset['train']['src'][i], dataset['train']['tgt'][i],
                             #~ opt.batch_size, opt.gpus)
            
        validSets[i] = onmt.Dataset(valid_set['src'][i], valid_set['tgt'][i],
                             opt.batch_size, opt.gpus)

        #~ print(' * number of training sentences for set %d: %d' %
          #~ (i, len(dataset['train']['src'][i])))
        

    print('[INFO] * maximum batch size. %d' % opt.batch_size)

    print('[INFO] Building model...')
    
    model, generator = build_model(opt, dicts, nSets)
    
    if opt.train_from_state_dict:
        print('[INFO] Loading model from checkpoint at %s'
              % opt.train_from_state_dict)
              
        model_state_dict = {k: v for k, v in checkpoint['model'].items() if 'critic' not in k}
        checkpoint['critic'] = {k: v for k, v in checkpoint['model'].items() if 'critic' in k}
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(checkpoint['generator'])
        
    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()
    
    model.generator = generator
      

    if not opt.train_from_state_dict :
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

    optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at
    )

    nParams = sum([p.nelement() for p in model.parameters()])
    print('[INFO] * number of parameters: %d' % nParams)
    
    
    
    evaluator = Evaluator(model, dataset, opt, cuda=(len(opt.gpus) >= 1))
    
    if opt.reinforce:
        if opt.critic == 'self':
            trainer = SCSTTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt)
        else:
            from onmt.ModelConstructor import build_critic
            from onmt.trainer.ActorCriticTrainer import A2CTrainer
            critic = build_critic(opt, dicts)
            
            model.critic = critic
            
            trainer = A2CTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt)
            #~ raise NotImplementedError
    else:
        trainer = XETrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt)
    
    trainer.run(checkpoint=checkpoint)
Esempio n. 5
0
def main():
    print("Loading data from '%s'" % opt.data)

    train = torch.load(opt.data + '.train.pt')
    valid = torch.load(opt.data + '.valid.pt')

    fields = onmt.IO.ONMTDataset.load_fields(
        torch.load(opt.data + '.vocab.pt'))
    fields = dict([(k, f) for (k, f) in fields.items()
                   if k in train.examples[0].__dict__])

    train.fields = fields
    valid.fields = fields
    # TODO: account for target features. Also, why does fields need to
    # have the structure it does?
    src_features = [fields["src_feat_"+str(j)]
                    for j in range(train.nfeatures)]
    model_opt = opt
    checkpoint = None

    if opt.train_from:
        print('Loading dicts from checkpoint at %s' % opt.train_from)
        checkpoint = torch.load(opt.train_from,
                                map_location=lambda storage, loc: storage)
        fields = onmt.IO.ONMTDataset.load_fields(checkpoint['vocab'])
        model_opt = checkpoint["opt"]

    print(' * vocabulary size. source = %d; target = %d' %
          (len(fields['src'].vocab), len(fields['tgt'].vocab)))
    for j, feat in enumerate(src_features):
        print(' * src feature %d size = %d' % (j, len(feat.vocab)))

    print(' * number of training sentences. %d' % len(train))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    model = onmt.ModelConstructor.make_base_model(model_opt, fields,
                                                  use_gpu(opt), checkpoint)
    if len(opt.gpuid) > 1:
        print('Multi gpu training ', opt.gpuid)
        model = nn.DataParallel(model, device_ids=opt.gpuid, dim=1)
    print(model)

    # Load model from checkpoint or initialize, create optim
    if opt.train_from:
        print('Loading model from checkpoint at %s'
              % opt.train_from)
        # I don't like reassigning attributes of opt: it's not clear
        opt.start_epoch = checkpoint['epoch'] + 1

        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())
    else:
        if opt.param_init != 0.0:
            print('Intializing params')
            for p in model.parameters():
                p.data.uniform_(-opt.param_init, opt.param_init)
        model.encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc,
                                                         opt.fix_word_vecs_enc)
        model.decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec,
                                                         opt.fix_word_vecs_dec)
        # what members of opt does Optim need?
        optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            opt=opt
        )

    optim.set_parameters(model.parameters())

    tally_parameters(model)

    check_model_path()

    train_model(model, train, valid, fields, optim)
Esempio n. 6
0
    def __init__(self, device, train_data, valid_data, dicts, opt, setup_optimizer=True):
        """
        :param model:
        :param device: int (GPU id)
        :param loss_function:
        :param train_data:
        :param valid_data:
        :param dicts:
        :param opt:
        """

        self.device = device
        opt.node_rank = 0
        opt.nodes = 1
        self.world_size = len(opt.gpus)

        # in the case of single node distributed, it should equal self.device
        self.rank = self.device

        # make a group to later use with self.all_reduce
        self.group = dist.group.WORLD

        self.print("[INFO] Training Options:", opt)
        if self.world_size > 1:
            dist.init_process_group(backend='nccl', init_method='env://', world_size=self.world_size, rank=self.rank)

        self.model = None

        if self.rank == 0:
            self.train_data = train_data
            self.valid_data = valid_data
        else:
            # Do we really need to deepcopy the data instances (which could cause memory leak easily)
            self.train_data = copy.deepcopy(train_data)
            self.valid_data = copy.deepcopy(valid_data)

        self.dicts = dicts
        self.opt = opt
        self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0)

        assert self.cuda, "[ERROR] Training is only available on GPUs."

        self.start_time = 0

        # setting up models and others
        if opt.lfv_multilingual:
            from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss
            lid_loss = CrossEntropyLIDLoss(opt.n_languages, opt.label_smoothing, opt.fast_xentropy)
            self.loss_function.add_loss_function(lid_loss, 'lid_loss')

        torch.manual_seed(self.opt.seed)

        # note: we must start creating models after ccreating the processes
        # for some reason passing a pre-created model to a process creates a "pickle" error
        if not opt.fusion:

            if self.is_main():
                print("[INFO] Building models .... ", flush=True)
            model = build_model(opt, dicts)

            """ Building the loss function """
            if opt.ctc_loss > 0.0:
                from onmt.speech.ctc_loss import CTC
                self.ctc_loss_function = CTC(0.0, reduce=True)

            if opt.nce:
                from onmt.modules.nce.nce_loss import NCELoss
                loss_function = NCELoss(opt.model_size, dicts['tgt'].size(), noise_ratio=opt.nce_noise,
                                        logz=9, label_smoothing=opt.label_smoothing)
            else:
                loss_function = NMTLossFunc(opt.model_size, dicts['tgt'].size(),
                                            label_smoothing=opt.label_smoothing,
                                            mirror=opt.mirror_loss,
                                            fast_xentropy=opt.fast_xentropy)

            # This function replaces modules with the more optimized counterparts so that it can run faster
            # Currently exp with LayerNorm
            if not opt.memory_profiling:
                # distributed is required to convert BatchNorm to SyncBatchNorm for DDP
                optimize_model(model, distributed=(self.world_size > 1))

        init_model_parameters(model, opt)
        self.model = model
        self.loss_function = loss_function
        self.grad_scaler = torch.cuda.amp.GradScaler()

        if opt.load_from:
            checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage)
            self.model.load_state_dict(checkpoint['model'])
            if 'scaler' in checkpoint and checkpoint['scaler'] is not None:
                self.grad_scaler.load_state_dict(checkpoint['scaler'])

        if self.cuda:
            torch.cuda.set_device(self.device)

            self.loss_function = self.loss_function.cuda(device=self.device)
            self.model = self.model.cuda(device=self.device)
            if opt.ctc_loss > 0.0:
                self.ctc_loss_function = self.ctc_loss_function.cuda(device=self.device)

            # Ensure that the distributed copies have the same initial parameters
            # Manual seed may not work the same for different GPU models.
            # if self.world_size > 1:
            #     params = [p for p in self.model.parameters()]
            #
            #     with torch.no_grad():
            #         if not self.is_main():
            #             # zero everything except for the main model
            #             for p in params:
            #                 p.zero_()
            #         else:
            #             for p in params:
            #                 p.add_(0)
            #
            # # run all_reduce to ensure that all models have exactly the same parameters
            # if self.world_size > 1:
            #     params = [p for p in self.model.parameters()]
            #     all_reduce_and_rescale_tensors(params, 1)

        if setup_optimizer:

            self.optim = onmt.Optim(opt)
            self.optim.set_parameters(self.model.parameters())

            if self.is_main():
                print("[INFO] Optimizer: ", self.optim.optimizer)

            if opt.load_from:
                if 'optim' in checkpoint and checkpoint['optim'] is not None and not opt.reset_optim:
                    self.optim.load_state_dict(checkpoint['optim'])

        if self.world_size > 1:
            # find_unused_parameters may be required for dropped layer (parameters that are not connected to
            # any particular graph)
            find_unused_parameters = True

            self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.rank],
                                                                   output_device=self.rank,
                                                                   find_unused_parameters=find_unused_parameters)

        print("[INFO] Process %d ready." % self.rank, flush=True)
def build_optim(model, text_model, speech_model, checkpoint):
    if opt.train_from:
        print('Loading optimizer from checkpoint.')
        optim = checkpoint['optim']
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())
    else:
        # what members of opt does Optim need?
        optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            beta1=opt.adam_beta1,
            beta2=opt.adam_beta2,
            adagrad_accum=opt.adagrad_accumulator_init,
            decay_method=opt.decay_method,
            warmup_steps=opt.warmup_steps,
            model_size=opt.rnn_size)

    optim.set_parameters(model.parameters())
    optim.set_parameters(text_model.encoder.parameters())
    if speech_model:
        optim.set_parameters(speech_model.decoder.parameters())
        optim.set_parameters(speech_model.globalEncoder.parameters())

    try:
        print('Loading speech optimizer from checkpoint.')
        optim = checkpoint['speech_optim']
        speech_optim.optimizer.load_state_dict(
            checkpoint['speech_optim'].optimizer.state_dict())
    except:
        # what members of opt does Optim need?
        speech_optim = onmt.Optim(
            opt.speech_optim, opt.speech_learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            beta1=opt.adam_beta1,
            beta2=opt.adam_beta2,
            adagrad_accum=opt.adagrad_accumulator_init,
            decay_method=opt.decay_method,
            warmup_steps=opt.warmup_steps,
            model_size=opt.rnn_size)

    speech_optim.set_parameters(speech_model.parameters())

    try:
        adv_optim = checkpoint['adv_optim']
        adv_optim.optimizer.load_state_dict(
            checkpoint['adv_optim'].optimizer.state_dict())
    except:
        # what members of opt does Optim need?
        adv_optim = onmt.Optim(
            opt.adv_optim, opt.adv_learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            beta1=opt.adam_beta1,
            beta2=opt.adam_beta2,
            adagrad_accum=opt.adagrad_accumulator_init,
            decay_method=opt.decay_method,
            warmup_steps=opt.warmup_steps,
            model_size=opt.rnn_size)

    if not opt.feature_match:
        adv_optim.set_parameters(model.encoder.parameters())
        adv_optim.set_parameters(text_model.encoder.parameters())
    else:
        if opt.gen_label == 0.1:
            # move text to match speech
            print('gen_label = 0.1: adv training only modifies text encodings')
            adv_optim.set_parameters(text_model.encoder.parameters())
        else:
            # move speech to match text
            adv_optim.set_parameters(text_model.encoder.parameters()) # get rid of this later
            adv_optim.set_parameters(model.encoder.parameters())
            
    return optim, adv_optim, speech_optim
Esempio n. 8
0
def main():

    print("Loading data from '%s'" % opt.data)
    
    dataset = torch.load(opt.data)

    dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    trainData, validData = init_dataloaders(dataset, opt)

    vocabulary_size = 0
  
    if "settings" in dataset:
        vocabulary_size = dataset['dicts']['src']['kwargs']['vocab_size']
    else:
        vocabulary_size = dataset['dicts']['src'].size()

    print(' * vocabulary size. source = %d;' % vocabulary_size)
    print(' * number of training sentences. %d' %
          len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    model = onmt.CNNModels.ConvNet(opt, vocabulary_size)


    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k}
        model.load_state_dict(model_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' % opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
    else:
        model.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)

    if not opt.train_from_state_dict and not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        model.load_pretrained_vectors(opt)

        optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at
        )
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())

    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict())

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, optim, opt)
Esempio n. 9
0
def main():
    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)
    print("Done")

    dict_checkpoint = (opt.train_from
                       if opt.train_from else opt.train_from_state_dict)
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    dicts = dataset['dicts']
    nSets = dicts['nSets']
    print(' * Vocabulary sizes: ')
    for lang in dicts['langs']:
        print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size())

    trainSets = dict()
    validSets = dict()
    for i in xrange(nSets):
        trainSets[i] = onmt.Dataset(dataset['train']['src'][i],
                                    dataset['train']['tgt'][i], opt.batch_size,
                                    opt.gpus)

        validSets[i] = onmt.Dataset(dataset['valid']['src'][i],
                                    dataset['valid']['tgt'][i], opt.batch_size,
                                    opt.gpus)

        print(' * number of training sentences for set %d: %d' %
              (i, len(dataset['train']['src'][i])))

    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    encoder = onmt.Models.Encoder(opt, dicts['src'])
    decoder = onmt.Models.Decoder(opt, dicts['tgt'], nSets)
    generator = onmt.Models.Generator(opt, dicts['tgt'])

    model = onmt.Models.NMTModel(encoder, decoder)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {
            k: v
            for k, v in chk_model.state_dict().items() if 'generator' not in k
        }
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = int(math.floor(checkpoint['epoch'] + 1))

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

    model.generator = generator

    if opt.share_embedding:
        model.shareEmbedding(dicts)

    if not opt.train_from_state_dict and not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    elif not opt.reset_optim and 'optim' in checkpoint:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
    else:
        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)

    optim.set_parameters(model.parameters())
    optim.set_learning_rate(opt.learning_rate)

    #~ if opt.train_from or opt.train_from_state_dict:
    #~ optim.optimizer.load_state_dict(
    #~ checkpoint['optim'].optimizer.state_dict())

    if opt.train_from or opt.train_from_state_dict:
        del checkpoint  # to save memory

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    if len(opt.adapt_src) > 0 and len(opt.adapt_tgt) > 0:

        # find the source and target ID of the pair we need to adapt
        srcID = dataset['dicts']['srcLangs'].index(opt.adapt_src)
        tgtID = dataset['dicts']['tgtLangs'].index(opt.adapt_tgt)

        setIDs = dataset['dicts']['setIDs']

        # find the pair ID that we need to adapt
        pairID = -1
        for i, sid in enumerate(setIDs):
            if sid[0] == srcID and sid[1] == tgtID:
                pairID = i
                break

        if pairID == -1:
            pairID = None

    else:
        srcID = None
        tgtID = None
        pairID = None

    # convert string to IDs for easier manipulation
    opt.adapt_src = srcID
    opt.adapt_tgt = tgtID
    opt.pairID = pairID

    evaluator = Evaluator(model, dataset, opt, cuda=(len(opt.gpus) >= 1))

    if opt.reinforce:
        trainer = SCSTTrainer(model, trainSets, validSets, dataset, optim,
                              evaluator, opt)
    else:
        trainer = XETrainer(model, trainSets, validSets, dataset, optim,
                            evaluator, opt)

    trainer.run()
Esempio n. 10
0
def main():

    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)

    trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'],
                             opt.batch_size, opt.cuda)
    validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'],
                             opt.batch_size, opt.cuda)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    if opt.train_from is None:
        encoder = onmt.Models.Encoder(opt, dicts['src'])
        decoder = onmt.Models.Decoder(opt, dicts['tgt'])
        decoderlatent = onmt.Models.DecoderLatent(opt)
        encoderlatent = onmt.Models.EncoderLatent(opt)
        lengthnet = onmt.Models.LengthNet(opt)
        generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()),
                                  nn.LogSoftmax())
        if opt.cuda > 1:
            generator = nn.DataParallel(generator, device_ids=opt.gpus)
        model = onmt.Models.NMTModel(encoder, lengthnet, decoderlatent,
                                     encoderlatent, decoder, generator, opt)
        if opt.cuda > 1:
            model = nn.DataParallel(model, device_ids=opt.gpus)
        if opt.cuda:
            model.cuda()
        else:
            model.cpu()

        #model.generator = generator

        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        optim = onmt.Optim(model.parameters(),
                           opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    else:
        print('Loading from checkpoint at %s' % opt.train_from)
        checkpoint = torch.load(opt.train_from)
        model = checkpoint['model']
        if opt.cuda:
            model.cuda()
        else:
            model.cpu()
        optim = checkpoint['optim']
        opt.start_epoch = checkpoint['epoch'] + 1

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, optim)
Esempio n. 11
0
    def __init__(self,
                 device,
                 train_data,
                 valid_data,
                 dicts,
                 opt,
                 setup_optimizer=True):
        """
        :param model:
        :param device: int (GPU id)
        :param loss_function:
        :param train_data:
        :param valid_data:
        :param dicts:
        :param opt:
        """

        self.device = device
        opt.node_rank = 0
        opt.nodes = 1
        self.world_size = len(opt.gpus)

        # in the case of single node distributed, it should equal self.device
        self.rank = self.device

        # make a group to later use with self.all_reduce
        self.group = dist.group.WORLD

        self.print("[INFO] Training Options:", opt)
        if self.world_size > 1:
            dist.init_process_group(backend='nccl',
                                    init_method='env://',
                                    world_size=self.world_size,
                                    rank=self.rank)

        self.model = None

        if self.rank == 0:
            self.train_data = train_data
            self.valid_data = valid_data
        else:
            # Do we really need to deepcopy the data instances (which could cause memory leak easily)
            self.train_data = copy.deepcopy(train_data)
            self.valid_data = copy.deepcopy(valid_data)

        self.dicts = dicts
        self.opt = opt
        self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0)

        assert self.cuda, "[ERROR] Training is only available on GPUs."

        self.start_time = 0

        # setting up models and others
        if opt.lfv_multilingual:
            from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss
            lid_loss = CrossEntropyLIDLoss(opt.n_languages,
                                           opt.label_smoothing,
                                           opt.fast_xentropy)
            self.loss_function.add_loss_function(lid_loss, 'lid_loss')

        torch.manual_seed(self.opt.seed)

        # note: we must start creating models after ccreating the processes
        # for some reason passing a pre-created model to a process creates a "pickle" error
        if not opt.fusion:

            if self.is_main():
                print("[INFO] Building models .... ", flush=True)
            model = build_model(opt, dicts)
            """ Building the loss function """
            if opt.ctc_loss > 0.0:
                from onmt.speech.ctc_loss import CTC
                self.ctc_loss_function = CTC(0.0, reduce=True)

            if opt.nce:
                from onmt.modules.nce.nce_loss import NCELoss
                loss_function = NCELoss(opt.model_size,
                                        dicts['tgt'].size(),
                                        noise_ratio=opt.nce_noise,
                                        logz=9,
                                        label_smoothing=opt.label_smoothing)
            else:
                loss_function = NMTLossFunc(
                    opt.model_size,
                    dicts['tgt'].size(),
                    label_smoothing=opt.label_smoothing,
                    mirror=opt.mirror_loss,
                    fast_xentropy=opt.fast_xentropy)

            # This function replaces modules with the more optimized counterparts so that it can run faster
            # Currently exp with LayerNorm
            if not opt.memory_profiling:
                # distributed is required to convert BatchNorm to SyncBatchNorm for DDP
                optimize_model(model, distributed=(self.world_size > 1))

        init_model_parameters(model, opt)
        self.model = model
        self.loss_function = loss_function
        # self.grad_scaler = torch.cuda.amp.GradScaler()

        if self.cuda:
            torch.cuda.set_device(self.device)

            self.loss_function = self.loss_function.cuda(device=self.device)
            self.model = self.model.cuda(device=self.device)
            if opt.ctc_loss > 0.0:
                self.ctc_loss_function = self.ctc_loss_function.cuda(
                    device=self.device)

        if opt.load_from:
            checkpoint = torch.load(opt.load_from,
                                    map_location=lambda storage, loc: storage)

        if setup_optimizer:

            self.optim = onmt.Optim(opt)
            self.optim.set_parameters(self.model.parameters())

            if self.is_main():
                print("[INFO] Optimizer: ", self.optim.optimizer)

            if opt.load_from:
                if 'optim' in checkpoint and checkpoint[
                        'optim'] is not None and not opt.reset_optim:
                    self.optim.load_state_dict(checkpoint['optim'])

        if not self.opt.fp16:
            opt_level = "O0"
            keep_batchnorm_fp32 = False
        elif self.opt.fp16_mixed:
            opt_level = "O1"
            keep_batchnorm_fp32 = None
        else:
            opt_level = "O2"
            keep_batchnorm_fp32 = False

        self.opt_level = opt_level

        if self.cuda:
            self.model, self.optim.optimizer = amp.initialize(
                self.model,
                self.optim.optimizer,
                opt_level=opt_level,
                keep_batchnorm_fp32=keep_batchnorm_fp32,
                loss_scale="dynamic",
                verbosity=1 if self.opt.verbose else 1)

        if opt.load_from:
            self.model.load_state_dict(checkpoint['model'])
            if prec_opt is not None and hasattr(prec_opt, "fp16_mixed"):
                # Only load amp information if the mode is the same
                # Maybe its better to change between optimization mode?
                if opt.fp16_mixed == prec_opt.fp16_mixed and opt.fp16 == prec_opt.fp16:
                    if 'amp' in checkpoint:
                        try:
                            amp.load_state_dict(checkpoint['amp'])
                        except Exception:
                            # loading the amp state can fail
                            pass

        if self.world_size > 1:
            # find_unused_parameters may be required for dropped layer (parameters that are not connected to
            # any particular graph)
            # find_unused_parameters = True

            self.model = DDP(self.model,
                             delay_allreduce=True,
                             gradient_average=False)

        print("[INFO] Process %d ready." % self.rank, flush=True)
Esempio n. 12
0
def train(opt, dataset):

    if torch.cuda.is_available() and not opt.gpus:
        print("WARNING: You have a CUDA device, so you should probably run with -gpus 0")

    if opt.gpus:
        cuda.set_device(opt.gpus[0])
        opt.cuda = True
    else:
        opt.cuda = False

    ckpt_path = opt.train_from
    if ckpt_path:
        print('Loading dicts from checkpoint at %s' % ckpt_path)
        checkpoint = torch.load(ckpt_path)
        opt = checkpoint['opt']

    print("Loading data from '%s'" % opt.data)

    if ckpt_path:
        dataset['dicts'] = checkpoint['dicts']
    model_dir = os.path.dirname(opt.save_model)
    if not os.path.isdir(model_dir):
        os.mkdir(model_dir)

    trainData = onmt.Dataset(dataset['train']['src'],
                             dataset['train']['tgt'], opt.batch_size, opt.gpus)
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'], opt.batch_size, opt.gpus,
                             volatile=True)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' %
          len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')
    assert dicts['src'].size() == dicts['tgt'].size()
    dict_size = dicts['src'].size()
    word_lut = nn.Embedding(dicts['src'].size(),
                            opt.word_vec_size,
                            padding_idx=onmt.Constants.PAD)
    generator = nn.Sequential(
        nn.Linear(opt.rnn_size, dicts['tgt'].size()),
        nn.LogSoftmax())
    encoder = onmt.Models.Encoder(opt, word_lut)
    decoder = onmt.Models.Decoder(opt, word_lut, generator)

    model = onmt.Models.NMTModel(encoder, decoder, opt)


    if ckpt_path:
        print('Loading model from checkpoint at %s' % ckpt_path)
        model.load_state_dict(checkpoint['model'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
    else:
        model.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)

    if not ckpt_path:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.load_pretrained_vectors(opt)
        decoder.load_pretrained_vectors(opt)

        optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at
        )
        optim.set_parameters(model.parameters())
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        optim.set_parameters(model.parameters())
        optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict())


    if ckpt_path:
        stats = checkpoint['stats']
    else:
        stats = {'train_loss': [], 'train_KLD': [], 'train_KLD_obj': [],
        'train_accuracy': [], 'kl_rate': [], 'valid_loss': [], 'valid_KLD': [],
        'valid_accuracy': [], 'valid_lm_nll', 'step': []}

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    best_valid_lm_nll = trainModel(model, trainData, validData, dataset, optim, stats, opt)
    return best_valid_lm_nll
Esempio n. 13
0
def main():

    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)

    dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'],
                             opt.batch_size, opt.cuda)
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'],
                             opt.batch_size,
                             opt.cuda,
                             volatile=True)

    if opt.raml_alpha:
        print("Use RAML(alpha) ...")
        print("tau: {}".format(opt.tau))
        print("alpha: {}".format(opt.alpha))
        sampler = onmt.HammingDistanceSampler(
            temperature=opt.tau,
            max_len=55,
            voc_min=4,
            voc_max=dataset['dicts']['tgt'].size() - 4)
        trainData = onmt.ISDataset(trainData, sampler)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    encoder = onmt.Models.Encoder(opt, dicts['src'])
    decoder = onmt.Models.Decoder(opt, dicts['tgt'])

    generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()),
                              nn.LogSoftmax())

    model = onmt.Models.NMTModel(encoder, decoder)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = checkpoint['generator']
        model_state_dict = {k: v for k, v in chk_model if 'generator' not in k}
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.cuda:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.load_pretrained_vectors(opt)
        decoder.load_pretrained_vectors(opt)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        optim.lr = opt.learning_rate
        optim.start_decay_at = opt.start_decay_at
        optim.lr_decay = opt.learning_rate_decay
        optim.start_decay = False
        print(optim)

    optim.set_parameters(model.parameters())

    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, optim)
    print("\nBest: Valid BLEU: {}, Test BLEU: {} @epoch {}\n".format(
        max_valid[0], max_test[0], max_epoch[0]))
    print("Epoch, Valid BLEU, Test BLEU")
    print("-" * 30)
    for score in scores:
        epoch, valid_bleu, test_bleu = score
        print("{}: {}, {}".format(epoch, valid_bleu, test_bleu))
Esempio n. 14
0
def main():

    print("Loading data from '%s'" % opt.data)
    dataset = torch.load(opt.data)

    dict_checkpoint = opt.train_from if opt.train_from else None

    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'],
                             dataset['train']['tgt_uni'],
                             dataset['train']['align'], opt.batch_size,
                             opt.gpus)
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'],
                             dataset['valid']['tgt_uni'],
                             dataset['valid']['align'],
                             opt.batch_size,
                             opt.gpus,
                             volatile=True)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    encoder = onmt.Models.Encoder(opt,
                                  dicts['src'],
                                  opt.fix_src_emb,
                                  use_cov=True)
    decoder = onmt.Models.Decoder(opt, dicts['tgt'], opt.tie_emb)

    output_dim = opt.output_emb_size

    if not opt.nonlin_gen:
        generator = nn.Sequential(nn.Linear(opt.rnn_size, output_dim))
    else:  #add a non-linear layer before generating the continuous vector
        generator = nn.Sequential(nn.Linear(opt.rnn_size, output_dim),
                                  nn.ReLU(), nn.Linear(output_dim, output_dim))

    #output is just an embedding
    target_embeddings = nn.Embedding(dicts['tgt'].size(), opt.output_emb_size)
    target_uni_embeddings = nn.Embedding(dicts['tgt'].size_uni(),
                                         opt.output_emb_size)
    target_ngram_embeddings = nn.Embedding(dicts['tgt'].size_ngram(),
                                           opt.output_emb_size)

    #normalize the embeddings
    norm = dicts['tgt'].embeddings.norm(p=2, dim=1,
                                        keepdim=True).clamp(min=1e-12)
    target_embeddings.weight.data.copy_(dicts['tgt'].embeddings.div(norm))

    norm = dicts['tgt'].unigram_embeddings.norm(p=2, dim=1,
                                                keepdim=True).clamp(min=1e-12)
    target_uni_embeddings.weight.data.copy_(
        dicts['tgt'].unigram_embeddings.div(norm))

    norm = dicts['tgt'].ngram_embeddings.norm(p=2, dim=1,
                                              keepdim=True).clamp(min=1e-12)
    target_ngram_embeddings.weight.data.copy_(
        dicts['tgt'].ngram_embeddings.div(norm))

    #target embeddings are fixed and not trained
    target_embeddings.weight.requires_grad = False
    target_uni_embeddings.weight.requires_grad = False
    target_ngram_embeddings.weight.requires_grad = False

    # elif opt.loss != "maxmargin": # with max-margin loss, the target embeddings can be fine-tuned as well.
    # target_embeddings.weight.requires_grad=False

    model = onmt.Models.NMTModel(encoder, decoder)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        generator_state_dict = checkpoint['generator']
        encoder_state_dict = [('encoder.' + k, v)
                              for k, v in checkpoint['encoder'].items()]
        decoder_state_dict = [('decoder.' + k, v)
                              for k, v in checkpoint['decoder'].items()]
        model_state_dict = dict(encoder_state_dict + decoder_state_dict)

        model.load_state_dict(model_state_dict, strict=False)
        generator.load_state_dict(generator_state_dict)

        if not opt.train_anew:  #load from
            opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
        target_embeddings.cuda()
        target_uni_embeddings.cuda()
        target_ngram_embeddings.cuda()
    else:
        model.cpu()
        generator.cpu()
        target_embeddings.cpu()
        target_uni_embeddings.cpu()
        target_ngram_embeddings.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

    model.generator = generator

    if not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.load_pretrained_vectors(opt)
        decoder.load_pretrained_vectors(opt)

        if opt.tie_emb:
            decoder.tie_embeddings(target_embeddings)

        if opt.fix_src_emb:
            #fix and normalize the source embeddings
            source_embeddings = nn.Embedding(dicts['src'].size(),
                                             opt.output_emb_size)
            norm = dicts['src'].embeddings.norm(p=2, dim=1,
                                                keepdim=True).clamp(min=1e-12)
            source_embeddings.weight.data.copy_(
                dicts['src'].embeddings.div(norm))

            #turn this off to initialize embeddings as well as make them trainable
            source_embeddings.weight.requires_grad = False
            if len(opt.gpus) >= 1:
                source_embeddings.cuda()
            else:
                source_embeddings.cpu()
            encoder.fix_embeddings(source_embeddings)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    elif opt.train_anew:  #restart optimizer, sometimes useful for training with
        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())

    if opt.train_from and not opt.train_anew:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    nParams = sum(
        [p.nelement() for p in model.parameters() if p.requires_grad])
    print('* number of trainable parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, target_embeddings,
               target_uni_embeddings, target_ngram_embeddings, optim)
Esempio n. 15
0
def main():
    if torch.cuda.is_available() and not opt.gpus:
        print(
            "WARNING: You have a CUDA device, so you should probably run with -gpus 0"
        )

    if opt.gpus:
        cuda.set_device(opt.gpus[0])

    print(opt)

    if opt.seed > 0:
        torch.manual_seed(opt.seed)
    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)

    dict_checkpoint = (opt.train_from
                       if opt.train_from else opt.train_from_state_dict)
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    if opt.keys or opt.acts:
        trainData = memories.Key_Dataset(dataset['train'], opt.batch_size,
                                         opt.gpus, opt.context_size)
        validData = memories.Key_Dataset(dataset['valid'],
                                         opt.batch_size,
                                         opt.gpus,
                                         opt.context_size,
                                         volatile=True)
        nr_train_points = len(dataset['train']['src_utts'])

    else:
        trainData = memories.Dataset(dataset['train']['src'],
                                     dataset['train']['tgt'], opt.batch_size,
                                     opt.gpus, opt.context_size)
        validData = memories.Dataset(dataset['valid']['src'],
                                     dataset['valid']['tgt'],
                                     opt.batch_size,
                                     opt.gpus,
                                     opt.context_size,
                                     volatile=True)
        nr_train_points = len(dataset['train']['src'])

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % nr_train_points)
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    model = memories.hier_model.HierModel(opt, dicts)

    generator = nn.Sequential(
        nn.Linear(opt.word_vec_size, dicts['tgt'].size()), nn.LogSoftmax())

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        # generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {
            k: v
            for k, v in chk_model.state_dict().items() if 'generator' not in k
        }
        model.load_state_dict(model_state_dict)
        # generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        # encoder.load_pretrained_vectors(opt)
        # decoder.load_pretrained_vectors(opt)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())

    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    if opt.gather_net_data:
        # , opt.n_samples)
        return gather_data(model, validData, dataset['dicts'])

    low_ppl, best_e, trn_ppls, val_ppls, checkpoint = trainModel(
        model, trainData, validData, dataset, optim)
    return low_ppl, best_e, trn_ppls, val_ppls, checkpoint, opt, nParams
def main():
    data = "../../data_%s/%s/%s-train.pt" % (opt.task, opt.data, opt.data)
    print("Loading data from '%s'" % data)
    if opt.label_smooth:
        assert opt.num_rb_bin == 2
    dataset = torch.load(data)
    if opt.separate_threshold:
        print dataset["src_threshold"]
        print dataset["tgt_threshold"]
        threshold = {
            "src": dataset["src_threshold"][opt.num_rb_bin],
            "tgt": dataset["tgt_threshold"][opt.num_rb_bin]
        }
    else:
        if opt.num_rb_bin > 0:
            single_threshold = dataset['all_threshold'][opt.num_rb_bin]
        else:
            single_threshold = [0]
        threshold = {"src": single_threshold, "tgt": single_threshold}
    print threshold
    dicts = dataset['dicts']
    ori_datasets = copy.deepcopy(dataset)
    if opt.parallel_ratio is not None:
        parallel_len = l = int(
            len(dataset['train']['src']) * opt.parallel_ratio)
        dataset['train']['src'] = dataset['train']['src'][:l]
        print dataset['train']['src'][-1]
        dataset['train']['tgt'] = dataset['train']['tgt'][:l]
        dataset['train']['src_rb'] = dataset['train']['src_rb'][:l]
        dataset['train']['tgt_rb'] = dataset['train']['tgt_rb'][:l]
    else:
        parallel_len = None
    if opt.separate_encoder == 0:
        forward_data = onmt.BucketIterator(dataset['train']['src'],
                                           dataset['train']['tgt'],
                                           dataset['train']['src_rb'],
                                           dataset['train']['tgt_rb'], opt,
                                           threshold)
        valid_data = onmt.BucketIterator(dataset['valid']['src'],
                                         dataset['valid']['tgt'],
                                         dataset['valid']['src_rb'],
                                         dataset['valid']['tgt_rb'], opt,
                                         threshold)
        valid_datas = [valid_data]
        valid_weight = [1.]
        valid_probability = [1.]
        train_datas = [forward_data]
        probability = [1.]
        weights = [1.]
        print len(forward_data)
    else:
        opt.filter_src_rb = 0
        forward_data = onmt.BucketIterator(dataset['train']['src'],
                                           dataset['train']['tgt'],
                                           dataset['train']['src_rb'],
                                           dataset['train']['tgt_rb'], opt,
                                           threshold)
        #print len(forward_data)
        valid_data = onmt.BucketIterator(dataset['valid']['src'],
                                         dataset['valid']['tgt'],
                                         dataset['valid']['src_rb'],
                                         dataset['valid']['tgt_rb'], opt,
                                         threshold)
        valid_datas = [valid_data]
        valid_weight = [1.]
        valid_probability = [1.]
        train_datas = [forward_data]
        probability = [1.]
        weights = [1.]

        opt.filter_src_rb = 1
        forward_data = onmt.BucketIterator(dataset['train']['src'],
                                           dataset['train']['tgt'],
                                           dataset['train']['src_rb'],
                                           dataset['train']['tgt_rb'], opt,
                                           threshold)
        valid_data = onmt.BucketIterator(dataset['valid']['src'],
                                         dataset['valid']['tgt'],
                                         dataset['valid']['src_rb'],
                                         dataset['valid']['tgt_rb'], opt,
                                         threshold)
        valid_datas += [valid_data]
        valid_weight += [1.]
        valid_probability += [1.]
        train_datas += [forward_data]
        probability += [1.]
        weights += [1.]
        opt.filter_src_rb = None

    if not opt.no_tgt_to_src:
        backwardData = onmt.BucketIterator(dataset['train_bi']['src'],
                                           dataset['train_bi']['tgt'],
                                           dataset['train_bi']['src_rb'],
                                           dataset['train_bi']['tgt_rb'], opt,
                                           threshold)
        train_datas.append(backwardData)
        weights.append(1.)
        probability = [0.5, 0.5]
    trainData = onmt.mixed_iterator(train_datas, probability)
    validData = onmt.mixed_iterator(valid_datas, valid_probability)

    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    if opt.train_from is None:
        decoder = onmt.Models.Decoder(opt,
                                      dicts['tgt'],
                                      attn_type=opt.attn_type)
        generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()),
                                  nn.LogSoftmax())
        if opt.cuda > 1:
            generator = nn.DataParallel(generator, device_ids=opt.gpus)
        discriminator = onmt.Models.Discriminator(opt)
        if not opt.separate_encoder:
            encoder = onmt.Models.Encoder(opt, dicts['src'])
            models = [
                onmt.Models.NMTModel(encoder, decoder, generator,
                                     discriminator, opt)
            ]
        else:
            models = []
            for i in range(opt.num_rb_bin):
                encoder = onmt.Models.Encoder(opt, dicts['src'])
                models += [
                    onmt.Models.NMTModel(encoder, decoder, generator,
                                         discriminator, opt)
                ]
        optims = []
        for model_single in models:
            if opt.cuda > 1:
                model_single = nn.DataParallel(model_single,
                                               device_ids=opt.gpus)
            if opt.cuda:
                model_single.cuda()
            else:
                model_single.cpu()
            model_single.generator = generator
            for p in model_single.get_seq2seq_parameters():
                p.data.uniform_(-opt.param_init, opt.param_init)
            for p in model_single.get_disc_parameters():
                if opt.non_linear == "relu":
                    opt.adv_para_init = 2. / opt.disc_size
                p.data.uniform_(-opt.adv_param_init, opt.adv_param_init)
            optim_single = onmt.Optim(
                model_single.parameters(),
                model_single.get_seq2seq_parameters(),
                model_single.get_disc_parameters(),
                model_single.get_encoder_parameters(),
                opt.optim,
                opt.learning_rate,
                opt.max_grad_norm,
                lr_decay=opt.learning_rate_decay,
                start_decay_at=opt.start_decay_at,
                adam_momentum=opt.adam_momentum,
            )
            optims += [optim_single]
    else:
        print('Loading from checkpoint at %s' % opt.train_from)
        checkpoint = torch.load(opt.train_from)
        model_single = checkpoint['model']
        if opt.cuda:
            model_single.cuda()
        else:
            model_single.cpu()
        optim_single = checkpoint['optim']
        opt.start_epoch = checkpoint['epoch'] + 1

    nParams = sum([
        p.nelement() for model_single in models
        for p in model_single.parameters()
    ])
    print('* number of parameters: %d' % nParams)

    trainModel(models, trainData, validData, dataset, optims, dicts, weights,
               valid_weight, threshold)
    if opt.task == "MT":
        translate.main([
            "-task", opt.task, "-data", opt.data, "-model",
            "%s/model.pt" % exp_path, "-replace_unk", "-gpus",
            str(opt.gpus[0]), "-output",
            "%s/test_no_unk.txt" % exp_path, "-verbose"
        ])
        evaluate_file.main([
            "-task", opt.task, "-data", opt.data, "-outputs",
            "%s/test_no_unk.txt" % exp_path
        ])
    elif opt.task == "Multi-MT":
        for test_set in ["test"]:
            for language_pair in dataset["language_pairs"]:
                line = language_pair.split("-")
                S_lang = line[0]
                T_lang = line[1]
                print "test_set", test_set + "_" + language_pair
                if opt.filter_src_rb is None or opt.filter_src_rb == dataset[
                        "src_language_mapping"][S_lang]:
                    translate.main([
                        "-task", opt.task, "-data", opt.data, "-model",
                        "%s/model.pt" % exp_path, "-replace_unk", "-gpus",
                        str(opt.gpus[0]), "-output",
                        "%s/%s_%s_no_unk.txt" %
                        (exp_path, test_set, language_pair), "-verbose",
                        "-language_pair", language_pair, "-test_set", test_set,
                        "-bpe"
                    ])

                    evaluate_file.main([
                        "-task", opt.task, "-data", opt.data, "-outputs",
                        "%s/%s_%s_no_unk.txt" %
                        (exp_path, test_set, language_pair), "-language_pair",
                        language_pair, "-test_set", test_set
                    ])
                else:
                    print "BLEU  0.0, SARI   0.00, R1   0.00, R2   0.00, RL   0.00, FK_O   0.0, acc   0.00"
    else:
        for i in range(opt.num_rb_bin):
            translate.main([
                "-task", opt.task, "-data", opt.data, "-model",
                "%s/model.pt" % exp_path, "-replace_unk", "-gpus",
                str(opt.gpus[0]), "-output",
                "%s/test_no_unk.txt" % exp_path, "-verbose", "-tgt_rb_all",
                str(i)
            ])
            evaluate_file.main([
                "-task", opt.task, "-data", opt.data, "-outputs",
                "%s/test_no_unk.txt" % exp_path, "-single_rb",
                str(i)
            ])
            print "all rb", i
        translate.main([
            "-task", opt.task, "-data", opt.data, "-model",
            "%s/model.pt" % exp_path, "-replace_unk", "-gpus",
            str(opt.gpus[0]), "-output",
            "%s/test_no_unk.txt" % exp_path, "-verbose"
        ])
        evaluate_file.main([
            "-task", opt.task, "-data", opt.data, "-outputs",
            "%s/test_no_unk.txt" % exp_path
        ])
def main():

    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)

    dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']

    trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'],
                             opt.batch_size, opt.gpus)
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'],
                             opt.batch_size,
                             opt.gpus,
                             volatile=True)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Loading Encoder Model ...')
    enc_check = torch.load(opt.encoder_model,
                           map_location=lambda storage, loc: storage)
    m_opt = enc_check['opt']
    src_dict = enc_check['dicts']['src']
    encoder = onmt.Models.Encoder(m_opt, src_dict)
    encoder.load_state_dict(enc_check['encoder'])

    print('Loading CNN Classifier Model ...')
    class_check = torch.load(opt.classifier_model,
                             map_location=lambda storage, loc: storage)
    class_opt = class_check['opt']
    class_dict = class_check['vocabulary']
    class_model = emoModel.EmoGRU(class_opt["vocab_inp_size"],
                                  class_opt["embedding_dim"],
                                  class_opt["units"], opt.batch_size,
                                  class_opt["target_size"])
    # class_model = onmt.CNNModels.ConvNet(class_opt, class_dict)
    class_model.load_state_dict(class_check['model'])

    print('Building model...')

    decoder = onmt.Models_decoder.Decoder(opt, dicts['tgt'])

    generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()),
                              nn.LogSoftmax())

    class_input = nn.Sequential(nn.Linear(opt.rnn_size, class_dict.size()))

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {
            k: v
            for k, v in chk_model.state_dict().items() if 'generator' not in k
        }
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        decoder.load_state_dict(checkpoint['decoder'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    model = onmt.Models_decoder.DecoderModel(decoder)

    if len(opt.gpus) >= 1:
        encoder.cuda()
        model.cuda()
        class_model.cuda()
        generator.cuda()
        class_input.cuda()
    else:
        encoder.cpu()
        model.cpu()
        class_model.cpu()
        generator.cpu()
        class_input.cpu()

    if len(opt.gpus) > 1:
        encoder = nn.DataParallel(encoder, device_ids=opt.gpus, dim=1)
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)
        class_input = nn.DataParallel(class_input, device_ids=opt.gpus, dim=0)

    if not opt.train_from_state_dict and not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        decoder.load_pretrained_vectors(opt)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())

    model.encoder = encoder
    model.generator = generator
    model.class_input = class_input
    model.class_model = class_model

    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, optim)
Esempio n. 18
0
def main():

    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)

    dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint)
        dataset['dicts'] = checkpoint['dicts']
        old_opt = checkpoint['opt']

    cur_opt = old_opt if dict_checkpoint else opt

    trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'],
                             opt.batch_size, opt.gpus)
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'],
                             opt.batch_size,
                             opt.gpus,
                             volatile=True)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    encoder = onmt.Models.Encoder(cur_opt, dicts['src'])
    decoder = onmt.Models.DecoderWithMultiAttn(cur_opt, dicts['tgt'])
    # decoder = onmt.Models.Decoder(cur_opt, dicts['tgt'])

    generator = nn.Sequential(nn.Linear(cur_opt.rnn_size, dicts['tgt'].size()),
                              nn.LogSoftmax())

    if opt.sync_decode_emb:
        generator.weight = decoder.word_lut.weight

    model = onmt.Models.NMTModel(encoder, decoder, cur_opt.attn_use_emb)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {
            k: v
            for k, v in chk_model.state_dict().items() if 'generator' not in k
        }
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.load_pretrained_vectors(opt)
        decoder.load_pretrained_vectors(opt)
        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           momentum=opt.momentum,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)

    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim'][0]
        if opt.learning_rate != parser.get_default('learning_rate'):
            optim.setLearningRate(opt.learning_rate)
        if opt.start_decay_at > parser.get_default('start_decay_at'):
            optim.setStartDecay(opt.start_decay_at)
        if opt.optim != optim.method:
            print "Change optim method", optim.method, ' -> ', opt.optim
            optim.setMethod(opt.optim)
        print(optim)

    optim.set_parameters(model.parameters())

    if (opt.train_from or opt.train_from_state_dict) and \
            (opt.optim == old_opt.optim):
        # print old_opt.optim
        # print checkpoint['optim'][1]
        optim.optimizer.load_state_dict(checkpoint['optim'][1])

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    trainModel(model, trainData, validData, dataset, optim)
    def _build_optimizer(self):

        optimizer = onmt.Optim(self.args)

        return optimizer
Esempio n. 20
0
def main():
    global opt
    print("Loading data from '%s'" % opt.data)

    train = torch.load(opt.data + '.train.pt')
    fields = onmt.IO.ONMTDataset.load_fields(
        torch.load(opt.data + '.vocab.pt'))
    valid = torch.load(opt.data + '.valid.pt')
    fields = dict([(k, f) for (k, f) in fields.items()
                   if k in train.examples[0].__dict__])
    train.fields = fields
    valid.fields = fields
    src_features = [fields["src_feat_"+str(j)]
                    for j in range(train.nfeatures)]
    model_opt = opt
    checkpoint = None
    dict_checkpoint = opt.train_from

    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint,
                                map_location=lambda storage, loc: storage)
        fields = onmt.IO.ONMTDataset.load_fields(checkpoint['vocab'])
        model_opt = checkpoint["opt"]

    print(' * vocabulary size. source = %d; target = %d' %
          (len(fields['src'].vocab), len(fields['tgt'].vocab)))
    for j, feat in enumerate(src_features):
        print(' * src feature %d size = %d' %
              (j, len(feat.vocab)))

    print(' * number of training sentences. %d' %
          len(train))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')
    model = onmt.Models.make_base_model(opt, model_opt, fields, checkpoint)
    print(model)

    if opt.train_from:
        print('Loading model from checkpoint at %s'
              % opt.train_from)
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpuid) > 1:
        print('Multi gpu training ', opt.gpuid)
        model = nn.DataParallel(model, device_ids=opt.gpuid, dim=1)
    #     generator = nn.DataParallel(generator, device_ids=opt.gpuid, dim=0)

    if not opt.train_from:
        if opt.param_init != 0.0:
            print('Intializing params')
            for p in model.parameters():
                p.data.uniform_(-opt.param_init, opt.param_init)

        model.encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc)
        model.decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec)

        optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            opt=opt
        )
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    if opt.train_from:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())
    optim.set_parameters(model.parameters())

    n_params = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % n_params)
    enc = 0
    dec = 0
    for name, param in model.named_parameters():
        if 'encoder' in name:
            enc += param.nelement()
        elif 'decoder' in name:
            dec += param.nelement()
        else:
            print(name, param.nelement())
    print('encoder: ', enc)
    print('decoder: ', dec)

    check_model_path()

    train_model(model, train, valid, fields, optim)
Esempio n. 21
0
    def __init__(self,
                 model,
                 loss_function,
                 train_data,
                 valid_data,
                 dicts,
                 opt,
                 setup_optimizer=True):
        super().__init__(model, loss_function, train_data, valid_data, dicts,
                         opt)

        if opt.lfv_multilingual:
            from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss
            lid_loss = CrossEntropyLIDLoss(opt.n_languages,
                                           opt.label_smoothing,
                                           opt.fast_xentropy)
            self.loss_function.add_loss_function(lid_loss, 'lid_loss')

        self.n_gpus = len(self.opt.gpus)

        if opt.ctc_loss != 0:
            from onmt.speech.ctc_loss import CTC
            self.ctc_loss_function = CTC(dicts['tgt'].size(),
                                         opt.model_size,
                                         0.0,
                                         reduce=True)

        if self.cuda:
            torch.cuda.set_device(self.opt.gpus[0])
            if self.opt.seed >= 0:
                torch.manual_seed(self.opt.seed)
            self.loss_function = self.loss_function.cuda()
            self.model = self.model.cuda()
            if opt.ctc_loss > 0.0:
                self.ctc_loss_function = self.ctc_loss_function.cuda()

        if setup_optimizer:

            self.optim = onmt.Optim(opt)
            self.optim.set_parameters(self.model.parameters())

            if not self.opt.fp16:
                opt_level = "O0"
                keep_batchnorm_fp32 = False
            elif self.opt.fp16_mixed:
                opt_level = "O1"
                keep_batchnorm_fp32 = None
            else:
                opt_level = "O2"
                keep_batchnorm_fp32 = False

            if self.cuda:
                self.model, self.optim.optimizer = amp.initialize(
                    self.model,
                    self.optim.optimizer,
                    opt_level=opt_level,
                    keep_batchnorm_fp32=keep_batchnorm_fp32,
                    loss_scale="dynamic",
                    verbosity=1 if self.opt.verbose else 0)
        # An ugly hack to switch between align right and align left
        if hasattr(self.model, 'relative'):
            if self.model.relative:
                self.train_data.src_align_right = True
                self.train_data.tgt_align_right = False
                self.valid_data.src_align_right = True
                self.valid_data.tgt_align_right = False
Esempio n. 22
0
    def __init__(self,
                 device,
                 train_data,
                 valid_data,
                 dicts,
                 opt,
                 setup_optimizer=True):
        """
        :param model:
        :param device: int (GPU id)
        :param loss_function:
        :param train_data:
        :param valid_data:
        :param dicts:
        :param opt:
        """

        # self.model = model

        # self.model = model
        # self.loss_function = loss_function
        self.device = device
        opt.node_rank = 0
        opt.nodes = 1
        self.world_size = len(opt.gpus)

        # in the case of single node distributed, it should equal self.device
        self.rank = self.device

        # make a group to later use with dist.all_reduce
        self.group = dist.group.WORLD

        self.print("[INFO] Training Options:", opt)
        dist.init_process_group(backend='nccl',
                                init_method='env://',
                                world_size=self.world_size,
                                rank=self.rank)

        self.model = None

        if self.rank == 0:
            self.train_data = train_data
            self.valid_data = valid_data
        else:
            self.train_data = copy.deepcopy(train_data)
            self.valid_data = copy.deepcopy(valid_data)

        self.dicts = dicts
        self.opt = opt
        self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0)

        assert self.cuda, "[ERROR] Training is only available on GPUs."

        self.start_time = 0

        # setting up models and others
        if opt.lfv_multilingual:
            from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss
            lid_loss = CrossEntropyLIDLoss(opt.n_languages,
                                           opt.label_smoothing,
                                           opt.fast_xentropy)
            self.loss_function.add_loss_function(lid_loss, 'lid_loss')

        torch.manual_seed(self.opt.seed)

        # note: we must start creating models after ccreating the processes
        # for some reason passing a pre-created model to a process creates a "pickle" error
        if not opt.fusion:

            if self.is_main():
                print("BUILDING MODEL .... ", flush=True)
            model = build_model(opt, dicts)
            """ Building the loss function """
            if opt.ctc_loss != 0:
                loss_function = NMTAndCTCLossFunc(
                    dicts['tgt'].size(),
                    label_smoothing=opt.label_smoothing,
                    ctc_weight=opt.ctc_loss)
            elif opt.nce:
                from onmt.modules.nce.nce_loss import NCELoss
                loss_function = NCELoss(opt.model_size,
                                        dicts['tgt'].size(),
                                        noise_ratio=opt.nce_noise,
                                        logz=9,
                                        label_smoothing=opt.label_smoothing)
            else:
                loss_function = NMTLossFunc(
                    opt.model_size,
                    dicts['tgt'].size(),
                    label_smoothing=opt.label_smoothing,
                    mirror=opt.mirror_loss,
                    fast_xentropy=opt.fast_xentropy)

            # This function replaces modules with the more optimized counterparts so that it can run faster
            # Currently exp with LayerNorm
            if not opt.memory_profiling:
                # distributed is required to convert BatchNorm to SyncBatchNorm for DDP
                optimize_model(model, distributed=(self.world_size > 1))
                # optimize_model(model)

        init_model_parameters(model, opt)
        self.model = model
        self.loss_function = loss_function

        if self.cuda:
            torch.cuda.set_device(self.device)

            self.loss_function = self.loss_function.cuda(device=self.device)
            self.model = self.model.cuda(device=self.device)

            # Ensure that the distributed copies have the same initial parameters
            # Manual seed may not work the same for different GPU models.
            if self.world_size > 1:
                params = [p for p in self.model.parameters()]

                with torch.no_grad():
                    if not self.is_main():
                        for p in params:
                            p.zero_()
                    else:
                        for p in params:
                            p.add_(0)

            if self.world_size > 1:
                params = [p for p in self.model.parameters()]
                all_reduce_and_rescale_tensors(params, 1)

        if setup_optimizer:

            self.optim = onmt.Optim(opt)
            self.optim.set_parameters(self.model.parameters())

            if self.is_main():
                print("[INFO] Optimizer: ", self.optim.optimizer)

            if not self.opt.fp16:
                opt_level = "O0"
                keep_batchnorm_fp32 = False
            elif self.opt.fp16_mixed:
                opt_level = "O1"
                keep_batchnorm_fp32 = None
            else:
                opt_level = "O2"
                keep_batchnorm_fp32 = False

            if self.cuda:
                self.model, self.optim.optimizer = amp.initialize(
                    self.model,
                    self.optim.optimizer,
                    opt_level=opt_level,
                    keep_batchnorm_fp32=keep_batchnorm_fp32,
                    loss_scale="dynamic",
                    verbosity=1 if self.opt.verbose else 0)

            # wrap the model into DDP after initializing by amp
            if self.world_size > 1:
                """
                delay_allreduce is required to avoid allreduce error during backward pass
                """
                self.model = DDP(self.model,
                                 delay_allreduce=True,
                                 gradient_average=False)

                # torch DDP is more likely to work with the official amp autocast
                # self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.rank],
                #                                                        output_device=self.rank,
                #                                                        find_unused_parameters=True)

        print("[INFO] Process %d ready." % self.rank, flush=True)
Esempio n. 23
0
def main():

	print("Loading data from '%s'" % opt.data)

	dataset = torch.load(opt.data)

	dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict
	if dict_checkpoint:
		print('Loading dicts from checkpoint at %s' % dict_checkpoint)
		checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage)
		dataset['dicts'] = checkpoint['dicts']

	dicts = dataset['dicts']

	trainData = onmt.Dataset(dataset['train']['src'],
							 dataset['train']['tgt'], opt.batch_size, opt.gpus,
							 sample_size=opt.sample_vocab,
							 tgtVocab_size=dicts['tgt'].size())
	validData = onmt.Dataset(dataset['valid']['src'],
							 dataset['valid']['tgt'], opt.batch_size, opt.gpus,
							 volatile=True)


	print(' * vocabulary size. source = %d; target = %d' %
		  (dicts['src'].size(), dicts['tgt'].size()))
	print(' * number of training sentences. %d' %
		  len(dataset['train']['src']))
	print(' * maximum batch size. %d' % opt.batch_size)

	print('Building model...')

	encoder = onmt.Models.Encoder(opt, dicts['src'])
	decoder = onmt.Models.Decoder(opt, dicts['tgt'])

	generator = onmt.Generator(opt, dicts['tgt'].size(), decoder.word_lut, desc=opt.desc)

	model = onmt.Models.NMTModel(encoder, decoder)

	if opt.train_from:
		print('Loading model from checkpoint at %s' % opt.train_from)
		chk_model = checkpoint['model']
		generator_state_dict = chk_model.generator.state_dict()
		model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k}
		model.load_state_dict(model_state_dict)
		generator.load_state_dict(generator_state_dict)
		opt.start_epoch = checkpoint['epoch'] + 1

	if opt.train_from_state_dict:
		print('Loading model from checkpoint at %s' % opt.train_from_state_dict)
		model.load_state_dict(checkpoint['model'])
		generator.load_state_dict(checkpoint['generator'])
		opt.start_epoch = checkpoint['epoch'] + 1

	if len(opt.gpus) >= 1:
		model.cuda()
		generator.cuda()
	else:
		model.cpu()
		generator.cpu()

	if len(opt.gpus) > 1:
		model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
		generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)

	model.generator = generator

	if not opt.train_from_state_dict and not opt.train_from:
		for p in model.parameters():
			p.data.uniform_(-opt.param_init, opt.param_init)

		encoder.load_pretrained_vectors(opt)
		decoder.load_pretrained_vectors(opt)

		optim = onmt.Optim(
			opt.optim, opt.learning_rate, opt.max_grad_norm,
			lr_decay=opt.learning_rate_decay,
			start_decay_at=opt.start_decay_at
		)
	else:
		print('Loading optimizer from checkpoint:')
		optim = checkpoint['optim']
		print(optim)


	if opt.train_from or opt.train_from_state_dict:
		optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict())

	optim.set_parameters(model.parameters())

	nParams = sum([p.nelement() for p in model.parameters()])
	if opt.generator in ['simple','tie']:
		print('* number of parameters: %d' % nParams)
	else:
		linshape = model.generator.linear.weight.shape
		not_used = linshape[0]*linshape[1]
		nParams = nParams - not_used
		print('* number of parameters: %d' % nParams)

	trainModel(model, trainData, validData, dataset, optim)
Esempio n. 24
0
if len(opt.gpus) >= 1:
    model.cuda()
    vocab_dist_gen.cuda()
    final_dist_gen.cuda()

model.vocab_dist_gen = vocab_dist_gen
model.final_dist_gen = final_dist_gen

if not opt.train_from_state_dict and not opt.train_from:
    for p in model.parameters():
        p.data.uniform_(-opt.param_init, opt.param_init)
        encoder.load_pretrained_vectors(opt)
        decoder.load_pretrained_vectors(opt)
        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at)

optim.set_parameters(model.parameters())
nParams = sum([p.nelement() for p in model.parameters()])
print('* number of parameters: %d' % nParams)

#trainModel(model, trainData, validData, dataset, optim)
criterion = NMTCriterion(dataset['dicts']['tgt'].size())

# shuffle mini batch order
#batchOrder = torch.randperm(len(trainData))
total_loss, total_words, total_num_correct = 0, 0, 0
report_loss, report_tgt_words, report_src_words, report_num_correct = 0, 0, 0, 0
batchIdx = 1500
Esempio n. 25
0
def main():
    # Set up the Crayon logging server.
    if opt.log_server != "":
        from pycrayon import CrayonClient
        cc = CrayonClient(hostname=opt.log_server)

        experiments = cc.get_experiment_names()
        print(experiments)
        if opt.experiment_name in experiments:
            cc.remove_experiment(opt.experiment_name)
        opt.experiment_name = cc.create_experiment(opt.experiment_name)

    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)
    dict_checkpoint = (opt.train_from if opt.train_from
                       else opt.train_from_state_dict)
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint,
                                map_location=lambda storage, loc: storage)
        #dataset['dicts'] = checkpoint['dicts']

    if opt.redis:
        trainData = onmt.RedisDataset("train", opt.batch_size, False, reverse=opt.reverse, port=opt.port, db=opt.db,
                                      r2l=opt.r2l)
        validData = onmt.RedisDataset('valid', opt.batch_size, False, volatile=True, reverse=opt.reverse, port=opt.port,
                                      r2l=opt.r2l, db=opt.db)
    else:
        trainData = onmt.Dataset(dataset['train']['src'],
                             dataset['train']['tgt'], opt.batch_size, False,
                             data_type=dataset.get("type", "text"),
                             srcFeatures=dataset['train'].get('src_features'),
                             tgtFeatures=dataset['train'].get('tgt_features'),
                             alignment=dataset['train'].get('alignments'))
        validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'], opt.batch_size, False,
                             volatile=True,
                             data_type=dataset.get("type", "text"),
                             srcFeatures=dataset['valid'].get('src_features'),
                             tgtFeatures=dataset['valid'].get('tgt_features'),
                             alignment=dataset['valid'].get('alignments'))

    dicts = dataset['dicts']
    if opt.reverse:
        dicts['src'], dicts['tgt'] = dicts['tgt'], dicts['src']
        dicts['src_features'], dicts['tgt_features'] = dicts['tgt_features'], dicts['src_features']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    #if 'src_features' in dicts:
    #    for j in range(len(dicts['src_features'])):
    #        print(' * src feature %d size = %d' %
    #              (j, dicts['src_features'][j].size()))

    #print(' * number of training sentences. %d' %
          #len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    if opt.encoder_type == "text":
        encoder = onmt.Models.Encoder(opt, dicts['src'],
                                      dicts.get('src_features', None))
    elif opt.encoder_type == "img":
        encoder = onmt.modules.ImageEncoder(opt)
        assert("type" not in dataset or dataset["type"] == "img")
    else:
        print("Unsupported encoder type %s" % (opt.encoder_type))

    decoder = onmt.Models.Decoder(opt, dicts['tgt'])

    if opt.copy_attn:
        generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt'])
    else:
        generator = nn.Sequential(
            nn.Linear(opt.rnn_size, dicts['tgt'].size()),
            nn.LogSoftmax())
        if opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight

    model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {k: v for k, v in chk_model.state_dict().items()
                            if 'generator' not in k}
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s'
              % opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    model.cpu()
    generator.cpu()

    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        if opt.param_init != 0.0:
            print('Intializing params')
            for p in model.parameters():
                p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc)
        decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec)

        optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            opt=opt
        )
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)


    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    print('Multi gpu training ', opt.gpus)
    trainer = MultiprocessingTrainer(opt, model, optim, device_ids=opt.gpus)

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)
    enc = 0
    dec = 0
    for name, param in model.named_parameters():
        if 'encoder' in name:
            enc += param.nelement()
        elif 'decoder' in name:
            dec += param.nelement()
        else:
            print(name, param.nelement())
    print('encoder: ', enc)
    print('decoder: ', dec)

    trainModel(trainer, trainData, validData, dataset)
Esempio n. 26
0
        report_stats.output(epoch, batch + 1, num_batches, start_time)
        report_stats = onmt.Statistics()
    return report_stats

if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--config_file', default='default.cfg')
    args, extra_args = argparser.parse_known_args()
    opt = Configurable(args.config_file, extra_args)

    model = ADVModel(opt)
    optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            beta1=opt.adam_beta1,
            beta2=opt.adam_beta2,
            adagrad_accum=opt.adagrad_accumulator_init,
            decay_method=opt.decay_method,
            warmup_steps=opt.warmup_steps,
            model_size=opt.rnn_size)
    optim.set_parameters(model.named_parameters())
    tgt_vocab = Vocab(opt.tgt_vocab)
    loss_compute = onmt.Loss.NMTLossCompute(model.generator, tgt_vocab).cuda()
    trainer = onmt.Trainer(model, loss_compute, loss_compute, optim)
    train_set = Data_Loader(opt.train_file, opt.batch_size)
    valid_set = Data_Loader(opt.dev_file, opt.batch_size)
    for epoch in xrange(opt.max_epoch):
        train_stats = trainer.train(train_set, epoch, report_func)
        print('Train perplexity: %g' % train_stats.ppl())
        print('Train accuracy: %g' % train_stats.accuracy())
Esempio n. 27
0
def build_optim(model, checkpoint):
    saved_optimizer_state_dict = None

    if opt.train_from and opt.train_part is None:  #!= "context":
        print('Loading optimizer from checkpoint.')
        optim = checkpoint['optim']
        # We need to save a copy of optim.optimizer.state_dict() for setting
        # the, optimizer state later on in Stage 2 in this method, since
        # the method optim.set_parameters(model.parameters()) will overwrite
        # optim.optimizer, and with ith the values stored in
        # optim.optimizer.state_dict()
        saved_optimizer_state_dict = optim.optimizer.state_dict()
    else:
        print('Making optimizer for training.')
        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at,
                           beta1=opt.adam_beta1,
                           beta2=opt.adam_beta2,
                           adagrad_accum=opt.adagrad_accumulator_init,
                           decay_method=opt.decay_method,
                           warmup_steps=opt.warmup_steps,
                           model_size=opt.rnn_size)

    # Stage 1:
    # Essentially optim.set_parameters (re-)creates and optimizer using
    # model.paramters() as parameters that will be stored in the
    # optim.optimizer.param_groups field of the torch optimizer class.
    # Importantly, this method does not yet load the optimizer state, as
    # essentially it builds a new optimizer with empty optimizer state and
    # parameters from the model.
    optim.set_parameters(model.named_parameters())
    print("Stage 1: Keys after executing optim.set_parameters" +
          "(model.parameters())")
    show_optimizer_state(optim)

    if opt.train_from and opt.train_part is None:  # != "context":
        # Stage 2: In this stage, which is only performed when loading an
        # optimizer from a checkpoint, we load the saved_optimizer_state_dict
        # into the re-created optimizer, to set the optim.optimizer.state
        # field, which was previously empty. For this, we use the optimizer
        # state saved in the "saved_optimizer_state_dict" variable for
        # this purpose.
        # See also: https://github.com/pytorch/pytorch/issues/2830
        optim.optimizer.load_state_dict(saved_optimizer_state_dict)
        # Convert back the state values to cuda type if applicable
        if use_gpu(opt):
            for state in optim.optimizer.state.values():
                for k, v in state.items():
                    if torch.is_tensor(v):
                        state[k] = v.cuda()

        print(
            "Stage 2: Keys after executing  optim.optimizer.load_state_dict" +
            "(saved_optimizer_state_dict)")
        show_optimizer_state(optim)

        # We want to make sure that indeed we have a non-empty optimizer state
        # when we loaded an existing model. This should be at least the case
        # for Adam, which saves "exp_avg" and "exp_avg_sq" state
        # (Exponential moving average of gradient and squared gradient values)
        if (optim.method == 'adam') and (len(optim.optimizer.state) < 1):
            raise RuntimeError(
                "Error: loaded Adam optimizer from existing model" +
                " but optimizer state is empty")

    return optim
Esempio n. 28
0
def main():
    print("Loading data from '%s'" % opt.data)
    dict_checkpoint = (opt.train_from
                       if opt.train_from else opt.train_from_state_dict)

    if opt.data_type == 'h5':
        alignments = torch.load(opt.data_alignment)
        #alignments = None
        dicts = torch.load(opt.dict)['dicts']
        dataset = h5py.File(opt.data)
        trainData = onmt.Dataset_h5(
            dataset,
            'train',
            opt.batch_size,
            opt.gpus,
            data_type="text",
            srcFeatures=None,
            tgtFeatures=None,
            alignment=alignments['train'] if alignments else None)
        validData = onmt.Dataset_h5(
            dataset,
            'valid',
            opt.batch_size,
            opt.gpus,
            volatile=True,
            data_type="text",
            srcFeatures=None,
            tgtFeatures=None,
            alignment=alignments['valid'] if alignments else None)
        print(' ***************************************************')
        print(' *** vocabulary size. source = %d; target = %d' %
              (dicts['src'].size(), dicts['tgt'].size()))
        print(' *** number of training sentences. %d' %
              dataset['train_src_label'].shape[0])
        print(' *** maximum batch size. %d' % opt.batch_size)
        print(' *** maximum number of batch size. %d ' % len(trainData))
    else:
        dataset = torch.load(opt.data,
                             map_location=lambda storage, loc: storage)
        if dict_checkpoint:
            print('Loading dicts from checkpoint at %s' % dict_checkpoint)
            checkpoint = torch.load(dict_checkpoint,
                                    map_location=lambda storage, loc: storage)
            dataset['dicts'] = checkpoint['dicts']
        trainData = onmt.Dataset(
            dataset['train']['src'],
            dataset['train']['tgt'],
            opt.batch_size,
            opt.gpus,
            data_type=dataset.get("type", "text"),
            srcFeatures=dataset['train'].get('src_features'),
            tgtFeatures=dataset['train'].get('tgt_features'),
            alignment=dataset['train'].get('alignments'))
        validData = onmt.Dataset(
            dataset['valid']['src'],
            dataset['valid']['tgt'],
            opt.batch_size,
            opt.gpus,
            volatile=True,
            data_type=dataset.get("type", "text"),
            srcFeatures=dataset['valid'].get('src_features'),
            tgtFeatures=dataset['valid'].get('tgt_features'),
            alignment=dataset['valid'].get('alignments'))
        dicts = dataset['dicts']
        print(' ***************************************************')
        print(' *** vocabulary size. source = %d; target = %d' %
              (dicts['src'].size(), dicts['tgt'].size()))
        print(' *** number of training sentences. %d' %
              len(dataset['train']['src']))
        print(' *** maximum batch size. %d' % opt.batch_size)

    if 'src_features' in dicts:
        for j in range(len(dicts['src_features'])):
            print(' * src feature %d size = %d' %
                  (j, dicts['src_features'][j].size()))

    print('Building model...')

    if opt.encoder_type == "text":
        encoder = onmt.Models.Encoder(opt, dicts['src'],
                                      dicts.get('src_features', None))
    elif opt.encoder_type == "img":
        encoder = onmt.modules.ImageEncoder(opt)
        assert ("type" not in dataset or dataset["type"] == "img")
    else:
        print("Unsupported encoder type %s" % (opt.encoder_type))

    decoder = onmt.Models.Decoder(opt, dicts['tgt'])

    model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1)

    if opt.copy_attn:
        generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt'])
    else:
        generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()),
                                  nn.LogSoftmax())
        if opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        model_state_dict = checkpoint['model']
        generator_state_dict = checkpoint['generator']
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s' %
              opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()

    if len(opt.gpus) > 1:
        print('Multi gpu training ', opt.gpus)
        model = nn.DataParallel(model, device_ids=opt.gpus, dim=1)
        generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0)
    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        if opt.param_init != 0.0:
            print('Intializing params')
            for p in model.parameters():
                p.data.uniform_(-opt.param_init, opt.param_init)
                # if p.data.dim()>1:
                # init.xavier_uniform(p.data)
        # else:
        #     init.constant(p.data, 0.0)

        encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc)
        decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec)

        optim = onmt.Optim(opt.optim,
                           opt.learning_rate,
                           opt.max_grad_norm,
                           lr_decay=opt.learning_rate_decay,
                           start_decay_at=opt.start_decay_at,
                           opt=opt)
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)

    optim.set_parameters(model.parameters())

    if opt.guided_fertility:
        print('Getting fertilities from external alignments..')
        fert_dict = evaluation.get_fert_dict(opt.guided_fertility,
                                             opt.guided_fertility_source_file,
                                             dicts["src"])
    else:
        fert_dict = None

    if opt.supervised_fertility:
        print("Retrieving fertilities for all training sentences...")
        fert_sents = evaluation.get_fertility(
            opt.supervised_fertility, opt.supervised_fertility_source_file,
            dicts["src"])
    else:
        fert_sents = None

    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)
    trainModel(model, trainData, validData, dataset, dicts, optim, fert_dict,
               fert_sents)
Esempio n. 29
0
    def __init__(self,
                 model,
                 lat_dis,
                 loss_function,
                 train_data,
                 valid_data,
                 dicts,
                 opt,
                 setup_optimizer=True):

        self.train_data = train_data
        self.valid_data = valid_data

        self.dicts = dicts
        self.opt = opt
        self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0)

        self.start_time = 0
        self.n_gpus = len(self.opt.gpus)

        self.loss_function_ae, self.loss_lat_dis = loss_function
        self.model_ae = model
        self.lat_dis = lat_dis

        if self.cuda:
            torch.cuda.set_device(self.opt.gpus[0])
            if self.opt.seed >= 0:
                torch.manual_seed(self.opt.seed)
            self.loss_function_ae = self.loss_function_ae.cuda()
            self.model_ae = self.model_ae.cuda()
            self.lat_dis = self.lat_dis.cuda()
            self.loss_lat_dis = self.loss_lat_dis.cuda()
        if setup_optimizer:

            self.optim_ae = onmt.Optim(opt)
            self.optim_ae.set_parameters(self.model_ae.parameters())

            lat_opt = copy.deepcopy(opt)
            lat_opt.beta1 = 0.5
            # lat_opt.learning_rate = 0.0002
            # lat_opt.update_method = 'regular'
            self.optim_lat_dis = onmt.Optim(lat_opt)
            self.optim_lat_dis.set_parameters(self.lat_dis.parameters())

            if not self.opt.fp16:
                opt_level = "O0"
                keep_batchnorm_fp32 = False
            elif self.opt.fp16_mixed:
                opt_level = "O1"
                keep_batchnorm_fp32 = None
            else:
                opt_level = "O2"
                keep_batchnorm_fp32 = False

            if self.cuda:
                # print(234)
                self.model_ae, self.optim_ae.optimizer = amp.initialize(
                    self.model_ae,
                    self.optim_ae.optimizer,
                    opt_level=opt_level,
                    keep_batchnorm_fp32=keep_batchnorm_fp32,
                    loss_scale="dynamic",
                    verbosity=1 if self.opt.verbose else 0)

                self.lat_dis, self.optim_lat_dis.optimizer = amp.initialize(
                    self.lat_dis,
                    self.optim_lat_dis.optimizer,
                    opt_level=opt_level,
                    keep_batchnorm_fp32=keep_batchnorm_fp32,
                    loss_scale="dynamic",
                    verbosity=1 if self.opt.verbose else 0)
Esempio n. 30
0
args.cuda = not args.no_cuda and torch.cuda.is_available()
with open(args.vocab_file) as f:
    args.vocab_size = len(f.readlines())

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

model = MultiChoiceQAModel(args)

if args.cuda:
    model = model.cuda()
optimizer = onmt.Optim(args.optim,
                       args.lr,
                       args.max_grad_norm,
                       lr_decay=args.lr_decay,
                       start_decay_at=args.start_decay_at)
optimizer.set_parameters(model.parameters())

datasets = []
for f in glob.glob(
        "/data/users/iLikeNLP/AIContest/ChatBotCourse/subtitle/preprocess/chinese/*.srt"
):
    datasets.append(GenLCData(f, args.vocab_file))
train_loader = ConcatData(datasets)
#train_loader = LocallyShuffleData(train_loader, args.batch_size*100)
train_loader = BatchData(train_loader, args.batch_size)
train_data = train_loader.get_data()
valid_loader = GenLCData(args.valid_file, args.vocab_file)
valid_loader = BatchData(valid_loader, args.batch_size)