Esempio n. 1
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch
        self.beam_accum = None
        self.beta = opt.beta
        self.alpha = opt.alpha
        self.start_with_bos = opt.start_with_bos
        self.fp16 = opt.fp16

        self.models = list()
        self.model_types = list()

        # models are string with | as delimiter
        models = opt.model.split("|")

        print(models)
        self.n_models = len(models)
        self._type = 'text'

        for i, model in enumerate(models):
            if opt.verbose:
                print('Loading model from %s' % model)
            checkpoint = torch.load(model,
                                    map_location=lambda storage, loc: storage)

            model_opt = checkpoint['opt']

            if i == 0:
                self.src_dict = checkpoint['dicts']['src']
                self.tgt_dict = checkpoint['dicts']['tgt']

            # Build model from the saved option
            model = build_model(model_opt, checkpoint['dicts'])

            model.load_state_dict(checkpoint['model'])

            if model_opt.model in model_list:
                if model.decoder.positional_encoder.len_max < self.opt.max_sent_length:
                    print("Not enough len to decode. Renewing .. ")
                    model.decoder.renew_buffer(self.opt.max_sent_length)

            if opt.fp16:
                model = model.half()

            if opt.cuda:
                model = model.cuda()
            else:
                model = model.cpu()

            model.eval()

            self.models.append(model)
            self.model_types.append(model_opt.model)

        self.cuda = opt.cuda
        self.ensemble_op = opt.ensemble_op

        if opt.verbose:
            print('Done')
Esempio n. 2
0
def custom_build_model(opt, dict, lm=False):

    if not lm:
        model = build_model(opt, dict)
    else:
        model = build_language_model(opt, dict)

    return model
def custom_build_model(opt, dict, lm=False):

    if not lm:
        model = build_model(opt, dict)
        # by me
        scalar_mix = ScalarMix(
            onmt.Constants.BERT_LAYERS,
            do_layer_norm=False,
            initial_scalar_parameters=None,
            trainable=True,
        )
        model.add_module("scalar_mix", scalar_mix)
    else:
        model = build_language_model(opt, dict)

    return model
Esempio n. 4
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch
        self.beam_accum = None
        self.beta = opt.beta
        self.alpha = opt.alpha
        self.start_with_bos = opt.start_with_bos
        
        if opt.verbose:
            print('Loading model from %s' % opt.model)
        checkpoint = torch.load(opt.model,
                               map_location=lambda storage, loc: storage)

        model_opt = checkpoint['opt']
        self.src_dict = checkpoint['dicts']['src']
        self.tgt_dict = checkpoint['dicts']['tgt']
        self._type = model_opt.encoder_type \
            if "encoder_type" in model_opt else "text"

        # Build model from the saved option
        model = build_model(model_opt, checkpoint['dicts'])
        
        model.load_state_dict(checkpoint['model'])
        
        model.eval()
        
        if model_opt.model == 'transformer':
            if model.decoder.positional_encoder.len_max < self.opt.max_sent_length:
                print("Not enough len to decode. Renewing .. ")    
                model.decoder.renew_buffer(self.opt.max_sent_length)
           
        if opt.cuda:
            model.cuda()
        else:
            model.cpu()
        
        self.cuda = opt.cuda
            
        self.model_type = model_opt.model


        self.model = model
        self.model.eval()
        
        if opt.verbose:
            print('Done')
Esempio n. 5
0
def main():

    if opt.data_format == 'raw':
        start = time.time()
        if opt.data.endswith(".train.pt"):
            print("Loading data from '%s'" % opt.data)
            dataset = torch.load(opt.data)
        else:
            print("Loading data from %s" % opt.data + ".train.pt")
            dataset = torch.load(opt.data + ".train.pt")

        elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
        print("Done after %s" % elapse )

        train_data = onmt.Dataset(dataset['train']['src'],
                                 dataset['train']['tgt'], opt.batch_size_words,
                                 data_type=dataset.get("type", "text"),
                                 batch_size_sents=opt.batch_size_sents,
                                 multiplier = opt.batch_size_multiplier,
                                 reshape_speech=opt.reshape_speech,
                                 augment=opt.augment_speech)
        valid_data = onmt.Dataset(dataset['valid']['src'],
                                 dataset['valid']['tgt'], opt.batch_size_words,
                                 data_type=dataset.get("type", "text"),
                                 batch_size_sents=opt.batch_size_sents,
                                 reshape_speech=opt.reshape_speech)

        dicts = dataset['dicts']
        if "src" in dicts:
            print(' * vocabulary size. source = %d; target = %d' %
            (dicts['src'].size(), dicts['tgt'].size()))
        else:
            print(' * vocabulary size. target = %d' %
            (dicts['tgt'].size()))

        print(' * number of training sentences. %d' %
          len(dataset['train']['src']))
        print(' * maximum batch size (words per batch). %d' % opt.batch_size_words)

    elif opt.data_format == 'bin':

        from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset

        dicts = torch.load(opt.data + ".dict.pt")

        #~ train = {}
        train_path = opt.data + '.train'
        train_src = IndexedInMemoryDataset(train_path + '.src')
        train_tgt = IndexedInMemoryDataset(train_path + '.tgt')

        train_data = onmt.Dataset(train_src,
                                 train_tgt, opt.batch_size_words,
                                 data_type=opt.encoder_type,
                                 batch_size_sents=opt.batch_size_sents,
                                 multiplier = opt.batch_size_multiplier)

        valid_path = opt.data + '.valid'
        valid_src = IndexedInMemoryDataset(valid_path + '.src')
        valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt')

        valid_data = onmt.Dataset(valid_src,
                                 valid_tgt, opt.batch_size_words,
                                 data_type=opt.encoder_type,
                                 batch_size_sents=opt.batch_size_sents)

    else:
        raise NotImplementedError

    print('Building model...')

    if not opt.fusion:
        model = build_model(opt, dicts)

        """ Building the loss function """
        if opt.ctc_loss != 0:
            loss_function = NMTAndCTCLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing,ctc_weight = opt.ctc_loss)
        else:
            loss_function = NMTLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing)
    else:
        from onmt.ModelConstructor import build_fusion
        from onmt.modules.Loss import FusionLoss

        model = build_fusion(opt, dicts)

        loss_function = FusionLoss(dicts['tgt'].size(), label_smoothing=opt.label_smoothing)


    n_params = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % n_params)

    if len(opt.gpus) > 1 or opt.virtual_gpu > 1:
            raise NotImplementedError("Warning! Multi-GPU training is not fully tested and potential bugs can happen.")
    else:
        if opt.fp16:
            trainer = FP16XETrainer(model, loss_function, train_data, valid_data, dicts, opt)
        else:
            trainer = XETrainer(model, loss_function, train_data, valid_data, dicts, opt)

    
    trainer.run(save_file=opt.load_from)
Esempio n. 6
0
def main():
    if opt.data_format == 'raw':
        start = time.time()
        if opt.data.endswith(".train.pt"):
            print("Loading data from '%s'" % opt.data)
            dataset = torch.load(
                opt.data)  # This requires a lot of cpu memory!
        else:
            print("Loading data from %s" % opt.data + ".train.pt")
            dataset = torch.load(opt.data + ".train.pt")

        elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
        print("Done after %s" % elapse)

        train_data = onmt.Dataset(dataset['train']['src'],
                                  dataset['train']['tgt'],
                                  opt.batch_size_words,
                                  data_type=dataset.get("type", "text"),
                                  batch_size_sents=opt.batch_size_sents,
                                  multiplier=opt.batch_size_multiplier,
                                  reshape_speech=opt.reshape_speech,
                                  augment=opt.augment_speech)
        valid_data = onmt.Dataset(dataset['valid']['src'],
                                  dataset['valid']['tgt'],
                                  opt.batch_size_words,
                                  data_type=dataset.get("type", "text"),
                                  batch_size_sents=opt.batch_size_sents,
                                  reshape_speech=opt.reshape_speech)

        dicts = dataset['dicts']

        print(' * number of training sentences. %d' %
              len(dataset['train']['src']))
        print(' * maximum batch size (words per batch). %d' %
              opt.batch_size_words)

    elif opt.data_format == 'bin':

        from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset

        dicts = torch.load(opt.data + ".dict.pt")

        train_path = opt.data + '.train'
        train_src = IndexedInMemoryDataset(train_path + '.src')
        train_tgt = IndexedInMemoryDataset(train_path + '.tgt')

        train_data = onmt.Dataset(train_src,
                                  train_tgt,
                                  opt.batch_size_words,
                                  data_type=opt.encoder_type,
                                  batch_size_sents=opt.batch_size_sents,
                                  multiplier=opt.batch_size_multiplier)

        valid_path = opt.data + '.valid'
        valid_src = IndexedInMemoryDataset(valid_path + '.src')
        valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt')

        valid_data = onmt.Dataset(valid_src,
                                  valid_tgt,
                                  opt.batch_size_words,
                                  data_type=opt.encoder_type,
                                  batch_size_sents=opt.batch_size_sents)

    else:
        raise NotImplementedError

    additional_data = []
    if (opt.additional_data != "none"):
        add_data = opt.additional_data.split(";")
        add_format = opt.additional_data_format.split(";")
        assert (len(add_data) == len(add_format))
        for i in range(len(add_data)):
            if add_format[i] == 'raw':
                if add_data[i].endswith(".train.pt"):
                    print("Loading data from '%s'" % add_data[i])
                    add_dataset = torch.load(add_data[i])
                else:
                    print("Loading data from %s" % add_data[i] + ".train.pt")
                    add_dataset = torch.load(add_data[i] + ".train.pt")

                additional_data.append(
                    onmt.Dataset(add_dataset['train']['src'],
                                 add_dataset['train']['tgt'],
                                 opt.batch_size_words,
                                 data_type=add_dataset.get("type", "text"),
                                 batch_size_sents=opt.batch_size_sents,
                                 multiplier=opt.batch_size_multiplier,
                                 reshape_speech=opt.reshape_speech,
                                 augment=opt.augment_speech))
                add_dicts = add_dataset['dicts']

                for d in ['src', 'tgt']:
                    if (d in dicts):
                        if (d in add_dicts):
                            assert (dicts[d].size() == add_dicts[d].size())
                    else:
                        if (d in add_dicts):
                            dicts[d] = add_dicts[d]

            elif add_format[i] == 'bin':

                from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset

                train_path = add_data[i] + '.train'
                train_src = IndexedInMemoryDataset(train_path + '.src')
                train_tgt = IndexedInMemoryDataset(train_path + '.tgt')

                additional_data.append(
                    onmt.Dataset(train_src,
                                 train_tgt,
                                 opt.batch_size_words,
                                 data_type=opt.encoder_type,
                                 batch_size_sents=opt.batch_size_sents,
                                 multiplier=opt.batch_size_multiplier))

    # Restore from checkpoint
    if opt.load_from:
        checkpoint = torch.load(opt.load_from,
                                map_location=lambda storage, loc: storage)
        print("* Loading dictionaries from the checkpoint")
        dicts = checkpoint['dicts']
    else:
        dicts['tgt'].patch(opt.patch_vocab_multiplier)
        checkpoint = None

    if "src" in dicts:
        print(' * vocabulary size. source = %d; target = %d' %
              (dicts['src'].size(), dicts['tgt'].size()))
    else:
        print(' * vocabulary size. target = %d' % (dicts['tgt'].size()))

    print('Building model...')

    if not opt.fusion:
        model = build_model(opt, dicts)
        """ Building the loss function """
        if opt.ctc_loss != 0:
            loss_function = NMTAndCTCLossFunc(
                dicts['tgt'].size(),
                label_smoothing=opt.label_smoothing,
                ctc_weight=opt.ctc_loss)
        else:
            loss_function = NMTLossFunc(dicts['tgt'].size(),
                                        label_smoothing=opt.label_smoothing)
    else:
        from onmt.ModelConstructor import build_fusion
        from onmt.modules.Loss import FusionLoss

        model = build_fusion(opt, dicts)

        loss_function = FusionLoss(dicts['tgt'].size(),
                                   label_smoothing=opt.label_smoothing)

    n_params = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % n_params)

    if len(opt.gpus) > 1 or opt.virtual_gpu > 1:
        raise NotImplementedError(
            "Warning! Multi-GPU training is not fully tested and potential bugs can happen."
        )
    else:
        # if opt.fp16:
        #     trainer = FP16XETrainer(model, loss_function, train_data, valid_data, dicts, opt)
        # else:
        trainer = XETrainer(model, loss_function, train_data, valid_data,
                            dicts, opt)
        if (len(additional_data) > 0):
            trainer.add_additional_data(additional_data, opt.data_ratio)

    trainer.run(checkpoint=checkpoint)
Esempio n. 7
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch
        self.beam_accum = None
        self.beta = opt.beta
        self.alpha = opt.alpha
        self.start_with_bos = opt.start_with_bos
        self.fp16 = opt.fp16
        self.attributes = opt.attributes  # attributes split by |. for example: de|domain1
        self.bos_token = opt.bos_token
        self.sampling = opt.sampling

        if self.attributes:
            self.attributes = self.attributes.split("|")

        self.models = list()
        self.model_types = list()

        # models are string with | as delimiter
        models = opt.model.split("|")

        print(models)
        self.n_models = len(models)
        self._type = 'text'

        for i, model in enumerate(models):
            if opt.verbose:
                print('Loading model from %s' % model)
            checkpoint = torch.load(model,
                                    map_location=lambda storage, loc: storage)

            model_opt = checkpoint['opt']

            if i == 0:
                if "src" in checkpoint['dicts']:
                    self.src_dict = checkpoint['dicts']['src']
                else:
                    self._type = "audio"
                self.tgt_dict = checkpoint['dicts']['tgt']

                if "atb" in checkpoint["dicts"]:
                    self.atb_dict = checkpoint['dicts']['atb']

                else:
                    self.atb_dict = None

                self.bos_id = self.tgt_dict.labelToIdx[self.bos_token]

            # Build model from the saved option
            # if hasattr(model_opt, 'fusion') and model_opt.fusion == True:
            #     print("* Loading a FUSION model")
            #     model = build_fusion(model_opt, checkpoint['dicts'])
            # else:
            #     model = build_model(model_opt, checkpoint['dicts'])
            model = build_model(model_opt, checkpoint['dicts'])
            model.load_state_dict(checkpoint['model'])

            if model_opt.model in model_list:
                # if model.decoder.positional_encoder.len_max < self.opt.max_sent_length:
                #     print("Not enough len to decode. Renewing .. ")
                #     model.decoder.renew_buffer(self.opt.max_sent_length)
                model.renew_buffer(self.opt.max_sent_length)

            if opt.fp16:
                model = model.half()

            if opt.cuda:
                model = model.cuda()
            else:
                model = model.cpu()

            model.eval()

            self.models.append(model)
            self.model_types.append(model_opt.model)

        # language model
        if opt.lm is not None:
            if opt.verbose:
                print('Loading language model from %s' % opt.lm)

            lm_chkpoint = torch.load(opt.lm, map_location=lambda storage, loc: storage)

            lm_opt = lm_chkpoint['opt']

            lm_model = build_language_model(lm_opt, checkpoint['dicts'])

            if opt.fp16:
                lm_model = lm_model.half()

            if opt.cuda:
                lm_model = lm_model.cuda()
            else:
                lm_model = lm_model.cpu()

            self.lm_model = lm_model

        self.cuda = opt.cuda
        self.ensemble_op = opt.ensemble_op

        if opt.autoencoder is not None:
            if opt.verbose:
                print('Loading autoencoder from %s' % opt.autoencoder)
            checkpoint = torch.load(opt.autoencoder,
                                    map_location=lambda storage, loc: storage)
            model_opt = checkpoint['opt']

            # posSize= checkpoint['autoencoder']['nmt.decoder.positional_encoder.pos_emb'].size(0)
            # self.models[0].decoder.renew_buffer(posSize)
            # self.models[0].decoder.renew_buffer(posSize)

            # Build model from the saved option
            self.autoencoder = Autoencoder(self.models[0], model_opt)

            self.autoencoder.load_state_dict(checkpoint['autoencoder'])

            if opt.cuda:
                self.autoencoder = self.autoencoder.cuda()
                self.models[0] = self.models[0].cuda()
            else:
                self.autoencoder = self.autoencoder.cpu()
                self.models[0] = self.models[0].cpu()

            self.models[0].autoencoder = self.autoencoder
        if opt.verbose:
            print('Done')
Esempio n. 8
0
def main():

    start = time.time()
    print("Loading data from '%s'" % opt.data)

    if opt.data_format == 'raw':
        dataset = torch.load(opt.data)
        elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
        print("Done after %s" % elapse)

        trainData = onmt.Dataset(dataset['train']['src'],
                                 dataset['train']['tgt'],
                                 opt.batch_size_words,
                                 opt.gpus,
                                 max_seq_num=opt.batch_size_sents,
                                 pad_count=opt.pad_count,
                                 multiplier=opt.batch_size_multiplier,
                                 sort_by_target=opt.sort_by_target)
        validData = onmt.Dataset(dataset['valid']['src'],
                                 dataset['valid']['tgt'],
                                 opt.batch_size_words,
                                 opt.gpus,
                                 max_seq_num=opt.batch_size_sents)

        dicts = dataset['dicts']
        print(' * vocabulary size. source = %d; target = %d' %
              (dicts['src'].size(), dicts['tgt'].size()))
        print(' * number of training sentences. %d' %
              len(dataset['train']['src']))
        print(' * maximum batch size (words per batch). %d' %
              opt.batch_size_words)
    elif opt.data_format == 'bin':
        from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset

        dicts = torch.load(opt.data + ".dict.pt")

        #~ train = {}
        train_path = opt.data + '.train'
        train_src = IndexedInMemoryDataset(train_path + '.src')
        train_tgt = IndexedInMemoryDataset(train_path + '.tgt')

        trainData = onmt.Dataset(train_src,
                                 train_tgt,
                                 opt.batch_size_words,
                                 opt.gpus,
                                 max_seq_num=opt.batch_size_sents,
                                 pad_count=opt.pad_count,
                                 multiplier=opt.batch_size_multiplier,
                                 sort_by_target=opt.sort_by_target)

        valid_path = opt.data + '.valid'
        valid_src = IndexedInMemoryDataset(valid_path + '.src')
        valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt')

        validData = onmt.Dataset(valid_src,
                                 valid_tgt,
                                 opt.batch_size_words,
                                 opt.gpus,
                                 max_seq_num=opt.batch_size_sents)

    else:
        raise NotImplementedError

    print('Building model...')
    model = build_model(opt, dicts)
    """ Building the loss function """
    loss_function = NMTLossFunc(dicts['tgt'].size(),
                                label_smoothing=opt.label_smoothing)

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    optim = None

    if len(opt.gpus) > 1 or opt.virtual_gpu > 1:
        #~ trainer = MultiGPUXETrainer(model, loss_function, trainData, validData, dataset, opt)
        raise NotImplementedError(
            "Warning! Multi-GPU training is not fully tested and potential bugs can happen."
        )
    else:
        if opt.fp16:
            trainer = FP16XETrainer(model, loss_function, trainData, validData,
                                    dicts, opt)
        else:
            trainer = XETrainer(model, loss_function, trainData, validData,
                                dicts, opt)

    trainer.run(save_file=opt.load_from)
Esempio n. 9
0
def main():

    start = time.time()
    print("Loading data from '%s'" % opt.data)
    dataset = torch.load(opt.data)
    elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
    print("Done after %s" % elapse)

    #~ dict_checkpoint = opt.load_from
    #~ if dict_checkpoint:
    #~ print('Loading dicts from checkpoint at %s' % dict_checkpoint)
    #~ checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage)
    #~ dataset['dicts'] = checkpoint['dicts']
    #~ else:
    #~ checkpoint = None

    trainData = onmt.Dataset(dataset['train']['src'],
                             dataset['train']['tgt'],
                             opt.batch_size_words,
                             opt.gpus,
                             data_type=dataset.get("type", "text"),
                             max_seq_num=opt.batch_size_sents)
    validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'],
                             opt.batch_size_words,
                             opt.gpus,
                             volatile=True,
                             data_type=dataset.get("type", "text"),
                             max_seq_num=opt.batch_size_sents)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size (words per batch). %d' % opt.batch_size_words)

    print('Building model...')
    model = build_model(opt, dicts)
    """ Building the loss function """
    loss_function = NMTLossFunc(dataset['dicts']['tgt'].size(),
                                label_smoothing=opt.label_smoothing,
                                shard_size=opt.max_generator_batches)

    #~ print(model)
    #~ print(loss_function)

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)

    optim = None

    if len(opt.gpus) > 1 or opt.virtual_gpu > 1:
        trainer = MultiGPUXETrainer(model, loss_function, trainData, validData,
                                    dataset, opt)
        print(
            "Warning! Multi-GPU training is used. Not fully tested and potential bugs can happen."
        )
    else:
        trainer = XETrainer(model, loss_function, trainData, validData,
                            dataset, opt)

    trainer.run(save_file=opt.load_from)
Esempio n. 10
0
def main():
    
    dataset = dict()
    
    print("Loading dicts from '%s'" % opt.data + "/dicts_info.pt")
    dataset['dicts'] = torch.load(opt.data + "/dicts_info.pt")
    
    pairIDs = list()
    if len(opt.adapt_src) > 0 and len(opt.adapt_tgt) > 0:
    
        # find the source and target ID of the pair we need to adapt
        srcID = dataset['dicts']['srcLangs'].index(opt.adapt_src)
        tgtID = dataset['dicts']['tgtLangs'].index(opt.adapt_tgt)
    
        setIDs = dataset['dicts']['setIDs']
        
        # find the pair ID that we need to adapt
        for i, sid in enumerate(setIDs):
            if sid[0] == srcID and sid[1] == tgtID:
                pairIDs.append(i)
                
        if len(pairIDs) == 0:
            pairIDs = None
    
    else:
        srcID = None
        tgtID = None
        pairIDs = None
    
    # convert string to IDs for easier manipulation
    opt.adapt_src = srcID
    opt.adapt_tgt = tgtID 
    opt.pairIDs = pairIDs
        
    
    dict_checkpoint = opt.train_from_state_dict
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage)
        #~ dataset['dicts'] = checkpoint['dicts']
    else:
        checkpoint = None
    
    dicts = dataset['dicts']
    
    dataset['valid'] = torch.load(opt.data + "/valid.pt")
    valid_set = dataset['valid']
    
    #~ print("Loading training data from '%s'" % opt.data + "/train.pt.*")
    
    
    
    dataset['train'] = dict()
    
    #~ torch.load(opt.data + "/train.pt.0")
    
    print("Done")
    
    nSets = dicts['nSets']
    setIDs = dicts['setIDs']
    print(' * Vocabulary sizes: ')
    for lang in dicts['langs']:
        print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size())
        
    # A wrapper to manage data loading
    trainLoader = onmt.MultiShardLoader(opt, dicts)

    trainSets = dict()
    validSets = dict()
    for i in xrange(nSets):
        
        #~ trainSets[i] = onmt.Dataset(dataset['train']['src'][i], dataset['train']['tgt'][i],
                             #~ opt.batch_size, opt.gpus)
            
        validSets[i] = onmt.Dataset(valid_set['src'][i], valid_set['tgt'][i],
                             opt.batch_size, opt.gpus)

        #~ print(' * number of training sentences for set %d: %d' %
          #~ (i, len(dataset['train']['src'][i])))
        

    print('[INFO] * maximum batch size. %d' % opt.batch_size)

    print('[INFO] Building model...')
    
    model, generator = build_model(opt, dicts, nSets)
    
    if opt.train_from_state_dict:
        print('[INFO] Loading model from checkpoint at %s'
              % opt.train_from_state_dict)
              
        model_state_dict = {k: v for k, v in checkpoint['model'].items() if 'critic' not in k}
        checkpoint['critic'] = {k: v for k, v in checkpoint['model'].items() if 'critic' in k}
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(checkpoint['generator'])
        
    if len(opt.gpus) >= 1:
        model.cuda()
        generator.cuda()
    else:
        model.cpu()
        generator.cpu()
    
    model.generator = generator
      

    if not opt.train_from_state_dict :
        for p in model.parameters():
            p.data.uniform_(-opt.param_init, opt.param_init)

    optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at
    )

    nParams = sum([p.nelement() for p in model.parameters()])
    print('[INFO] * number of parameters: %d' % nParams)
    
    
    
    evaluator = Evaluator(model, dataset, opt, cuda=(len(opt.gpus) >= 1))
    
    if opt.reinforce:
        if opt.critic == 'self':
            trainer = SCSTTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt)
        else:
            from onmt.ModelConstructor import build_critic
            from onmt.trainer.ActorCriticTrainer import A2CTrainer
            critic = build_critic(opt, dicts)
            
            model.critic = critic
            
            trainer = A2CTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt)
            #~ raise NotImplementedError
    else:
        trainer = XETrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt)
    
    trainer.run(checkpoint=checkpoint)
Esempio n. 11
0
def main():

    if opt.data_format == 'raw':
        start = time.time()

        if opt.data.endswith(".train.pt"):
            print("Loading data from '%s'" % opt.data)
            dataset = torch.load(opt.data)
        else:
            print("Loading data from %s" % opt.data + ".train.pt")
            dataset = torch.load(opt.data + ".train.pt")

        elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
        print("Done after %s" % elapse )

        # For backward compatibility
        train_dict = defaultdict(lambda: None, dataset['train'])
        valid_dict = defaultdict(lambda: None, dataset['valid'])

        train_data = onmt.Dataset(train_dict['src'], train_dict['tgt'],
                                  train_dict['src_atbs'], train_dict['tgt_atbs'],
                                  batch_size_words=opt.batch_size_words,
                                  data_type=dataset.get("type", "text"),
                                  batch_size_sents=opt.batch_size_sents,
                                  multiplier=opt.batch_size_multiplier,
                                  augment=opt.augment_speech,
                                  upsampling=opt.upsampling)
        valid_data = onmt.Dataset(valid_dict['src'], valid_dict['tgt'],
                                  valid_dict['src_atbs'], valid_dict['tgt_atbs'],
                                  batch_size_words=opt.batch_size_words,
                                  data_type=dataset.get("type", "text"),
                                  batch_size_sents=opt.batch_size_sents,
                                  upsampling=opt.upsampling)

        dicts = dataset['dicts']

        print(' * number of training sentences. %d' % len(dataset['train']['src']))
        print(' * maximum batch size (words per batch). %d' % opt.batch_size_words)

    elif opt.data_format == 'bin':
        print("Loading memory binned data files ....")
        start = time.time()
        from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset

        dicts = torch.load(opt.data + ".dict.pt")

        train_path = opt.data + '.train'
        train_src = IndexedInMemoryDataset(train_path + '.src')
        train_tgt = IndexedInMemoryDataset(train_path + '.tgt')

        train_data = onmt.Dataset(train_src,
                                  train_tgt,
                                  batch_size_words=opt.batch_size_words,
                                  data_type="text",
                                  batch_size_sents=opt.batch_size_sents,
                                  multiplier=opt.batch_size_multiplier)

        valid_path = opt.data + '.valid'
        valid_src = IndexedInMemoryDataset(valid_path + '.src')
        valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt')

        valid_data = onmt.Dataset(valid_src,
                                  valid_tgt,
                                  batch_size_words=opt.batch_size_words,
                                  data_type="text",
                                  batch_size_sents=opt.batch_size_sents)

        elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
        print("Done after %s" % elapse)
    elif opt.data_format == 'mmem':
        print("Loading memory mapped data files ....")
        start = time.time()
        from onmt.data_utils.MMapIndexedDataset import MMapIndexedDataset

        dicts = torch.load(opt.data + ".dict.pt")

        train_path = opt.data + '.train'
        train_src = MMapIndexedDataset(train_path + '.src')
        train_tgt = MMapIndexedDataset(train_path + '.tgt')

        train_data = onmt.Dataset(train_src,
                                  train_tgt,
                                  batch_size_words=opt.batch_size_words,
                                  data_type="text",
                                  batch_size_sents=opt.batch_size_sents,
                                  multiplier=opt.batch_size_multiplier)

        valid_path = opt.data + '.valid'
        valid_src = MMapIndexedDataset(valid_path + '.src')
        valid_tgt = MMapIndexedDataset(valid_path + '.tgt')

        valid_data = onmt.Dataset(valid_src,
                                  valid_tgt,
                                  batch_size_words=opt.batch_size_words,
                                  data_type="text",
                                  batch_size_sents=opt.batch_size_sents)
        elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
        print("Done after %s" % elapse)

    else:
        raise NotImplementedError

    additional_data = []
    if opt.additional_data != "none":
        add_data = opt.additional_data.split(";")
        add_format = opt.additional_data_format.split(";")
        assert(len(add_data) == len(add_format))
        for i in range(len(add_data)):
            if add_format[i] == 'raw':
                if add_data[i].endswith(".train.pt"):
                    print("Loading data from '%s'" % opt.data)
                    add_dataset = torch.load(add_data[i])
                else:
                    print("Loading data from %s" % opt.data + ".train.pt")
                    add_dataset = torch.load(add_data[i] + ".train.pt")

                additional_data.append(onmt.Dataset(add_dataset['train']['src'],
                                          dataset['train']['tgt'], batch_size_words=opt.batch_size_words,
                                          data_type=dataset.get("type", "text"),
                                          batch_size_sents=opt.batch_size_sents,
                                          multiplier=opt.batch_size_multiplier,
                                          reshape_speech=opt.reshape_speech,
                                          augment=opt.augment_speech))
            elif add_format[i] == 'bin':

                from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset

                train_path = add_data[i] + '.train'
                train_src = IndexedInMemoryDataset(train_path + '.src')
                train_tgt = IndexedInMemoryDataset(train_path + '.tgt')

                additional_data.append(onmt.Dataset(train_src,
                                       train_tgt,
                                       batch_size_words=opt.batch_size_words,
                                       data_type=opt.encoder_type,
                                       batch_size_sents=opt.batch_size_sents,
                                       multiplier = opt.batch_size_multiplier))

    if opt.load_from:
        checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage)
        print("* Loading dictionaries from the checkpoint")
        dicts = checkpoint['dicts']
    else:
        dicts['tgt'].patch(opt.patch_vocab_multiplier)
        checkpoint = None

    if "src" in dicts:
        print(' * vocabulary size. source = %d; target = %d' %
              (dicts['src'].size(), dicts['tgt'].size()))
    else:
        print(' * vocabulary size. target = %d' %
              (dicts['tgt'].size()))

    print('Building model...')

    if not opt.fusion:
        if opt.bert_scalar and opt.finetune_bert:
            print("WARNING: we only fine tune bert, we don't finetune scalar parameters, please set opt.bert_scalar False")

        print("Using scalared bert vector: ", opt.bert_scalar)
        print("Using Bert+Transformer to finetuning : ", opt.finetune_bert)

        model = build_model(opt, dicts)

        if not opt.finetune_bert:
            for param in model.bert.parameters():
                param.requires_grad = False

        if not opt.finetune_bert and opt.bert_scalar:
            scalar_mix = ScalarMix(
               onmt.Constants.BERT_LAYERS,
               do_layer_norm=False,
               initial_scalar_parameters=None,
               trainable=True,
            )
            model.add_module("scalar_mix", scalar_mix)
        print(model)
      #  for name, param in model.bert_model.named_parameters():
            # print(name, param, param.requires_grad)
            # the params in bert_model which require gradient:
            # if param.requires_grad:
            #    print(name)

        """ Building the loss function """
        if opt.ctc_loss != 0:
            loss_function = NMTAndCTCLossFunc(dicts['tgt'].size(),
                                              label_smoothing=opt.label_smoothing,
                                              ctc_weight=opt.ctc_loss)
        else:
            loss_function = NMTLossFunc(dicts['tgt'].size(),
                                        label_smoothing=opt.label_smoothing)
    else:
        from onmt.ModelConstructor import build_fusion
        from onmt.modules.Loss import FusionLoss

        model = build_fusion(opt, dicts)

        loss_function = FusionLoss(dicts['tgt'].size(), label_smoothing=opt.label_smoothing)

    n_params = sum([p.nelement() for p in model.parameters()])
    print('* number of all parameters: %d' % n_params)

    n_params_grad = sum([p.nelement() for p in model.parameters() if p.requires_grad == True])
    print('* number of all parameters that need gradient: %d' % n_params_grad)

    n_params_nograd = sum([p.nelement() for p in model.parameters() if p.requires_grad == False])
    print('* number of all parameters that do not need gradient: %d' % n_params_nograd)

    assert n_params == (n_params_grad + n_params_nograd)
    # print(model)

    if len(opt.gpus) > 1 or opt.virtual_gpu > 1:
        raise NotImplementedError("Warning! Multi-GPU training is not fully tested and potential bugs can happen.")
    else:
        trainer = XETrainer(model, loss_function, train_data, valid_data, dicts, opt,setup_optimizer=True)
        if len(additional_data) > 0:
            trainer.add_additional_data(additional_data, opt.data_ratio)

    trainer.run(checkpoint=checkpoint)
Esempio n. 12
0
def main():

    if opt.data_format == 'raw':
        start = time.time()
        print("Loading data from '%s'" % opt.data)

        if opt.data.endswith(".train.pt"):
            print("Loading data from '%s'" % opt.data)
            dataset = torch.load(opt.data)
        else:
            print("Loading data from %s" % opt.data + ".train.pt")
            dataset = torch.load(opt.data + ".train.pt")

        elapse = str(datetime.timedelta(seconds=int(time.time() - start)))
        print("Done after %s" % elapse)

        trainData = onmt.Dataset(dataset['train']['src'],
                                 dataset['train']['tgt'], opt.batch_size_words,
                                 data_type=dataset.get("type", "text"),
                                 batch_size_sents=opt.batch_size_sents,
                                 multiplier=opt.batch_size_multiplier)
        validData = onmt.Dataset(dataset['valid']['src'],
                                 dataset['valid']['tgt'], opt.batch_size_words,
                                 data_type=dataset.get("type", "text"),
                                 batch_size_sents=opt.batch_size_sents)

        dicts = dataset['dicts']
        if ("src" in dicts):
            print(' * vocabulary size. source = %d; target = %d' %
                  (dicts['src'].size(), dicts['tgt'].size()))
        else:
            print(' * vocabulary size. target = %d' %
                  (dicts['tgt'].size()))

        print(' * number of training sentences. %d' %
              len(dataset['train']['src']))
        print(' * maximum batch size (words per batch). %d' % opt.batch_size_words)
    elif opt.data_format == 'bin':
        from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset

        dicts = torch.load(opt.data + ".dict.pt")

        # ~ train = {}
        train_path = opt.data + '.train'
        train_src = IndexedInMemoryDataset(train_path + '.src')
        train_tgt = IndexedInMemoryDataset(train_path + '.tgt')

        trainData = onmt.Dataset(train_src,
                                 train_tgt, opt.batch_size_words,
                                 batch_size_sents=opt.batch_size_sents,
                                 multiplier=opt.batch_size_multiplier)

        valid_path = opt.data + '.valid'
        valid_src = IndexedInMemoryDataset(valid_path + '.src')
        valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt')

        validData = onmt.Dataset(valid_src,
                                 valid_tgt, opt.batch_size_words,
                                 batch_size_sents=opt.batch_size_sents)

    else:
        raise NotImplementedError

    print('Building model...')
    model = build_model(opt, dicts)
    autoencoder = Autoencoder(model,opt)

    """ Building the loss function """
    loss_function = nn.MSELoss(size_average=False)

    nParams = sum([p.nelement() for p in autoencoder.parameters()])
    print('* number of parameters: %d' % nParams)

    # load nmt model
    checkpoint = None
    if opt.load_from:
        checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage)
    else:
        raise NotImplementedError

    if checkpoint is not None:
        print('Loading model from checkpoint at %s' % opt.load_from)
        model.load_state_dict(checkpoint['model'])

        del checkpoint['model']
        del checkpoint['optim']
        del checkpoint

    if len(opt.gpus) > 1 or opt.virtual_gpu > 1:
        # ~ trainer = MultiGPUXETrainer(model, loss_function, trainData, validData, dataset, opt)
        raise NotImplementedError("Warning! Multi-GPU training is not fully tested and potential bugs can happen.")
    else:
            trainer = AETrainer(autoencoder,model, loss_function, trainData, validData, dicts, opt)

    trainer.run(save_file=False)
Esempio n. 13
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch
        self.start_with_bos = opt.start_with_bos
        self.fp16 = opt.fp16

        self.models = list()
        self.model_types = list()

        # models are string with | as delimiter
        models = opt.model.split("|")

        print(models)
        self.n_models = len(models)
        self._type = 'text'

        check_m = None

        for i, model in enumerate(models):
            if opt.verbose:
                print('Loading model from %s' % model)
            checkpoint = torch.load(model,
                                    map_location=lambda storage, loc: storage)

            model_opt = checkpoint['opt']

            if i == 0:
                if ("src" in checkpoint['dicts']):
                    self.src_dict = checkpoint['dicts']['src']
                else:
                    self._type = "audio"
                self.tgt_dict = checkpoint['dicts']['tgt']

            # Build model from the saved option
            model = build_model(model_opt, checkpoint['dicts'])

            model.load_state_dict(checkpoint['model'])

            check_m = checkpoint['model']

            if opt.cuda:
                model = model.cuda()
            else:
                model = model.cpu()

            if opt.fp16:
                model = model.half()

            model.eval()

            self.models.append(model)
            self.model_types.append(model_opt.model)

        self.cuda = opt.cuda

        ## Autoencoder

        if opt.verbose:
            print('Loading autoencoder from %s' % opt.autoencoder)
        checkpoint = torch.load(opt.autoencoder,
                                map_location=lambda storage, loc: storage)
        model_opt = checkpoint['opt']

        posSize = checkpoint['autoencoder'][
            'nmt.decoder.positional_encoder.pos_emb'].size(0)
        self.models[0].decoder.renew_buffer(posSize)

        # Build model from the saved option
        self.autoencoder = Autoencoder(self.models[0], model_opt)

        self.autoencoder.load_state_dict(checkpoint['autoencoder'])

        for k in checkpoint['autoencoder']:
            if (k.startswith("nmt") and k[4:] in check_m):
                n = checkpoint['autoencoder'][k]
                o = check_m[k[4:]]
                if (o.size() != n.size()):
                    print("Different size:", k[4:])
                elif ((n - o).sum() != 0):
                    print("Different weight:", k[4:])

        if self.autoencoder.nmt.decoder.positional_encoder.len_max < self.opt.max_sent_length:
            self.autoencoder.nmt.decoder.renew_buffer(self.opt.max_sent_length)

        if opt.cuda:
            self.autoencoder = self.autoencoder.cuda()
        else:
            self.autoencoder = self.autoencoder.cpu()

        if opt.fp16:
            self.autoencoder = self.autoencoder.half()

        self.autoencoder.eval()

        if opt.verbose:
            print('Done')
Esempio n. 14
0
def main():

    opt = parser.parse_args()

    opt.cuda = opt.gpu > -1

    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # opt.model should be a string of models, split by |

    models = opt.models.split("|")
    # print(models)
    n_models = len(models)

    print("Loading main model from %s ..." % models[0])
    checkpoint = torch.load(models[0],
                            map_location=lambda storage, loc: storage)

    if 'optim' in checkpoint:
        del checkpoint['optim']

    main_checkpoint = checkpoint
    model_opt = checkpoint['opt']
    dicts = checkpoint['dicts']

    main_model = build_model(model_opt, checkpoint['dicts'])

    main_model.load_state_dict(checkpoint['model'])

    if opt.cuda:
        main_model = main_model.cuda()

    for i in range(1, len(models)):

        model = models[i]
        print("Loading model from %s ..." % models[i])
        checkpoint = torch.load(model,
                                map_location=lambda storage, loc: storage)

        model_opt = checkpoint['opt']

        # delete optim information to save GPU memory
        if 'optim' in checkpoint:
            del checkpoint['optim']

        current_model = build_model(model_opt, checkpoint['dicts'])

        current_model.load_state_dict(checkpoint['model'])

        if opt.cuda:
            current_model = current_model.cuda()

        if opt.method == 'mean':
            # Sum the parameter values
            for (main_param, param) in zip(main_model.parameters(),
                                           current_model.parameters()):
                main_param.data.add_(param.data)
        elif opt.method == 'gmean':
            # Take the geometric mean of parameter values
            for (main_param, param) in zip(main_model.parameters(),
                                           current_model.parameters()):
                main_param.data.mul_(param.data)
        else:
            raise NotImplementedError

    # Normalizing
    if opt.method == 'mean':
        for main_param in main_model.parameters():
            main_param.data.div_(n_models)
    elif opt.method == 'gmean':
        for main_param in main_model.parameters():
            main_param.data.pow_(1. / n_models)

    # Saving
    model_state_dict = main_model.state_dict()

    save_checkpoint = {
        'model': model_state_dict,
        'dicts': dicts,
        'opt': model_opt,
        'epoch': -1,
        'iteration': -1,
        'batchOrder': None,
        'optim': None
    }

    print("Saving averaged model to %s" % opt.output)

    torch.save(save_checkpoint, opt.output)
Esempio n. 15
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch
        self.beam_accum = None
        self.beta = opt.beta
        self.alpha = opt.alpha
        self.start_with_bos = opt.start_with_bos
        self.fp16 = opt.fp16
        self.stop_early = True
        self.normalize_scores = opt.normalize
        self.len_penalty = opt.alpha

        self.models = list()
        self.model_types = list()

        # models are string with | as delimiter
        models = opt.model.split("|")

        print(models)
        self.n_models = len(models)
        self._type = 'text'

        for i, model in enumerate(models):
            if opt.verbose:
                print('Loading model from %s' % model)
            checkpoint = torch.load(model,
                                    map_location=lambda storage, loc: storage)

            model_opt = checkpoint['opt']

            if i == 0:
                self.src_dict = checkpoint['dicts']['src']
                self.tgt_dict = checkpoint['dicts']['tgt']

            # Build model from the saved option
            model = build_model(model_opt, checkpoint['dicts'])

            model.load_state_dict(checkpoint['model'])

            if model_opt.model in model_list:
                if model.decoder.positional_encoder.len_max < self.opt.max_sent_length + 1:
                    print("Not enough len to decode. Renewing .. ")
                    model.decoder.renew_buffer(self.opt.max_sent_length + 1)

            if opt.fp16:
                model = model.half()

            if opt.cuda:
                model = model.cuda()
            else:
                model = model.cpu()

            model.eval()

            self.models.append(model)
            self.model_types.append(model_opt.model)

        self.cuda = opt.cuda
        self.ensemble_op = opt.ensemble_op
        # ~ # self.search = BeamSearch(self.tgt_dict)
        # 1 will give the same result as BeamSearch
        self.search = DiverseBeamSearch(self.tgt_dict, 1,
                                        self.opt.diverse_beam_strength)
        self.eos = onmt.Constants.EOS
        self.pad = onmt.Constants.PAD
        self.bos = onmt.Constants.BOS
        self.unk = onmt.Constants.UNK
        self.vocab_size = self.tgt_dict.size()
        self.minlen = 1

        if opt.verbose:
            print('Done')
Esempio n. 16
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch
        self.beam_accum = None
        self._type = "text"
        self.ensemble_op = opt.ensemble_op

        # opt.model should be a string of models, split by |

        models = opt.model.split("|")
        print(models)
        self.n_models = len(models)

        # only one src and target language

        self.models = list()
        self.logSoftMax = torch.nn.LogSoftmax()
        nSets = 0

        for i, model in enumerate(models):
            if opt.verbose:
                print('Loading model from %s' % opt.model)
            checkpoint = torch.load(opt.model,
                                    map_location=lambda storage, loc: storage)

            if opt.verbose:
                print('Done')

            model_opt = update_opt(checkpoint['opt'])

            # delete optim information to save GPU memory
            if 'optim' in checkpoint:
                del checkpoint['optim']

            # assuming that all these models use the same dict
            # the first checkpoint's dict will be loaded
            if i == 0:
                self.dicts = checkpoint['dicts']
                self.src_dict = self.dicts['vocabs'][opt.src_lang]
                self.tgt_dict = self.dicts['vocabs'][opt.tgt_lang]
                nSets = self.dicts['nSets']

            # Build the model

            this_model, generator = build_model(model_opt, self.dicts, nSets)

            model_state_dict = {
                k: v
                for k, v in checkpoint['model'].items() if 'critic' not in k
            }
            this_model.load_state_dict(model_state_dict)
            generator.load_state_dict(checkpoint['generator'])

            if opt.cuda:
                this_model.cuda()
                generator.cuda()
            else:
                this_model.cpu()
                generator.cpu()

            this_model.generator = generator

            this_model.eval()

            # Need to find the src and tgt id
            srcID = self.dicts['srcLangs'].index(opt.src_lang)
            tgtID = self.dicts['tgtLangs'].index(opt.tgt_lang)

            # After that, look for the pairID

            setIDs = self.dicts['setIDs']

            pair = -1
            for i, sid in enumerate(setIDs):
                if sid[0] == srcID and sid[1] == tgtID:
                    pair = i
                    break

            assert pair >= 0, "Cannot find any language pair with your provided src and tgt id"
            print(" * Translating with pair %i " % pair)
            #~ print(srcID, tgtID)
            #~ print(self.model)
            this_model.hardSwitchLangID(srcID, tgtID)
            this_model.switchPairID(pair)

            self.models.append(this_model)