def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch self.beam_accum = None self.beta = opt.beta self.alpha = opt.alpha self.start_with_bos = opt.start_with_bos self.fp16 = opt.fp16 self.models = list() self.model_types = list() # models are string with | as delimiter models = opt.model.split("|") print(models) self.n_models = len(models) self._type = 'text' for i, model in enumerate(models): if opt.verbose: print('Loading model from %s' % model) checkpoint = torch.load(model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] if i == 0: self.src_dict = checkpoint['dicts']['src'] self.tgt_dict = checkpoint['dicts']['tgt'] # Build model from the saved option model = build_model(model_opt, checkpoint['dicts']) model.load_state_dict(checkpoint['model']) if model_opt.model in model_list: if model.decoder.positional_encoder.len_max < self.opt.max_sent_length: print("Not enough len to decode. Renewing .. ") model.decoder.renew_buffer(self.opt.max_sent_length) if opt.fp16: model = model.half() if opt.cuda: model = model.cuda() else: model = model.cpu() model.eval() self.models.append(model) self.model_types.append(model_opt.model) self.cuda = opt.cuda self.ensemble_op = opt.ensemble_op if opt.verbose: print('Done')
def custom_build_model(opt, dict, lm=False): if not lm: model = build_model(opt, dict) else: model = build_language_model(opt, dict) return model
def custom_build_model(opt, dict, lm=False): if not lm: model = build_model(opt, dict) # by me scalar_mix = ScalarMix( onmt.Constants.BERT_LAYERS, do_layer_norm=False, initial_scalar_parameters=None, trainable=True, ) model.add_module("scalar_mix", scalar_mix) else: model = build_language_model(opt, dict) return model
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch self.beam_accum = None self.beta = opt.beta self.alpha = opt.alpha self.start_with_bos = opt.start_with_bos if opt.verbose: print('Loading model from %s' % opt.model) checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] self.src_dict = checkpoint['dicts']['src'] self.tgt_dict = checkpoint['dicts']['tgt'] self._type = model_opt.encoder_type \ if "encoder_type" in model_opt else "text" # Build model from the saved option model = build_model(model_opt, checkpoint['dicts']) model.load_state_dict(checkpoint['model']) model.eval() if model_opt.model == 'transformer': if model.decoder.positional_encoder.len_max < self.opt.max_sent_length: print("Not enough len to decode. Renewing .. ") model.decoder.renew_buffer(self.opt.max_sent_length) if opt.cuda: model.cuda() else: model.cpu() self.cuda = opt.cuda self.model_type = model_opt.model self.model = model self.model.eval() if opt.verbose: print('Done')
def main(): if opt.data_format == 'raw': start = time.time() if opt.data.endswith(".train.pt"): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) else: print("Loading data from %s" % opt.data + ".train.pt") dataset = torch.load(opt.data + ".train.pt") elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse ) train_data = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, multiplier = opt.batch_size_multiplier, reshape_speech=opt.reshape_speech, augment=opt.augment_speech) valid_data = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, reshape_speech=opt.reshape_speech) dicts = dataset['dicts'] if "src" in dicts: print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) else: print(' * vocabulary size. target = %d' % (dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size (words per batch). %d' % opt.batch_size_words) elif opt.data_format == 'bin': from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset dicts = torch.load(opt.data + ".dict.pt") #~ train = {} train_path = opt.data + '.train' train_src = IndexedInMemoryDataset(train_path + '.src') train_tgt = IndexedInMemoryDataset(train_path + '.tgt') train_data = onmt.Dataset(train_src, train_tgt, opt.batch_size_words, data_type=opt.encoder_type, batch_size_sents=opt.batch_size_sents, multiplier = opt.batch_size_multiplier) valid_path = opt.data + '.valid' valid_src = IndexedInMemoryDataset(valid_path + '.src') valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt') valid_data = onmt.Dataset(valid_src, valid_tgt, opt.batch_size_words, data_type=opt.encoder_type, batch_size_sents=opt.batch_size_sents) else: raise NotImplementedError print('Building model...') if not opt.fusion: model = build_model(opt, dicts) """ Building the loss function """ if opt.ctc_loss != 0: loss_function = NMTAndCTCLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing,ctc_weight = opt.ctc_loss) else: loss_function = NMTLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing) else: from onmt.ModelConstructor import build_fusion from onmt.modules.Loss import FusionLoss model = build_fusion(opt, dicts) loss_function = FusionLoss(dicts['tgt'].size(), label_smoothing=opt.label_smoothing) n_params = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % n_params) if len(opt.gpus) > 1 or opt.virtual_gpu > 1: raise NotImplementedError("Warning! Multi-GPU training is not fully tested and potential bugs can happen.") else: if opt.fp16: trainer = FP16XETrainer(model, loss_function, train_data, valid_data, dicts, opt) else: trainer = XETrainer(model, loss_function, train_data, valid_data, dicts, opt) trainer.run(save_file=opt.load_from)
def main(): if opt.data_format == 'raw': start = time.time() if opt.data.endswith(".train.pt"): print("Loading data from '%s'" % opt.data) dataset = torch.load( opt.data) # This requires a lot of cpu memory! else: print("Loading data from %s" % opt.data + ".train.pt") dataset = torch.load(opt.data + ".train.pt") elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse) train_data = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier, reshape_speech=opt.reshape_speech, augment=opt.augment_speech) valid_data = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, reshape_speech=opt.reshape_speech) dicts = dataset['dicts'] print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size (words per batch). %d' % opt.batch_size_words) elif opt.data_format == 'bin': from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset dicts = torch.load(opt.data + ".dict.pt") train_path = opt.data + '.train' train_src = IndexedInMemoryDataset(train_path + '.src') train_tgt = IndexedInMemoryDataset(train_path + '.tgt') train_data = onmt.Dataset(train_src, train_tgt, opt.batch_size_words, data_type=opt.encoder_type, batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier) valid_path = opt.data + '.valid' valid_src = IndexedInMemoryDataset(valid_path + '.src') valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt') valid_data = onmt.Dataset(valid_src, valid_tgt, opt.batch_size_words, data_type=opt.encoder_type, batch_size_sents=opt.batch_size_sents) else: raise NotImplementedError additional_data = [] if (opt.additional_data != "none"): add_data = opt.additional_data.split(";") add_format = opt.additional_data_format.split(";") assert (len(add_data) == len(add_format)) for i in range(len(add_data)): if add_format[i] == 'raw': if add_data[i].endswith(".train.pt"): print("Loading data from '%s'" % add_data[i]) add_dataset = torch.load(add_data[i]) else: print("Loading data from %s" % add_data[i] + ".train.pt") add_dataset = torch.load(add_data[i] + ".train.pt") additional_data.append( onmt.Dataset(add_dataset['train']['src'], add_dataset['train']['tgt'], opt.batch_size_words, data_type=add_dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier, reshape_speech=opt.reshape_speech, augment=opt.augment_speech)) add_dicts = add_dataset['dicts'] for d in ['src', 'tgt']: if (d in dicts): if (d in add_dicts): assert (dicts[d].size() == add_dicts[d].size()) else: if (d in add_dicts): dicts[d] = add_dicts[d] elif add_format[i] == 'bin': from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset train_path = add_data[i] + '.train' train_src = IndexedInMemoryDataset(train_path + '.src') train_tgt = IndexedInMemoryDataset(train_path + '.tgt') additional_data.append( onmt.Dataset(train_src, train_tgt, opt.batch_size_words, data_type=opt.encoder_type, batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier)) # Restore from checkpoint if opt.load_from: checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage) print("* Loading dictionaries from the checkpoint") dicts = checkpoint['dicts'] else: dicts['tgt'].patch(opt.patch_vocab_multiplier) checkpoint = None if "src" in dicts: print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) else: print(' * vocabulary size. target = %d' % (dicts['tgt'].size())) print('Building model...') if not opt.fusion: model = build_model(opt, dicts) """ Building the loss function """ if opt.ctc_loss != 0: loss_function = NMTAndCTCLossFunc( dicts['tgt'].size(), label_smoothing=opt.label_smoothing, ctc_weight=opt.ctc_loss) else: loss_function = NMTLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing) else: from onmt.ModelConstructor import build_fusion from onmt.modules.Loss import FusionLoss model = build_fusion(opt, dicts) loss_function = FusionLoss(dicts['tgt'].size(), label_smoothing=opt.label_smoothing) n_params = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % n_params) if len(opt.gpus) > 1 or opt.virtual_gpu > 1: raise NotImplementedError( "Warning! Multi-GPU training is not fully tested and potential bugs can happen." ) else: # if opt.fp16: # trainer = FP16XETrainer(model, loss_function, train_data, valid_data, dicts, opt) # else: trainer = XETrainer(model, loss_function, train_data, valid_data, dicts, opt) if (len(additional_data) > 0): trainer.add_additional_data(additional_data, opt.data_ratio) trainer.run(checkpoint=checkpoint)
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch self.beam_accum = None self.beta = opt.beta self.alpha = opt.alpha self.start_with_bos = opt.start_with_bos self.fp16 = opt.fp16 self.attributes = opt.attributes # attributes split by |. for example: de|domain1 self.bos_token = opt.bos_token self.sampling = opt.sampling if self.attributes: self.attributes = self.attributes.split("|") self.models = list() self.model_types = list() # models are string with | as delimiter models = opt.model.split("|") print(models) self.n_models = len(models) self._type = 'text' for i, model in enumerate(models): if opt.verbose: print('Loading model from %s' % model) checkpoint = torch.load(model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] if i == 0: if "src" in checkpoint['dicts']: self.src_dict = checkpoint['dicts']['src'] else: self._type = "audio" self.tgt_dict = checkpoint['dicts']['tgt'] if "atb" in checkpoint["dicts"]: self.atb_dict = checkpoint['dicts']['atb'] else: self.atb_dict = None self.bos_id = self.tgt_dict.labelToIdx[self.bos_token] # Build model from the saved option # if hasattr(model_opt, 'fusion') and model_opt.fusion == True: # print("* Loading a FUSION model") # model = build_fusion(model_opt, checkpoint['dicts']) # else: # model = build_model(model_opt, checkpoint['dicts']) model = build_model(model_opt, checkpoint['dicts']) model.load_state_dict(checkpoint['model']) if model_opt.model in model_list: # if model.decoder.positional_encoder.len_max < self.opt.max_sent_length: # print("Not enough len to decode. Renewing .. ") # model.decoder.renew_buffer(self.opt.max_sent_length) model.renew_buffer(self.opt.max_sent_length) if opt.fp16: model = model.half() if opt.cuda: model = model.cuda() else: model = model.cpu() model.eval() self.models.append(model) self.model_types.append(model_opt.model) # language model if opt.lm is not None: if opt.verbose: print('Loading language model from %s' % opt.lm) lm_chkpoint = torch.load(opt.lm, map_location=lambda storage, loc: storage) lm_opt = lm_chkpoint['opt'] lm_model = build_language_model(lm_opt, checkpoint['dicts']) if opt.fp16: lm_model = lm_model.half() if opt.cuda: lm_model = lm_model.cuda() else: lm_model = lm_model.cpu() self.lm_model = lm_model self.cuda = opt.cuda self.ensemble_op = opt.ensemble_op if opt.autoencoder is not None: if opt.verbose: print('Loading autoencoder from %s' % opt.autoencoder) checkpoint = torch.load(opt.autoencoder, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] # posSize= checkpoint['autoencoder']['nmt.decoder.positional_encoder.pos_emb'].size(0) # self.models[0].decoder.renew_buffer(posSize) # self.models[0].decoder.renew_buffer(posSize) # Build model from the saved option self.autoencoder = Autoencoder(self.models[0], model_opt) self.autoencoder.load_state_dict(checkpoint['autoencoder']) if opt.cuda: self.autoencoder = self.autoencoder.cuda() self.models[0] = self.models[0].cuda() else: self.autoencoder = self.autoencoder.cpu() self.models[0] = self.models[0].cpu() self.models[0].autoencoder = self.autoencoder if opt.verbose: print('Done')
def main(): start = time.time() print("Loading data from '%s'" % opt.data) if opt.data_format == 'raw': dataset = torch.load(opt.data) elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse) trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size_words, opt.gpus, max_seq_num=opt.batch_size_sents, pad_count=opt.pad_count, multiplier=opt.batch_size_multiplier, sort_by_target=opt.sort_by_target) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size_words, opt.gpus, max_seq_num=opt.batch_size_sents) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size (words per batch). %d' % opt.batch_size_words) elif opt.data_format == 'bin': from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset dicts = torch.load(opt.data + ".dict.pt") #~ train = {} train_path = opt.data + '.train' train_src = IndexedInMemoryDataset(train_path + '.src') train_tgt = IndexedInMemoryDataset(train_path + '.tgt') trainData = onmt.Dataset(train_src, train_tgt, opt.batch_size_words, opt.gpus, max_seq_num=opt.batch_size_sents, pad_count=opt.pad_count, multiplier=opt.batch_size_multiplier, sort_by_target=opt.sort_by_target) valid_path = opt.data + '.valid' valid_src = IndexedInMemoryDataset(valid_path + '.src') valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt') validData = onmt.Dataset(valid_src, valid_tgt, opt.batch_size_words, opt.gpus, max_seq_num=opt.batch_size_sents) else: raise NotImplementedError print('Building model...') model = build_model(opt, dicts) """ Building the loss function """ loss_function = NMTLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) optim = None if len(opt.gpus) > 1 or opt.virtual_gpu > 1: #~ trainer = MultiGPUXETrainer(model, loss_function, trainData, validData, dataset, opt) raise NotImplementedError( "Warning! Multi-GPU training is not fully tested and potential bugs can happen." ) else: if opt.fp16: trainer = FP16XETrainer(model, loss_function, trainData, validData, dicts, opt) else: trainer = XETrainer(model, loss_function, trainData, validData, dicts, opt) trainer.run(save_file=opt.load_from)
def main(): start = time.time() print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse) #~ dict_checkpoint = opt.load_from #~ if dict_checkpoint: #~ print('Loading dicts from checkpoint at %s' % dict_checkpoint) #~ checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) #~ dataset['dicts'] = checkpoint['dicts'] #~ else: #~ checkpoint = None trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size_words, opt.gpus, data_type=dataset.get("type", "text"), max_seq_num=opt.batch_size_sents) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size_words, opt.gpus, volatile=True, data_type=dataset.get("type", "text"), max_seq_num=opt.batch_size_sents) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size (words per batch). %d' % opt.batch_size_words) print('Building model...') model = build_model(opt, dicts) """ Building the loss function """ loss_function = NMTLossFunc(dataset['dicts']['tgt'].size(), label_smoothing=opt.label_smoothing, shard_size=opt.max_generator_batches) #~ print(model) #~ print(loss_function) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) optim = None if len(opt.gpus) > 1 or opt.virtual_gpu > 1: trainer = MultiGPUXETrainer(model, loss_function, trainData, validData, dataset, opt) print( "Warning! Multi-GPU training is used. Not fully tested and potential bugs can happen." ) else: trainer = XETrainer(model, loss_function, trainData, validData, dataset, opt) trainer.run(save_file=opt.load_from)
def main(): dataset = dict() print("Loading dicts from '%s'" % opt.data + "/dicts_info.pt") dataset['dicts'] = torch.load(opt.data + "/dicts_info.pt") pairIDs = list() if len(opt.adapt_src) > 0 and len(opt.adapt_tgt) > 0: # find the source and target ID of the pair we need to adapt srcID = dataset['dicts']['srcLangs'].index(opt.adapt_src) tgtID = dataset['dicts']['tgtLangs'].index(opt.adapt_tgt) setIDs = dataset['dicts']['setIDs'] # find the pair ID that we need to adapt for i, sid in enumerate(setIDs): if sid[0] == srcID and sid[1] == tgtID: pairIDs.append(i) if len(pairIDs) == 0: pairIDs = None else: srcID = None tgtID = None pairIDs = None # convert string to IDs for easier manipulation opt.adapt_src = srcID opt.adapt_tgt = tgtID opt.pairIDs = pairIDs dict_checkpoint = opt.train_from_state_dict if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) #~ dataset['dicts'] = checkpoint['dicts'] else: checkpoint = None dicts = dataset['dicts'] dataset['valid'] = torch.load(opt.data + "/valid.pt") valid_set = dataset['valid'] #~ print("Loading training data from '%s'" % opt.data + "/train.pt.*") dataset['train'] = dict() #~ torch.load(opt.data + "/train.pt.0") print("Done") nSets = dicts['nSets'] setIDs = dicts['setIDs'] print(' * Vocabulary sizes: ') for lang in dicts['langs']: print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size()) # A wrapper to manage data loading trainLoader = onmt.MultiShardLoader(opt, dicts) trainSets = dict() validSets = dict() for i in xrange(nSets): #~ trainSets[i] = onmt.Dataset(dataset['train']['src'][i], dataset['train']['tgt'][i], #~ opt.batch_size, opt.gpus) validSets[i] = onmt.Dataset(valid_set['src'][i], valid_set['tgt'][i], opt.batch_size, opt.gpus) #~ print(' * number of training sentences for set %d: %d' % #~ (i, len(dataset['train']['src'][i]))) print('[INFO] * maximum batch size. %d' % opt.batch_size) print('[INFO] Building model...') model, generator = build_model(opt, dicts, nSets) if opt.train_from_state_dict: print('[INFO] Loading model from checkpoint at %s' % opt.train_from_state_dict) model_state_dict = {k: v for k, v in checkpoint['model'].items() if 'critic' not in k} checkpoint['critic'] = {k: v for k, v in checkpoint['model'].items() if 'critic' in k} model.load_state_dict(model_state_dict) generator.load_state_dict(checkpoint['generator']) if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() model.generator = generator if not opt.train_from_state_dict : for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at ) nParams = sum([p.nelement() for p in model.parameters()]) print('[INFO] * number of parameters: %d' % nParams) evaluator = Evaluator(model, dataset, opt, cuda=(len(opt.gpus) >= 1)) if opt.reinforce: if opt.critic == 'self': trainer = SCSTTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt) else: from onmt.ModelConstructor import build_critic from onmt.trainer.ActorCriticTrainer import A2CTrainer critic = build_critic(opt, dicts) model.critic = critic trainer = A2CTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt) #~ raise NotImplementedError else: trainer = XETrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt) trainer.run(checkpoint=checkpoint)
def main(): if opt.data_format == 'raw': start = time.time() if opt.data.endswith(".train.pt"): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) else: print("Loading data from %s" % opt.data + ".train.pt") dataset = torch.load(opt.data + ".train.pt") elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse ) # For backward compatibility train_dict = defaultdict(lambda: None, dataset['train']) valid_dict = defaultdict(lambda: None, dataset['valid']) train_data = onmt.Dataset(train_dict['src'], train_dict['tgt'], train_dict['src_atbs'], train_dict['tgt_atbs'], batch_size_words=opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier, augment=opt.augment_speech, upsampling=opt.upsampling) valid_data = onmt.Dataset(valid_dict['src'], valid_dict['tgt'], valid_dict['src_atbs'], valid_dict['tgt_atbs'], batch_size_words=opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, upsampling=opt.upsampling) dicts = dataset['dicts'] print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size (words per batch). %d' % opt.batch_size_words) elif opt.data_format == 'bin': print("Loading memory binned data files ....") start = time.time() from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset dicts = torch.load(opt.data + ".dict.pt") train_path = opt.data + '.train' train_src = IndexedInMemoryDataset(train_path + '.src') train_tgt = IndexedInMemoryDataset(train_path + '.tgt') train_data = onmt.Dataset(train_src, train_tgt, batch_size_words=opt.batch_size_words, data_type="text", batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier) valid_path = opt.data + '.valid' valid_src = IndexedInMemoryDataset(valid_path + '.src') valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt') valid_data = onmt.Dataset(valid_src, valid_tgt, batch_size_words=opt.batch_size_words, data_type="text", batch_size_sents=opt.batch_size_sents) elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse) elif opt.data_format == 'mmem': print("Loading memory mapped data files ....") start = time.time() from onmt.data_utils.MMapIndexedDataset import MMapIndexedDataset dicts = torch.load(opt.data + ".dict.pt") train_path = opt.data + '.train' train_src = MMapIndexedDataset(train_path + '.src') train_tgt = MMapIndexedDataset(train_path + '.tgt') train_data = onmt.Dataset(train_src, train_tgt, batch_size_words=opt.batch_size_words, data_type="text", batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier) valid_path = opt.data + '.valid' valid_src = MMapIndexedDataset(valid_path + '.src') valid_tgt = MMapIndexedDataset(valid_path + '.tgt') valid_data = onmt.Dataset(valid_src, valid_tgt, batch_size_words=opt.batch_size_words, data_type="text", batch_size_sents=opt.batch_size_sents) elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse) else: raise NotImplementedError additional_data = [] if opt.additional_data != "none": add_data = opt.additional_data.split(";") add_format = opt.additional_data_format.split(";") assert(len(add_data) == len(add_format)) for i in range(len(add_data)): if add_format[i] == 'raw': if add_data[i].endswith(".train.pt"): print("Loading data from '%s'" % opt.data) add_dataset = torch.load(add_data[i]) else: print("Loading data from %s" % opt.data + ".train.pt") add_dataset = torch.load(add_data[i] + ".train.pt") additional_data.append(onmt.Dataset(add_dataset['train']['src'], dataset['train']['tgt'], batch_size_words=opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier, reshape_speech=opt.reshape_speech, augment=opt.augment_speech)) elif add_format[i] == 'bin': from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset train_path = add_data[i] + '.train' train_src = IndexedInMemoryDataset(train_path + '.src') train_tgt = IndexedInMemoryDataset(train_path + '.tgt') additional_data.append(onmt.Dataset(train_src, train_tgt, batch_size_words=opt.batch_size_words, data_type=opt.encoder_type, batch_size_sents=opt.batch_size_sents, multiplier = opt.batch_size_multiplier)) if opt.load_from: checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage) print("* Loading dictionaries from the checkpoint") dicts = checkpoint['dicts'] else: dicts['tgt'].patch(opt.patch_vocab_multiplier) checkpoint = None if "src" in dicts: print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) else: print(' * vocabulary size. target = %d' % (dicts['tgt'].size())) print('Building model...') if not opt.fusion: if opt.bert_scalar and opt.finetune_bert: print("WARNING: we only fine tune bert, we don't finetune scalar parameters, please set opt.bert_scalar False") print("Using scalared bert vector: ", opt.bert_scalar) print("Using Bert+Transformer to finetuning : ", opt.finetune_bert) model = build_model(opt, dicts) if not opt.finetune_bert: for param in model.bert.parameters(): param.requires_grad = False if not opt.finetune_bert and opt.bert_scalar: scalar_mix = ScalarMix( onmt.Constants.BERT_LAYERS, do_layer_norm=False, initial_scalar_parameters=None, trainable=True, ) model.add_module("scalar_mix", scalar_mix) print(model) # for name, param in model.bert_model.named_parameters(): # print(name, param, param.requires_grad) # the params in bert_model which require gradient: # if param.requires_grad: # print(name) """ Building the loss function """ if opt.ctc_loss != 0: loss_function = NMTAndCTCLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing, ctc_weight=opt.ctc_loss) else: loss_function = NMTLossFunc(dicts['tgt'].size(), label_smoothing=opt.label_smoothing) else: from onmt.ModelConstructor import build_fusion from onmt.modules.Loss import FusionLoss model = build_fusion(opt, dicts) loss_function = FusionLoss(dicts['tgt'].size(), label_smoothing=opt.label_smoothing) n_params = sum([p.nelement() for p in model.parameters()]) print('* number of all parameters: %d' % n_params) n_params_grad = sum([p.nelement() for p in model.parameters() if p.requires_grad == True]) print('* number of all parameters that need gradient: %d' % n_params_grad) n_params_nograd = sum([p.nelement() for p in model.parameters() if p.requires_grad == False]) print('* number of all parameters that do not need gradient: %d' % n_params_nograd) assert n_params == (n_params_grad + n_params_nograd) # print(model) if len(opt.gpus) > 1 or opt.virtual_gpu > 1: raise NotImplementedError("Warning! Multi-GPU training is not fully tested and potential bugs can happen.") else: trainer = XETrainer(model, loss_function, train_data, valid_data, dicts, opt,setup_optimizer=True) if len(additional_data) > 0: trainer.add_additional_data(additional_data, opt.data_ratio) trainer.run(checkpoint=checkpoint)
def main(): if opt.data_format == 'raw': start = time.time() print("Loading data from '%s'" % opt.data) if opt.data.endswith(".train.pt"): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) else: print("Loading data from %s" % opt.data + ".train.pt") dataset = torch.load(opt.data + ".train.pt") elapse = str(datetime.timedelta(seconds=int(time.time() - start))) print("Done after %s" % elapse) trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size_words, data_type=dataset.get("type", "text"), batch_size_sents=opt.batch_size_sents) dicts = dataset['dicts'] if ("src" in dicts): print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) else: print(' * vocabulary size. target = %d' % (dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size (words per batch). %d' % opt.batch_size_words) elif opt.data_format == 'bin': from onmt.data_utils.IndexedDataset import IndexedInMemoryDataset dicts = torch.load(opt.data + ".dict.pt") # ~ train = {} train_path = opt.data + '.train' train_src = IndexedInMemoryDataset(train_path + '.src') train_tgt = IndexedInMemoryDataset(train_path + '.tgt') trainData = onmt.Dataset(train_src, train_tgt, opt.batch_size_words, batch_size_sents=opt.batch_size_sents, multiplier=opt.batch_size_multiplier) valid_path = opt.data + '.valid' valid_src = IndexedInMemoryDataset(valid_path + '.src') valid_tgt = IndexedInMemoryDataset(valid_path + '.tgt') validData = onmt.Dataset(valid_src, valid_tgt, opt.batch_size_words, batch_size_sents=opt.batch_size_sents) else: raise NotImplementedError print('Building model...') model = build_model(opt, dicts) autoencoder = Autoencoder(model,opt) """ Building the loss function """ loss_function = nn.MSELoss(size_average=False) nParams = sum([p.nelement() for p in autoencoder.parameters()]) print('* number of parameters: %d' % nParams) # load nmt model checkpoint = None if opt.load_from: checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage) else: raise NotImplementedError if checkpoint is not None: print('Loading model from checkpoint at %s' % opt.load_from) model.load_state_dict(checkpoint['model']) del checkpoint['model'] del checkpoint['optim'] del checkpoint if len(opt.gpus) > 1 or opt.virtual_gpu > 1: # ~ trainer = MultiGPUXETrainer(model, loss_function, trainData, validData, dataset, opt) raise NotImplementedError("Warning! Multi-GPU training is not fully tested and potential bugs can happen.") else: trainer = AETrainer(autoencoder,model, loss_function, trainData, validData, dicts, opt) trainer.run(save_file=False)
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch self.start_with_bos = opt.start_with_bos self.fp16 = opt.fp16 self.models = list() self.model_types = list() # models are string with | as delimiter models = opt.model.split("|") print(models) self.n_models = len(models) self._type = 'text' check_m = None for i, model in enumerate(models): if opt.verbose: print('Loading model from %s' % model) checkpoint = torch.load(model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] if i == 0: if ("src" in checkpoint['dicts']): self.src_dict = checkpoint['dicts']['src'] else: self._type = "audio" self.tgt_dict = checkpoint['dicts']['tgt'] # Build model from the saved option model = build_model(model_opt, checkpoint['dicts']) model.load_state_dict(checkpoint['model']) check_m = checkpoint['model'] if opt.cuda: model = model.cuda() else: model = model.cpu() if opt.fp16: model = model.half() model.eval() self.models.append(model) self.model_types.append(model_opt.model) self.cuda = opt.cuda ## Autoencoder if opt.verbose: print('Loading autoencoder from %s' % opt.autoencoder) checkpoint = torch.load(opt.autoencoder, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] posSize = checkpoint['autoencoder'][ 'nmt.decoder.positional_encoder.pos_emb'].size(0) self.models[0].decoder.renew_buffer(posSize) # Build model from the saved option self.autoencoder = Autoencoder(self.models[0], model_opt) self.autoencoder.load_state_dict(checkpoint['autoencoder']) for k in checkpoint['autoencoder']: if (k.startswith("nmt") and k[4:] in check_m): n = checkpoint['autoencoder'][k] o = check_m[k[4:]] if (o.size() != n.size()): print("Different size:", k[4:]) elif ((n - o).sum() != 0): print("Different weight:", k[4:]) if self.autoencoder.nmt.decoder.positional_encoder.len_max < self.opt.max_sent_length: self.autoencoder.nmt.decoder.renew_buffer(self.opt.max_sent_length) if opt.cuda: self.autoencoder = self.autoencoder.cuda() else: self.autoencoder = self.autoencoder.cpu() if opt.fp16: self.autoencoder = self.autoencoder.half() self.autoencoder.eval() if opt.verbose: print('Done')
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # opt.model should be a string of models, split by | models = opt.models.split("|") # print(models) n_models = len(models) print("Loading main model from %s ..." % models[0]) checkpoint = torch.load(models[0], map_location=lambda storage, loc: storage) if 'optim' in checkpoint: del checkpoint['optim'] main_checkpoint = checkpoint model_opt = checkpoint['opt'] dicts = checkpoint['dicts'] main_model = build_model(model_opt, checkpoint['dicts']) main_model.load_state_dict(checkpoint['model']) if opt.cuda: main_model = main_model.cuda() for i in range(1, len(models)): model = models[i] print("Loading model from %s ..." % models[i]) checkpoint = torch.load(model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] # delete optim information to save GPU memory if 'optim' in checkpoint: del checkpoint['optim'] current_model = build_model(model_opt, checkpoint['dicts']) current_model.load_state_dict(checkpoint['model']) if opt.cuda: current_model = current_model.cuda() if opt.method == 'mean': # Sum the parameter values for (main_param, param) in zip(main_model.parameters(), current_model.parameters()): main_param.data.add_(param.data) elif opt.method == 'gmean': # Take the geometric mean of parameter values for (main_param, param) in zip(main_model.parameters(), current_model.parameters()): main_param.data.mul_(param.data) else: raise NotImplementedError # Normalizing if opt.method == 'mean': for main_param in main_model.parameters(): main_param.data.div_(n_models) elif opt.method == 'gmean': for main_param in main_model.parameters(): main_param.data.pow_(1. / n_models) # Saving model_state_dict = main_model.state_dict() save_checkpoint = { 'model': model_state_dict, 'dicts': dicts, 'opt': model_opt, 'epoch': -1, 'iteration': -1, 'batchOrder': None, 'optim': None } print("Saving averaged model to %s" % opt.output) torch.save(save_checkpoint, opt.output)
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch self.beam_accum = None self.beta = opt.beta self.alpha = opt.alpha self.start_with_bos = opt.start_with_bos self.fp16 = opt.fp16 self.stop_early = True self.normalize_scores = opt.normalize self.len_penalty = opt.alpha self.models = list() self.model_types = list() # models are string with | as delimiter models = opt.model.split("|") print(models) self.n_models = len(models) self._type = 'text' for i, model in enumerate(models): if opt.verbose: print('Loading model from %s' % model) checkpoint = torch.load(model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] if i == 0: self.src_dict = checkpoint['dicts']['src'] self.tgt_dict = checkpoint['dicts']['tgt'] # Build model from the saved option model = build_model(model_opt, checkpoint['dicts']) model.load_state_dict(checkpoint['model']) if model_opt.model in model_list: if model.decoder.positional_encoder.len_max < self.opt.max_sent_length + 1: print("Not enough len to decode. Renewing .. ") model.decoder.renew_buffer(self.opt.max_sent_length + 1) if opt.fp16: model = model.half() if opt.cuda: model = model.cuda() else: model = model.cpu() model.eval() self.models.append(model) self.model_types.append(model_opt.model) self.cuda = opt.cuda self.ensemble_op = opt.ensemble_op # ~ # self.search = BeamSearch(self.tgt_dict) # 1 will give the same result as BeamSearch self.search = DiverseBeamSearch(self.tgt_dict, 1, self.opt.diverse_beam_strength) self.eos = onmt.Constants.EOS self.pad = onmt.Constants.PAD self.bos = onmt.Constants.BOS self.unk = onmt.Constants.UNK self.vocab_size = self.tgt_dict.size() self.minlen = 1 if opt.verbose: print('Done')
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch self.beam_accum = None self._type = "text" self.ensemble_op = opt.ensemble_op # opt.model should be a string of models, split by | models = opt.model.split("|") print(models) self.n_models = len(models) # only one src and target language self.models = list() self.logSoftMax = torch.nn.LogSoftmax() nSets = 0 for i, model in enumerate(models): if opt.verbose: print('Loading model from %s' % opt.model) checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) if opt.verbose: print('Done') model_opt = update_opt(checkpoint['opt']) # delete optim information to save GPU memory if 'optim' in checkpoint: del checkpoint['optim'] # assuming that all these models use the same dict # the first checkpoint's dict will be loaded if i == 0: self.dicts = checkpoint['dicts'] self.src_dict = self.dicts['vocabs'][opt.src_lang] self.tgt_dict = self.dicts['vocabs'][opt.tgt_lang] nSets = self.dicts['nSets'] # Build the model this_model, generator = build_model(model_opt, self.dicts, nSets) model_state_dict = { k: v for k, v in checkpoint['model'].items() if 'critic' not in k } this_model.load_state_dict(model_state_dict) generator.load_state_dict(checkpoint['generator']) if opt.cuda: this_model.cuda() generator.cuda() else: this_model.cpu() generator.cpu() this_model.generator = generator this_model.eval() # Need to find the src and tgt id srcID = self.dicts['srcLangs'].index(opt.src_lang) tgtID = self.dicts['tgtLangs'].index(opt.tgt_lang) # After that, look for the pairID setIDs = self.dicts['setIDs'] pair = -1 for i, sid in enumerate(setIDs): if sid[0] == srcID and sid[1] == tgtID: pair = i break assert pair >= 0, "Cannot find any language pair with your provided src and tgt id" print(" * Translating with pair %i " % pair) #~ print(srcID, tgtID) #~ print(self.model) this_model.hardSwitchLangID(srcID, tgtID) this_model.switchPairID(pair) self.models.append(this_model)