def __init__(self, model, fields, model_options, translate_options): # Add in default model arguments, possibly added since training. # Note: The fields here should contain src_map, so it is not what the translation_dataset loader does # as it removes those fields. Be careful self.opt = mhf.convertToNamedTuple(translate_options) self.fields = fields model_opt = model_options for arg in translate_options: if arg not in model_opt: model_opt[arg] = translate_options[arg] model_opt = mhf.convertToNamedTuple(model_opt) self._type = model_opt.model_type self.copy_attn = model_opt.copy_attn self.model = model self.model.eval() self.model.generator.eval() # for debugging self.beam_accum = None
def create_model(fields, options=None): if options is None: options = copy.deepcopy(onmt.standard_options.stdOptions) if not isinstance(options, dict): options = mhf.convertToDictionary(options) options = handle_options(options) options = mhf.convertToNamedTuple(options) model = onmt.ModelConstructor.make_base_model(options, fields, USE_CUDA, checkpoint=None) if len(options.gpuid) > 1: model = nn.DataParallel(model, device_ids=options.gpuid, dim=1) return model
def create_optimizer(model_or_iterable, options=None): if options is None: options = copy.deepcopy(onmt.standard_options.stdOptions) if not isinstance(options, dict): options = mhf.convertToDictionary(options) options = handle_options(options) options = mhf.convertToNamedTuple(options) optim = onmt.Optim(options.optim, options.learning_rate, options.max_grad_norm, lr_decay=options.learning_rate_decay, start_decay_at=options.start_decay_at, opt=options) try: optim.set_parameters(model_or_iterable.parameters()) except AttributeError: optim.set_parameters(model_or_iterable) return optim
def process_dataset(self): stdProcessOptions = onmt.standard_options.standardPreProcessingOptions stdProcessOptions = mhf.convertToNamedTuple(stdProcessOptions) print('Preparing Training...') with codecs.open(self.trainFilesPath[0], "r", "utf-8") as src_file: src_line = src_file.readline().strip().split() _, _, nFeatures = onmt.IO.extract_features(src_line) fields = onmt.IO.ONMTDataset.get_fields(nFeatures) print("Building Training...") train = onmt.IO.ONMTDataset(self.trainFilesPath[0], self.trainFilesPath[1], fields, stdProcessOptions) print("Building Vocab...") onmt.IO.ONMTDataset.build_vocab(train, stdProcessOptions) print("Building Test...") test = onmt.IO.ONMTDataset(self.testFilesPath[0], self.testFilesPath[1], fields, stdProcessOptions) print("Saving train/test/fields") # Can't save fields, so remove/reconstruct at training time. with open(self.processedFilesPath[0], 'wb') as processed_vocab, \ open(self.processedFilesPath[1], 'wb') as processed_train, \ open(self.processedFilesPath[2], 'wb') as processed_test: torch.save(onmt.IO.ONMTDataset.save_vocab(fields), processed_vocab) train.fields = [] test.fields = [] torch.save(train, processed_train) torch.save(test, processed_test) print('Saving done.')
def optimize_quantization_points(modelToQuantize, train_loader, test_loader, options, optim=None, numPointsPerTensor=16, assignBitsAutomatically=False, use_distillation_loss=False, bucket_size=None): print('Preparing training - pre processing tensors') if options is None: options = onmt.standard_options.stdOptions if not isinstance(options, dict): options = mhf.convertToDictionary(options) options = handle_options(options) options = mhf.convertToNamedTuple(options) modelToQuantize.eval() quantizedModel = copy.deepcopy(modelToQuantize) fields = train_loader.dataset.fields train_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab, train_loader.dataset, options.copy_attn, options.copy_attn_force) valid_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab, test_loader.dataset, options.copy_attn, options.copy_attn_force) trunc_size = options.truncated_decoder # Badly named... shard_size = options.max_generator_batches numTensorsNetwork = sum(1 for _ in quantizedModel.parameters()) if isinstance(numPointsPerTensor, int): numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork if len(numPointsPerTensor) != numTensorsNetwork: raise ValueError( 'numPointsPerTensor must be equal to the number of tensor in the network' ) scalingFunction = quantization.ScalingFunction(type_scaling='linear', max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size) quantizedModel.zero_grad() dummy_optim = create_optimizer( quantizedModel, options) #dummy optim, just to pass to trainer if assignBitsAutomatically: trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader, train_loss, valid_loss, dummy_optim, trunc_size, shard_size) batch = next(iter(train_loader)) quantizedModel.zero_grad() trainer.forward_and_backward(0, batch, 0, onmt.Statistics(), None) fisherInformation = [] for p in quantizedModel.parameters(): fisherInformation.append(p.grad.data.norm()) numPointsPerTensor = qhf.assign_bits_automatically(fisherInformation, numPointsPerTensor, input_is_point=True) quantizedModel.zero_grad() del trainer del optim # initialize the points using the percentile function so as to make them all usable pointsPerTensor = [] for idx, p in enumerate(quantizedModel.parameters()): initial_points = qhf.initialize_quantization_points( p.data, scalingFunction, numPointsPerTensor[idx]) initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) optionsOpt = copy.deepcopy(mhf.convertToDictionary(options)) optimizer = create_optimizer(pointsPerTensor, mhf.convertToNamedTuple(optionsOpt)) trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader, train_loss, valid_loss, dummy_optim, trunc_size, shard_size) perplexity_epochs = [] quantizationFunctions = [] for idx, p in enumerate(modelToQuantize.parameters()): #efficient version of nonUniformQuantization quant_fun = quantization.nonUniformQuantization_variable( max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size, pre_process_tensors=True, tensor=p.data) quantizationFunctions.append(quant_fun) print('Pre processing done, training started') for epoch in range(options.start_epoch, options.epochs + 1): train_stats = onmt.Statistics() quantizedModel.train() for idx_batch, batch in enumerate(train_loader): #zero the gradient quantizedModel.zero_grad() # quantize the weights for idx, p_quantized in enumerate(quantizedModel.parameters()): #I am using the efficient version of nonUniformQuantization. The tensors (that don't change across #iterations) are saved inside the quantization function, and we only need to pass the quantization #points p_quantized.data = quantizationFunctions[idx].forward( None, pointsPerTensor[idx].data) trainer.forward_and_backward(idx_batch, batch, epoch, train_stats, report_func, use_distillation_loss, modelToQuantize) # now get the gradient of the pointsPerTensor for idx, p in enumerate(quantizedModel.parameters()): pointsPerTensor[idx].grad.data = quantizationFunctions[ idx].backward(p.grad.data)[1] optimizer.step() # after optimzer.step() we need to make sure that the points are still sorted for points in pointsPerTensor: points.data = torch.sort(points.data)[0] print('Train perplexity: %g' % train_stats.ppl()) print('Train accuracy: %g' % train_stats.accuracy()) # 2. Validate on the validation set. valid_stats = trainer.validate() print('Validation perplexity: %g' % valid_stats.ppl()) print('Validation accuracy: %g' % valid_stats.accuracy()) perplexity_epochs.append(valid_stats.ppl()) # 3. Update the learning rate optimizer.updateLearningRate(valid_stats.ppl(), epoch) informationDict = {} informationDict['perplexity'] = perplexity_epochs informationDict[ 'numEpochsTrained'] = options.epochs + 1 - options.start_epoch return pointsPerTensor, informationDict
def train_model(model, train_loader, test_loader, plot_path, optim=None, options=None, stochasticRounding=False, quantizeWeights=False, numBits=8, maxElementAllowedForQuantization=False, bucket_size=None, subtractMeanInQuantization=False, quantizationFunctionToUse='uniformLinearScaling', backprop_quantization_style='none', num_estimate_quant_grad=1, use_distillation_loss=False, teacher_model=None, quantize_first_and_last_layer=True): if options is None: options = copy.deepcopy(onmt.standard_options.stdOptions) if not isinstance(options, dict): options = mhf.convertToDictionary(options) options = handle_options(options) options = mhf.convertToNamedTuple(options) if optim is None: optim = create_optimizer(model, options) if use_distillation_loss is True and teacher_model is None: raise ValueError( 'If training with distilled word level, we need teacher_model to be passed' ) if teacher_model is not None: teacher_model.eval() step_since_last_grad_quant_estimation = 0 num_param_model = sum(1 for _ in model.parameters()) if quantizeWeights: quantizationFunctionToUse = quantizationFunctionToUse.lower() if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower(): s = 2**(numBits - 1) type_of_scaling = 'absmax' elif quantizationFunctionToUse == 'uniformLinearScaling'.lower(): s = 2**numBits type_of_scaling = 'linear' else: raise ValueError( 'The specified quantization function is not present') if backprop_quantization_style is None or backprop_quantization_style in ( 'none', 'truncated'): quantizeFunctions = lambda x: quantization.uniformQuantization( x, s, type_of_scaling=type_of_scaling, stochastic_rounding=stochasticRounding, max_element=maxElementAllowedForQuantization, subtract_mean=subtractMeanInQuantization, modify_in_place=False, bucket_size=bucket_size)[0] elif backprop_quantization_style == 'complicated': quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling, stochastic_rounding=stochasticRounding, max_element=maxElementAllowedForQuantization, subtract_mean=subtractMeanInQuantization, modify_in_place=False, bucket_size=bucket_size) \ for _ in model.parameters()] else: raise ValueError( 'The specified backprop_quantization_style not recognized') fields = train_loader.dataset.fields # Collect features. src_features = collect_features(train_loader.dataset, fields) for j, feat in enumerate(src_features): print(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) train_loss = make_loss_compute(model, fields["tgt"].vocab, train_loader.dataset, options.copy_attn, options.copy_attn_force, use_distillation_loss, teacher_model) #for validation we don't use distilled loss; it would screw up the perplexity computation valid_loss = make_loss_compute(model, fields["tgt"].vocab, test_loader.dataset, options.copy_attn, options.copy_attn_force) trunc_size = None #options.truncated_decoder # Badly named... shard_size = options.max_generator_batches trn_writer = tbx.SummaryWriter(plot_path + '_output/train') tst_writer = tbx.SummaryWriter(plot_path + '_output/test') trainer = thf.MyTrainer(model, train_loader, test_loader, train_loss, valid_loss, optim, trunc_size, shard_size) perplexity_epochs = [] for epoch in range(options.start_epoch, options.epochs + 1): MAX_Memory = 0 train_stats = onmt.Statistics() model.train() for idx_batch, batch in enumerate(train_loader): model.zero_grad() if quantizeWeights: if step_since_last_grad_quant_estimation >= num_estimate_quant_grad: # we save them because we only want to quantize weights to compute gradients, # but keep using non-quantized weights during the algorithm model_state_dict = model.state_dict() for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_param_model - 1: continue if backprop_quantization_style == 'truncated': p.data.clamp_( -1, 1 ) # TODO: Is this necessary? Clamping the weights? if backprop_quantization_style in ('none', 'truncated'): p.data = quantizeFunctions(p.data) elif backprop_quantization_style == 'complicated': p.data = quantizeFunctions[idx].forward(p.data) else: raise ValueError trainer.forward_and_backward(idx_batch, batch, epoch, train_stats, report_func, use_distillation_loss, teacher_model) if quantizeWeights: if step_since_last_grad_quant_estimation >= num_estimate_quant_grad: model.load_state_dict(model_state_dict) del model_state_dict # free memory if backprop_quantization_style in ('truncated', 'complicated'): for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_param_model - 1: continue #Now some sort of backward. For the none style, we don't do anything. #for the truncated style, we just need to truncate the grad weights #as per the paper here: https://arxiv.org/pdf/1609.07061.pdf #Complicated is my derivation, but unsure whether to use it or not if backprop_quantization_style == 'truncated': p.grad.data[p.data.abs() > 1] = 0 elif backprop_quantization_style == 'complicated': p.grad.data = quantizeFunctions[idx].backward( p.grad.data) #update parameters after every batch trainer.optim.step() if step_since_last_grad_quant_estimation >= num_estimate_quant_grad: step_since_last_grad_quant_estimation = 0 step_since_last_grad_quant_estimation += 1 print('Train perplexity: %g' % train_stats.ppl()) print('Train accuracy: %g' % train_stats.accuracy()) trn_writer.add_scalar('ppl', train_stats.ppl(), epoch + 1) trn_writer.add_scalar('acc', train_stats.accuracy(), epoch + 1) # 2. Validate on the validation set. MAX_Memory = max(MAX_Memory, torch.cuda.max_memory_allocated()) valid_stats = trainer.validate() print('Validation perplexity: %g' % valid_stats.ppl()) print('Validation accuracy: %g' % valid_stats.accuracy()) print('Max allocated memory: {:2f}MB'.format(MAX_Memory / (1024**2))) perplexity_epochs.append(valid_stats.ppl()) tst_writer.add_scalar('ppl', valid_stats.ppl(), epoch + 1) tst_writer.add_scalar('acc', valid_stats.accuracy(), epoch + 1) # 3. Update the learning rate trainer.epoch_step(valid_stats.ppl(), epoch) if quantizeWeights: for idx, p in enumerate(model.parameters()): if backprop_quantization_style == 'truncated': p.data.clamp_( -1, 1) # TODO: Is this necessary? Clamping the weights? if backprop_quantization_style in ('none', 'truncated'): p.data = quantizeFunctions(p.data) elif backprop_quantization_style == 'complicated': p.data = quantizeFunctions[idx].forward(p.data) del quantizeFunctions[idx].saved_for_backward quantizeFunctions[idx].saved_for_backward = None # free memory else: raise ValueError informationDict = {} informationDict['perplexity'] = perplexity_epochs informationDict[ 'numEpochsTrained'] = options.epochs + 1 - options.start_epoch return model, informationDict
def translate_sequences(model, vocab_file, model_options, translate_options, source_to_translate, out_file_path=None, verbose=True, percentages_to_show=25, replace_unk_hack=False, test_memory=False): 'translates phrases using the model passed as parameter' fields = onmt.IO.ONMTDataset.load_fields(torch.load(vocab_file)) translator = Translator_modified(model, fields, model_options, translate_options) for key, val in onmt.standard_options.standardPreProcessingOptions.items(): if key not in translate_options: translate_options[key] = val translate_options = mhf.convertToNamedTuple(translate_options) model_options = mhf.convertToNamedTuple(model_options) iterator_created = False dataset_created = False is_source_file = False #source_to_translate can be different things: # - A ONMTDataset (must have been created with options=None though!) # - An OrderedIterator (must also have been created with options=None) # - A path to a file # - A string of phrases separated by \n # - A list of strings if isinstance(source_to_translate, onmt.IO.ONMTDataset): dataset = source_to_translate dataset_created = True elif isinstance(source_to_translate, onmt.IO.OrderedIterator): data_generator = source_to_translate dataset = source_to_translate.dataset dataset_created = True iterator_created = True elif isinstance(source_to_translate, str): if os.path.exists(source_to_translate): dataset = onmt.IO.ONMTDataset(source_to_translate, source_to_translate, translator.fields, None) dataset_created = True is_source_file, source_file_path = True, source_to_translate else: source_to_translate = re.sub('\n+', '\n', source_to_translate) source_to_translate = [x for x in source_to_translate.split('\n') if x] if isinstance(source_to_translate, list): temp_file_path = os.path.abspath('temp_file_translate_pytorch_{}'.format(uuid.uuid4())) with open(temp_file_path, 'w') as temp_file: for line in source_to_translate: temp_file.write(line + '\n') is_source_file, source_file_path = True, temp_file_path dataset = onmt.IO.ONMTDataset(temp_file_path, temp_file_path, translator.fields, None) dataset_created = True if not dataset_created: raise ValueError('source_to_translate could not have been interpreted correctly') if not iterator_created: data_generator = onmt.IO.OrderedIterator(dataset=dataset, #device=model_options.gpu, batch_size=translate_options.batch_size, train=False, sort=False, shuffle=False) if out_file_path is None: res = '' else: out_file = open(out_file_path, 'w') next_percentage_to_show = percentages_to_show total_num_batches = len(data_generator) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 counter = count(1) if replace_unk_hack and is_source_file: if data_generator.batch_size != 1: raise ValueError('For now the replace_unk_hack only works with batch size 1') source_file = open(source_file_path, 'r') lines_iterator = (line for line in source_file) try: for idx, batch in enumerate(data_generator): pred_batch, gold_batch, pred_scores, gold_scores, attn, src = translator.translate(batch, dataset) if test_memory and idx == 10: return torch.cuda.max_memory_allocated() pred_score_total += sum(score[0] for score in pred_scores) pred_words_total += sum(len(x[0]) for x in pred_batch) if translate_options.tgt: gold_score_total += sum(gold_scores) gold_words_total += sum(len(x) for x in batch.tgt[1:]) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest( pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: n_best_preds = [pred for pred in pred_sents[:translate_options.n_best]] if replace_unk_hack and is_source_file: original_line = next(lines_iterator).split(' ') for pred in n_best_preds: for idx_tok, tok in enumerate(pred): if tok == '<unk>': _, maxIndex = attn[0][0][idx_tok].max(0) pred[idx_tok] = original_line[maxIndex[0]] n_best_preds = [" ".join(pred) for pred in n_best_preds] strToWrite = '\n'.join(n_best_preds) + '\n' if out_file_path is None: res += strToWrite else: out_file.write(strToWrite) out_file.flush() if translate_options.verbose: sent_number = next(counter) words = get_src_words( src_sent, translator.fields["src"].vocab.itos) os.write(1, bytes('\nSENT %d: %s\n' % (sent_number, words), 'UTF-8')) best_pred = n_best_preds[0] best_score = pred_score[0] os.write(1, bytes('PRED %d: %s\n' % (sent_number, best_pred), 'UTF-8')) print("PRED SCORE: %.4f" % best_score) if translate_options.tgt: tgt_sent = ' '.join(gold_sent) os.write(1, bytes('GOLD %d: %s\n' % (sent_number, tgt_sent), 'UTF-8')) print("GOLD SCORE: %.4f" % gold_score) if len(n_best_preds) > 1: print('\nBEST HYP:') for score, sent in zip(pred_score, n_best_preds): os.write(1, bytes("[%.4f] %s\n" % (score, sent), 'UTF-8')) if idx / total_num_batches * 100 >= next_percentage_to_show: if verbose: print('Total completed: {:.2f}%'.format(idx / total_num_batches * 100)) next_percentage_to_show += percentages_to_show #report_score('PRED', pred_score_total, pred_words_total) if translate_options.tgt: report_score('GOLD', gold_score_total, gold_words_total) except Exception as e: print('An error occurred in translating sentences: {}'.format(e)) try: source_file.close() except:pass try: os.remove(temp_file_path) except:pass if out_file_path is None: return res else: out_file.close() return out_file_path