def build_trainer(args, device_id, model, optim): """ Simplify `Trainer` creation based on user `opt`s* Args: opt (:obj:`Namespace`): user options (usually from argument parsing) model (:obj:`onmt.models.NMTModel`): the model to train fields (dict): dict of fields optim (:obj:`onmt.utils.Optimizer`): optimizer used during training data_type (str): string describing the type of data e.g. "text", "img", "audio" model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object used to save the model """ device = "cpu" if args.visible_gpus == '-1' else "cuda" grad_accum_count = args.accum_count n_gpu = args.world_size if device_id >= 0: gpu_rank = int(args.gpu_ranks[device_id]) else: gpu_rank = 0 n_gpu = 0 print('gpu_rank %d' % gpu_rank) tensorboard_log_dir = args.model_path writer = SummaryWriter(tensorboard_log_dir, comment="Unmt") report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer) trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager) # print(tr) if (model): n_params = _tally_parameters(model) logger.info('* number of parameters: %d' % n_params) return trainer
def _save(self, step): real_model = self.model # real_generator = (self.generator.module # if isinstance(self.generator, torch.nn.DataParallel) # else self.generator) model_state_dict = real_model.state_dict() # generator_state_dict = real_generator.state_dict() checkpoint = { 'model': model_state_dict, # 'generator': generator_state_dict, 'opt': self.args, 'optim': self.optim, } checkpoint_path = os.path.join(self.args.model_path, 'model_step_%d.pt' % step) logger.info("Saving checkpoint %s" % checkpoint_path) # checkpoint_path = '%s_step_%d.pt' % (FLAGS.model_path, step) if (not os.path.exists(checkpoint_path)): torch.save(checkpoint, checkpoint_path) return checkpoint, checkpoint_path
def _format_to_bert(params): json_file, args, save_file = params if (os.path.exists(save_file)): logger.info('Ignore %s' % save_file) return bert = BertData(args) logger.info('Processing %s' % json_file) jobs = json.load(open(json_file)) datasets = [] for d in jobs: source, tgt = d['src'], d['tgt'] if (args.oracle_mode == 'greedy'): oracle_ids = greedy_selection(source, tgt, 3) elif (args.oracle_mode == 'combination'): oracle_ids = combination_selection(source, tgt, 3) b_data = bert.preprocess(source, tgt, oracle_ids) if (b_data is None): continue indexed_tokens, labels, segments_ids, cls_ids, src_txt, tgt_txt = b_data b_data_dict = { "src": indexed_tokens, "labels": labels, "segs": segments_ids, 'clss': cls_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt } datasets.append(b_data_dict) logger.info('Saving to %s' % save_file) torch.save(datasets, save_file) datasets = [] gc.collect()
def train(self, train_iter_fct, train_steps, valid_iter_fct=None, valid_steps=-1): """ The main training loops. by iterating over training data (i.e. `train_iter_fct`) and running validation (i.e. iterating over `valid_iter_fct` Args: train_iter_fct(function): a function that returns the train iterator. e.g. something like train_iter_fct = lambda: generator(*args, **kwargs) valid_iter_fct(function): same as train_iter_fct, for valid data train_steps(int): valid_steps(int): save_checkpoint_steps(int): Return: None """ logger.info('Start training...') # step = self.optim._step + 1 step = self.optim._step + 1 true_batchs = [] accum = 0 normalization = 0 train_iter = train_iter_fct() total_stats = Statistics() report_stats = Statistics() self._start_report_manager(start_time=total_stats.start_time) while step <= train_steps: reduce_counter = 0 for i, batch in enumerate(train_iter): if self.n_gpu == 0 or (i % self.n_gpu == self.gpu_rank): true_batchs.append(batch) normalization += batch.batch_size accum += 1 if accum == self.grad_accum_count: reduce_counter += 1 if self.n_gpu > 1: normalization = sum(distributed .all_gather_list (normalization)) self._gradient_accumulation( true_batchs, normalization, total_stats, report_stats) report_stats = self._maybe_report_training( step, train_steps, self.optim.learning_rate, report_stats) true_batchs = [] accum = 0 normalization = 0 if (step % self.save_checkpoint_steps == 0 and self.gpu_rank == 0): self._save(step) step += 1 if step > train_steps: break train_iter = train_iter_fct() return total_stats
def _lazy_dataset_loader(pt_file, corpus_type): dataset = torch.load(pt_file) logger.info('Loading %s dataset from %s, number of examples: %d' % (corpus_type, pt_file, len(dataset))) return dataset
def log(self, *args, **kwargs): logger.info(*args, **kwargs)