def get_optimizer(model: Model) -> optim.Optimizer: """Obtain the optimizer used for training the model.""" if Config().trainer.optimizer == 'SGD': return optim.SGD(model.parameters(), lr=Config().trainer.learning_rate, momentum=Config().trainer.momentum, weight_decay=Config().trainer.weight_decay) elif Config().trainer.optimizer == 'Adam': return optim.Adam(model.parameters(), lr=Config().trainer.learning_rate, weight_decay=Config().trainer.weight_decay) elif Config().trainer.optimizer == 'FedProx': return FedProxOptimizer(model.parameters(), lr=Config().trainer.learning_rate, momentum=Config().trainer.momentum, weight_decay=Config().trainer.weight_decay) elif Config().trainer.optimizer == 'Scaffold': return ScaffoldOptimizer(model.parameters(), lr=Config().trainer.learning_rate, momentum=Config().trainer.momentum, weight_decay=Config().trainer.weight_decay) elif Config().trainer.optimizer == 'FedSarah': return FedSarahOptimizer(model.parameters(), lr=Config().trainer.learning_rate, momentum=Config().trainer.momentum, weight_decay=Config().trainer.weight_decay) raise ValueError('No such optimizer: {}'.format( Config().trainer.optimizer))
def _eval_discriminative_model(self, model: Model, writer: SummaryWriter, step, eval_title): training = model.training model.eval() K = 5 totals = [] corrects_1 = [] corrects_k = [] # Accuracy of each subset for subset_name, subset in self.subsets.items(): data = DataLoader( subset, batch_size=self.config['eval_batch_size'], num_workers=self.config['eval_num_workers'], collate_fn=self.collate_fn, ) total = 0. correct_1 = 0. correct_k = 0. for x, y in iter(data): b = x.size(0) with torch.no_grad(): logits = model(x).view(b, -1) # [B, K] _, pred_topk = logits.topk(K, dim=1) correct_topk = (pred_topk.cpu() == y.view( b, -1).expand_as(pred_topk)).float() correct_1 += correct_topk[:, :1].view(-1).cpu().sum() correct_k += correct_topk[:, :K].view(-1).cpu().sum() total += x.size(0) totals.append(total) corrects_1.append(correct_1) corrects_k.append(correct_k) accuracy_1 = correct_1 / total accuracy_k = correct_k / total writer.add_scalar( 'accuracy_1/%s/%s/%s' % (eval_title, self.name, subset_name), accuracy_1, step) writer.add_scalar( 'accuracy_%d/%s/%s/%s' % (K, eval_title, self.name, subset_name), accuracy_k, step) # Overall accuracy total = sum(totals) correct_1 = sum(corrects_1) correct_k = sum(corrects_k) accuracy_1 = correct_1 / total accuracy_k = correct_k / total writer.add_scalar('accuracy_1/%s/%s/overall' % (eval_title, self.name), accuracy_1, step) writer.add_scalar( 'accuracy_%d/%s/%s/overall' % (K, eval_title, self.name), accuracy_k, step) model.train(training)
def test_convert_onnx(self): model = Model() model.train(False) output = torch.onnx.export( model, self.org_dummy_input, self.model_onnx_path, verbose=True, operator_export_type=OPERATOR_EXPORT_TYPE, ) print("Export of torch_model.onnx complete!")
def train_model(model: Model, epochs: int, batch_size: int, use_wandb: bool = False) -> Model: """Train model.""" callbacks = [] if EARLY_STOPPING: early_stopping = EarlyStopping(monitor="val_loss", min_delta=0.01, patience=3, verbose=1, mode="auto") callbacks.append(early_stopping) model.network.summary() t = time() _history = model.fit(dataset=dataset, batch_size=batch_size, epochs=epochs, callbacks=callbacks) print("Training took {:2f} s".format(time() - t)) return model
def visualize(self, model: Model, writer: SummaryWriter, epoch, step): training = model.training model.eval() vis_indices = self.config['vis_indices'] if isinstance(self.config['vis_indices'], int): # sample k data points from n data points with equal interval n = len(self) k = self.config['vis_indices'] vis_indices = torch.linspace(0, n - 1, k) \ .type(torch.IntTensor).tolist() self.visualize_data( model, writer, self, vis_indices, 'val_pc', step ) model.train(training)
def get_optimizer(training_hparams: TrainingHparams, model: Model) -> torch.optim.Optimizer: if training_hparams.optimizer_name == 'sgd': return torch.optim.SGD( model.parameters(), lr=training_hparams.lr, momentum=training_hparams.momentum or training_hparams.nesterov_momentum or 0, weight_decay=training_hparams.weight_decay or 0, nesterov=training_hparams.nesterov_momentum is not None and training_hparams.nesterov_momentum > 0 ) elif training_hparams.optimizer_name == 'adam': return torch.optim.Adam( model.parameters(), lr=training_hparams.lr, weight_decay=training_hparams.weight_decay or 0 ) raise ValueError('No such optimizer: {}'.format(training_hparams.optimizer_name))
def accumulate( training_hparams: hparams.TrainingHparams, model: Model, train_loader: DataLoader, data_order_seed: int = None, suffix: str = '' ): """Accumulate the gradient for one training epoch. Args: * training_hparams: The training hyperparameters whose schema is specified in hparams.py. * model: The model to train. Must be a models.base.Model * train_loader: The training data. Must be a datasets.base.DataLoader * data_order_seed: The RNG seed for data shuffling. """ # Adapt for FP16. if training_hparams.apex_fp16: if NO_APEX: raise ImportError('Must install nvidia apex to use this model.') model = apex.amp.initialize(model, loss_scale='dynamic', verbosity=0) # Handle parallelism if applicable. if get_platform().is_distributed: model = DistributedDataParallel(model, device_ids=[get_platform().rank]) elif get_platform().is_parallel: model = DataParallel(model) train_loader.shuffle(data_order_seed) for it, (examples, labels) in enumerate(train_loader): examples = examples.to(device=get_platform().torch_device) labels = labels.to(device=get_platform().torch_device) model.eval() loss = model.loss_criterion(model(examples), labels) if training_hparams.apex_fp16: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() get_platform().barrier()
def _eval_model(self, model: Model, writer: SummaryWriter, step, t, eval_title, results_dict=None): training = model.training model.eval() totals = [] corrects = [] # Accuracy of each subset for subset_name, subset in self.subsets.items(): data = DataLoader( subset, batch_size=self.config['eval_batch_size'], num_workers=self.config['eval_num_workers'], collate_fn=self.collate_fn, ) total = 0. correct = 0. for x, y in iter(data): with torch.no_grad(): pred = model(x).view(x.size(0), -1).argmax(dim=1) total += x.size(0) correct += (pred.cpu() == y).float().sum() totals.append(total) corrects.append(correct) accuracy = correct / total writer.add_scalar( 'accuracy/%s/%s/%s' % (eval_title, self.name, subset_name), accuracy, step) # Overall accuracy total = sum(totals) correct = sum(corrects) accuracy = correct / total writer.add_scalar('accuracy/%s/%s/overall' % (eval_title, self.name), accuracy, step) model.train(training)
class Message(Model): class Status: READY = "READY" DONE = "DONE" ERROR = "ERROR" table_name = "messages" fields = { "recipient": Model.REQUIRED_FIELD, "body": Model.REQUIRED_FIELD, "last_update": Model.DEFAULT_VALUE(r.now()), "created": Model.DEFAULT_VALUE(r.now()), "status": Model.DEFAULT_VALUE(Status.READY), "error": Model.DEFAULT_VALUE("") } async def save(self, **kwargs): self._data["last_update"] = r.now() return await super().save(**kwargs)
def train_model(config, model: Model, scheduler: DataScheduler, writer: SummaryWriter): saved_model_path = os.path.join(config['log_dir'], 'ckpts') os.makedirs(saved_model_path, exist_ok=True) skip_batch = 0 for step, (x, y, epoch) in enumerate(scheduler): x, y = x.to(config['device']), y.to(config['device']) # since number of points vary in the dataset, # we skip if gpu overflow occurs if config['skip_gpu_overflow']: try: train_loss = model.learn(x, y, step) except RuntimeError: skip_batch += 1 continue else: train_loss = model.learn(x, y, step) # model learns print('\r[Epoch {:4}, Step {:7}, Overflow: {:7}, Loss {:5}]'.format( epoch, step, skip_batch, '%.3f' % train_loss), end='') # evaluate if scheduler.check_eval_step(step): scheduler.eval(model, writer, step) if scheduler.check_vis_step(step): print("\nVisualizing...") scheduler.visualize(model, writer, step) writer.add_scalar('skip_batch', skip_batch, step) if (step + 1) % config['ckpt_step'] == 0: torch.save( model.state_dict(), os.path.join(saved_model_path, 'ckpt-step-{}'.format(str(step + 1).zfill(3)))) model.lr_scheduler.step()
def predict(model: Model, dataset: DatasetBase, restore_path: Path): restore_path = restore_path.expanduser().absolute() model.build_graph() saver = tf.train.Saver(save_relative_paths=True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, str(restore_path)) while True: sess.run(dataset.test_init_op) try: pred, x, y = sess.run([model.predict(), dataset.x, dataset.y], feed_dict={tf.keras.backend.learning_phase(): 0}) pred = np.argmax(pred) print(pred) io.imshow(x[0, :, :, 0], cmap="gray") io.show() except tf.errors.OutOfRangeError: continue
def test_from_orient(self): create_class(ClassToInsert, client=self.client) class_to_insert = ClassToInsert() class_to_insert.int_field = 10 class_to_insert.str_field = 'foobar' class_to_insert.datetime_field = Arrow.utcnow() class_to_insert.float_field = 12345.547 class_to_insert.bin_field = bytes('foo','utf-8') insert(class_to_insert, client=self.client) r = load(class_to_insert.rid, client=self.client) result = Model.from_orient(r) self.assertEqual(class_to_insert.__class__, result.__class__) self.assertEqual(result.rid, class_to_insert.rid) self.assertEqual(result.str_field, class_to_insert.str_field) self.assertEqual(result.int_field, class_to_insert.int_field) self.assertEqual(result.datetime_field, class_to_insert.datetime_field) self.assertEqual(result.float_field, class_to_insert.float_field) self.assertEqual(result.bin_field, class_to_insert.bin_field)
def train_model(model: Model, dataset: Dataset, epochs: int, batch_size: int, learning_rate: float, gpu_ind: Optional[int] = None, use_wandb=False) -> Model: callbacks = [] callbacks.append(Metrics()) if EARLY_STOPPING: early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=1, mode='auto') callbacks.append(early_stopping) if GPU_UTIL_SAMPLER and gpu_ind is not None: gpu_utilization = GPUUtilizationSampler(gpu_ind) callbacks.append(gpu_utilization) if use_wandb: wandb = WandbCallback() callbacks.append(wandb) model.network.summary() t = time() history = model.fit(dataset, batch_size, epochs, learning_rate, callbacks) print('Training took {:2f} s'.format(time() - t)) if GPU_UTIL_SAMPLER and gpu_ind is not None: gpu_utilizations = gpu_utilization.samples print( f'GPU utilization: {round(np.mean(gpu_utilizations), 2)} +- {round(np.std(gpu_utilizations), 2)}' ) return model
def _eval_generative_model(self, model: Model, writer: SummaryWriter, step, eval_title): # change the model to eval mode training = model.training z_samples = model.config['z_samples'] model.eval() model.config['z_samples'] = 16 # evaluate generative model on each subset subset_counts = [] subset_cumulative_bpds = [] for subset_name, subset in self.subsets.items(): data = DataLoader( subset, batch_size=self.config['eval_batch_size'], num_workers=self.config['eval_num_workers'], collate_fn=self.collate_fn, ) subset_count = 0 subset_cumulative_bpd = 0 # evaluate on a subset for x, _ in iter(data): dim = reduce(lambda x, y: x * y, x.size()[1:]) with torch.no_grad(): ll = model(x) bpd = -ll / math.log(2) / dim subset_count += x.size(0) subset_cumulative_bpd += bpd.sum() # append the subset evaluation result subset_counts.append(subset_count) subset_cumulative_bpds.append(subset_cumulative_bpd) subset_bpd = subset_cumulative_bpd / subset_count writer.add_scalar( 'bpd/%s/%s/%s' % (eval_title, self.name, subset_name), subset_bpd, step) # Overall accuracy overall_bpd = sum(subset_cumulative_bpds) / sum(subset_counts) writer.add_scalar('bpd/%s/%s/overall' % (eval_title, self.name), overall_bpd, step) # roll back the mode model.train(training) model.config['z_samples'] = z_samples
def _eval_model(self, model: Model, writer: SummaryWriter, step, t, eval_title, results_dict): training = model.training model.eval() if t in self.config['schedule_simple']: t_idx = self.config['schedule_simple'].index(t) else: t_idx = len(self.config['schedule_simple']) - 1 # for calculating total performance targets_total = [] probs_total = [] # Accuracy of each subset for order_i, t_i in enumerate(self.config['schedule_simple'][:t_idx + 1]): subset_name = t_i last_id = self.config['schedule_simple'][ -1] # XXX should be -1. -2 for debugging. subset = self.subsets[t_i] data = DataLoader( subset, batch_size=self.config['eval_batch_size'], num_workers=self.config['eval_num_workers'], collate_fn=self.collate_fn, ) # results is dict. {method: group_averagemeter_object} results, targets, probs = validate(subset_name, model, data, self.category_map, results_dict, last_id, self.split_cats_dict) targets_total.append(targets) probs_total.append(probs) if subset_name in results_dict: results_dict[subset_name].append(results) else: results_dict[subset_name] = [results] for metric in results.keys(): results[metric].write_to_excel( os.path.join(writer.logdir, 'results_{}.xlsx'.format(metric)), sheet_name='task {}'.format(subset_name), column_name='task {}'.format( self.config['schedule_simple'][t_idx]), info='avg') # ================================================================================================================= # calculate scores for trained tasks. prefix = 'tally_' # prefix for tensorboard plotting and csv filename targets_total = torch.cat(targets_total, axis=0) probs_total = torch.cat(probs_total, axis=0) predicts_total = probs_total > 0.5 # BCE style predicts total_metric = ['CP', 'CR', 'CF1', 'OP', 'OR', 'OF1', 'mAP'] results = dict() # reset results CP, CR, CF1, OP, OR, OF1, mAP = (AverageMeter() for _ in range(len(total_metric))) ncats = targets_total.sum(axis=0) # ignore classes in future tasks cats_in_task_idx = ncats > 0 cats_in_task_name = self.category_map[cats_in_task_idx].tolist() targets_total = targets_total probs_total = probs_total predicts_total = predicts_total # calculate score precision_pc = torch.mean( precision_score_per_class(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0)) recall_pc = torch.mean( recall_score_per_class(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0)) # CF1. note that CF1 is not a mean value of categories' f1_score f1_pc = ((2 * precision_pc * recall_pc) / (precision_pc + recall_pc) ) if (precision_pc + recall_pc) > 0 else torch.tensor([0.]) precision_oa = precision_score_overall( targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0) recall_oa = recall_score_overall(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0) f1_oa = f1_score_overall(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0) map_ = mean_average_precision(targets_total[:, cats_in_task_idx], probs_total[:, cats_in_task_idx]) # save to AverageMeter CP.update(precision_pc.item()) CR.update(recall_pc.item()) CF1.update(f1_pc.item()) OP.update(precision_oa.item()) OR.update(recall_oa.item()) OF1.update(f1_oa.item()) mAP.update(map_.item()) results[prefix + 'CP'] = CP results[prefix + 'CR'] = CR results[prefix + 'CF1'] = CF1 results[prefix + 'OP'] = OP results[prefix + 'OR'] = OR results[prefix + 'OF1'] = OF1 results[prefix + 'mAP'] = mAP # for reporting major, moderate, minor cateogory performances for report_name in self.split_cats_dict.keys(): reporter = Group_AverageMeter() # get report category idxes all_cats = self.category_map.tolist() task_cats = set(cats_in_task_name) report_cats = task_cats & set(self.split_cats_dict[report_name]) report_cats_idx = torch.tensor( [all_cats.index(cat) for cat in report_cats], dtype=torch.long) # CP, CR, CF1 performance of report_categories. _class_precision = precision_score_per_class( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _class_recall = recall_score_per_class( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _class_precision = torch.mean(_class_precision) _class_recall = torch.mean(_class_recall) # CF1 bias. note that CF1 is not a mean value of categories' f1_score _class_f1 = ((2*_class_precision*_class_recall)/(_class_precision+_class_recall)) \ if (_class_precision+_class_recall)>0 else torch.tensor([0.]) # OP, OR, OF1 performance of report_categories. _overall_precision = precision_score_overall( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _overall_recall = recall_score_overall( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _overall_f1 = f1_score_overall(targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) # mAP performance of report_categories. _mAP = mean_average_precision(targets_total[:, report_cats_idx], probs_total[:, report_cats_idx]) reporter.update(['CP'], [_class_precision.item()], [1]) reporter.update(['CR'], [_class_recall.item()], [1]) reporter.update(['CF1'], [_class_f1.item()], [1]) reporter.update(['OP'], [_overall_precision.item()], [1]) reporter.update(['OR'], [_overall_recall.item()], [1]) reporter.update(['OF1'], [_overall_f1.item()], [1]) reporter.update(['mAP'], [_mAP.item()], [1]) reporter.total.reset() results[prefix + report_name] = reporter # write to tensorboard and csv. task_len = t_idx + 1 for metric in results.keys(): if not metric in [ prefix + 'CP', prefix + 'CR', prefix + 'OP', prefix + 'OR' ]: results[metric].write( writer, '%s/%s/%s/task_len(%d)' % (metric, eval_title, self.name, task_len), step, info='avg') results[metric].write_to_excel( os.path.join(writer.logdir, 'results_{}.xlsx'.format(metric)), sheet_name=prefix, column_name='task {}'.format( self.config['schedule_simple'][t_idx]), info='avg') # ================================================================================================================= # print performances at the end if t_idx == len(self.config['schedule_simple']) - 1: src = writer.logdir csv_files = ['major', 'moderate', 'minor', 'OF1', 'CF1', 'mAP', \ prefix+'major', prefix+'moderate', prefix+'minor', prefix+'CF1', prefix+'OF1', prefix+'mAP', \ 'forget'] for csv_file in csv_files: try: csv = pd.read_csv(os.path.join( src, 'results_{}.csv'.format(csv_file)), index_col=0) # print performance after training last task pd.set_option('display.max_rows', None) print( colorful.bold_green( '\n{:10} result'.format(csv_file)).styled_string) print(csv.round(4).iloc[:, -1]) # save as txt with open(os.path.join(src, 'summary.txt'), 'a') as summary_txt: summary_txt.write('\n') summary_txt.write('{:10} result\n'.format(csv_file)) summary_txt.write(csv.round(4).iloc[:, -1].to_string()) summary_txt.write('\n') except FileNotFoundError: print("This excperiment doesn't have {} file!! continue.". format(csv_file)) continue model.train(training) return results_dict
def visualize(self, options, model: Model, writer: SummaryWriter, step): training = model.training model.eval() vis_config = self.config['vis'] if vis_config.get('num_scene_samples'): # sample k data points from n data points with equal interval n = len(self) k = vis_config.get('num_scene_samples') vis_indices = torch.linspace(0, n - 1, k) \ .type(torch.IntTensor).tolist() else: vis_indices = [self.dir2idx[i] for i in vis_config.get('scene_names')] if self.config['overfit_one_ex']: vis_scene = self.config['overfit_one_ex'] vis_indices = [self.dir2idx[vis_scene]] vis_indices = list(set(vis_indices)) for i in vis_indices: coords, feats, labels, _ = self[i] coords, feats, = sparse_collate([coords], [feats]) x = SparseTensor(feats, coords) x = x.to(model.device) with torch.no_grad(): y_hat = model(x) embs = y_hat insts = labels[:, 1] for option in options: # visualize tsne if option == 'tsne': tsne_img = visualization.visualize_tsne( embs.cpu(), insts.cpu(), config=self.config['vis']['tsne'] ) writer.add_image('tsne/{}'.format(self.idx2dir[i]), tsne_img, step) elif option == 'embs': vis_config = self.config['vis']['embs'] # visualize embs with background emb_imgs, axis_range = visualization.visualize_embs( embs.cpu(), insts.cpu(), remove_bg=False, max_sample=vis_config['max_sample'], num_view=vis_config['num_view'] ) for view_num, img in enumerate(emb_imgs): writer.add_image( 'emb/with_bg/{}_{}'.format(self.idx2dir[i], view_num), img, step ) # visualize embs without background not_bg_emb_imgs, _ = visualization.visualize_embs( embs.cpu(), insts.cpu(), remove_bg=True, max_sample=vis_config['max_sample'], num_view=vis_config['num_view'], axis_range=axis_range ) for view_num, img in enumerate(not_bg_emb_imgs): writer.add_image( 'emb/no_bg/{}_{}'.format(self.idx2dir[i], view_num), img, step ) model.train(training)
def eval(self, model: Model, writer: SummaryWriter, step): training = model.training model.eval() scalar_summaries = defaultdict(list) list_summaries = defaultdict(list) data_loader = DataLoader( self, batch_size=self.config['eval_batch_size'], num_workers=self.config['num_workers'], collate_fn=self.collate_fn, drop_last=True, ) print('') for eval_step, data in enumerate(data_loader): x, y = data[0], data[1] x, y = x.to(self.config['device']), y.to(self.config['device']) with torch.no_grad(): y_hat = model(x) loss, scalar_summary, list_summary = model.compute_loss(x, y, y_hat, step) print('\r[Evaluating, Step {:7}, Loss {:5}]'.format( eval_step, '%.3f' %loss), end='' ) for (k, v) in scalar_summary.items(): scalar_summaries[k].append(v) for (k, v) in list_summary.items(): list_summaries[k] += v # write summaries for (k, v) in scalar_summaries.items(): v = np.array(v).mean().item() writer.add_scalar(k, v, step) for (k, v) in list_summaries.items(): v = np.array(v) if k[:4] == 'mIoU': num_classes = self.config['y_c'] confusion_matrix = v.reshape(-1, num_classes ** 2) confusion_matrix = confusion_matrix.sum(axis=0) \ .reshape(num_classes, num_classes) mious = [] for i in range(num_classes): true_positive = confusion_matrix[i, i].item() false_positive = (confusion_matrix[i, :].sum() - true_positive).item() false_negative = (confusion_matrix[:, i].sum() - true_positive).item() denom = true_positive + false_positive + false_negative mious.append(0 if denom == 0 else float(true_positive) / denom) if hasattr(self, 'class_id2label'): writer.add_scalar(k + self.class_id2label[i], mious[-1], step) writer.add_scalar(k + 'mIoU/overall', sum(mious) / len(mious), step) else: bins = np.linspace(0., 1.1, num=12) counts, limits = np.histogram(v, bins=bins) sum_sq = v.dot(v) writer.add_histogram_raw( tag=k, min=v.min(), max=v.max(), num=len(v), sum=v.sum(), sum_squares=sum_sq, bucket_limits=limits[1:].tolist(), bucket_counts=counts.tolist(), global_step=step ) model.train(training)
def distill( training_hparams: hparams.TrainingHparams, distill_hparams: hparams.DistillHparams, student: Model, teacher: Model, train_loader: DataLoader, output_location: str, callbacks: typing.List[typing.Callable] = [], start_step: Step = None, end_step: Step = None ): """The main training loop for this framework. Args: * training_hparams: The training hyperparameters whose schema is specified in hparams.py. * distll_hparams: The knowledge distillation hyperparameters whose schema is specified in hparams.py. * student: The student model to train. Must be a models.base.Model * teacher: The teacher model to distill the knowledge. Must be a models.base.Model * train_loader: The training data. Must be a datasets.base.DataLoader * output_location: The string path where all outputs should be stored. * callbacks: A list of functions that are called before each training step and once more after the last training step. Each function takes five arguments: the current step, the output location, the model, the optimizer, and the logger. Callbacks are used for running the test set, saving the logger, saving the state of the model, etc. The provide hooks into the training loop for customization so that the training loop itself can remain simple. * start_step: The step at which the training data and learning rate schedule should begin. Defaults to step 0. * end_step: The step at which training should cease. Otherwise, training will go for the full `training_hparams.training_steps` steps. """ import torch import torch.nn as nn import torch.nn.functional as F # Create the output location if it doesn't already exist. if not get_platform().exists(output_location) and get_platform().is_primary_process: get_platform().makedirs(output_location) # Get the optimizer and learning rate schedule. student.to(get_platform().torch_device) teacher.to(get_platform().torch_device) optimizer = optimizers.get_optimizer(training_hparams, student) step_optimizer = optimizer lr_schedule = optimizers.get_lr_schedule(training_hparams, optimizer, train_loader.iterations_per_epoch) ce_loss_fct = nn.KLDivLoss(reduction="batchmean") if distill_hparams.alpha_mse > 0.0: mse_loss_fct = nn.MSELoss(reduction='sum') if distill_hparams.alpha_cos > 0.0: cos_loss_fct = nn.CosineEmbeddingLoss(reduction='mean') # Adapt for FP16. if training_hparams.apex_fp16: if NO_APEX: raise ImportError('Must install nvidia apex to use this model.') (student, teacher), step_optimizer = apex.amp.initialize( [student, teacher], optimizer, loss_scale='dynamic', verbosity=0 ) # Handle parallelism if applicable. if get_platform().is_distributed: student = DistributedDataParallel(student, device_ids=[get_platform().rank]) teacher = DistributedDataParallel(teacher, device_ids=[get_platform().rank]) elif get_platform().is_parallel: student = DataParallel(student) teacher = DataParallel(teacher) # Get the random seed for the data order. data_order_seed = training_hparams.data_order_seed # Restore the model from a saved checkpoint if the checkpoint exists. cp_step, cp_logger = restore_checkpoint(output_location, student, optimizer, train_loader.iterations_per_epoch) start_step = cp_step or start_step or Step.zero(train_loader.iterations_per_epoch) logger = cp_logger or MetricLogger() with warnings.catch_warnings(): # Filter unnecessary warning. warnings.filterwarnings("ignore", category=UserWarning) for _ in range(start_step.iteration): lr_schedule.step() # Determine when to end training. end_step = end_step or Step.from_str(training_hparams.training_steps, train_loader.iterations_per_epoch) if end_step <= start_step: return # The training loop. for ep in range(start_step.ep, end_step.ep + 1): # Ensure the data order is different for each epoch. train_loader.shuffle(None if data_order_seed is None else (data_order_seed + ep)) for it, (examples, labels) in enumerate(train_loader): # Advance the data loader until the start epoch and iteration. if ep == start_step.ep and it < start_step.it: continue # Run the callbacks. step = Step.from_epoch(ep, it, train_loader.iterations_per_epoch) for callback in callbacks: callback(output_location, step, student, optimizer, logger) # Exit at the end step. if ep == end_step.ep and it == end_step.it: return # Otherwise, train. examples = examples.to(device=get_platform().torch_device) labels = labels.to(device=get_platform().torch_device) loss = 0.0 step_optimizer.zero_grad() student.train() teacher.eval() student_outputs = student(examples) with torch.no_grad(): teacher_outputs = teacher(examples) s_logits = student_outputs t_logits = teacher_outputs # KL Divergence loss for the knowledge distillation loss_ce = ce_loss_fct( F.log_softmax(s_logits / distill_hparams.temperature, dim=-1), F.softmax(t_logits / distill_hparams.temperature, dim=-1), ) * distill_hparams.temperature**2 loss += distill_hparams.alpha_ce * loss_ce if distill_hparams.alpha_cls > 0.0: loss_cls = student.loss_criterion(student_outputs, labels) loss += distill_hparams.alpha_cls * loss_cls if distill_hparams.alpha_mse > 0.0: loss_mse = mse_loss_fct(s_logits, t_logits) / s_logits.size(0) loss += distill_hparams.alpha_mse * loss_mse if training_hparams.apex_fp16: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Step forward. Ignore extraneous warnings that the lr_schedule generates. step_optimizer.step() with warnings.catch_warnings(): # Filter unnecessary warning. warnings.filterwarnings("ignore", category=UserWarning) lr_schedule.step() get_platform().barrier()
def grasp( training_hparams: hparams.TrainingHparams, model: Model, parameter_list: list, train_loader: DataLoader, data_order_seed: int = None, suffix: str = '' ): """For the implementation of GraSP. Args: * training_hparams: The training hyperparameters whose schema is specified in hparams.py. * model: The model to train. Must be a models.base.Model * train_loader: The training data. Must be a datasets.base.DataLoader * data_order_seed: The RNG seed for data shuffling. """ # Adapt for FP16. if training_hparams.apex_fp16: if NO_APEX: raise ImportError('Must install nvidia apex to use this model.') model = apex.amp.initialize(model, loss_scale='dynamic', verbosity=0) # Handle parallelism if applicable. if get_platform().is_distributed: model = DistributedDataParallel(model, device_ids=[get_platform().rank]) elif get_platform().is_parallel: model = DataParallel(model) train_loader.shuffle(data_order_seed) # First gradient without computational graph stopped_grads = 0 for it, (examples, labels) in enumerate(train_loader): examples = examples.to(device=get_platform().torch_device) labels = labels.to(device=get_platform().torch_device) model.eval() output = model(examples) / 200.0 # temp = 200 loss = model.loss_criterion(output, labels) # if training_hparams.apex_fp16: # with apex.amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() # else: # loss.backward() grads = torch.autograd.grad(loss, parameter_list, create_graph=False) flatten_grads = torch.cat([g.reshape(-1) for g in grads if g is not None]) stopped_grads += flatten_grads train_loader.shuffle(None if data_order_seed is None else (data_order_seed + 1)) # Second gradient vector with computational graph for it, (examples, labels) in enumerate(train_loader): examples = examples.to(device=get_platform().torch_device) labels = labels.to(device=get_platform().torch_device) model.eval() output = model(examples) / 200.0 # temp = 200 loss = model.loss_criterion(output, labels) # if training_hparams.apex_fp16: # with apex.amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() # else: # loss.backward() grads = torch.autograd.grad(loss, parameter_list, create_graph=True) flatten_grads = torch.cat([g.reshape(-1) for g in grads if g is not None]) gnorm = (stopped_grads * flatten_grads).sum() gnorm.backward() get_platform().barrier()
def ones_like(model: base.Model) -> 'Mask': mask = Mask() for name in model.prunable_layer_names: mask[name] = torch.ones(list(model.state_dict()[name].shape)) return mask
def train(training_hparams: hparams.TrainingHparams, model: Model, train_loader: DataLoader, output_location: str, callbacks: typing.List[typing.Callable] = [], start_step: Step = None, end_step: Step = None): """The main training loop for this framework. Args: * training_hparams: The training hyperparameters whose schema is specified in hparams.py. * model: The model to train. Must be a models.base.Model * train_loader: The training data. Must be a datasets.base.DataLoader * output_location: The string path where all outputs should be stored. * callbacks: A list of functions that are called before each training step and once more after the last training step. Each function takes five arguments: the current step, the output location, the model, the optimizer, and the logger. Callbacks are used for running the test set, saving the logger, saving the state of the model, etc. The provide hooks into the training loop for customization so that the training loop itself can remain simple. * start_step: The step at which the training data and learning rate schedule should begin. Defaults to step 0. * end_step: The step at which training should cease. Otherwise, training will go for the full `training_hparams.training_steps` steps. """ # Create the output location if it doesn't already exist. if not get_platform().exists(output_location) and get_platform( ).is_primary_process: get_platform().makedirs(output_location) # Get the optimizer and learning rate schedule. model.to(get_platform().torch_device) optimizer = optimizers.get_optimizer(training_hparams, model) step_optimizer = optimizer lr_schedule = optimizers.get_lr_schedule(training_hparams, optimizer, train_loader.iterations_per_epoch) # Adapt for FP16. if training_hparams.apex_fp16: if NO_APEX: raise ImportError('Must install nvidia apex to use this model.') model, step_optimizer = apex.amp.initialize(model, optimizer, loss_scale='dynamic', verbosity=0) # Handle parallelism if applicable. if get_platform().is_distributed: model = DistributedDataParallel(model, device_ids=[get_platform().rank]) elif get_platform().is_parallel: model = DataParallel(model) # Get the random seed for the data order. data_order_seed = training_hparams.data_order_seed # Restore the model from a saved checkpoint if the checkpoint exists. cp_step, cp_logger = restore_checkpoint(output_location, model, optimizer, train_loader.iterations_per_epoch) start_step = cp_step or start_step or Step.zero( train_loader.iterations_per_epoch) logger = cp_logger or MetricLogger() with warnings.catch_warnings(): # Filter unnecessary warning. warnings.filterwarnings("ignore", category=UserWarning) for _ in range(start_step.iteration): lr_schedule.step() # Determine when to end training. end_step = end_step or Step.from_str(training_hparams.training_steps, train_loader.iterations_per_epoch) if end_step <= start_step: return # The training loop. for ep in range(start_step.ep, end_step.ep + 1): # Ensure the data order is different for each epoch. train_loader.shuffle(None if data_order_seed is None else ( data_order_seed + ep)) for it, (examples, labels) in enumerate(train_loader): # Advance the data loader until the start epoch and iteration. if ep == start_step.ep and it < start_step.it: continue # Run the callbacks. step = Step.from_epoch(ep, it, train_loader.iterations_per_epoch) for callback in callbacks: callback(output_location, step, model, optimizer, logger) # Exit at the end step. if ep == end_step.ep and it == end_step.it: return # Otherwise, train. examples = examples.to(device=get_platform().torch_device) labels = labels.to(device=get_platform().torch_device) step_optimizer.zero_grad() model.train() loss = model.loss_criterion(model(examples), labels) if training_hparams.apex_fp16: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Step forward. Ignore extraneous warnings that the lr_schedule generates. step_optimizer.step() with warnings.catch_warnings(): # Filter unnecessary warning. warnings.filterwarnings("ignore", category=UserWarning) lr_schedule.step() get_platform().barrier()
def __init__(self, vocab_size=1000, batch_size=100, rnn_size=1024, layer_depth=2, num_units=100, rnn_type="GRU", seq_length=50, keep_prob=0.9, grad_clip=5.0): Model.__init__(self) # RNN self._layer_depth = layer_depth self._keep_prob = keep_prob self._batch_size = batch_size self._num_units = num_units self._seq_length = seq_length self._rnn_size = rnn_size self._vocab_size = vocab_size self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length], name="inputs") self.targets = tf.placeholder(tf.int32, [batch_size, seq_length], name="targets") self.is_training = tf.placeholder('bool', None, name="is_training") with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size], initializer=tf.truncated_normal_initializer(stddev=1e-4)) softmax_b = tf.get_variable("softmax_b", [vocab_size]) def create_cell(device): if rnn_type == "GRU": cell = rnn.GRUCell(rnn_size) elif rnn_type == "LSTM": if 'reuse' in inspect.signature(tf.contrib.rnn.BasicLSTMCell.__init__).parameters: cell = rnn.LayerNormBasicLSTMCell(rnn_size, forget_bias=0.0, reuse=tf.get_variable_scope().reuse) else: cell = rnn.LayerNormBasicLSTMCell(rnn_size, forget_bias=0.0) elif rnn_type == "RWA": cell = RWACell(rnn_size) elif rnn_type == "RAN": cell = RANCell(rnn_size, normalize=self.is_training) cell = SwitchableDropoutWrapper(rnn.DeviceWrapper(cell, device="/gpu:{}".format(device)), is_train=self.is_training) return cell self.cell = cell = rnn.MultiRNNCell([create_cell(i) for i in range(layer_depth)], state_is_tuple=True) with tf.device("/cpu:0"): self.embedding = tf.get_variable("embedding", [vocab_size, num_units]) inputs = tf.nn.embedding_lookup(self.embedding, self.input_data) inputs = tf.contrib.layers.dropout(inputs, keep_prob, is_training=self.is_training) with tf.variable_scope("output"): self.initial_state = cell.zero_state(batch_size, tf.float32) outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, time_major=False, swap_memory=True, initial_state=self.initial_state, dtype=tf.float32) output = tf.reshape(tf.concat(outputs, 1), [-1, rnn_size]) with tf.variable_scope("loss"): self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) flat_targets = tf.reshape(tf.concat(self.targets, 1), [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=flat_targets) self.loss = tf.reduce_mean(loss) self.final_state = last_state self.global_step = tf.Variable(0, name="global_step", trainable=False) self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
def fit(model: Model, dataset: DatasetBase, save_path: Path, save_interval_minute: int = 15, epochs: int = 1): save_path = save_path.expanduser().absolute() model.build_graph() train = True test = not train switch = True sum_loss = 0 sum_acc = 0 saver = tf.train.Saver(save_relative_paths=True) epoch = -1 now = datetime.now().minute logging.info(f"Number of trainable parameters: {get_num_of_parameters()}") with tf.Session() as sess, tqdm() as progress: sess.run(tf.global_variables_initializer()) sum_writer = tf.summary.FileWriter(save_path / str(model) / "logdir", sess.graph) while epoch <= epochs: # Switch to test or train dataset if switch: switch = False if train: sess.run(dataset.train_init_op) progress.total = dataset.train_size epoch += 1 elif test: sess.run(dataset.test_init_op) progress.total = dataset.test_size try: phase = 'train' if train else 'test' loss = 0 acc = 0 if phase == 'train': loss, acc, _, = sess.run([model.loss(), model.accuracy(), model.optimize()], feed_dict={tf.keras.backend.learning_phase(): 1}) elif phase == 'test': loss, acc = sess.run([model.loss(), model.accuracy()], feed_dict={tf.keras.backend.learning_phase(): 0}) sum_loss += loss sum_acc += acc batches = (progress.n / dataset.batch_size + 1) desc = f"Epoch: {epoch:<5}| Phase: {phase :<10}| " \ f"loss: {sum_loss / batches :<25}| " \ f"acc: {sum_acc / batches :<25}| " progress.set_description(desc=desc) progress.update(dataset.batch_size) except tf.errors.OutOfRangeError: progress.write("") train = not train test = not test switch = True progress.n = 0 sum_loss = 0 sum_acc = 0 if now - datetime.now().minute >= save_interval_minute: now = datetime.now().minute saver.save(sess, str(save_path / str(model) / str(model))) sum_writer.flush() sum_writer.close() sum_writer.reopen() continue sum_writer.flush() sum_writer.close() saver.save(sess, str(save_path / str(model) / str(model)))
sys.path.insert(1, os.path.join(sys.path[0], '..')) from models.base import Model if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('image_dir') parser.add_argument('model') args = parser.parse_args() model_path = args.model data_path = args.image_dir print('Loading model from ', model_path) model = Model().load(model_path) print('Model loaded') print(model._config) print('## Evaluating on test data##') prediction, labels = model.predict(data_path, return_labels=True) prediction_class = prediction.argmax(axis=-1) correct = (prediction_class == labels).sum() total = len(labels) print('Percentage correct (manual): {:.2f}, {}/{}'.format((correct / total * 100), correct, total)) #np.save('predictions.npy', {'prediction': prediction, 'true': y_test, 'labels': labels})