def run(self): tq = tqdm(range(self.epochs)) for epoch in tq: # for recordio data if hasattr(self.train_data, 'reset'): self.train_data.reset() tbar = tqdm(self.train_data) idx = 0 for batch in tbar: # sample network configuration config = self.controller.pre_sample()[0] self.supernet.sample(**config) # self.train_fn(self.supernet, batch, **self.train_args) self.train_fn(epoch, self.epochs, self.supernet, batch, **self.train_args) mx.nd.waitall() if epoch >= self.warmup_epochs and ( idx % self.update_arch_frequency) == 0: self.train_controller() if self.plot_frequency > 0 and idx % self.plot_frequency == 0 and in_ipynb( ): graph = self.supernet.graph graph.attr(rankdir='LR', size='8,3') tbar.set_svg(graph._repr_svg_()) if self.baseline: tbar.set_description('avg reward: {:.2f}'.format( self.baseline)) idx += 1 self.validation() self.save() msg = 'epoch {}, val_acc: {:.2f}'.format(epoch, self.val_acc) if self.baseline: msg += ', avg reward: {:.2f}'.format(self.baseline) tq.set_description(msg)
def predict_imgs(X): if isinstance(X, list): different_dataset = [] for i, x in enumerate(X): proba_all_one_dataset = [] tbar = tqdm(range(len(x.items))) for j, x_item in enumerate(x): tbar.update(1) proba_all = predict_img(x_item[0], ensemble=True) tbar.set_description( 'ratio:[%d],The input picture [%d]' % (i, j)) proba_all_one_dataset.append(proba_all) different_dataset.append(proba_all_one_dataset) inds, probas, probals_all = avg_prediction( different_dataset, threshold=set_prob_thresh) else: inds, probas, probals_all = [], [], [] tbar = tqdm(range(len(X.items))) for i, x in enumerate(X): tbar.update(1) ind, proba, proba_all = predict_img(x[0]) tbar.set_description( 'The input picture [%d] is classified as [%d], with probability %.2f ' % (i, ind.asscalar(), proba.asscalar())) inds.append(ind.asscalar()) probas.append(proba.asnumpy()) probals_all.append(proba_all.asnumpy().flatten()) return inds, probas, probals_all
def evaluate(loader_dev, metric, segment): """Evaluate the model on validation dataset.""" metric.reset() step_loss = 0 tbar = tqdm(loader_dev) for batch_id, seqs in enumerate(tbar): input_ids, valid_length, segment_ids, label = seqs input_ids = input_ids.as_in_context(ctx) valid_length = valid_length.as_in_context(ctx).astype('float32') label = label.as_in_context(ctx) if use_roberta: out = model(input_ids, valid_length) else: out = model(input_ids, segment_ids.as_in_context(ctx), valid_length) ls = loss_function(out, label).mean() step_loss += ls.asscalar() metric.update([label], [out]) if (batch_id + 1) % (args.log_interval) == 0: log_eval(batch_id, len(loader_dev), metric, step_loss, args.log_interval, tbar) step_loss = 0 metric_nm, metric_val = metric.get() if not isinstance(metric_nm, list): metric_nm, metric_val = [metric_nm], [metric_val] metric_str = 'validation metrics:' + ','.join( [i + ':%.4f' for i in metric_nm]) logger.info(metric_str, *metric_val) mx.nd.waitall() return metric_nm, metric_val
def validation(self): if hasattr(self.val_data, 'reset'): self.val_data.reset() # data iter, avoid memory leak it = iter(self.val_data) if hasattr(it, 'reset_sample_times'): it.reset_sample_times() tbar = tqdm(it) # update network arc config = self.controller.inference() self.supernet.sample(**config) metric = mx.metric.Accuracy() for batch in tbar: self.eval_fn(self.supernet, batch, metric=metric, **self.val_args) reward = metric.get()[1] tbar.set_description('Val Acc: {}'.format(reward)) self.val_acc = reward self.training_history.append(reward)
def evaluate(self, dataset, input_size=224, ctx=[mx.cpu()]): """Evaluate predictive performance of trained image classifier using given test data. Parameters ---------- dataset : :class:`autogluon.task.ImagePredictor.Dataset` The dataset containing test images (must be in same format as the training dataset). input_size : int Size of the images (pixels). ctx : List of mxnet.context elements. Determines whether to use CPU or GPU(s), options include: `[mx.cpu()]` or `[mx.gpu()]`. Examples -------- >>> import autogluon.core as ag >>> from autogluon.vision import ImagePredictor as task >>> train_data = task.Dataset(train_path='~/data/train') >>> classifier = task.fit(train_data, >>> nets=ag.space.Categorical['resnet18_v1', 'resnet34_v1'], >>> time_limits=600, ngpus_per_trial=1, num_trials = 4) >>> test_data = task.Dataset('~/data/test', train=False) >>> test_acc = classifier.evaluate(test_data) """ args = self.args net = self.model batch_size = args.batch_size * max(len(ctx), 1) metric = get_metric_instance(args.metric) input_size = net.input_size if hasattr(net, 'input_size') else input_size test_data, _, batch_fn, _ = get_data_loader(dataset, input_size, batch_size, args.num_workers, True, None) tbar = tqdm(test_data) for batch in tbar: self.eval_func(net, batch, batch_fn, metric, ctx) _, test_reward = metric.get() tbar.set_description('{}: {}'.format(args.metric, test_reward)) _, test_reward = metric.get() return test_reward
def train_text_classification(args, reporter=None): # Step 1: add scripts every function and python objects in the original training script except for the training function # at the beginning of the decorated function nlp = try_import_gluonnlp() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.INFO) logger.info(args) batch_size = args.batch_size dev_batch_size = args.dev_batch_size lr = args.lr epsilon = args.epsilon accumulate = args.accumulate log_interval = args.log_interval * accumulate if accumulate else args.log_interval if accumulate: logger.info('Using gradient accumulation. Effective batch size = ' \ 'batch_size * accumulate = %d', accumulate * batch_size) # random seed np.random.seed(args.seed) random.seed(args.seed) mx.random.seed(args.seed) # TODO support for multi-GPU ctx = [mx.gpu(i) for i in range(args.num_gpus) ][0] if args.num_gpus > 0 else [mx.cpu()][0] task = args.dataset # data type with mixed precision training if args.dtype == 'float16': try: from mxnet.contrib import amp # pylint: disable=ungrouped-imports # monkey patch amp list since topk does not support fp16 amp.lists.symbol.FP32_FUNCS.append('topk') amp.lists.symbol.FP16_FP32_FUNCS.remove('topk') amp.init() except ValueError: # topk is already in the FP32_FUNCS list amp.init() except ImportError: # amp is not available logger.info( 'Mixed precision training with float16 requires MXNet >= ' '1.5.0b20190627. Please consider upgrading your MXNet version.' ) exit() # model and loss model_name = args.net dataset = args.pretrained_dataset use_roberta = 'roberta' in model_name get_model_params = { 'name': model_name, 'dataset_name': dataset, 'pretrained': True, 'ctx': ctx, 'use_decoder': False, 'use_classifier': False, } # RoBERTa does not contain parameters for sentence pair classification if not use_roberta: get_model_params['use_pooler'] = True bert, vocabulary = nlp.model.get_model(**get_model_params) model = get_network(bert, task.class_labels, use_roberta) #do_regression = not task.class_labels #if do_regression: # num_classes = 1 # loss_function = gluon.loss.L2Loss() #else: # num_classes = len(task.class_labels) # loss_function = gluon.loss.SoftmaxCELoss() ## reuse the BERTClassifier class with num_classes=1 for regression #if use_roberta: # model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes) #else: # model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes) # initialize classifier loss_function = gluon.loss.SoftmaxCELoss( ) if task.class_labels else gluon.loss.L2Loss() initializer = mx.init.Normal(0.02) model.classifier.initialize(init=initializer, ctx=ctx) model.hybridize(static_alloc=True) loss_function.hybridize(static_alloc=True) # data processing do_lower_case = 'uncased' in dataset if use_roberta: bert_tokenizer = nlp.data.GPT2BPETokenizer() else: bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case) # Get the loader. train_data, dev_data_list, num_train_examples, trans, test_trans = preprocess_data( bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary, True, args.num_workers) def log_train(batch_id, batch_num, metric, step_loss, log_interval, epoch_id, learning_rate, tbar): """Generate and print out the log message for training. """ metric_nm, metric_val = metric.get() if not isinstance(metric_nm, list): metric_nm, metric_val = [metric_nm], [metric_val] train_str = '[Epoch %d] loss=%.4f, lr=%.7f, metrics:' + \ ','.join([i + ':%.4f' for i in metric_nm]) tbar.set_description( train_str % (epoch_id, step_loss / log_interval, learning_rate, *metric_val)) def log_eval(batch_id, batch_num, metric, step_loss, log_interval, tbar): """Generate and print out the log message for inference. """ metric_nm, metric_val = metric.get() if not isinstance(metric_nm, list): metric_nm, metric_val = [metric_nm], [metric_val] eval_str = 'loss=%.4f, metrics:' + \ ','.join([i + ':%.4f' for i in metric_nm]) tbar.set_description(eval_str % (step_loss / log_interval, *metric_val)) def evaluate(loader_dev, metric, segment): """Evaluate the model on validation dataset.""" metric.reset() step_loss = 0 tbar = tqdm(loader_dev) for batch_id, seqs in enumerate(tbar): input_ids, valid_length, segment_ids, label = seqs input_ids = input_ids.as_in_context(ctx) valid_length = valid_length.as_in_context(ctx).astype('float32') label = label.as_in_context(ctx) if use_roberta: out = model(input_ids, valid_length) else: out = model(input_ids, segment_ids.as_in_context(ctx), valid_length) ls = loss_function(out, label).mean() step_loss += ls.asscalar() metric.update([label], [out]) if (batch_id + 1) % (args.log_interval) == 0: log_eval(batch_id, len(loader_dev), metric, step_loss, args.log_interval, tbar) step_loss = 0 metric_nm, metric_val = metric.get() if not isinstance(metric_nm, list): metric_nm, metric_val = [metric_nm], [metric_val] metric_str = 'validation metrics:' + ','.join( [i + ':%.4f' for i in metric_nm]) logger.info(metric_str, *metric_val) mx.nd.waitall() return metric_nm, metric_val # Step 2: the training function in the original training script is added in the decorated function in autogluon for training. """Training function.""" all_model_params = model.collect_params() optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01} trainer = gluon.Trainer(all_model_params, 'bertadam', optimizer_params, update_on_kvstore=False) if args.dtype == 'float16': amp.init_trainer(trainer) step_size = batch_size * accumulate if accumulate else batch_size num_train_steps = int(num_train_examples / step_size * args.epochs) warmup_ratio = args.warmup_ratio num_warmup_steps = int(num_train_steps * warmup_ratio) step_num = 0 # Do not apply weight decay on LayerNorm and bias terms for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Collect differentiable parameters params = [p for p in all_model_params.values() if p.grad_req != 'null'] # Set grad_req if gradient accumulation is required if accumulate and accumulate > 1: for p in params: p.grad_req = 'add' # track best eval score metric_history = [] best_metric = None patience = args.early_stop tic = time.time() for epoch_id in range(args.epochs): if args.early_stop and patience == 0: logger.info('Early stopping at epoch %d', epoch_id) break task.metric.reset() step_loss = 0 tic = time.time() all_model_params.zero_grad() tbar = tqdm(train_data) for batch_id, seqs in enumerate(tbar): # learning rate schedule if step_num < num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: non_warmup_steps = step_num - num_warmup_steps offset = non_warmup_steps / (num_train_steps - num_warmup_steps) new_lr = lr - offset * lr trainer.set_learning_rate(new_lr) # forward and backward with mx.autograd.record(): input_ids, valid_length, segment_ids, label = seqs input_ids = input_ids.as_in_context(ctx) valid_length = valid_length.as_in_context(ctx).astype( 'float32') label = label.as_in_context(ctx) if use_roberta: out = model(input_ids, valid_length) else: out = model(input_ids, segment_ids.as_in_context(ctx), valid_length) ls = loss_function(out, label).mean() if args.dtype == 'float16': with amp.scale_loss(ls, trainer) as scaled_loss: mx.autograd.backward(scaled_loss) else: ls.backward() # update if not accumulate or (batch_id + 1) % accumulate == 0: trainer.allreduce_grads() nlp.utils.clip_grad_global_norm(params, 1) trainer.update(accumulate if accumulate else 1) step_num += 1 if accumulate and accumulate > 1: # set grad to zero for gradient accumulation all_model_params.zero_grad() step_loss += ls.asscalar() task.metric.update([label], [out]) if (batch_id + 1) % (args.log_interval) == 0: log_train(batch_id, len(train_data), task.metric, step_loss, args.log_interval, epoch_id, trainer.learning_rate, tbar) step_loss = 0 mx.nd.waitall() # inference on dev data for segment, dev_data in dev_data_list: metric_nm, metric_val = evaluate(dev_data, task.metric, segment) if best_metric is None or metric_val >= best_metric: best_metric = metric_val patience = args.early_stop else: if args.early_stop is not None: patience -= 1 metric_history.append((epoch_id, metric_nm, metric_val)) if reporter is not None: # Note: epoch reported back must start with 1, not with 0 reporter(epoch=epoch_id + 1, accuracy=metric_val[0]) if args.final_fit: get_model_params.pop('ctx') return { 'model_params': collect_params(model), 'get_model_args': get_model_params, 'class_labels': task.class_labels, 'transform': trans, 'test_transform': test_trans }
def train_image_classification(args, reporter): logging.basicConfig() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.INFO) logger.info(args) target_params = Sample_params(args.batch_size, args.num_gpus, args.num_workers) batch_size = target_params.get_batchsize ctx = target_params.get_context classes = args.dataset.num_classes if hasattr(args.dataset, 'num_classes') else None target_kwargs = Getmodel_kwargs(ctx, classes, args.net, args.tricks.teacher_name, args.tricks.hard_weight, args.hybridize, args.optimizer.multi_precision, args.tricks.use_pretrained, args.tricks.use_gn, args.tricks.last_gamma, args.tricks.batch_norm, args.tricks.use_se) distillation = target_kwargs.distillation net = target_kwargs.get_net input_size = net.input_size if hasattr(net, 'input_size') else args.input_size if args.tricks.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.tricks.label_smoothing or args.tricks.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: teacher = target_kwargs.get_teacher def teacher_prob(data): return [ nd.softmax( teacher(X.astype(target_kwargs.dtype, copy=False)) / args.tricks.temperature) for X in data ] L = DistillationSoftmaxCrossEntropyLoss( temperature=args.tricks.temperature, hard_weight=args.tricks.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) teacher_prob = None if args.tricks.mixup: metric = get_metric_instance('rmse') else: metric = get_metric_instance(args.metric) train_data, val_data, batch_fn, num_batches = get_data_loader( args.dataset, input_size, batch_size, args.num_workers, args.final_fit, args.split_ratio) if isinstance(args.lr_config.lr_mode, str): # fix target_lr = LR_params( args.optimizer.lr, args.lr_config.lr_mode, args.epochs, num_batches, args.lr_config.lr_decay_epoch, args.lr_config.lr_decay, args.lr_config.lr_decay_period, args.lr_config.warmup_epochs, args.lr_config.warmup_lr) lr_scheduler = target_lr.get_lr_scheduler else: lr_scheduler = args.lr_config.lr_mode args.optimizer.lr_scheduler = lr_scheduler trainer = gluon.Trainer(net.collect_params(), args.optimizer) def train(epoch, num_epochs, metric): for i, batch in enumerate(train_data): metric = default_train_fn( epoch, num_epochs, net, batch, batch_size, L, trainer, batch_fn, ctx, args.tricks.mixup, args.tricks.label_smoothing, distillation, args.tricks.mixup_alpha, args.tricks.mixup_off_epoch, classes, target_kwargs.dtype, metric, teacher_prob) mx.nd.waitall() return metric def test(epoch): metric.reset() for i, batch in enumerate(val_data): default_val_fn(net, batch, batch_fn, metric, ctx, target_kwargs.dtype) _, reward = metric.get() reporter(epoch=epoch, classification_reward=reward) return reward # Note: epoch must start with 1, not 0 tbar = tqdm(range(1, args.epochs + 1)) for epoch in tbar: metric = train(epoch, args.epochs, metric) train_metric_name, train_metric_score = metric.get() tbar.set_description( f'[Epoch {epoch}] training: {train_metric_name}={train_metric_score :.3f}' ) if not args.final_fit: reward = test(epoch) tbar.set_description(f'[Epoch {epoch}] Validation: {reward :.3f}') if args.final_fit: return {'model_params': collect_params(net), 'num_classes': classes}