def get_recommended_resource(nthreads_per_trial=None, ngpus_per_trial=None): """Get the recommended resource. Parameters ---------- nthreads_per_trial The number of threads per trial ngpus_per_trial The number of GPUs per trial Returns ------- resource The resource """ if nthreads_per_trial is None and ngpus_per_trial is None: nthreads_per_trial = get_cpu_count() ngpus_per_trial = get_gpu_count() elif nthreads_per_trial is not None and ngpus_per_trial is None: ngpus_per_trial = get_gpu_count() elif nthreads_per_trial is None and ngpus_per_trial is not None: if ngpus_per_trial != 0: num_parallel_jobs = get_gpu_count() // ngpus_per_trial nthreads_per_trial = max(get_cpu_count() // num_parallel_jobs, 1) else: nthreads_per_trial = min(get_cpu_count(), 4) nthreads_per_trial = min(nthreads_per_trial, get_cpu_count()) ngpus_per_trial = min(ngpus_per_trial, get_gpu_count()) assert nthreads_per_trial > 0 and ngpus_per_trial >= 0,\ 'Invalid number of threads and number of GPUs.' return {'num_cpus': nthreads_per_trial, 'num_gpus': ngpus_per_trial}
def fit(dataset='voc', net=Categorical('mobilenet1.0'), meta_arch='yolo3', lr=Categorical(5e-4, 1e-4), loss=gluon.loss.SoftmaxCrossEntropyLoss(), split_ratio=0.8, batch_size=16, epochs=50, num_trials=None, time_limits=None, nthreads_per_trial=12, num_workers=32, ngpus_per_trial=1, hybridize=True, scheduler_options=None, search_strategy='random', search_options=None, verbose=False, transfer='coco', resume='', checkpoint='checkpoint/exp1.ag', visualizer='none', dist_ip_addrs=None, auto_search=True, seed=223, data_shape=416, start_epoch=0, lr_mode='step', lr_decay=0.1, lr_decay_period=0, lr_decay_epoch='160,180', warmup_lr=0.0, warmup_epochs=2, warmup_iters=1000, warmup_factor=1. / 3., momentum=0.9, wd=0.0005, log_interval=100, save_prefix='', save_interval=10, val_interval=1, num_samples=-1, no_random_shape=False, no_wd=False, mixup=False, no_mixup_epochs=20, label_smooth=False, syncbn=False, reuse_pred_weights=True, **kwargs): """ Fit object detection models. Parameters ---------- dataset : str or :class:`autogluon.task.ObjectDectection.Dataset` Training dataset containing images and corresponding object bounding boxes. net : str, :class:`autogluon.space.AutoGluonObject` Which existing neural network base models to consider as candidates. meta_arch : str Meta architecture of the model. Currently support YoloV3 (Default) and FasterRCNN. YoloV3 is faster, while FasterRCNN is more accurate. lr : float or :class:`autogluon.space` The learning rate to use in each update of the neural network weights during training. loss : mxnet.gluon.loss Loss function used during training of the neural network weights. split_ratio : float Fraction of dataset to hold-out during training in order to tune hyperparameters (i.e. validation data). The final returned model may be fit to all of the data (after hyperparameters have been selected). batch_size : int How many images to group in each mini-batch during gradient computations in training. epochs : int How many epochs to train the neural networks for at most. num_trials : int Maximal number of hyperparameter configurations to try out. time_limits : int Approximately how long should `fit()` should run for (wallclock time in seconds). `fit()` will stop training new models after this amount of time has elapsed (but models which have already started training will continue to completion). nthreads_per_trial : int How many CPUs to use in each trial (ie. single training run of a model). num_workers : int How many CPUs to use for data loading during training of a model. ngpus_per_trial : int How many GPUs to use in each trial (ie. single training run of a model). hybridize : bool Whether or not the MXNet neural network should be hybridized (for increased efficiency). scheduler_options : dict Extra arguments passed to __init__ of scheduler, to configure the orchestration of training jobs during hyperparameter-tuning. search_strategy : str Which hyperparameter search algorithm to use. Options include: 'random' (random search), 'skopt' (SKopt Bayesian optimization), 'grid' (grid search), 'hyperband' (Hyperband random), 'rl' (reinforcement learner). search_options : dict Auxiliary keyword arguments to pass to the searcher that performs hyperparameter optimization. verbose : bool Whether or not to print out intermediate information during training. resume : str Path to checkpoint file of existing model, from which model training should resume. If not empty, we also start the hyperparameter search from the state loaded from checkpoint. checkpoint : str or None State of hyperparameter search is stored to this local file visualizer : str Describes method to visualize training progress during `fit()`. Options: ['mxboard', 'tensorboard', 'none']. dist_ip_addrs : list List of IP addresses corresponding to remote workers, in order to leverage distributed computation. auto_search : bool If True, enables automatic suggestion of network types and hyper-parameter ranges adaptively based on provided dataset. seed : int Random seed to set for reproducibility. data_shape : int Shape of the image data. start_epoch : int Which epoch we begin training from (eg. if we resume training of an existing model, then this argument may be set to the number of epochs the model has already been trained for previously). lr_mode : str What sort of learning rate schedule should be followed during training. lr_decay : float How much learning rate should be decayed during training. lr_decay_period : int How often learning rate should be decayed during training. warmup_lr : float Learning rate to use during warm up period at the start of training. warmup_epochs : int How many initial epochs constitute the "warm up" period of model training. warmup_iters : int How many initial iterations constitute the "warm up" period of model training. This is used by R-CNNs warmup_factor : float warmup factor of target lr. initial lr starts from target lr * warmup_factor momentum : float or :class:`autogluon.space` Momentum to use in optimization of neural network weights during training. wd : float or :class:`autogluon.space` Weight decay to use in optimization of neural network weights during training. log_interval : int Log results every so many epochs during training. save_prefix : str Prefix to append to file name for saved model. save_interval : int Save a copy of model every so many epochs during training. val_interval : int Evaluate performance on held-out validation data every so many epochs during training. no_random_shape : bool Whether random shapes should not be used. no_wd : bool Whether weight decay should be turned off. mixup : bool Whether or not to utilize mixup data augmentation strategy. no_mixup_epochs : int If using mixup, we first train model for this many epochs without mixup data augmentation. label_smooth : bool Whether or not to utilize label smoothing. syncbn : bool Whether or not to utilize synchronized batch normalization. Returns ------- :class:`autogluon.task.object_detection.Detector` object which can make predictions on new data and summarize what happened during `fit()`. Examples -------- >>> from autogluon.vision.object_detection import ObjectDetection as task >>> detector = task.fit(dataset = 'voc', net = 'mobilenet1.0', >>> time_limits = 600, ngpus_per_trial = 1, num_trials = 1) """ assert search_strategy not in {'bayesopt', 'bayesopt_hyperband'}, \ "search_strategy == 'bayesopt' or 'bayesopt_hyperband' not yet supported" if auto_search: # The strategies can be injected here, for example: automatic suggest some hps # based on the dataset statistics pass nthreads_per_trial = get_cpu_count( ) if nthreads_per_trial > get_cpu_count() else nthreads_per_trial if ngpus_per_trial > get_gpu_count(): logger.warning( "The number of requested GPUs is greater than the number of available GPUs." ) ngpus_per_trial = get_gpu_count( ) if ngpus_per_trial > get_gpu_count() else ngpus_per_trial # If only time_limits is given, the scheduler starts trials until the # time limit is reached if num_trials is None and time_limits is None: num_trials = 2 train_object_detection.register_args( meta_arch=meta_arch, dataset=dataset, net=net, lr=lr, loss=loss, num_gpus=ngpus_per_trial, batch_size=batch_size, split_ratio=split_ratio, epochs=epochs, num_workers=nthreads_per_trial, hybridize=hybridize, verbose=verbose, final_fit=False, seed=seed, data_shape=data_shape, start_epoch=0, transfer=transfer, lr_mode=lr_mode, lr_decay=lr_decay, lr_decay_period=lr_decay_period, lr_decay_epoch=lr_decay_epoch, warmup_lr=warmup_lr, warmup_epochs=warmup_epochs, warmup_iters=warmup_iters, warmup_factor=warmup_factor, momentum=momentum, wd=wd, log_interval=log_interval, save_prefix=save_prefix, save_interval=save_interval, val_interval=val_interval, num_samples=num_samples, no_random_shape=no_random_shape, no_wd=no_wd, mixup=mixup, no_mixup_epochs=no_mixup_epochs, label_smooth=label_smooth, resume=resume, syncbn=syncbn, reuse_pred_weights=reuse_pred_weights) # Backward compatibility: grace_period = kwargs.get('grace_period') if grace_period is not None: if scheduler_options is None: scheduler_options = {'grace_period': grace_period} else: assert 'grace_period' not in scheduler_options, \ "grace_period appears both in scheduler_options and as direct argument" scheduler_options = copy.copy(scheduler_options) scheduler_options['grace_period'] = grace_period logger.warning("grace_period is deprecated, use " "scheduler_options={'grace_period': ...} instead") scheduler_options = compile_scheduler_options( scheduler_options=scheduler_options, search_strategy=search_strategy, search_options=search_options, nthreads_per_trial=nthreads_per_trial, ngpus_per_trial=ngpus_per_trial, checkpoint=checkpoint, num_trials=num_trials, time_out=time_limits, resume=(len(resume) > 0), visualizer=visualizer, time_attr='epoch', reward_attr='map_reward', dist_ip_addrs=dist_ip_addrs, epochs=epochs) results = BaseTask.run_fit(train_object_detection, search_strategy, scheduler_options) logger.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> finish model fitting") args = sample_config(train_object_detection.args, results['best_config']) logger.info('The best config: {}'.format(results['best_config'])) ngpus = get_gpu_count() ctx = [mx.gpu(i) for i in range(ngpus)] if ngpus > 0 else [mx.cpu()] model = get_network(args.meta_arch, args.net, dataset.init().get_classes(), transfer, ctx, syncbn=args.syncbn) update_params(model, results.pop('model_params')) return Detector(model, results, checkpoint, args)
def fit(dataset='SST', net=ag.Categorical('bert_12_768_12'), pretrained_dataset=ag.Categorical( 'book_corpus_wiki_en_uncased', 'openwebtext_book_corpus_wiki_en_uncased'), lr=ag.space.Real(2e-05, 2e-04, log=True), warmup_ratio=0.01, lr_scheduler='cosine', log_interval=100, seed=0, batch_size=32, dev_batch_size=32, max_len=128, dtype='float32', epochs=3, epsilon=1e-6, accumulate=1, early_stop=False, nthreads_per_trial=4, ngpus_per_trial=1, hybridize=True, scheduler_options=None, search_strategy='random', search_options=None, num_trials=None, time_limits=None, resume=False, checkpoint='checkpoint/exp1.ag', visualizer='none', dist_ip_addrs=None, auto_search=True, verbose=False, **kwargs): """Fit neural networks on text dataset. Parameters ---------- dataset : str or :class:`autogluon.task.TextClassification.Dataset` The Training dataset. You can specify a string to use a popular built-in text dataset. net : str or :class:`autogluon.space.Categorical` Which existing neural network models to consider as candidates. pretrained_dataset : str, :class:`autogluon.space.Categorical` Which existing datasets to consider as candidates for transfer learning from. lr : float or :class:`autogluon.space` The learning rate to use in each update of the neural network weights during training. warmup_ratio : float Ratio of overall training period considered as "warm up". lr_scheduler : str Describes how learning rate should be adjusted over the course of training. Options include: 'cosine', 'poly'. log_interval : int Log results every so many epochs during training. seed : int Random seed to set for reproducibility. batch_size : int How many examples to group in each mini-batch during gradient computations in training. dev_batch_size : int How many examples to group in each mini-batch during performance evalatuion over validation dataset. max_len : int Maximum number of words in a single training example (i.e. one text snippet). dtype : str Dtype used to represent data fed to neural networks. epochs: int How many epochs to train the neural networks for at most. epsilon : float Small number. accumulate : int How often to accumulate losses. early_stop : bool Whether to utilize early stopping during training to avoid overfitting. num_trials : int Maximal number of hyperparameter configurations to try out. time_limits : int Approximately how long should `fit()` should run for (wallclock time in seconds). `fit()` will stop training new models after this amount of time has elapsed (but models which have already started training will continue to completion). nthreads_per_trial : int How many CPUs to use in each trial (ie. single training run of a model). ngpus_per_trial : int How many GPUs to use in each trial (ie. single training run of a model). hybridize : bool Whether or not the MXNet neural network should be hybridized (for increased efficiency). scheduler_options : dict Extra arguments passed to __init__ of scheduler, to configure the orchestration of training jobs during hyperparameter-tuning. search_strategy : str Which hyperparameter search algorithm to use. Options include: 'random' (random search), 'skopt' (SKopt Bayesian optimization), 'grid' (grid search), 'hyperband' (Hyperband random), 'rl' (reinforcement learner). search_options : dict Auxiliary keyword arguments to pass to the searcher that performs hyperparameter optimization. verbose : bool Whether or not to print out intermediate information during training. checkpoint : str or None State of hyperparameter search is stored to this local file resume : bool If True, the hyperparameter search is started from state loaded from checkpoint visualizer : str Describes method to visualize training progress during `fit()`. Options: ['mxboard', 'tensorboard', 'none']. dist_ip_addrs : list List of IP addresses corresponding to remote workers, in order to leverage distributed computation. auto_search : bool If True, enables automatic suggestion of network types and hyper-parameter ranges adaptively based on provided dataset. Returns ------- :class:`autogluon.task.text_classification.TextClassificationPredictor` object which can make predictions on new data and summarize what happened during `fit()`. Examples -------- >>> from autogluon.text import TextClassification as task >>> dataset = task.Dataset(name='ToySST') >>> predictor = task.fit(dataset) """ assert search_strategy not in {'bayesopt', 'bayesopt_hyperband'}, \ "search_strategy == 'bayesopt' or 'bayesopt_hyperband' not yet supported" logger.warning( '`TextClassification` will soon be deprecated.' 'Please use `TextPrediction` module instead.' 'If your text dataset is in tabular format, you may also try the `TabularPrediction` module.' ) if auto_search: # The strategies can be injected here, for example: automatic suggest some hps # based on the dataset statistics pass nthreads_per_trial = get_cpu_count( ) if nthreads_per_trial > get_cpu_count() else nthreads_per_trial ngpus_per_trial = get_gpu_count( ) if ngpus_per_trial > get_gpu_count() else ngpus_per_trial # If only time_limits is given, the scheduler starts trials until the # time limit is reached if num_trials is None and time_limits is None: num_trials = 2 train_text_classification.register_args( dataset=dataset, pretrained_dataset=pretrained_dataset, net=net, lr=lr, warmup_ratio=warmup_ratio, early_stop=early_stop, dtype=dtype, max_len=max_len, log_interval=log_interval, epsilon=epsilon, accumulate=accumulate, seed=seed, lr_scheduler=lr_scheduler, num_gpus=ngpus_per_trial, batch_size=batch_size, dev_batch_size=dev_batch_size, epochs=epochs, num_workers=nthreads_per_trial, hybridize=hybridize, verbose=verbose, final_fit=False, **kwargs) # Backward compatibility: grace_period = kwargs.get('grace_period') if grace_period is not None: if scheduler_options is None: scheduler_options = {'grace_period': grace_period} else: assert 'grace_period' not in scheduler_options, \ "grace_period appears both in scheduler_options and as direct argument" scheduler_options = copy.copy(scheduler_options) scheduler_options['grace_period'] = grace_period logger.warning("grace_period is deprecated, use " "scheduler_options={'grace_period': ...} instead") scheduler_options = compile_scheduler_options( scheduler_options=scheduler_options, search_strategy=search_strategy, search_options=search_options, nthreads_per_trial=nthreads_per_trial, ngpus_per_trial=ngpus_per_trial, checkpoint=checkpoint, num_trials=num_trials, time_out=time_limits, resume=resume, visualizer=visualizer, time_attr='epoch', reward_attr='accuracy', dist_ip_addrs=dist_ip_addrs, epochs=epochs) results = BaseTask.run_fit(train_text_classification, search_strategy, scheduler_options) args = sample_config(train_text_classification.args, results['best_config']) get_model_params = results.pop('get_model_args') get_model_params['ctx'] = mx.cpu(0) nlp = try_import_gluonnlp() bert, _ = nlp.model.get_model(**get_model_params) model = get_network(bert, results.pop('class_labels'), 'roberta' in args.net) update_params(model, results.pop('model_params')) transform = results.pop('transform') test_transform = results.pop('test_transform') return TextClassificationPredictor(model, transform, test_transform, results, checkpoint, args)
16, 'ngpus_per_trial': 8, 'batch_size': 16, 'lr_decay_epoch': ag.Categorical('24,28', '35', '50,55', '40', '45', '55', '30, 35', '20'), 'warmup_iters': ag.Int(5, 500), 'wd': ag.Categorical(1e-4, 5e-4, 2.5e-4), 'syncbn': True, 'label_smooth': False, 'time_limits': time_limits, 'dist_ip_addrs': [] } else: raise NotImplementedError('%s is not implemented.', args.meta_arch) detector = task.fit(dataset_train, **kwargs) ctx = [mx.gpu(i) for i in range(get_gpu_count())] if not ctx: ctx = [mx.cpu()] test_map = detector.evaluate(dataset_test, ctx=ctx) print("mAP on test dataset: {}".format(test_map[-1][-1])) print(test_map) detector.save('final_model.model')
def fit(dataset, net=ag.Categorical('ResNet50_v1b', 'ResNet18_v1b'), optimizer=NAG(learning_rate=ag.Real(1e-3, 1e-2, log=True), wd=ag.Real(1e-4, 1e-3, log=True), multi_precision=False), loss=SoftmaxCrossEntropyLoss(), split_ratio=0.8, batch_size=64, input_size=224, epochs=20, final_fit_epochs=None, ensemble=1, metric='accuracy', nthreads_per_trial=60, ngpus_per_trial=1, hybridize=True, scheduler_options=None, search_strategy='random', search_options=None, plot_results=False, verbose=False, num_trials=None, time_limits=None, resume=False, output_directory='checkpoint/', visualizer='none', dist_ip_addrs=None, auto_search=True, lr_config=ag.Dict(lr_mode='cosine', lr_decay=0.1, lr_decay_period=0, lr_decay_epoch='40,80', warmup_lr=0.0, warmup_epochs=0), tricks=ag.Dict(last_gamma=False, use_pretrained=True, use_se=False, mixup=False, mixup_alpha=0.2, mixup_off_epoch=0, label_smoothing=False, no_wd=False, teacher_name=None, temperature=20.0, hard_weight=0.5, batch_norm=False, use_gn=False), **kwargs): # TODO: ensemble and hybridize are not in docstring """ Fit image classification models to a given dataset. Parameters ---------- dataset : str or :meth:`autogluon.task.ImageClassification.Dataset` Training dataset containing images and their associated class labels. Popular image datasets built into AutoGluon can be used by specifying their name as a string (options: ‘mnist’, ‘fashionmnist’, ‘cifar’, ‘cifar10’, ‘cifar100’, ‘imagenet’). input_size : int Size of images in the dataset (pixels). net : str or :class:`autogluon.space.Categorical` Which existing neural network models to consider as candidates. optimizer : str or :class:`autogluon.space.AutoGluonObject` Which optimizers to consider as candidates for learning the neural network weights. batch_size : int How many images to group in each mini-batch during gradient computations in training. epochs: int How many epochs to train the neural networks for at most. final_fit_epochs: int, default None Final fit epochs, the same number of epochs will be used as during the HPO if not specified. metric : str or callable object Evaluation metric by which predictions will be ulitmately evaluated on test data. loss : `mxnet.gluon.loss` Loss function used during training of the neural network weights. num_trials : int Maximal number of hyperparameter configurations to try out. time_limits : int Approximately how long `fit()` should run for (wallclock time in seconds). `fit()` will stop training new models after this amount of time has elapsed (but models which have already started training will continue to completion). split_ratio : float, default = 0.8 Fraction of dataset to use for training (rest of data is held-out for tuning hyperparameters). The final returned model may be fit to all of the data (after hyperparameters have been selected). nthreads_per_trial : int How many CPUs to use in each trial (ie. single training run of a model). ngpus_per_trial : int How many GPUs to use in each trial (ie. single training run of a model). output_directory : str Checkpoints of the search state are written to os.path.join(output_directory, 'exp1.ag') scheduler_options : dict Extra arguments passed to __init__ of scheduler, to configure the orchestration of training jobs during hyperparameter-tuning. search_strategy : str, default = None Which hyperparameter search algorithm to use. Options include: 'random' (random search), 'bayesopt' (Gaussian process Bayesian optimization), 'skopt' (SKopt Bayesian optimization), 'grid' (grid search), 'hyperband' (Hyperband scheduling with random search), 'bayesopt-hyperband' (Hyperband scheduling with GP-BO search). If unspecified, the default is 'random'. search_options : dict Auxiliary keyword arguments to pass to the searcher that performs hyperparameter optimization. resume : bool If True, the hyperparameter search is started from state loaded from os.path.join(output_directory, 'exp1.ag') dist_ip_addrs : list List of IP addresses corresponding to remote workers, in order to leverage distributed computation. verbose : bool Whether or not to print out intermediate information during training. plot_results : bool Whether or not to generate plots summarizing training process. visualizer : str Describes method to visualize training progress during `fit()`. Options: ['mxboard', 'tensorboard', 'none']. auto_search : bool If True, enables automatic suggestion of network types and hyper-parameter ranges adaptively based on provided dataset. Returns ------- :class:`autogluon.task.image_classification.Classifier` object which can make predictions on new data and summarize what happened during `fit()`. Examples -------- >>> from autogluon.vision import ImageClassification as task >>> dataset = task.Dataset(train_path='data/train', >>> test_path='data/test') >>> classifier = task.fit(dataset, >>> nets=ag.space.Categorical['resnet18_v1', 'resnet34_v1'], >>> time_limits=time_limits, >>> ngpus_per_trial=1, >>> num_trials = 4) >>> test_data = task.Dataset('~/data/test', train=False) >>> test_acc = classifier.evaluate(test_data) Bag of tricks are used on image classification dataset lr_config ---------- lr-mode : type=str, default='step'. describes how learning rate should be adjusted over the course of training. Options include: 'cosine', 'poly'. lr-decay : type=float, default=0.1. decay rate of learning rate. default is 0.1. lr-decay-period : type=int, default=0. interval for periodic learning rate decays. default is 0 to disable. lr-decay-epoch : type=str, default='10,20,30'. epochs at which learning rate decays. epochs=40, default is 10, 20, 30. warmup-lr : type=float, default=0.0. starting warmup learning rate. default is 0.0. warmup-epochs : type=int, default=0. number of warmup epochs. tricks ---------- last-gamma', default= True. whether to init gamma of the last BN layer in each bottleneck to 0. use-pretrained', default= True. enable using pretrained model from gluon. use_se', default= False. use SE layers or not in resnext. default is false. mixup', default= False. whether train the model with mix-up. default is false. mixup-alpha', type=float, default=0.2. beta distribution parameter for mixup sampling, default is 0.2. mixup-off-epoch', type=int, default=0. how many last epochs to train without mixup, default is 0. label-smoothing', default= True. use label smoothing or not in training. default is false. no-wd', default= True. whether to remove weight decay on bias, and beta/gamma for batchnorm layers. teacher', type=str, default=None. teacher model for distillation training temperature', type=float, default=20. temperature parameter for distillation teacher model hard-weight', type=float, default=0.5. weight for the loss of one-hot label for distillation training batch-norm', default= True. enable batch normalization or not in vgg. default is false. use-gn', default= False. whether to use group norm. """ checkpoint = os.path.join(output_directory, 'exp1.ag') if auto_search: # The strategies can be injected here, for example: automatic suggest some hps # based on the dataset statistics net = auto_suggest_network(dataset, net) nthreads_per_trial = get_cpu_count( ) if nthreads_per_trial > get_cpu_count() else nthreads_per_trial ngpus_per_trial = get_gpu_count( ) if ngpus_per_trial > get_gpu_count() else ngpus_per_trial # If only time_limits is given, the scheduler starts trials until the # time limit is reached if num_trials is None and time_limits is None: num_trials = 2 final_fit_epochs = final_fit_epochs if final_fit_epochs else epochs train_image_classification.register_args( dataset=dataset, net=net, optimizer=optimizer, loss=loss, metric=metric, num_gpus=ngpus_per_trial, split_ratio=split_ratio, batch_size=batch_size, input_size=input_size, epochs=epochs, final_fit_epochs=final_fit_epochs, verbose=verbose, num_workers=nthreads_per_trial, hybridize=hybridize, final_fit=False, tricks=tricks, lr_config=lr_config) # Backward compatibility: grace_period = kwargs.get('grace_period') if grace_period is not None: if scheduler_options is None: scheduler_options = {'grace_period': grace_period} else: assert 'grace_period' not in scheduler_options, \ "grace_period appears both in scheduler_options and as direct argument" scheduler_options = copy.copy(scheduler_options) scheduler_options['grace_period'] = grace_period logger.warning("grace_period is deprecated, use " "scheduler_options={'grace_period': ...} instead") scheduler_options = compile_scheduler_options( scheduler_options=scheduler_options, search_strategy=search_strategy, search_options=search_options, nthreads_per_trial=nthreads_per_trial, ngpus_per_trial=ngpus_per_trial, checkpoint=checkpoint, num_trials=num_trials, time_out=time_limits, resume=resume, visualizer=visualizer, time_attr='epoch', reward_attr='classification_reward', dist_ip_addrs=dist_ip_addrs, epochs=epochs) results = BaseTask.run_fit(train_image_classification, search_strategy, scheduler_options, plot_results=plot_results) args = sample_config(train_image_classification.args, results['best_config']) kwargs = {'num_classes': results['num_classes'], 'ctx': mx.cpu(0)} model = get_network(args.net, **kwargs) multi_precision = optimizer.kwvars[ 'multi_precision'] if 'multi_precision' in optimizer.kwvars else False update_params(model, results.pop('model_params'), multi_precision) if ensemble > 1: models = [model] scheduler = create_scheduler(train_image_classification, search_strategy, scheduler_options) for i in range(1, ensemble): resultsi = scheduler.run_with_config(results['best_config']) kwargs = { 'num_classes': resultsi['num_classes'], 'ctx': mx.cpu(0) } model = get_network(args.net, **kwargs) update_params(model, resultsi.pop('model_params'), multi_precision) models.append(model) model = Ensemble(models) results.pop('args') args.pop('optimizer') args.pop('dataset') args.pop('loss') return Classifier(model, results, default_val_fn, checkpoint, args)
def __init__(self, supernet, train_set='imagenet', val_set=None, train_fn=default_train_fn, eval_fn=default_val_fn, train_args={}, val_args={}, reward_fn=default_reward_fn, num_gpus=0, num_cpus=4, batch_size=256, epochs=120, warmup_epochs=5, controller_lr=1e-3, controller_type='lstm', controller_batch_size=10, ema_baseline_decay=0.95, update_arch_frequency=20, checkname='./enas/checkpoint.ag', plot_frequency=0, **kwargs): num_cpus = get_cpu_count() if num_cpus > get_cpu_count() else num_cpus num_gpus = get_gpu_count() if num_gpus > get_gpu_count() else num_gpus self.supernet = supernet self.train_fn = train_fn self.eval_fn = eval_fn self.reward_fn = reward_fn self.checkname = checkname self.plot_frequency = plot_frequency self.epochs = epochs self.warmup_epochs = warmup_epochs self.controller_batch_size = controller_batch_size kwspaces = self.supernet.kwspaces self.initialize_miscs(train_set, val_set, batch_size, num_cpus, num_gpus, train_args, val_args) # create RL searcher/controller self.baseline = None self.ema_decay = ema_baseline_decay self.searcher = RLSearcher(kwspaces, controller_type=controller_type, prefetch=4, num_workers=4) # controller setup self.controller = self.searcher.controller self.controller_optimizer = mx.gluon.Trainer( self.controller.collect_params(), 'adam', optimizer_params={'learning_rate': controller_lr}) self.update_arch_frequency = update_arch_frequency self.val_acc = 0 # async controller sample self._worker_pool = ThreadPool(2) self._data_buffer = {} self._rcvd_idx = 0 self._sent_idx = 0 self._timeout = 20 # logging history self.training_history = [] self._prefetch_controller()