def eval_arch(conf_eval: Config, cell_builder: Optional[CellBuilder]): logger.pushd('eval_arch') # region conf vars conf_loader = conf_eval['loader'] model_filename = conf_eval['model_filename'] metric_filename = conf_eval['metric_filename'] conf_checkpoint = conf_eval['checkpoint'] resume = conf_eval['resume'] conf_train = conf_eval['trainer'] # endregion if cell_builder: cell_builder.register_ops() model = create_model(conf_eval) # get data train_dl, _, test_dl = data.get_data(conf_loader) assert train_dl is not None and test_dl is not None checkpoint = nas_utils.create_checkpoint(conf_checkpoint, resume) trainer = Trainer(conf_train, model, checkpoint) train_metrics = trainer.fit(train_dl, test_dl) train_metrics.save(metric_filename) # save model if model_filename: model_filename = utils.full_path(model_filename) ml_utils.save_model(model, model_filename) logger.info({'model_save_path': model_filename}) logger.popd()
def train_test()->Metrics: conf = common.get_conf() conf_eval = conf['nas']['eval'] # region conf vars conf_loader = conf_eval['loader'] conf_trainer = conf_eval['trainer'] # endregion conf_trainer['validation']['freq']=1 conf_trainer['epochs'] = 1 conf_loader['train_batch'] = 128 conf_loader['test_batch'] = 4096 conf_loader['cutout'] = 0 conf_trainer['drop_path_prob'] = 0.0 conf_trainer['grad_clip'] = 0.0 conf_trainer['aux_weight'] = 0.0 Net = cifar10_models.resnet34 model = Net().to(torch.device('cuda')) # get data data_loaders = data.get_data(conf_loader) assert data_loaders.train_dl is not None and data_loaders.test_dl is not None trainer = Trainer(conf_trainer, model, None) trainer.fit(data_loaders) met = trainer.get_metrics() return met
def get_data(self, conf_loader:Config)->Tuple[Optional[DataLoader], Optional[DataLoader]]: # first get from cache train_ds, val_ds = self._data_cache.get(id(conf_loader), (None, None)) # if not found in cache then create if train_ds is None: train_ds, val_ds, _ = data.get_data(conf_loader) self._data_cache[id(conf_loader)] = (train_ds, val_ds) return train_ds, val_ds
def train_test(conf_eval: Config): conf_loader = conf_eval['loader'] conf_trainer = conf_eval['trainer'] # create model Net = cifar10_models.resnet34 model = Net().to(torch.device('cuda', 0)) # get data train_dl, _, test_dl = data.get_data(conf_loader) # train! trainer = Trainer(conf_trainer, model) trainer.fit(train_dl, test_dl)
def get_data(self, conf_loader:Config)->Tuple[Optional[DataLoader], Optional[DataLoader]]: # this dict caches the dataset objects per dataset config so we don't have to reload # the reason we do dynamic attribute is so that any dependent methods # can do ray.remote if not hasattr(self, '_data_cache'): self._data_cache = {} # first get from cache train_ds, val_ds = self._data_cache.get(id(conf_loader), (None, None)) # if not found in cache then create if train_ds is None: train_ds, val_ds, _ = data.get_data(conf_loader) self._data_cache[id(conf_loader)] = (train_ds, val_ds) return train_ds, val_ds
def get_data(self, conf_loader: Config) -> data.DataLoaders: # this dict caches the dataset objects per dataset config so we don't have to reload # the reason we do dynamic attribute is so that any dependent methods # can do ray.remote if not hasattr(self, '_data_cache'): self._data_cache: Dict[int, data.DataLoaders] = {} # first get from cache if id(conf_loader) in self._data_cache: data_loaders = self._data_cache[id(conf_loader)] else: data_loaders = data.get_data(conf_loader) self._data_cache[id(conf_loader)] = data_loaders return data_loaders
def main(): #6, 7, 9, 10, 16 #model = model_builder.build(model_builder.EXAMPLE_DESC_MATRIX, model_builder.EXAMPLE_VERTEX_OPS) nsds = Nasbench101Dataset('~/dataroot/nasbench_ds/nasbench_full.pkl') conf = common_init(config_filepath='confs/algos/nasbench101.yaml') conf_eval = conf['nas']['eval'] conf_loader = conf_eval['loader'] conf_trainer = conf_eval['trainer'] model = nsds.create_model(5) # 401277 is same model as example data_loaders = data.get_data(conf_loader) trainer = Trainer(conf_trainer, model) trainer.fit(data_loaders)
def finalize_model(self, model: Model, to_cpu=True, restore_device=True) -> ModelDesc: logger.pushd('finalize') # get config and train data loader # TODO: confirm this is correct in case you get silent bugs conf = get_conf() conf_loader = conf['nas']['search']['loader'] train_dl, val_dl, test_dl = get_data(conf_loader) # wrap all cells in the model self._divnas_cells: Dict[int, Divnas_Cell] = {} for _, cell in enumerate(model.cells): divnas_cell = Divnas_Cell(cell) self._divnas_cells[id(cell)] = divnas_cell # go through all edges in the DAG and if they are of divop # type then set them to collect activations sigma = conf['nas']['search']['divnas']['sigma'] for _, dcell in enumerate(self._divnas_cells.values()): dcell.collect_activations(DivOp, sigma) # now we need to run one evaluation epoch to collect activations # we do it on cpu otherwise we might run into memory issues # later we can redo the whole logic in pytorch itself # at the end of this each node in a cell will have the covariance # matrix of all incoming edges' ops model = model.cpu() model.eval() with torch.no_grad(): for _ in range(1): for _, (x, _) in enumerate(train_dl): _, _ = model(x), None # now you can go through and update the # node covariances in every cell for dcell in self._divnas_cells.values(): dcell.update_covs() logger.popd() return super().finalize_model(model, to_cpu, restore_device)
def train_test(conf_eval:Config): # region conf vars conf_loader = conf_eval['loader'] conf_trainer = conf_eval['trainer'] # endregion conf_trainer['validation']['freq']=1 conf_trainer['epochs'] = 10 conf_loader['train_batch'] = 128 conf_loader['test_batch'] = 4096 conf_loader['cutout'] = 0 conf_trainer['drop_path_prob'] = 0.0 conf_trainer['grad_clip'] = 0.0 conf_trainer['aux_weight'] = 0.0 Net = cifar10_models.resnet34 model = Net().to(torch.device('cuda', 0)) # get data train_dl, _, test_dl = data.get_data(conf_loader) assert train_dl is not None and test_dl is not None trainer = Trainer(conf_trainer, model, None) trainer.fit(train_dl, test_dl)
def imagenet_test(): conf = Config('confs/algos/darts.yaml;confs/datasets/imagenet.yaml', ) conf_loader = conf['nas']['eval']['loader'] dl_train, *_ = data.get_data(conf_loader)
import logging from archai.datasets import data from archai.common import utils from archai.common.timing import MeasureTime, print_all_timings, print_timing, get_timing from archai.common.common import logger, common_init conf = common_init(config_filepath='confs/algos/darts.yaml', param_args=['--common.experiment_name', 'restnet_test']) conf_eval = conf['nas']['eval'] conf_loader = conf_eval['loader'] conf_loader['train_batch'] = 512 conf_loader['test_batch'] = 4096 conf_loader['cutout'] = 0 train_dl, _, test_dl = data.get_data(conf_loader) @MeasureTime def iter_dl(dl): dummy = 0.0 for x, y in train_dl: x = x.cuda() y = y.cuda() dummy += len(x) # dummy += len(y) return dummy logging.info(f'batch_cout={len(train_dl)}')
import logging from archai.datasets import data from archai.common import utils from archai.common.timing import MeasureTime, print_all_timings, print_timing, get_timing from archai.common.common import logger, common_init conf = common_init(config_filepath='confs/algos/darts.yaml', param_args=['--common.experiment_name', 'restnet_test']) conf_eval = conf['nas']['eval'] conf_loader = conf_eval['loader'] conf_loader['train_batch'] = 512 conf_loader['test_batch'] = 4096 conf_loader['cutout'] = 0 data_loaders = data.get_data(conf_loader) assert data_loaders.train_dl is not None @MeasureTime def iter_dl(dl): dummy = 0.0 for x, y in dl: x = x.cuda() y = y.cuda() dummy += len(x) # dummy += len(y) return dummy logging.info(f'batch_count={len(data_loaders.train_dl)}')