def download(): DOWNLOAD_URL = 'http://vision.stanford.edu/lijiali/event_dataset/event_dataset.rar' # make sport8 directory sport8 = utils.full_path(os.path.join(dataroot, 'sport8')) meta = utils.full_path(os.path.join(sport8, 'meta')) os.makedirs(sport8, exist_ok=True) os.makedirs(meta, exist_ok=True) dir_downloads = utils.dir_downloads() filename = os.path.basename(DOWNLOAD_URL) archive = os.path.join(dir_downloads, filename) if not os.path.isfile(archive): tvutils.download_url(DOWNLOAD_URL, dir_downloads, filename) print(f"Extracting {archive} to {sport8}") pyunpack.Archive(archive).extractall(sport8) # download the csv files for the train and test split # from 'NAS Evaluation is Frustrating' repo # note that download_url doesn't work in vscode debug mode test_file_url = 'https://raw.githubusercontent.com/antoyang/NAS-Benchmark/master/data/Sport8_test.csv' train_file_url = 'https://raw.githubusercontent.com/antoyang/NAS-Benchmark/master/data/Sport8_train.csv' tvutils.download_url(test_file_url, meta, filename=None, md5=None) tvutils.download_url(train_file_url, meta, filename=None, md5=None) return sport8, meta
def main(): in_dataset_file = utils.full_path( '~/dataroot/nasbench_ds/nasbench_full.tfrecord.pkl') out_dataset_file = utils.full_path( '~/dataroot/nasbench_ds/nasbench_full.pkl') stats: Dict[str, dict] = {} with open(in_dataset_file, 'rb') as f: records = pickle.load(f) for module_hash, epochs, raw_adjacency, raw_operations, raw_metrics in records: dim = int(np.sqrt(len(raw_adjacency))) adjacency = np.array([int(e) for e in list(raw_adjacency)], dtype=np.int8) adjacency = np.reshape(adjacency, (dim, dim)) operations = raw_operations.split(',') metrics = model_metrics_pb2.ModelMetrics.FromString( base64.b64decode(raw_metrics)) if module_hash not in stats: stats[module_hash] = { 'module_hash': module_hash, 'module_adjacency': adjacency, 'module_operations': operations, 'trainable_parameters': metrics.trainable_parameters, 'total_time': metrics.total_time, 'metrics': {} } entry = stats[module_hash] assert entry['module_hash'] == module_hash #assert entry['module_adjacency'] == adjacency assert entry['module_operations'] == operations assert entry['trainable_parameters'] == metrics.trainable_parameters if epochs not in entry['metrics']: entry['metrics'][epochs] = [] entry['metrics'][epochs].append( [eval_to_dict(e) for e in metrics.evaluation_data]) dataset = sorted(stats.values(), key=lambda d: np.mean( [r[-1]['test_accuracy'] for r in d['metrics'][108]])) for i, d in enumerate(dataset): d['rank'] = i odict = OrderedDict((d['module_hash'], d) for d in dataset) with open(out_dataset_file, 'wb') as f: pickle.dump(odict, f)
def main(): default_dir = r'D:\GitHubSrc\archaiphilly\phillytools\nasbench_darts_lr0.025_wd3_b128' parser = argparse.ArgumentParser(description='Pytorch cifar training') parser.add_argument('--in-dir', default=default_dir) parser.add_argument('--out-dir', default=default_dir) args = parser.parse_args() parsed_metrics = delimited_text.DelimitedText() in_dir = pathlib.Path(utils.full_path(args.in_dir)) assert in_dir.exists(), f'Does not exist: {in_dir}' metrics_filepaths = in_dir.rglob('metrics*.tsv') for metrics_filepath in metrics_filepaths: text = metrics_filepath.read_text() parsed_metrics.add_from_text(text, has_header=True) assert len(parsed_metrics) >= 1 model_nums = [int(r) for r in parsed_metrics.get_col('model_name')] nasbench_acc = [ statistics.mean(literal_eval(r)) for r in parsed_metrics.get_col('nasbenc101_test_acc') ] retrain_acc = [float(r) for r in parsed_metrics.get_col('test_acc')] stats = list(zip(model_nums, nasbench_acc, retrain_acc)) stats.sort(key=lambda t: t[0]) retrain_ranks = utils.get_ranks(stats, key=lambda t: t[2]) stats = list( (i, rr, *t) for i, (t, rr) in enumerate(zip(stats, retrain_ranks))) corr = scipy.stats.pearsonr([t[0] for t in stats], [t[1] for t in stats]) out_metrics = delimited_text.DelimitedText() out_metrics.add_from_cols_list(stats, header=[ 'nasbench_rank', 'rerank', 'model_num', 'nasbench_acc', 'retrain_acc' ]) rerank_filepath = os.path.join(utils.full_path(args.out_dir), 'reranking.tsv') out_metrics.save(rerank_filepath) corr_filepath = os.path.join(utils.full_path(args.out_dir), 'corr.txt') utils.write_string(corr_filepath, str(corr))
def _copy_final_desc(self, search_conf) -> Tuple[Config, Config]: # get desc file path that search has produced search_desc_filename = search_conf['nas']['search'][ 'final_desc_filename'] search_desc_filepath = utils.full_path(search_desc_filename) assert search_desc_filepath and os.path.exists(search_desc_filepath) # get file path that eval would need eval_conf = self._init('eval') eval_desc_filename = eval_conf['nas']['eval']['final_desc_filename'] eval_desc_filepath = utils.full_path(eval_desc_filename) assert eval_desc_filepath shutil.copy2(search_desc_filepath, eval_desc_filepath) return search_conf, eval_conf
def _draw_model(self) -> None: if not self._plotsdir: return train_metrics = self.get_metrics() if train_metrics: best_train, best_val, best_test = train_metrics.run_metrics.best_epoch( ) # if test is available and is best for this epoch then mark it as best is_best = best_test and best_test.index == train_metrics.cur_epoch( ).index # if val is available and is best for this epoch then mark it as best is_best = is_best or best_val and best_val.index == train_metrics.cur_epoch( ).index # if neither val or test availavle then use train metrics is_best = is_best or best_train.index == train_metrics.cur_epoch( ).index if is_best: # log model_desc as a image plot_filepath = utils.full_path(os.path.join( self._plotsdir, f"EP{train_metrics.cur_epoch().index:03d}"), create=True) draw_model_desc( self.model.finalize(), filepath=plot_filepath, caption=f"Epoch {train_metrics.cur_epoch().index}")
def eval_arch(conf_eval: Config, cell_builder: Optional[CellBuilder]): logger.pushd('eval_arch') # region conf vars conf_loader = conf_eval['loader'] model_filename = conf_eval['model_filename'] metric_filename = conf_eval['metric_filename'] conf_checkpoint = conf_eval['checkpoint'] resume = conf_eval['resume'] conf_train = conf_eval['trainer'] # endregion if cell_builder: cell_builder.register_ops() model = create_model(conf_eval) # get data train_dl, _, test_dl = data.get_data(conf_loader) assert train_dl is not None and test_dl is not None checkpoint = nas_utils.create_checkpoint(conf_checkpoint, resume) trainer = Trainer(conf_train, model, checkpoint) train_metrics = trainer.fit(train_dl, test_dl) train_metrics.save(metric_filename) # save model if model_filename: model_filename = utils.full_path(model_filename) ml_utils.save_model(model, model_filename) logger.info({'model_save_path': model_filename}) logger.popd()
def untar_dataset(conf_name: str, pt_data_dir: str, conf_data: Config, dataroot: str) -> None: if 'storage_name' not in conf_data or not conf_data['storage_name']: print( f'data config {conf_name} ignored because storage_name key was not found or not set' ) return print(f'Untaring for data config: {conf_name}') storage_name = conf_data['storage_name'] tar_filepath = os.path.join(pt_data_dir, storage_name + '.tar') if not os.path.isfile(tar_filepath): raise RuntimeError( f'Tar file for dataset at {tar_filepath} was not found') tar_size = pathlib.Path(tar_filepath).stat().st_size print('tar_filepath:', tar_filepath, 'tar_size:', tar_size) local_dataroot = utils.full_path(dataroot) print('local_dataroot:', local_dataroot) _create_ram_disk(tar_size, local_dataroot) # os.makedirs(local_dataroot, exist_ok=True) utils.exec_shell_command(f'tar -xf "{tar_filepath}" -C "{local_dataroot}"') print( f'dataset copied from {tar_filepath} to {local_dataroot} sucessfully')
def copy_search_to_eval(self)->None: # do not cache conf_search or conf_eval as it may have values that # needs env var expansion. # get desc file path that search has produced conf_search = self.get_conf(True)['nas']['search'] search_desc_filename = conf_search['final_desc_filename'] search_desc_filepath = utils.full_path(search_desc_filename) assert search_desc_filepath and os.path.exists(search_desc_filepath) # get file path that eval would need conf_eval = self.get_conf(False)['nas']['eval'] eval_desc_filename = conf_eval['final_desc_filename'] eval_desc_filepath = utils.full_path(eval_desc_filename) assert eval_desc_filepath utils.copy_file(search_desc_filepath, eval_desc_filepath)
def evaluate(self, conf_eval: Config, model_desc_builder: ModelDescBuilder) -> EvalResult: logger.pushd('eval_arch') # region conf vars conf_checkpoint = conf_eval['checkpoint'] resume = conf_eval['resume'] model_filename = conf_eval['model_filename'] metric_filename = conf_eval['metric_filename'] # endregion model = self.create_model(conf_eval, model_desc_builder) checkpoint = nas_utils.create_checkpoint(conf_checkpoint, resume) train_metrics = self.train_model(conf_eval, model, checkpoint) train_metrics.save(metric_filename) # save model if model_filename: model_filename = utils.full_path(model_filename) ml_utils.save_model(model, model_filename) logger.info({'model_save_path': model_filename}) logger.popd() return EvalResult(train_metrics)
def main(): parser = argparse.ArgumentParser(description='Archai data install') parser.add_argument('--dataroot', type=str, default='~/dataroot', help='path to dataroot on local drive') parser.add_argument( '--dataset', type=str, default='cifar10', help= 'Name of the dataset for which confs/dataset/name.yaml should exist and have name of folder or tar file it resides in' ) args, extra_args = parser.parse_known_args() pt_data_dir = os.environ.get('PT_DATA_DIR', '') if not pt_data_dir: raise RuntimeError( 'This script needs PT_DATA_DIR environment variable with path to dataroot on cloud drive' ) pt_data_dir = utils.full_path(pt_data_dir) print('pt_data_dir:', pt_data_dir) conf_data_filepath = f'confs/datasets/{args.dataset}.yaml' print('conf_data_filepath:', conf_data_filepath) conf = Config(config_filepath=conf_data_filepath) for dataset_key in ['dataset', 'dataset_search', 'dataset_eval']: if dataset_key in conf: conf_data = conf[dataset_key] untar_dataset(dataset_key, pt_data_dir, conf_data, args.dataroot)
def _save_trained(self, reductions:int, cells:int, nodes:int, search_iter:int, metrics_stats:MetricsStats)->None: """Save the model and metric info into a log file""" # construct path where we will save subdir = utils.full_path(self.metrics_dir.format(**vars()), create=True) # save metric_infi metrics_stats_filepath = os.path.join(subdir, 'metrics_stats.yaml') if metrics_stats_filepath: with open(metrics_stats_filepath, 'w') as f: yaml.dump(metrics_stats, f) # save just metrics separately metrics_filepath = os.path.join(subdir, 'metrics.yaml') if metrics_filepath: with open(metrics_filepath, 'w') as f: yaml.dump(metrics_stats.train_metrics, f) logger.info({'metrics_stats_filepath': metrics_stats_filepath, 'metrics_filepath': metrics_filepath}) # append key info in root pareto data if self._parito_filepath: train_top1 = val_top1 = train_epoch = val_epoch = math.nan # extract metrics if metrics_stats.train_metrics: best_metrics = metrics_stats.train_metrics.run_metrics.best_epoch() train_top1 = best_metrics[0].top1.avg train_epoch = best_metrics[0].index if best_metrics[1]: val_top1 = best_metrics[1].top1.avg if len(best_metrics)>1 else math.nan val_epoch = best_metrics[1].index if len(best_metrics)>1 else math.nan # extract model stats if metrics_stats.model_stats: flops = metrics_stats.model_stats.Flops parameters = metrics_stats.model_stats.parameters inference_memory = metrics_stats.model_stats.inference_memory inference_duration = metrics_stats.model_stats.duration else: flops = parameters = inference_memory = inference_duration = math.nan utils.append_csv_file(self._parito_filepath, [ ('reductions', reductions), ('cells', cells), ('nodes', nodes), ('search_iter', search_iter), ('train_top1', train_top1), ('train_epoch', train_epoch), ('val_top1', val_top1), ('val_epoch', val_epoch), ('flops', flops), ('params', parameters), ('inference_memory', inference_memory), ('inference_duration', inference_duration) ])
def save_trained(self, conf_search: Config, reductions: int, cells: int, nodes: int, model_metrics: ModelMetrics) -> None: """Save the model and metric info into a log file""" metrics_dir = conf_search['metrics_dir'] # construct path where we will save subdir = utils.full_path(metrics_dir.format(**vars()), create=True) model_stats = nas_utils.get_model_stats(model_metrics.model) # save model_stats in its own file model_stats_filepath = os.path.join(subdir, 'model_stats.yaml') if model_stats_filepath: with open(model_stats_filepath, 'w') as f: yaml.dump(model_stats, f) # save just metrics separately for convinience metrics_filepath = os.path.join(subdir, 'metrics.yaml') if metrics_filepath: with open(metrics_filepath, 'w') as f: yaml.dump(model_stats.metrics, f) logger.info({ 'model_stats_filepath': model_stats_filepath, 'metrics_filepath': metrics_filepath }) # append key info in root pareto data if self._summary_filepath: train_top1 = val_top1 = train_epoch = val_epoch = math.nan # extract metrics if model_metrics.metrics: best_metrics = model_metrics.metrics.run_metrics.best_epoch() train_top1 = best_metrics[0].top1.avg train_epoch = best_metrics[0].index if best_metrics[1]: val_top1 = best_metrics[1].top1.avg if len( best_metrics) > 1 else math.nan val_epoch = best_metrics[1].index if len( best_metrics) > 1 else math.nan # extract model stats flops = model_stats.Flops parameters = model_stats.parameters inference_memory = model_stats.inference_memory inference_duration = model_stats.duration utils.append_csv_file(self._summary_filepath, [('reductions', reductions), ('cells', cells), ('nodes', nodes), ('train_top1', train_top1), ('train_epoch', train_epoch), ('val_top1', val_top1), ('val_epoch', val_epoch), ('flops', flops), ('params', parameters), ('inference_memory', inference_memory), ('inference_duration', inference_duration)])
def main(): in_dataset_file = utils.full_path( '~/dataroot/nasbench_ds/nasbench_full.tfrecord.pkl') out_dataset_file = utils.full_path( '~/dataroot/nasbench_ds/nasbench101_sample.tfrecord.pkl') with open(in_dataset_file, 'rb') as f: records = pickle.load(f) sampled_indices = set() adj_samples = 1000 for i in [0, 4000, 40000, len(records) - 1 - adj_samples + 1]: sampled_indices = sampled_indices.union( [i + k for k in range(adj_samples)]) sampled_hashes = set(records[i][0] for i in sorted(list(sampled_indices))) sampled = [r for r in records if r[0] in sampled_hashes] with open(out_dataset_file, 'wb') as f: pickle.dump(sampled, f)
def copy_search_to_eval(self) -> None: # get folder of model gallery that search has produced conf_search = self.get_conf(True)['nas']['search'] search_desc_foldername = conf_search['final_desc_foldername'] search_desc_folderpath = utils.full_path(search_desc_foldername) assert search_desc_foldername and os.path.exists( search_desc_folderpath) # get folder path that eval would need conf_eval = self.get_conf(False)['nas']['eval'] eval_desc_foldername = conf_eval['final_desc_foldername'] eval_desc_folderpath = utils.full_path(eval_desc_foldername) assert eval_desc_folderpath # only later version of shutil copytree has dirs_exists_ok option # so being robust to pre-existing directory if os.path.exists(eval_desc_folderpath): shutil.rmtree(eval_desc_folderpath) utils.copy_dir(search_desc_folderpath, eval_desc_folderpath, use_shutil=True)
def __init__(self, dataset_file, seed=None): self.config = config.build_config() random.seed(seed) dataset_file = utils.full_path(dataset_file) logging.info(f'Loading dataset from file "{dataset_file}"...') start = time.time() with open(dataset_file, 'rb') as f: self.data: OrderedDict[str, dict] = pickle.load(f) self.module_hashes = list(self.data.keys()) elapsed = time.time() - start logging.info('Loaded dataset in %d seconds' % elapsed)
def save(self, filename: str, save_trainables=False) -> Optional[str]: if filename: filename = utils.full_path(filename) if save_trainables: state_dict = self.state_dict() pt_filepath = ModelDesc._pt_filepath(filename) torch.save(state_dict, pt_filepath) # save yaml cloned = self.clone() cloned.clear_trainables() utils.write_string(filename, yaml.dump(cloned)) return filename
def main(): parser = argparse.ArgumentParser(description='Visualize model description') parser.add_argument('-f', '--model-desc-file', type=str, default='models/final_model_desc5.yaml', help='Model desc file') args, extra_args = parser.parse_known_args() model_desc_filepath = utils.full_path(args.model_desc_file) model_desc = ModelDesc.load(model_desc_filepath) out_file = pathlib.Path(model_desc_filepath).with_suffix('') draw_model_desc(model_desc, str(out_file))
def evaluate(self, conf_eval: Config, model_desc_builder: ModelDescBuilder) -> EvalResult: """Takes a folder of model descriptions output by search process and trains them in a distributed manner using ray with 1 gpu""" logger.pushd('evaluate') final_desc_foldername: str = conf_eval['final_desc_foldername'] # get list of model descs in the gallery folder final_desc_folderpath = utils.full_path(final_desc_foldername) files = [os.path.join(final_desc_folderpath, f) \ for f in glob.glob(os.path.join(final_desc_folderpath, 'model_desc_*.yaml')) \ if os.path.isfile(os.path.join(final_desc_folderpath, f))] logger.info({'model_desc_files': len(files)}) # to avoid all workers download datasets individually, let's do it before hand self._ensure_dataset_download(conf_eval) future_ids = [] for model_desc_filename in files: future_id = EvaluaterPetridish._train_dist.remote( self, conf_eval, model_desc_builder, model_desc_filename, common.get_state()) future_ids.append(future_id) # wait for all eval jobs to be finished ready_refs, remaining_refs = ray.wait(future_ids, num_returns=len(future_ids)) # plot pareto curve of gallery of models hull_points = [ray.get(ready_ref) for ready_ref in ready_refs] save_hull(hull_points, common.get_expdir()) plot_pool(hull_points, common.get_expdir()) best_point = max(hull_points, key=lambda p: p.metrics.best_val_top1()) logger.info({ 'best_val_top1': best_point.metrics.best_val_top1(), 'best_MAdd': best_point.model_stats.MAdd }) logger.popd() return EvalResult(best_point.metrics)
def load(filename:str, load_trainables=False)->'ModelDesc': filename = utils.full_path(filename) if not filename or not os.path.exists(filename): raise RuntimeError("Model description file is not found." "Typically this file should be generated from the search." "Please copy this file to '{}'".format(filename)) logger.info({'final_desc_filename': filename}) with open(filename, 'r') as f: model_desc = yaml.load(f, Loader=yaml.Loader) if load_trainables: # look for pth file that should have pytorch parameters state_dict pt_filepath = ModelDesc._pt_filepath(filename) if os.path.exists(pt_filepath): state_dict = torch.load(pt_filepath, map_location=torch.device('cpu')) model_desc.load_state_dict(state_dict) # else no need to restore weights return model_desc
def __init__(self, conf_search:Config, cell_builder:Optional[CellBuilder], trainer_class:TArchTrainer, finalizers:Finalizers) -> None: # region config vars conf_checkpoint = conf_search['checkpoint'] resume = conf_search['resume'] self.conf_model_desc = conf_search['model_desc'] self.conf_loader = conf_search['loader'] self.conf_train = conf_search['trainer'] self.final_desc_filename = conf_search['final_desc_filename'] self.full_desc_filename = conf_search['full_desc_filename'] self.metrics_dir = conf_search['metrics_dir'] self.conf_presearch_train = conf_search['seed_train'] self.conf_postsearch_train = conf_search['post_train'] conf_pareto = conf_search['pareto'] self.base_cells = self.conf_model_desc['n_cells'] self.max_cells = conf_pareto['max_cells'] self.base_reductions = self.conf_model_desc['n_reductions'] self.max_reductions = conf_pareto['max_reductions'] self.base_nodes = self.conf_model_desc['n_nodes'] self.max_nodes = conf_pareto['max_nodes'] self.search_iters = conf_search['search_iters'] self.pareto_enabled = conf_pareto['enabled'] pareto_summary_filename = conf_pareto['summary_filename'] # endregion self.cell_builder = cell_builder self.trainer_class = trainer_class self.finalizers = finalizers self._data_cache = {} self._parito_filepath = utils.full_path(pareto_summary_filename) self._checkpoint = nas_utils.create_checkpoint(conf_checkpoint, resume) logger.info({'pareto_enabled': self.pareto_enabled, 'base_reductions': self.base_reductions, 'base_cells': self.base_cells, 'base_nodes': self.base_nodes, 'max_reductions': self.max_reductions, 'max_cells': self.max_cells, 'max_nodes': self.max_nodes })
def restore_checkpoint(self, conf_search:Config, macro_combinations)\ ->Tuple[int, Optional[SearchResult]]: conf_pareto = conf_search['pareto'] pareto_summary_filename = conf_pareto['summary_filename'] summary_filepath = utils.full_path(pareto_summary_filename) # if checkpoint is available then restart from last combination we were running checkpoint_avail = self._checkpoint is not None resumed, state = False, None start_macro_i, best_result = 0, None if checkpoint_avail: state = self._checkpoint.get('search', None) if state is not None: start_macro_i = state['start_macro_i'] assert start_macro_i >= 0 and start_macro_i < len( macro_combinations) best_result = yaml.load(state['best_result'], Loader=yaml.Loader) start_macro_i += 1 # resume after the last checkpoint resumed = True if not resumed: # erase previous file left over from run utils.zero_file(summary_filepath) logger.warn({ 'resumed': resumed, 'checkpoint_avail': checkpoint_avail, 'checkpoint_val': state is not None, 'start_macro_i': start_macro_i, 'total_macro_combinations': len(macro_combinations) }) return start_macro_i, best_result
def get_filepath(suffix): conf = common_init(config_filepath='confs/algos/darts.yaml', param_args=['--common.experiment_name', 'test_basename' + f'_{suffix}' ]) return utils.full_path(os.path.join('$expdir' ,'somefile.txt'))
import tensorflow as tf import json, base64 import numpy as np #import model_metrics_pb2 import pickle from archai.common import utils dataset_file = utils.full_path( '~/dataroot/nasbench_ds/nasbench_only108.tfrecord') records = [] for serialized_row in tf.python_io.tf_record_iterator(dataset_file): module_hash, epochs, raw_adjacency, raw_operations, raw_metrics = ( json.loads(serialized_row.decode('utf-8'))) # dim = int(np.sqrt(len(raw_adjacency))) # adjacency = np.array([int(e) for e in list(raw_adjacency)], dtype=np.int8) # adjacency = np.reshape(adjacency, (dim, dim)) # operations = raw_operations.split(',') # metrics = base64.b64decode(raw_metrics) records.append( (module_hash, epochs, raw_adjacency, raw_operations, raw_metrics)) with open(dataset_file + '.pkl', 'wb') as f: pickle.dump(records, f)
def main(): # accept search and eval scripts to run # config file can be supplied using --config parser = argparse.ArgumentParser(description='NAS E2E Runs') parser.add_argument('--search-script', type=str, default='scripts/darts/cifar_search.py', help='Search script to run') parser.add_argument('--eval-script', type=str, default='scripts/darts/cifar_eval.py', help='Eval script to run') parser.add_argument('--exp_prefix', type=str, default='darts', help='Experiment prefix to use') args, extra_args = parser.parse_known_args() # load config to some of the settings like logdir conf = common_init(use_args=True) logdir = get_conf_common()['logdir'] assert logdir # get script, resume flag and experiment dir for search search_script = args.search_script resume = conf['nas']['search']['resume'] search_script = utils.full_path(search_script.strip()) experiment_name = args.exp_prefix + '_' + Path(search_script).stem experiment_dir = os.path.join(logdir, experiment_name) # see if search has already produced the output final_desc_filepath = os.path.join( experiment_dir, conf['nas']['search']['final_desc_filename']) if not resume or not os.path.exists(final_desc_filepath): print(f'Starting {search_script}...') result = subprocess.run([ 'python', search_script, '--config', conf.config_filepath, '--config-defaults', conf.config_defaults_filepath, '--common.experiment_name', experiment_name ]) print(f'Script {search_script} returned {result.returncode}') if result.returncode != 0: exit(result.returncode) else: print( f'Search is skipped because file {final_desc_filepath} already exists' ) # get script, resume flag and experiment dir for eval eval_script = args.eval_script resume = conf['nas']['eval']['resume'] eval_script = utils.full_path(eval_script.strip()) experiment_name = args.exp_prefix + '_' + Path(eval_script).stem experiment_dir = os.path.join(logdir, experiment_name) # if eval has already produced the output, skip eval run model_filepath = os.path.join(experiment_dir, conf['nas']['eval']['save_filename']) if not resume or not os.path.exists(model_filepath): # copy output of search to eval folder # TODO: take final_desc_filename from eval config os.makedirs(experiment_dir, exist_ok=True) shutil.copy2(final_desc_filepath, experiment_dir) print(f'Starting {eval_script}...') result = subprocess.run([ 'python', eval_script, '--config', conf.config_filepath, '--config-defaults', conf.config_defaults_filepath, '--common.experiment_name', experiment_name ]) print(f'Script {eval_script} returned {result.returncode}') if result.returncode != 0: exit(result.returncode) else: print(f'Eval is skipped because file {model_filepath} already exists') print('Search and eval done.') exit(0)
def __init__(self, conf_dataset: Config): super().__init__(conf_dataset) self._dataroot = utils.full_path(conf_dataset['dataroot'])
def main(): parser = argparse.ArgumentParser(description='Report creator') parser.add_argument( '--results-dir', '-d', type=str, #default=r'D:\GitHubSrc\archaiphilly\phillytools\darts_baseline_20200411', default=r'~/logdir/report_test', help='folder with experiment results from pt') parser.add_argument('--out-dir', '-o', type=str, default=r'~/logdir/reports', help='folder to output reports') args, extra_args = parser.parse_known_args() # root dir where all results are stored results_dir = pathlib.Path(utils.full_path(args.results_dir)) print(f'results_dir: {results_dir}') # extract experiment name which is top level directory exp_name = results_dir.parts[-1] # create results dir for experiment out_dir = utils.full_path(os.path.join(args.out_dir, exp_name)) print(f'out_dir: {out_dir}') os.makedirs(out_dir, exist_ok=True) # get list of all structured logs for each job logs = {} job_count = 0 for job_dir in results_dir.iterdir(): job_count += 1 for subdir in job_dir.iterdir(): if not subdir.is_dir(): continue # currently we expect that each job was ExperimentRunner job which should have # _search or _eval folders if subdir.stem.endswith('_search'): sub_job = 'search' elif subdir.stem.endswith('_eval'): sub_job = 'eval' else: raise RuntimeError( f'Sub directory "{subdir}" in job "{job_dir}" must ' 'end with either _search or _eval which ' 'should be the case if ExperimentRunner was used.') logs_filepath = os.path.join(str(subdir), 'logs.yaml') if os.path.isfile(logs_filepath): fix_yaml(logs_filepath) with open(logs_filepath, 'r') as f: key = job_dir.name + ':' + sub_job logs[key] = yaml.load(f, Loader=yaml.Loader) # create list of epoch nodes having same path in the logs grouped_logs = group_multi_runs(logs) collated_grouped_logs = collect_epoch_nodes(grouped_logs) summary_text, details_text = '', '' for log_key, grouped_logs in collated_grouped_logs.items(): # for each path for epochs nodes, compute stats for node_path, logs_epochs_nodes in grouped_logs.items(): collated_epoch_stats = get_epoch_stats(node_path, logs_epochs_nodes) summary_text += get_summary_text(log_key, out_dir, node_path, collated_epoch_stats, len(logs_epochs_nodes)) details_text += get_details_text(log_key, out_dir, node_path, collated_epoch_stats, len(logs_epochs_nodes)) write_report('summary.md', **vars()) write_report('details.md', **vars())
def _train_dist(evaluater: Evaluater, conf_eval: Config, model_desc_builder: ModelDescBuilder, model_desc_filename: str, common_state) -> ConvexHullPoint: """Train given a model""" common.init_from(common_state) # region config vars conf_model_desc = conf_eval['model_desc'] max_cells = conf_model_desc['n_cells'] conf_checkpoint = conf_eval['checkpoint'] resume = conf_eval['resume'] conf_petridish = conf_eval['petridish'] cell_count_scale = conf_petridish['cell_count_scale'] #endregion #register ops as we are in different process now model_desc_builder.pre_build(conf_model_desc) model_filename = utils.append_to_filename(model_desc_filename, '_model', '.pt') full_desc_filename = utils.append_to_filename(model_desc_filename, '_full', '.yaml') metrics_filename = utils.append_to_filename(model_desc_filename, '_metrics', '.yaml') model_stats_filename = utils.append_to_filename( model_desc_filename, '_model_stats', '.yaml') # create checkpoint for this specific model desc by changing the config checkpoint = None if conf_checkpoint is not None: conf_checkpoint['filename'] = model_filename.split( '.')[0] + '_checkpoint.pth' checkpoint = nas_utils.create_checkpoint(conf_checkpoint, resume) if checkpoint is not None and resume: if 'metrics_stats' in checkpoint: # return the output we had recorded in the checkpoint convex_hull_point = checkpoint['metrics_stats'] return convex_hull_point # template model is what we used during the search template_model_desc = ModelDesc.load(model_desc_filename) # we first scale this model by number of cells, keeping reductions same as in search n_cells = math.ceil( len(template_model_desc.cell_descs()) * cell_count_scale) n_cells = min(n_cells, max_cells) conf_model_desc = copy.deepcopy(conf_model_desc) conf_model_desc['n_cells'] = n_cells conf_model_desc[ 'n_reductions'] = n_reductions = template_model_desc.cell_type_count( CellType.Reduction) model_desc = model_desc_builder.build(conf_model_desc, template=template_model_desc) # save desc for reference model_desc.save(full_desc_filename) model = evaluater.model_from_desc(model_desc) train_metrics = evaluater.train_model(conf_eval, model, checkpoint) train_metrics.save(metrics_filename) # get metrics_stats model_stats = nas_utils.get_model_stats(model) # save metrics_stats with open(model_stats_filename, 'w') as f: yaml.dump(model_stats, f) # save model if model_filename: model_filename = utils.full_path(model_filename) ml_utils.save_model(model, model_filename) # TODO: Causes logging error at random times. Commenting out as stop-gap fix. # logger.info({'model_save_path': model_filename}) hull_point = ConvexHullPoint( JobStage.EVAL_TRAINED, 0, 0, model_desc, (n_cells, n_reductions, len(model_desc.cell_descs()[0].nodes())), metrics=train_metrics, model_stats=model_stats) if checkpoint: checkpoint.new() checkpoint['metrics_stats'] = hull_point checkpoint.commit() return hull_point
def main(): parser = argparse.ArgumentParser(description='Pytorch cifar training') parser.add_argument('--experiment-name', '-n', default='train_pytorch') parser.add_argument('--experiment-description', '-d', default='Train cifar usin pure PyTorch code') parser.add_argument('--epochs', '-e', type=int, default=108) parser.add_argument('--model-name', '-m', default='5') parser.add_argument( '--device', default='', help='"cuda" or "cpu" or "" in which case use cuda if available') parser.add_argument('--train-batch-size', '-b', type=int, default=256) parser.add_argument('--test-batch-size', type=int, default=256) parser.add_argument('--seed', '-s', type=float, default=42) parser.add_argument('--half', type=lambda x: x.lower() == 'true', nargs='?', const=True, default=False) parser.add_argument('--cutout', type=int, default=0) parser.add_argument('--grad-clip', type=float, default=5.0) parser.add_argument( '--datadir', default='', help='where to find dataset files, default is ~/torchvision_data_dir') parser.add_argument('--outdir', default='', help='where to put results, default is ~/logdir') parser.add_argument( '--loader-workers', type=int, default=-1, help='number of thread/workers for data loader (-1 means auto)') args = parser.parse_args() if not args.datadir: args.datadir = common.default_dataroot() nsds_dir = args.datadir if os.environ.get('PT_DATA_DIR', ''): nsds_dir = os.environ.get('PT_DATA_DIR') if not args.outdir: args.outdir = os.environ.get('PT_OUTPUT_DIR', '') if not args.outdir: args.outdir = os.path.join('~/logdir', 'nasbench101', args.experiment_name) assert isinstance(nsds_dir, str) expdir = utils.full_path(args.outdir) os.makedirs(expdir, exist_ok=True) utils.setup_cuda(args.seed) datadir = utils.full_path(args.datadir) os.makedirs(datadir, exist_ok=True) utils.create_logger(filepath=os.path.join(expdir, 'logs.log')) # log config for reference logging.info( f'exp_name="{args.experiment_name}", exp_desc="{args.experiment_description}"' ) logging.info( f'model_name="{args.model_name}", seed={args.seed}, epochs={args.epochs}' ) logging.info(f'half={args.half}, cutout={args.cutout}') logging.info(f'datadir="{datadir}"') logging.info(f'expdir="{expdir}"') logging.info(f'train_batch_size={args.train_batch_size}') if args.device: device = torch.device(args.device) else: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') nsds = Nasbench101Dataset( os.path.join(nsds_dir, 'nasbench_ds', 'nasbench_full.pkl')) # load data just before train start so any errors so far is not delayed train_dl, val_dl, test_dl = get_data( datadir=datadir, train_batch_size=args.train_batch_size, test_batch_size=args.test_batch_size, train_num_workers=args.loader_workers, test_num_workers=args.loader_workers, cutout=args.cutout) model_id = int(args.model_name) # 5, 401, 4001, 40001, 400001 epochs = args.epochs net = create_model(nsds, model_id, device, args.half) crit = create_crit(device, args.half) optim, sched, sched_on_epoch = optim_sched_darts( net, epochs) # optim_sched_darts optim_sched_paper train_metrics = train(epochs, train_dl, val_dl, net, device, crit, optim, sched, sched_on_epoch, args.half, False, grad_clip=args.grad_clip) test_acc = test(net, test_dl, device, args.half) log_metrics(expdir, f'metrics_{model_id}', train_metrics, test_acc, args, nsds, model_id)