def eval_classification(model, sessinit, dataflow): pred_config = PredictConfig( model=model, session_init=sessinit, input_names=['input', 'label'], output_names=['wrong-top1','bn5/output:0','conv5/output:0'] ) acc1 = RatioCounter() pred = FeedfreePredictor(pred_config, StagingInput(QueueInput(dataflow), device='/gpu:0')) for _ in tqdm.trange(dataflow.size()): top1,afbn5,beforbn5= pred() dic ={} dic['bn5/output:0']=afbn5 dic['conv5/output:0']=beforbn5 batch_size = top1.shape[0] acc1.feed(top1.sum(), batch_size) dir = logger.get_logger_dir() fname = os.path.join( dir, 'afbn5-{}.npz'.format(int(time.time()))) np.savez(fname, **dic) print("Top1 Error: {}".format(acc1.ratio))
def train_vqvae(params, dataset, checkpoint_dir): logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] image_shape = model_params['image_shape'] train_ds, val_ds, sample_train, sample_test = load_toy_dataset( dataset, trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseVQVAE.from_params(model_params) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Reconstruct(model, sample_train, sample_test, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss', 'perplexity'])), MaxSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')) ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, SimpleTrainer())
def server_handle_critic_message( msg_output, controller, mi_info, options): """ Petridish server handles the return message of a forked process that watches over a critic job. """ log_dir_root = logger.get_logger_dir() model_dir_root = options.model_dir queues = controller.queues queue_name, new_ci = msg_output is_fail, _ = is_mark_failure( _ci_to_dn(log_dir_root, new_ci, queue_name)) if is_fail: logger.info('Failed {} ci={}'.format(queue_name, new_ci)) return logger.info('Updating w/ msg of CRITIC {} ci={}'.format( queue_name, new_ci)) # load the new critic ctrl_dn = _ci_to_dn(model_dir_root, new_ci, queue_name) controller.update_predictor(ctrl_dn, queue_name) # as we have new model for critic, # remove other old ones if exists. ctrl_dns = [_ci_to_dn(model_dir_root, ci, queue_name) \ for ci in range(new_ci + 1 - controller.n_critic_procs)] for ctrl_dn in filter(lambda x : os.path.exists(x), ctrl_dns): logger.info('rm -rf {}'.format(ctrl_dn)) _ = subprocess.check_output( 'rm -rf {} &'.format(ctrl_dn), shell=True) # Sort the affected queue. logger.info('Ordering queue {}...'.format(queue_name)) queue = queues[queue_name] controller.update_queue(queue, mi_info) logger.info('... done ordering')
def __init__(self, max_to_keep=10, keep_checkpoint_every_n_hours=0.5, checkpoint_dir=None, var_collections=None): """ Args: max_to_keep (int): the same as in ``tf.train.Saver``. keep_checkpoint_every_n_hours (float): the same as in ``tf.train.Saver``. Note that "keep" does not mean "create", but means "don't delete". checkpoint_dir (str): Defaults to ``logger.get_logger_dir()``. var_collections (str or list of str): collection of the variables (or list of collections) to save. """ if var_collections is None: var_collections = [tf.GraphKeys.GLOBAL_VARIABLES] self._max_to_keep = max_to_keep self._keep_every_n_hours = keep_checkpoint_every_n_hours if not isinstance(var_collections, list): var_collections = [var_collections] self.var_collections = var_collections if checkpoint_dir is None: checkpoint_dir = logger.get_logger_dir() if checkpoint_dir is not None: if not tf.gfile.IsDirectory( checkpoint_dir): # v2: tf.io.gfile.isdir tf.gfile.MakeDirs(checkpoint_dir) # v2: tf.io.gfile.makedirs self.checkpoint_dir = checkpoint_dir
def server_handle_child_message( msg_output, controller, mi_info, options, n_idle, curr_iter): """ Petridish server handles the return message of a forked process that watches over a child job. """ log_dir_root = logger.get_logger_dir() q_parent, q_hallu = controller.q_parent, controller.q_hallu model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter fp, ve, te = jr['fp'], jr['ve'], jr['te'] logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format( model_iter, ve, te, fp * 1e-9)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp if (search_depth // 2 < options.max_growth and (options.search_max_flops is None or fp < options.search_max_flops)): controller.add_one_to_queue( q_parent, mi_info, model_iter, None) if q_parent.size() > 0: # choose a parent. pqe = controller.choose_parent(q_parent, mi_info) model_str, model_iter, _parent_iter, search_depth = pqe logger.info('PARENT : mi={}'.format(model_iter)) # Create hallucinations on the parent net_info_parent = net_info_from_str(model_str) n_hallu_per_parent = max( 1, min(controller.n_hallu_per_parent_on_idle, n_idle)) for _ in range(n_hallu_per_parent): net_info = copy.deepcopy(net_info_parent) hallus = net_info.sample_hallucinations( layer_ops=controller.valid_operations, merge_ops=controller.merge_operations, prob_at_layer=None, min_num_hallus=options.n_hallus_per_init, hallu_input_choice=options.hallu_input_choice) net_info = net_info.add_hallucinations( hallus, final_merge_op=controller.hallu_final_merge_op, stop_gradient_val=controller.stop_gradient_val, hallu_gate_layer=controller.hallu_gate_layer) # Update mi_info curr_iter += 1 hallu_str = net_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth + 1, None, None, hallu_str)) controller.add_one_to_queue( q_hallu, mi_info, curr_iter, net_info) return curr_iter
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, sample_train_label, \ sample_val_label = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path latent_shape = model_params['latent_shape'] num_labels = model_params['num_labels'] params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BasePixelCNNPrior.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ SequentialSampling(trainer_params['num_examples_to_generate'], latent_shape, num_labels, model, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images'), sample_train_label, sample_val_label), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss'])), MinSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), RestoreWeights(vqvae_checkpoint_path), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def train_image_embedding_softmax(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, _, _, _, _ = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseImageEmbedding.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ InferenceRunner(input=val_ds, infs=[ ScalarStats('loss'), ClassificationError('correct_prediction', 'val-correct_prediction')]), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), MinSaver(monitor_stat='val-correct_prediction'), RestoreWeights(vqvae_checkpoint_path), SendStat('Training status', [ 'loss', 'accuracy', 'validation_loss', 'val-correct_prediction'], after_every=2), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def train_vae(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, _, _ = \ get_dataflow(dataset_params['path'], dataset_params['binarizer'], dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) latent_dim = model_params['latent_dim'] model = BaseVAE.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Sampling(model, trainer_params['num_examples_to_generate'], latent_dim, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])), MinSaver(monitor_stat='validation_neg_elbo'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def train_image_embedding_triplet(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds = get_triplet_dataflow( dataset_params['path'], trainer_params['items_per_batch'], trainer_params['images_per_item'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseImageEmbedding.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), MinSaver(monitor_stat='loss'), RestoreWeights(vqvae_checkpoint_path), SendStat('Training status', ['loss', 'pos_triplet_frac'], after_every=2), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def __init__(self): self.model_dir = logger.get_logger_dir()
def crawl_local_auto_scripts_and_launch(auto_dir, nr_gpu=1, launcher="", n_parallel=10000, num_init_use_all_gpu=2): """ Process overview: if there is available resource limit (see n_parallel below), we launch a job. We first copy the job script into our local log dir to prevent other process from also launchng it. We remove the job script from the original auto_dir after copying. If either "copy" or "remove" fails, it means other process already owned the job and we abort by removing the job from our log dir. If we have the job script in our log dir, we actually launch it by translating its argument into a cmd "( ... && python xxxx ; rm -rf xxx.sh) &". We call this command to do this job in a different process. The cmd also includes a "...; rm xx.sh" to release the resource counted by n_parallel, regardless of the status of the remote job. Args auto_dir (str) : where to look for auto_scripts nr_gpu (int) : the numebr of gpu the crawler can use. A round robin schedule is used. launcher (str) : the name of the launcher, which is used for logger. n_parallel (int) : max number of parallel jobs. We count the number of n_parallel using the number of .sh files in the launcher's own log dir. Hence it is IMPERATIVE for each launched job to remove its own .sh after finishing regardless of sucess/failure. Bugged xx.sh are copied into xx.sh.fixme to avoid resource leak. num_init_use_all_gpu (int) : if mi < num_init_use_all_gpu then it will use all availabel gpu. This is for the initial jobs to be faster """ device = -1 while True: time.sleep(1) if os.path.exists(auto_dir): break logger.info("Found the auto_dir {}".format(auto_dir)) launch_log = logger.get_logger_dir() # python 2 vs 3 crap check_errno = False try: FileNotFoundError except NameError: FileNotFoundError = OSError check_errno = True logger.info("Crawler check_errno = {}".format(check_errno)) def _newFileNotFound(): e = FileNotFoundError() e.errno = errno.ENOENT return e def _isFileNotFound(e): if hasattr( e, 'errno') and e.errno is not None and e.errno == errno.ENOENT: return True if check_errno: return False return isinstance(e, FileNotFoundError) while True: time.sleep(np.random.uniform(low=1.0, high=5.0)) n_running = len( list(filter(lambda x: x.endswith('.sh'), os.listdir(launch_log)))) if n_running >= n_parallel: continue l_scripts = os.listdir(auto_dir) np.random.shuffle(l_scripts) for script in l_scripts: if script.endswith('.lck'): # this is a lock file. ignore continue auto_script = os.path.join(auto_dir, script) auto_script_tmp = os.path.join(launch_log, script) lock = auto_script + '.lck' if os.path.exists(lock): # someone early has locked the file. ignore continue if not os.path.exists(auto_script): # someone early has removed the script. ignore continue try: with open(lock, 'wt'): shutil.copyfile(auto_script, auto_script_tmp) if not os.path.exists(auto_script): # this is important. It makes sure that pycmd is valid. # Remove the tmp, if we found that we are not the first. os.remove(auto_script_tmp) raise _newFileNotFound() # this may raise error due to race. # All process could raise here due to strange iteractions. os.remove(auto_script) except Exception as e: if _isFileNotFound(e): # this means someone else removed the auto_script # before we did, so that guy is to launch if os.path.exists(auto_script_tmp): os.remove(auto_script_tmp) logger.info("Race on script {}".format(script)) else: logger.info("Crazy Race on {} : {} : {}".format( script, e.__class__, e)) # Other errors means race within os.remove ... # which means maybe none of them succeed to remove ... # so all launch. while os.path.exists(lock): # every process that opened the lock should attempt to remove it. try: os.remove(lock) break except: logger.info("Race on rm lock of {}".format(script)) # this file is only accessible by the current launcher. No need to lock. if os.path.exists(auto_script_tmp): # Translate pycmd, n_job_gpu = script_to_local_cmd( auto_script_tmp, nr_crawler_gpu=nr_gpu, num_init_use_all_gpu=num_init_use_all_gpu) if pycmd is None: logger.info("FIXME: {} failed on {}".format( launcher, script)) # rename so that it is no longer in the resource limit. os.rename(auto_script_tmp, auto_script_tmp + '.fixme') continue visible_gpus = os.environ.get('CUDA_VISIBLE_DEVICES', None) if visible_gpus: visible_gpus = visible_gpus.strip().split(',') assert len(visible_gpus) >= nr_gpu, \ '{} != {}'.format(len(visible_gpus), nr_gpu) else: visible_gpus = [str(gpu_id) for gpu_id in range(nr_gpu)] job_device = [] for _ in range(n_job_gpu): device = (device + 1) % nr_gpu job_device.append(visible_gpus[device]) job_device = ','.join(job_device) cmd = '(export CUDA_VISIBLE_DEVICES="{device}" && {pycmd} >> {out_fn} ; rm -rf {script}) &'.format(\ device=job_device, pycmd=pycmd, out_fn=os.path.join(launch_log, 'remote_stdout.txt'), script=auto_script_tmp) logger.info("Launch job {} on GPU {} by {}".format(\ script, job_device, launcher)) # launch the script in a different process subprocess.call(cmd, shell=True)
temp = temp[keys[i]] temp[keys[-1]] = value # set GPU machine if config['gpu'] in [None, 'None', '']: os.environ['CUDA_VISIBLE_DEVICES'] = '' num_gpu = 0 else: os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu'] num_gpu = max(get_num_gpu(), 1) config['num_gpu'] = num_gpu # set log directory if config['logdir'] in [None, 'None', '']: logger.auto_set_dir() else: logger.set_logger_dir('train_log/' + config['logdir'], action='d') # save configuration with open(logger.get_logger_dir() + '/config.json', 'w') as outfile: json.dump(config, outfile) # get train config train_config = get_train_config(config) # train the model if num_gpu > 1: launch_train_with_config(train_config, SyncMultiGPUTrainerReplicated(num_gpu)) else: launch_train_with_config(train_config, SimpleTrainer())
parser = argparse.ArgumentParser() add_app_arguments(parser) add_model_arguments(parser) add_controller_arguments(parser) DiversityOptions.add_parser_arguments(parser) PetridishRecover.add_parser_arguments(parser) args, unknown = parser.parse_known_args() args = model_options_processing(args) if args.job_type == 'main': # The log is set to be {args.log_dir}/petridish_main/log.log # FIXME: A bit weird that some of the utils are in ANN repo. # Might be good to refactor. ann_app_utils.log_init(args, None) logger.info("App has the following unknown arguments : {}".format(unknown)) log_dir = logger.get_logger_dir() # Update nr_gpu related param based on rutime config and all containers. # FIXME: This block of code below is Philly specific. But it is okay because # if we are not running on Philly then the functions below will check for Philly # environment and then skip automatically if it is not Philly. runtime_config = get_runtime_config() args.total_nr_gpu = get_total_nr_gpu(config=runtime_config) logger.info("Main has access to {} gpus".format(args.total_nr_gpu)) # Update more nr_gpu related param based on local container. cinfo = local_container_info(config=runtime_config) container_id = get_container_index(cinfo) logger.info("Container index = {}".format(container_id)) if cinfo is not None: nr_gpu = get_container_nr_gpu(cinfo) args.nr_gpu = nr_gpu
def __init__(self, model_name): self.model_dir = logger.get_logger_dir() self.model_name = model_name
def server_handle_child_message_soft_vs_hard( msg_output, controller, mi_info, options, n_idle, curr_iter): """ Special replacement of server_handle_child_message for experimenting on soft init vs. hard init. This is for experiment only. TODO reuse code with regular server_handle_child_message? """ log_dir_root = logger.get_logger_dir() q_parent, q_hallu = controller.q_parent, controller.q_hallu model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter fp, ve, te = jr['fp'], jr['ve'], jr['te'] logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format( model_iter, ve, te, fp * 1e-9)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp if search_depth > 0: return curr_iter controller.n_hallu_per_parent_on_idle = 1 # for soft vs hard experiment, only root generates hallu. controller.add_one_to_queue(q_parent, mi_info, model_iter, None) if q_parent.size() > 0: # choose a parent. pqe = controller.choose_parent(q_parent, mi_info) model_str, model_iter, _parent_iter, search_depth = pqe logger.info('PARENT : mi={}'.format(model_iter)) # Create hallucinations on the parent net_info_parent = net_info_from_str(model_str) # this experiment only creates one hallu from the root hallus = net_info_parent.sample_hallucinations( layer_ops=controller.valid_operations, merge_ops=controller.merge_operations, prob_at_layer=None, min_num_hallus=options.n_hallus_per_init, hallu_input_choice=options.hallu_input_choice) for netmorph_method in ['hard', 'soft']: controller.set_netmorph_method(netmorph_method) net_info = copy.deepcopy(net_info_parent) net_info = net_info.add_hallucinations( hallus, final_merge_op=controller.hallu_final_merge_op, stop_gradient_val=controller.stop_gradient_val, hallu_gate_layer=controller.hallu_gate_layer) # Update mi_info curr_iter += 1 hallu_str = net_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth + 1, None, None, hallu_str)) controller.add_one_to_queue( q_hallu, mi_info, curr_iter, net_info) return curr_iter
def server_main( controller, options, hallu_handle=None, child_handle=None, critic_handle=None): """ Server entrance/main. """ model_options_base = options log_dir_root = logger.get_logger_dir() model_dir_root = options.model_dir ( mi_info, ipc, qname_to_pool, philly_wa, curr_iter, critic_iter, n_recv, n_last_train, n_last_mi_save ) = server_init(controller, options) # useful alias: (q_hallu, q_child) = (controller.q_hallu, controller.q_child) # message handles hallu_handle = ( hallu_handle if hallu_handle else server_handle_hallu_message) child_handle = ( child_handle if child_handle else server_handle_child_message) critic_handle = ( critic_handle if critic_handle else server_handle_critic_message) # server main loop while ipc.pools.has_active() or q_child.size() > 0 or q_hallu.size() > 0: # Launch child/hallu sleepers for job_type, queue in zip( [TRAIN_HALLU, TRAIN_MODEL], [q_hallu, q_child]): # Populate workers util either active is full # or option_queue is empty. while ipc.pools.has_idle(job_type) and queue.size() > 0: model_str, model_iter, parent_iter, search_depth = queue.pop() # log the pop order of models. Important for analysis logger.info("mi={} pi={} sd={}".format( model_iter, parent_iter, search_depth)) logger.info("LayerInfoList is :\n{}".format(model_str)) model_options = copy.deepcopy(model_options_base) model_options.net_info = net_info_from_str(model_str) fork_and_train_model(ipc=ipc, options=model_options, log_dir=_mi_to_dn(log_dir_root, model_iter), child_dir=_mi_to_dn(model_dir_root, model_iter), prev_dir=_mi_to_dn(model_dir_root, parent_iter), model_str=model_str, model_iter=model_iter, parent_iter=parent_iter, search_depth=search_depth, job_type=job_type) # launch critic sleepers for qname in [q_child.name, q_hallu.name]: _n_new = n_recv[qname] - n_last_train[qname] _train_every = controller.controller_train_every if _n_new >= _train_every: pool = qname_to_pool[qname] if ipc.pools.has_idle(pool): n_last_train[qname] = n_recv[qname] ci = critic_iter[qname] = 1 + critic_iter[qname] logger.info('Train critic {} ci={} ...'.format(qname, ci)) fork_and_train_critic( ipc=ipc, ctrl=controller, data_dir=options.data_dir, crawl_dirs=log_dir_root, log_dir=_ci_to_dn(log_dir_root, ci, qname), model_dir=_ci_to_dn(model_dir_root, ci, qname), prev_dir=_ci_to_dn(model_dir_root, ci-1, qname), critic_iter=ci, queue_name=qname, pool=pool) logger.info('...Train critic launched') logger.info('Listening for message...') msg_output, job_type = ipc.get_finished_message() if job_type == TRAIN_HALLU: n_recv[q_hallu.name] += 1 curr_iter = hallu_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options, curr_iter=curr_iter) elif job_type == TRAIN_MODEL: n_recv[q_child.name] += 1 n_idle = ipc.pools.num_idle(TRAIN_HALLU) curr_iter = child_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options, n_idle=n_idle, curr_iter=curr_iter) elif job_type in [ TRAIN_CRITIC_MODEL, TRAIN_CRITIC_HALLU, TRAIN_CRITIC_PARENT]: critic_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options) ## periodic log/heartbeat/ and exits. n_finished = n_recv[q_child.name] + n_recv[q_hallu.name] philly_wa.new_heart_beat(cnt=n_finished) philly_wa.print_progress_percent() # Saving mi_info periodically for training # critic, post-processing and recovering. np.savez(_mi_info_save_fn(log_dir_root), mi_info=mi_info) # we have explore enough models. quit now. if n_finished >= options.max_exploration: break # end while (server main loop) logger.info( "Exiting server main. n_recv[hallu]={} n_recv[child]={}".format( n_recv[q_hallu.name], n_recv[q_child.name]))
def server_init(controller, options): """ Initialize params for server. """ # names and static/fixed info log_dir_root = logger.get_logger_dir() model_dir_root = options.model_dir # Queues. queue_names, _queues = controller.init_queues() (q_parent, q_hallu, q_child) = ( controller.q_parent, controller.q_hallu, controller.q_child) qname_to_pool = { q_child.name : TRAIN_CRITIC_MODEL, q_hallu.name : TRAIN_CRITIC_HALLU, q_parent.name : TRAIN_CRITIC_PARENT} mi_info = [] if is_debug(options): prev_log_root = log_dir_root prev_model_root = model_dir_root else: prev_log_root = previous_trial_log_root(log_dir_root) prev_model_root = previous_trial_model_root(model_dir_root) is_success = False while prev_log_root and prev_model_root: logger.info("prev_log_root=\"{}\" && prev_model_root=\"{}\"".format( prev_log_root, prev_model_root)) is_success = controller.recover.recover( prev_log_root=prev_log_root, log_root=log_dir_root, prev_model_root=prev_model_root, model_root=model_dir_root, q_parent=q_parent, q_hallu=q_hallu, q_child=q_child, mi_info=mi_info) if is_success: critic_iter = controller.init_predictors( prev_log_root, prev_model_root) break prev_log_root = previous_trial_log_root(prev_log_root) prev_model_root = previous_trial_model_root(prev_model_root) if not is_success: # controller init predictors from scratch critic_iter = controller.init_predictors(log_dir_root, model_dir_root) if len(mi_info) == 0: if options.net_info: l_init_net_info = [options.net_info] # Need to delete these info because # 1. The options is to be used by future children, we want to # remove uncessary params. # 2. Having both will cause multiple occurance of net_info_str # on children scripts, which causes bugs. delattr(options, 'net_info') options.net_info_str = None else: l_init_net_info = controller.initial_net_info() for mi, net_info in enumerate(l_init_net_info): mstr = net_info.to_str() # on server model info for each model_iter. Used for critic features. # In the order are mi, pi, sd, fp, ve, mstr, stats mi_info.append(ModelSearchInfo(mi, mi, 0, None, 2.0, mstr, [0.0])) controller.add_one_to_queue(q_child, mi_info, mi, net_info) # Job counters curr_iter = len(mi_info) - 1 # queue related counters # Model counters and pool resources are reset upon reboot for now. n_recv = dict([(qname, 0) for qname in queue_names]) n_last_train = dict([(qname, 0) for qname in queue_names]) n_last_mi_save = 0 # IPC pool_sizes = [ 0 ] * NUM_POOLS pool_sizes[TRAIN_HALLU] = controller.n_hallu_procs pool_sizes[TRAIN_MODEL] = controller.n_model_procs for qname in qname_to_pool: pool_sizes[qname_to_pool[qname]] = controller.n_critic_procs ipc = PetridishServerIPC(pool_sizes, hwm=50) ipc.initialize() # Server progress workaround # This guy wakes up every once a while to increase progress bar a little. philly_wa = PhillyHeartBeatWorkAround(max_cnt=options.max_exploration) return ( mi_info, ipc, qname_to_pool, philly_wa, curr_iter, critic_iter, n_recv, n_last_train, n_last_mi_save )
def server_handle_hallu_message( msg_output, controller, mi_info, options, curr_iter): """ Petridish server handles the return message of a forked process that watches over a halluciniation job. """ log_dir_root = logger.get_logger_dir() q_child = controller.q_child model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter (fp, ve, te, hallu_stats, l_op_indices, l_op_omega) = ( jr['fp'], jr['ve'], jr['te'], jr['l_stats'], jr['l_op_indices'], jr['l_op_omega'] ) logger.info( ("HALLU : mi={} val_err={} test_err={} " "Gflops={} hallu_stats={}").format( model_iter, ve, te, fp * 1e-9, hallu_stats)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp ## compute hallucination related info in net_info net_info = net_info_from_str(model_str) hallu_locs = net_info.contained_hallucination() # contained hallu_indices = net_info.sorted_hallu_indices(hallu_locs) # feature selection based on params l_fs_ops, l_fs_omega = feature_selection_cutoff( l_op_indices, l_op_omega, options) separated_hallu_info = net_info.separate_hallu_info_by_cname( hallu_locs, hallu_indices, l_fs_ops, l_fs_omega) ## Select a subset of hallucination to add to child model l_selected = [] # sort by -cos(grad, hallu) for the indices, 0,1,2,...,n_hallu-1. processed_stats = [process_hallu_stats_for_critic_feat([stats]) \ for stats in hallu_stats] logger.info('processed_stats={}'.format(processed_stats)) logger.info('separated_hallu_info={}'.format(separated_hallu_info)) # greedy select with gradient boosting l_greedy_selected = [] if options.n_greed_select_per_init: greedy_order = sorted( range(len(hallu_indices)), key=lambda i : - processed_stats[i][0]) min_select = options.n_hallus_per_select max_select = max(min_select, len(hallu_indices) // 2) for selected_len in range(min_select, max_select + 1): selected = greedy_order[:selected_len] l_greedy_selected.append(selected) n_greedy_select = len(l_greedy_selected) if n_greedy_select > options.n_greed_select_per_init: # random choose l_greedy_selected = list(np.random.choice( l_greedy_selected, options.n_greed_select_per_init, replace=False)) # random select a subset l_random_selected = [] if options.n_rand_select_per_init: # also try some random samples l_random_selected = online_sampling( itertools.combinations( range(len(hallu_indices)), options.n_hallus_per_select ), options.n_rand_select_per_init) np.random.shuffle(l_random_selected) l_selected = l_greedy_selected + l_random_selected ## for each selected subset of hallu, make a model for q_child # since more recent ones tend to be better, # we insert in reverse order, so greedy are inserted later. for selected in reversed(l_selected): # new model description child_info = copy.deepcopy(net_info) l_hi = [ hallu_indices[s] for s in selected ] child_info = child_info.select_hallucination( l_hi, separated_hallu_info) # Compute initialization stat stat = process_hallu_stats_for_critic_feat( [hallu_stats[s] for s in selected]) # update mi_info curr_iter += 1 child_str = child_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth+1, None, None, child_str, stat)) controller.add_one_to_queue( q_child, mi_info, curr_iter, child_info) return curr_iter