def model_desc_fn_to_option(fullname, ds_name, options, use_latest_input=False, aux_weight=0.4, depth_multiplier=[1]): assert os.path.exists(fullname), fullname with open(fullname, 'rt') as fin: lines = fin.readlines() assert lines, 'file is empty' line = lines[0].strip() assert line, 'the net info line (first line) is empty' options.ds_name = ds_name if ds_name == 'cifar10' or ds_name == 'cifar100': is_ilsvrc = False options = cifar_default_train_options(options) elif ds_name == 'imagenet' or ds_name == 'ilsvrc': is_ilsvrc = True options = imagenet_mobile_default_train_options(options) net_info = net_info_from_str(line) if isinstance(depth_multiplier, int): depth_multiplier = [depth_multiplier] if any([_x > 1 for _x in depth_multiplier]): net_info = increase_net_info_size(net_info, depth_multiplier) if is_ilsvrc: net_info = net_info_cifar_to_ilsvrc(net_info, options.s_type, use_latest_input) if aux_weight > 0: options.net_info = add_aux_weight(net_info, 0.4) return options
def server_handle_child_message( msg_output, controller, mi_info, options, n_idle, curr_iter): """ Petridish server handles the return message of a forked process that watches over a child job. """ log_dir_root = logger.get_logger_dir() q_parent, q_hallu = controller.q_parent, controller.q_hallu model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter fp, ve, te = jr['fp'], jr['ve'], jr['te'] logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format( model_iter, ve, te, fp * 1e-9)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp if (search_depth // 2 < options.max_growth and (options.search_max_flops is None or fp < options.search_max_flops)): controller.add_one_to_queue( q_parent, mi_info, model_iter, None) if q_parent.size() > 0: # choose a parent. pqe = controller.choose_parent(q_parent, mi_info) model_str, model_iter, _parent_iter, search_depth = pqe logger.info('PARENT : mi={}'.format(model_iter)) # Create hallucinations on the parent net_info_parent = net_info_from_str(model_str) n_hallu_per_parent = max( 1, min(controller.n_hallu_per_parent_on_idle, n_idle)) for _ in range(n_hallu_per_parent): net_info = copy.deepcopy(net_info_parent) hallus = net_info.sample_hallucinations( layer_ops=controller.valid_operations, merge_ops=controller.merge_operations, prob_at_layer=None, min_num_hallus=options.n_hallus_per_init, hallu_input_choice=options.hallu_input_choice) net_info = net_info.add_hallucinations( hallus, final_merge_op=controller.hallu_final_merge_op, stop_gradient_val=controller.stop_gradient_val, hallu_gate_layer=controller.hallu_gate_layer) # Update mi_info curr_iter += 1 hallu_str = net_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth + 1, None, None, hallu_str)) controller.add_one_to_queue( q_hallu, mi_info, curr_iter, net_info) return curr_iter
def model_options_processing(options): """ Populate some complicated default arguments, and parse comma-separated int lists to int lists. """ if options.net_info_str is None: options.net_info = None return options if isinstance(options.net_info_str, str): try: options.net_info = net_info_from_str(options.net_info_str) except: logger.info("Failed info str is:\n{}".format(options.net_info_str)) raise return options
def server_handle_child_message_soft_vs_hard( msg_output, controller, mi_info, options, n_idle, curr_iter): """ Special replacement of server_handle_child_message for experimenting on soft init vs. hard init. This is for experiment only. TODO reuse code with regular server_handle_child_message? """ log_dir_root = logger.get_logger_dir() q_parent, q_hallu = controller.q_parent, controller.q_hallu model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter fp, ve, te = jr['fp'], jr['ve'], jr['te'] logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format( model_iter, ve, te, fp * 1e-9)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp if search_depth > 0: return curr_iter controller.n_hallu_per_parent_on_idle = 1 # for soft vs hard experiment, only root generates hallu. controller.add_one_to_queue(q_parent, mi_info, model_iter, None) if q_parent.size() > 0: # choose a parent. pqe = controller.choose_parent(q_parent, mi_info) model_str, model_iter, _parent_iter, search_depth = pqe logger.info('PARENT : mi={}'.format(model_iter)) # Create hallucinations on the parent net_info_parent = net_info_from_str(model_str) # this experiment only creates one hallu from the root hallus = net_info_parent.sample_hallucinations( layer_ops=controller.valid_operations, merge_ops=controller.merge_operations, prob_at_layer=None, min_num_hallus=options.n_hallus_per_init, hallu_input_choice=options.hallu_input_choice) for netmorph_method in ['hard', 'soft']: controller.set_netmorph_method(netmorph_method) net_info = copy.deepcopy(net_info_parent) net_info = net_info.add_hallucinations( hallus, final_merge_op=controller.hallu_final_merge_op, stop_gradient_val=controller.stop_gradient_val, hallu_gate_layer=controller.hallu_gate_layer) # Update mi_info curr_iter += 1 hallu_str = net_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth + 1, None, None, hallu_str)) controller.add_one_to_queue( q_hallu, mi_info, curr_iter, net_info) return curr_iter
def server_main( controller, options, hallu_handle=None, child_handle=None, critic_handle=None): """ Server entrance/main. """ model_options_base = options log_dir_root = logger.get_logger_dir() model_dir_root = options.model_dir ( mi_info, ipc, qname_to_pool, philly_wa, curr_iter, critic_iter, n_recv, n_last_train, n_last_mi_save ) = server_init(controller, options) # useful alias: (q_hallu, q_child) = (controller.q_hallu, controller.q_child) # message handles hallu_handle = ( hallu_handle if hallu_handle else server_handle_hallu_message) child_handle = ( child_handle if child_handle else server_handle_child_message) critic_handle = ( critic_handle if critic_handle else server_handle_critic_message) # server main loop while ipc.pools.has_active() or q_child.size() > 0 or q_hallu.size() > 0: # Launch child/hallu sleepers for job_type, queue in zip( [TRAIN_HALLU, TRAIN_MODEL], [q_hallu, q_child]): # Populate workers util either active is full # or option_queue is empty. while ipc.pools.has_idle(job_type) and queue.size() > 0: model_str, model_iter, parent_iter, search_depth = queue.pop() # log the pop order of models. Important for analysis logger.info("mi={} pi={} sd={}".format( model_iter, parent_iter, search_depth)) logger.info("LayerInfoList is :\n{}".format(model_str)) model_options = copy.deepcopy(model_options_base) model_options.net_info = net_info_from_str(model_str) fork_and_train_model(ipc=ipc, options=model_options, log_dir=_mi_to_dn(log_dir_root, model_iter), child_dir=_mi_to_dn(model_dir_root, model_iter), prev_dir=_mi_to_dn(model_dir_root, parent_iter), model_str=model_str, model_iter=model_iter, parent_iter=parent_iter, search_depth=search_depth, job_type=job_type) # launch critic sleepers for qname in [q_child.name, q_hallu.name]: _n_new = n_recv[qname] - n_last_train[qname] _train_every = controller.controller_train_every if _n_new >= _train_every: pool = qname_to_pool[qname] if ipc.pools.has_idle(pool): n_last_train[qname] = n_recv[qname] ci = critic_iter[qname] = 1 + critic_iter[qname] logger.info('Train critic {} ci={} ...'.format(qname, ci)) fork_and_train_critic( ipc=ipc, ctrl=controller, data_dir=options.data_dir, crawl_dirs=log_dir_root, log_dir=_ci_to_dn(log_dir_root, ci, qname), model_dir=_ci_to_dn(model_dir_root, ci, qname), prev_dir=_ci_to_dn(model_dir_root, ci-1, qname), critic_iter=ci, queue_name=qname, pool=pool) logger.info('...Train critic launched') logger.info('Listening for message...') msg_output, job_type = ipc.get_finished_message() if job_type == TRAIN_HALLU: n_recv[q_hallu.name] += 1 curr_iter = hallu_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options, curr_iter=curr_iter) elif job_type == TRAIN_MODEL: n_recv[q_child.name] += 1 n_idle = ipc.pools.num_idle(TRAIN_HALLU) curr_iter = child_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options, n_idle=n_idle, curr_iter=curr_iter) elif job_type in [ TRAIN_CRITIC_MODEL, TRAIN_CRITIC_HALLU, TRAIN_CRITIC_PARENT]: critic_handle( msg_output=msg_output, controller=controller, mi_info=mi_info, options=options) ## periodic log/heartbeat/ and exits. n_finished = n_recv[q_child.name] + n_recv[q_hallu.name] philly_wa.new_heart_beat(cnt=n_finished) philly_wa.print_progress_percent() # Saving mi_info periodically for training # critic, post-processing and recovering. np.savez(_mi_info_save_fn(log_dir_root), mi_info=mi_info) # we have explore enough models. quit now. if n_finished >= options.max_exploration: break # end while (server main loop) logger.info( "Exiting server main. n_recv[hallu]={} n_recv[child]={}".format( n_recv[q_hallu.name], n_recv[q_child.name]))
def server_handle_hallu_message( msg_output, controller, mi_info, options, curr_iter): """ Petridish server handles the return message of a forked process that watches over a halluciniation job. """ log_dir_root = logger.get_logger_dir() q_child = controller.q_child model_str, model_iter, _parent_iter, search_depth = msg_output # Record performance in the main log jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter)) if jr is None: # job failure: reap the virtual resource and move on. logger.info('Failed mi={}'.format(model_iter)) return curr_iter (fp, ve, te, hallu_stats, l_op_indices, l_op_omega) = ( jr['fp'], jr['ve'], jr['te'], jr['l_stats'], jr['l_op_indices'], jr['l_op_omega'] ) logger.info( ("HALLU : mi={} val_err={} test_err={} " "Gflops={} hallu_stats={}").format( model_iter, ve, te, fp * 1e-9, hallu_stats)) mi_info[model_iter].ve = ve mi_info[model_iter].fp = fp ## compute hallucination related info in net_info net_info = net_info_from_str(model_str) hallu_locs = net_info.contained_hallucination() # contained hallu_indices = net_info.sorted_hallu_indices(hallu_locs) # feature selection based on params l_fs_ops, l_fs_omega = feature_selection_cutoff( l_op_indices, l_op_omega, options) separated_hallu_info = net_info.separate_hallu_info_by_cname( hallu_locs, hallu_indices, l_fs_ops, l_fs_omega) ## Select a subset of hallucination to add to child model l_selected = [] # sort by -cos(grad, hallu) for the indices, 0,1,2,...,n_hallu-1. processed_stats = [process_hallu_stats_for_critic_feat([stats]) \ for stats in hallu_stats] logger.info('processed_stats={}'.format(processed_stats)) logger.info('separated_hallu_info={}'.format(separated_hallu_info)) # greedy select with gradient boosting l_greedy_selected = [] if options.n_greed_select_per_init: greedy_order = sorted( range(len(hallu_indices)), key=lambda i : - processed_stats[i][0]) min_select = options.n_hallus_per_select max_select = max(min_select, len(hallu_indices) // 2) for selected_len in range(min_select, max_select + 1): selected = greedy_order[:selected_len] l_greedy_selected.append(selected) n_greedy_select = len(l_greedy_selected) if n_greedy_select > options.n_greed_select_per_init: # random choose l_greedy_selected = list(np.random.choice( l_greedy_selected, options.n_greed_select_per_init, replace=False)) # random select a subset l_random_selected = [] if options.n_rand_select_per_init: # also try some random samples l_random_selected = online_sampling( itertools.combinations( range(len(hallu_indices)), options.n_hallus_per_select ), options.n_rand_select_per_init) np.random.shuffle(l_random_selected) l_selected = l_greedy_selected + l_random_selected ## for each selected subset of hallu, make a model for q_child # since more recent ones tend to be better, # we insert in reverse order, so greedy are inserted later. for selected in reversed(l_selected): # new model description child_info = copy.deepcopy(net_info) l_hi = [ hallu_indices[s] for s in selected ] child_info = child_info.select_hallucination( l_hi, separated_hallu_info) # Compute initialization stat stat = process_hallu_stats_for_critic_feat( [hallu_stats[s] for s in selected]) # update mi_info curr_iter += 1 child_str = child_info.to_str() mi_info.append(ModelSearchInfo( curr_iter, model_iter, search_depth+1, None, None, child_str, stat)) controller.add_one_to_queue( q_child, mi_info, curr_iter, child_info) return curr_iter