Ejemplo n.º 1
0
def eval_classification(model, sessinit, dataflow):

    pred_config = PredictConfig(
        model=model,
        session_init=sessinit,
        input_names=['input', 'label'],
        output_names=['wrong-top1','bn5/output:0','conv5/output:0']
    )
    acc1 = RatioCounter()


    pred = FeedfreePredictor(pred_config, StagingInput(QueueInput(dataflow), device='/gpu:0'))

    for _ in tqdm.trange(dataflow.size()):
        top1,afbn5,beforbn5= pred()
        dic ={}
        dic['bn5/output:0']=afbn5
        dic['conv5/output:0']=beforbn5   

        batch_size = top1.shape[0]
        acc1.feed(top1.sum(), batch_size)
        dir = logger.get_logger_dir()

        fname = os.path.join(
                dir, 'afbn5-{}.npz'.format(int(time.time())))
        np.savez(fname, **dic)


    print("Top1 Error: {}".format(acc1.ratio))
Ejemplo n.º 2
0
def train_vqvae(params, dataset, checkpoint_dir):
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']
    image_shape = model_params['image_shape']

    train_ds, val_ds, sample_train, sample_test = load_toy_dataset(
        dataset, trainer_params['batch_size'],
        trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseVQVAE.from_params(model_params)

    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Reconstruct(model, sample_train, sample_test,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss', 'perplexity'])),
            MaxSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip'))
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, SimpleTrainer())
Ejemplo n.º 3
0
def server_handle_critic_message(
        msg_output, controller, mi_info, options):
    """
    Petridish server handles the return message of a forked
    process that watches over a critic job.
    """
    log_dir_root = logger.get_logger_dir()
    model_dir_root = options.model_dir
    queues = controller.queues
    queue_name, new_ci = msg_output
    is_fail, _ = is_mark_failure(
        _ci_to_dn(log_dir_root, new_ci, queue_name))
    if is_fail:
        logger.info('Failed {} ci={}'.format(queue_name, new_ci))
        return
    logger.info('Updating w/ msg of CRITIC {} ci={}'.format(
        queue_name, new_ci))
    # load the new critic
    ctrl_dn = _ci_to_dn(model_dir_root, new_ci, queue_name)
    controller.update_predictor(ctrl_dn, queue_name)
    # as we have new model for critic,
    # remove other old ones if exists.
    ctrl_dns = [_ci_to_dn(model_dir_root, ci, queue_name) \
        for ci in range(new_ci + 1 - controller.n_critic_procs)]
    for ctrl_dn in filter(lambda x : os.path.exists(x), ctrl_dns):
        logger.info('rm -rf {}'.format(ctrl_dn))
        _ = subprocess.check_output(
            'rm -rf {} &'.format(ctrl_dn), shell=True)
    # Sort the affected queue.
    logger.info('Ordering queue {}...'.format(queue_name))
    queue = queues[queue_name]
    controller.update_queue(queue, mi_info)
    logger.info('... done ordering')
Ejemplo n.º 4
0
    def __init__(self,
                 max_to_keep=10,
                 keep_checkpoint_every_n_hours=0.5,
                 checkpoint_dir=None,
                 var_collections=None):
        """
        Args:
            max_to_keep (int): the same as in ``tf.train.Saver``.
            keep_checkpoint_every_n_hours (float): the same as in ``tf.train.Saver``.
                Note that "keep" does not mean "create", but means "don't delete".
            checkpoint_dir (str): Defaults to ``logger.get_logger_dir()``.
            var_collections (str or list of str): collection of the variables (or list of collections) to save.
        """
        if var_collections is None:
            var_collections = [tf.GraphKeys.GLOBAL_VARIABLES]
        self._max_to_keep = max_to_keep
        self._keep_every_n_hours = keep_checkpoint_every_n_hours

        if not isinstance(var_collections, list):
            var_collections = [var_collections]
        self.var_collections = var_collections
        if checkpoint_dir is None:
            checkpoint_dir = logger.get_logger_dir()
        if checkpoint_dir is not None:
            if not tf.gfile.IsDirectory(
                    checkpoint_dir):  # v2: tf.io.gfile.isdir
                tf.gfile.MakeDirs(checkpoint_dir)  # v2: tf.io.gfile.makedirs
        self.checkpoint_dir = checkpoint_dir
Ejemplo n.º 5
0
def server_handle_child_message(
        msg_output, controller, mi_info, options, n_idle, curr_iter):
    """
    Petridish server handles the return message of a forked
    process that watches over a child job.
    """
    log_dir_root = logger.get_logger_dir()
    q_parent, q_hallu = controller.q_parent, controller.q_hallu
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    fp, ve, te = jr['fp'], jr['ve'], jr['te']
    logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format(
        model_iter, ve, te, fp * 1e-9))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    if (search_depth // 2 < options.max_growth
            and (options.search_max_flops is None
                    or fp < options.search_max_flops)):
        controller.add_one_to_queue(
            q_parent, mi_info, model_iter, None)

    if q_parent.size() > 0:
        # choose a parent.
        pqe = controller.choose_parent(q_parent, mi_info)
        model_str, model_iter, _parent_iter, search_depth = pqe
        logger.info('PARENT : mi={}'.format(model_iter))
        # Create hallucinations on the parent
        net_info_parent = net_info_from_str(model_str)
        n_hallu_per_parent = max(
            1,
            min(controller.n_hallu_per_parent_on_idle, n_idle))
        for _ in range(n_hallu_per_parent):
            net_info = copy.deepcopy(net_info_parent)
            hallus = net_info.sample_hallucinations(
                layer_ops=controller.valid_operations,
                merge_ops=controller.merge_operations,
                prob_at_layer=None,
                min_num_hallus=options.n_hallus_per_init,
                hallu_input_choice=options.hallu_input_choice)
            net_info = net_info.add_hallucinations(
                hallus,
                final_merge_op=controller.hallu_final_merge_op,
                stop_gradient_val=controller.stop_gradient_val,
                hallu_gate_layer=controller.hallu_gate_layer)
            # Update mi_info
            curr_iter += 1
            hallu_str = net_info.to_str()
            mi_info.append(ModelSearchInfo(
                curr_iter, model_iter, search_depth + 1,
                None, None, hallu_str))
            controller.add_one_to_queue(
                q_hallu, mi_info, curr_iter, net_info)
    return curr_iter
Ejemplo n.º 6
0
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, sample_train_label, \
        sample_val_label = get_dataflow(
            dataset_params['path'], False,
            dataset_params['train_val_split'], trainer_params['batch_size'],
            trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    latent_shape = model_params['latent_shape']
    num_labels = model_params['num_labels']

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BasePixelCNNPrior.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            SequentialSampling(trainer_params['num_examples_to_generate'],
                               latent_shape, num_labels, model,
                               os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images'),
                        sample_train_label, sample_val_label),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss'])),
            MinSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            RestoreWeights(vqvae_checkpoint_path),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Ejemplo n.º 7
0
def train_image_embedding_softmax(params, checkpoint_dir, recover=True,
                                  force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, _, _, _, _ = get_dataflow(
        dataset_params['path'], False, dataset_params['train_val_split'],
        trainer_params['batch_size'], trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseImageEmbedding.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            InferenceRunner(input=val_ds, infs=[
                ScalarStats('loss'),
                ClassificationError('correct_prediction',
                                    'val-correct_prediction')]),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            MinSaver(monitor_stat='val-correct_prediction'),
            RestoreWeights(vqvae_checkpoint_path),
            SendStat('Training status', [
                'loss', 'accuracy',
                'validation_loss', 'val-correct_prediction'],
                after_every=2),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Ejemplo n.º 8
0
def train_vae(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, _, _ = \
        get_dataflow(dataset_params['path'],
                     dataset_params['binarizer'],
                     dataset_params['train_val_split'],
                     trainer_params['batch_size'],
                     trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    latent_dim = model_params['latent_dim']
    model = BaseVAE.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Sampling(model, trainer_params['num_examples_to_generate'],
                     latent_dim, os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])),
            MinSaver(monitor_stat='validation_neg_elbo'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Ejemplo n.º 9
0
def train_image_embedding_triplet(params, checkpoint_dir, recover=True,
                                  force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds = get_triplet_dataflow(
        dataset_params['path'], trainer_params['items_per_batch'],
        trainer_params['images_per_item'], trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseImageEmbedding.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            MinSaver(monitor_stat='loss'),
            RestoreWeights(vqvae_checkpoint_path),
            SendStat('Training status', ['loss', 'pos_triplet_frac'],
                     after_every=2),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Ejemplo n.º 10
0
 def __init__(self):
     self.model_dir = logger.get_logger_dir()
Ejemplo n.º 11
0
def crawl_local_auto_scripts_and_launch(auto_dir,
                                        nr_gpu=1,
                                        launcher="",
                                        n_parallel=10000,
                                        num_init_use_all_gpu=2):
    """
    Process overview: if there is available resource limit (see n_parallel below), we
    launch a job. We first copy the job script into our local log dir to prevent other
    process from also launchng it. We remove the job script from the original auto_dir after
    copying. If either "copy" or "remove" fails, it means other process already owned the job
    and we abort by removing the job from our log dir.

    If we have the job script in our log dir, we actually launch it by translating
    its argument into a cmd "( ... && python xxxx ; rm -rf xxx.sh) &". We call this command
    to do this job in a different process. The cmd also includes a "...; rm xx.sh" to release
    the resource counted by n_parallel, regardless of the status of the remote job.

    Args
    auto_dir (str) : where to look for auto_scripts
    nr_gpu (int) : the numebr of gpu the crawler can use. A round robin schedule is used.
    launcher (str) : the name of the launcher, which is used for logger.
    n_parallel (int) : max number of parallel jobs. We count the number of n_parallel
        using the number of .sh files in the launcher's own log dir. Hence it is IMPERATIVE
        for each launched job to remove its own .sh after finishing regardless of sucess/failure.
        Bugged xx.sh are copied into xx.sh.fixme to avoid resource leak.
    num_init_use_all_gpu (int) : if mi < num_init_use_all_gpu then it will use
        all availabel gpu. This is for the initial jobs to be faster
    """
    device = -1
    while True:
        time.sleep(1)
        if os.path.exists(auto_dir):
            break
    logger.info("Found the auto_dir {}".format(auto_dir))
    launch_log = logger.get_logger_dir()

    # python 2 vs 3 crap
    check_errno = False
    try:
        FileNotFoundError
    except NameError:
        FileNotFoundError = OSError
        check_errno = True
    logger.info("Crawler check_errno = {}".format(check_errno))

    def _newFileNotFound():
        e = FileNotFoundError()
        e.errno = errno.ENOENT
        return e

    def _isFileNotFound(e):
        if hasattr(
                e,
                'errno') and e.errno is not None and e.errno == errno.ENOENT:
            return True
        if check_errno:
            return False
        return isinstance(e, FileNotFoundError)

    while True:
        time.sleep(np.random.uniform(low=1.0, high=5.0))
        n_running = len(
            list(filter(lambda x: x.endswith('.sh'), os.listdir(launch_log))))
        if n_running >= n_parallel:
            continue

        l_scripts = os.listdir(auto_dir)
        np.random.shuffle(l_scripts)
        for script in l_scripts:
            if script.endswith('.lck'):
                # this is a lock file. ignore
                continue
            auto_script = os.path.join(auto_dir, script)
            auto_script_tmp = os.path.join(launch_log, script)
            lock = auto_script + '.lck'
            if os.path.exists(lock):
                # someone early has locked the file. ignore
                continue
            if not os.path.exists(auto_script):
                # someone early has removed the script. ignore
                continue

            try:
                with open(lock, 'wt'):
                    shutil.copyfile(auto_script, auto_script_tmp)
                    if not os.path.exists(auto_script):
                        # this is important. It makes sure that pycmd is valid.
                        # Remove the tmp, if we found that we are not the first.
                        os.remove(auto_script_tmp)
                        raise _newFileNotFound()
                    # this may raise error due to race.
                    # All process could raise here due to strange iteractions.
                    os.remove(auto_script)
            except Exception as e:
                if _isFileNotFound(e):
                    # this means someone else removed the auto_script
                    # before we did, so that guy is to launch
                    if os.path.exists(auto_script_tmp):
                        os.remove(auto_script_tmp)
                    logger.info("Race on script {}".format(script))
                else:
                    logger.info("Crazy Race on {} : {} : {}".format(
                        script, e.__class__, e))
                    # Other errors means race within os.remove ...
                    # which means maybe none of them succeed to remove ...
                    # so all launch.

            while os.path.exists(lock):
                # every process that opened the lock should attempt to remove it.
                try:
                    os.remove(lock)
                    break
                except:
                    logger.info("Race on rm lock of {}".format(script))

            # this file is only accessible by the current launcher. No need to lock.
            if os.path.exists(auto_script_tmp):
                # Translate
                pycmd, n_job_gpu = script_to_local_cmd(
                    auto_script_tmp,
                    nr_crawler_gpu=nr_gpu,
                    num_init_use_all_gpu=num_init_use_all_gpu)
                if pycmd is None:
                    logger.info("FIXME: {} failed on {}".format(
                        launcher, script))
                    # rename so that it is no longer in the resource limit.
                    os.rename(auto_script_tmp, auto_script_tmp + '.fixme')
                    continue
                visible_gpus = os.environ.get('CUDA_VISIBLE_DEVICES', None)
                if visible_gpus:
                    visible_gpus = visible_gpus.strip().split(',')
                    assert len(visible_gpus) >= nr_gpu, \
                        '{} != {}'.format(len(visible_gpus), nr_gpu)
                else:
                    visible_gpus = [str(gpu_id) for gpu_id in range(nr_gpu)]
                job_device = []
                for _ in range(n_job_gpu):
                    device = (device + 1) % nr_gpu
                    job_device.append(visible_gpus[device])
                job_device = ','.join(job_device)
                cmd = '(export CUDA_VISIBLE_DEVICES="{device}" && {pycmd} >> {out_fn} ; rm -rf {script}) &'.format(\
                    device=job_device, pycmd=pycmd,
                    out_fn=os.path.join(launch_log, 'remote_stdout.txt'),
                    script=auto_script_tmp)
                logger.info("Launch job {} on GPU {} by {}".format(\
                    script, job_device, launcher))
                # launch the script in a different process
                subprocess.call(cmd, shell=True)
Ejemplo n.º 12
0
            temp = temp[keys[i]]
        temp[keys[-1]] = value

    # set GPU machine
    if config['gpu'] in [None, 'None', '']:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        num_gpu = 0
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu']
        num_gpu = max(get_num_gpu(), 1)
    config['num_gpu'] = num_gpu

    # set log directory
    if config['logdir'] in [None, 'None', '']:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir('train_log/' + config['logdir'], action='d')
    # save configuration
    with open(logger.get_logger_dir() + '/config.json', 'w') as outfile:
        json.dump(config, outfile)

    # get train config
    train_config = get_train_config(config)

    # train the model
    if num_gpu > 1:
        launch_train_with_config(train_config,
                                 SyncMultiGPUTrainerReplicated(num_gpu))
    else:
        launch_train_with_config(train_config, SimpleTrainer())
Ejemplo n.º 13
0
    parser = argparse.ArgumentParser()
    add_app_arguments(parser)
    add_model_arguments(parser)
    add_controller_arguments(parser)
    DiversityOptions.add_parser_arguments(parser)
    PetridishRecover.add_parser_arguments(parser)
    args, unknown = parser.parse_known_args()
    args = model_options_processing(args)

    if args.job_type == 'main':
        # The log is set to be {args.log_dir}/petridish_main/log.log
        # FIXME: A bit weird that some of the utils are in ANN repo.
        # Might be good to refactor.
        ann_app_utils.log_init(args, None)
        logger.info("App has the following unknown arguments : {}".format(unknown))
        log_dir = logger.get_logger_dir()

        # Update nr_gpu related param based on rutime config and all containers.
        # FIXME: This block of code below is Philly specific. But it is okay because
        # if we are not running on Philly then the functions below will check for Philly
        # environment and then skip automatically if it is not Philly.
        runtime_config = get_runtime_config()
        args.total_nr_gpu = get_total_nr_gpu(config=runtime_config)
        logger.info("Main has access to {} gpus".format(args.total_nr_gpu))
        # Update more nr_gpu related param based on local container.
        cinfo = local_container_info(config=runtime_config)
        container_id = get_container_index(cinfo)
        logger.info("Container index = {}".format(container_id))
        if cinfo is not None:
            nr_gpu = get_container_nr_gpu(cinfo)
            args.nr_gpu = nr_gpu
Ejemplo n.º 14
0
 def __init__(self, model_name):
     self.model_dir = logger.get_logger_dir()
     self.model_name = model_name
Ejemplo n.º 15
0
def server_handle_child_message_soft_vs_hard(
        msg_output, controller, mi_info, options, n_idle, curr_iter):
    """
    Special replacement of server_handle_child_message for
    experimenting on soft init vs. hard init.

    This is for experiment only.
    TODO reuse code with regular server_handle_child_message?
    """
    log_dir_root = logger.get_logger_dir()
    q_parent, q_hallu = controller.q_parent, controller.q_hallu
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    fp, ve, te = jr['fp'], jr['ve'], jr['te']
    logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format(
        model_iter, ve, te, fp * 1e-9))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    if search_depth > 0:
        return curr_iter

    controller.n_hallu_per_parent_on_idle = 1
    # for soft vs hard experiment, only root generates hallu.
    controller.add_one_to_queue(q_parent, mi_info, model_iter, None)
    if q_parent.size() > 0:
        # choose a parent.
        pqe = controller.choose_parent(q_parent, mi_info)
        model_str, model_iter, _parent_iter, search_depth = pqe
        logger.info('PARENT : mi={}'.format(model_iter))
        # Create hallucinations on the parent
        net_info_parent = net_info_from_str(model_str)

        # this experiment only creates one hallu from the root
        hallus = net_info_parent.sample_hallucinations(
            layer_ops=controller.valid_operations,
            merge_ops=controller.merge_operations,
            prob_at_layer=None,
            min_num_hallus=options.n_hallus_per_init,
            hallu_input_choice=options.hallu_input_choice)

        for netmorph_method in ['hard', 'soft']:
            controller.set_netmorph_method(netmorph_method)
            net_info = copy.deepcopy(net_info_parent)
            net_info = net_info.add_hallucinations(
                hallus,
                final_merge_op=controller.hallu_final_merge_op,
                stop_gradient_val=controller.stop_gradient_val,
                hallu_gate_layer=controller.hallu_gate_layer)
            # Update mi_info
            curr_iter += 1
            hallu_str = net_info.to_str()
            mi_info.append(ModelSearchInfo(
                curr_iter, model_iter, search_depth + 1,
                None, None, hallu_str))
            controller.add_one_to_queue(
                q_hallu, mi_info, curr_iter, net_info)
    return curr_iter
Ejemplo n.º 16
0
def server_main(
        controller, options,
        hallu_handle=None, child_handle=None, critic_handle=None):
    """
        Server entrance/main.
    """
    model_options_base = options
    log_dir_root = logger.get_logger_dir()
    model_dir_root = options.model_dir
    (
        mi_info,
        ipc,
        qname_to_pool,
        philly_wa,
        curr_iter,
        critic_iter,
        n_recv,
        n_last_train,
        n_last_mi_save
    ) = server_init(controller, options)
    # useful alias:
    (q_hallu, q_child) = (controller.q_hallu, controller.q_child)
    # message handles
    hallu_handle = (
        hallu_handle if hallu_handle else server_handle_hallu_message)
    child_handle = (
        child_handle if child_handle else server_handle_child_message)
    critic_handle = (
        critic_handle if critic_handle else server_handle_critic_message)

    # server main loop
    while ipc.pools.has_active() or q_child.size() > 0 or q_hallu.size() > 0:
        # Launch child/hallu sleepers
        for job_type, queue in zip(
                [TRAIN_HALLU, TRAIN_MODEL], [q_hallu, q_child]):
            # Populate workers util either active is full
            # or option_queue is empty.
            while ipc.pools.has_idle(job_type) and queue.size() > 0:
                model_str, model_iter, parent_iter, search_depth = queue.pop()
                # log the pop order of models. Important for analysis
                logger.info("mi={} pi={} sd={}".format(
                    model_iter, parent_iter, search_depth))
                logger.info("LayerInfoList is :\n{}".format(model_str))
                model_options = copy.deepcopy(model_options_base)
                model_options.net_info = net_info_from_str(model_str)
                fork_and_train_model(ipc=ipc,
                        options=model_options,
                        log_dir=_mi_to_dn(log_dir_root, model_iter),
                        child_dir=_mi_to_dn(model_dir_root, model_iter),
                        prev_dir=_mi_to_dn(model_dir_root, parent_iter),
                        model_str=model_str,
                        model_iter=model_iter,
                        parent_iter=parent_iter,
                        search_depth=search_depth,
                        job_type=job_type)

        # launch critic sleepers
        for qname in [q_child.name, q_hallu.name]:
            _n_new = n_recv[qname] - n_last_train[qname]
            _train_every = controller.controller_train_every
            if _n_new >= _train_every:
                pool = qname_to_pool[qname]
                if ipc.pools.has_idle(pool):
                    n_last_train[qname] = n_recv[qname]
                    ci = critic_iter[qname] = 1 + critic_iter[qname]
                    logger.info('Train critic {} ci={} ...'.format(qname, ci))
                    fork_and_train_critic(
                        ipc=ipc,
                        ctrl=controller,
                        data_dir=options.data_dir,
                        crawl_dirs=log_dir_root,
                        log_dir=_ci_to_dn(log_dir_root, ci, qname),
                        model_dir=_ci_to_dn(model_dir_root, ci, qname),
                        prev_dir=_ci_to_dn(model_dir_root, ci-1, qname),
                        critic_iter=ci,
                        queue_name=qname,
                        pool=pool)
                    logger.info('...Train critic launched')

        logger.info('Listening for message...')
        msg_output, job_type = ipc.get_finished_message()
        if job_type == TRAIN_HALLU:
            n_recv[q_hallu.name] += 1
            curr_iter = hallu_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options,
                curr_iter=curr_iter)

        elif job_type == TRAIN_MODEL:
            n_recv[q_child.name] += 1
            n_idle = ipc.pools.num_idle(TRAIN_HALLU)
            curr_iter = child_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options,
                n_idle=n_idle,
                curr_iter=curr_iter)

        elif job_type in [
                TRAIN_CRITIC_MODEL, TRAIN_CRITIC_HALLU, TRAIN_CRITIC_PARENT]:
            critic_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options)

        ## periodic log/heartbeat/ and exits.
        n_finished = n_recv[q_child.name] + n_recv[q_hallu.name]
        philly_wa.new_heart_beat(cnt=n_finished)
        philly_wa.print_progress_percent()
        # Saving mi_info periodically for training
        # critic, post-processing and recovering.
        np.savez(_mi_info_save_fn(log_dir_root), mi_info=mi_info)
        # we have explore enough models. quit now.
        if n_finished >= options.max_exploration:
            break
    # end while (server main loop)
    logger.info(
        "Exiting server main. n_recv[hallu]={} n_recv[child]={}".format(
            n_recv[q_hallu.name], n_recv[q_child.name]))
Ejemplo n.º 17
0
def server_init(controller, options):
    """
    Initialize params for server.
    """
    # names and static/fixed info
    log_dir_root = logger.get_logger_dir()
    model_dir_root = options.model_dir

    # Queues.
    queue_names, _queues = controller.init_queues()
    (q_parent, q_hallu, q_child) = (
        controller.q_parent, controller.q_hallu, controller.q_child)
    qname_to_pool = {
        q_child.name : TRAIN_CRITIC_MODEL,
        q_hallu.name : TRAIN_CRITIC_HALLU,
        q_parent.name : TRAIN_CRITIC_PARENT}

    mi_info = []
    if is_debug(options):
        prev_log_root = log_dir_root
        prev_model_root = model_dir_root
    else:
        prev_log_root = previous_trial_log_root(log_dir_root)
        prev_model_root = previous_trial_model_root(model_dir_root)
    is_success = False
    while prev_log_root and prev_model_root:
        logger.info("prev_log_root=\"{}\" && prev_model_root=\"{}\"".format(
            prev_log_root, prev_model_root))
        is_success = controller.recover.recover(
            prev_log_root=prev_log_root,
            log_root=log_dir_root,
            prev_model_root=prev_model_root,
            model_root=model_dir_root,
            q_parent=q_parent,
            q_hallu=q_hallu,
            q_child=q_child,
            mi_info=mi_info)
        if is_success:
            critic_iter = controller.init_predictors(
                prev_log_root, prev_model_root)
            break
        prev_log_root = previous_trial_log_root(prev_log_root)
        prev_model_root = previous_trial_model_root(prev_model_root)
    if not is_success:
        # controller init predictors from scratch
        critic_iter = controller.init_predictors(log_dir_root, model_dir_root)

    if len(mi_info) == 0:
        if options.net_info:
            l_init_net_info = [options.net_info]
            # Need to delete these info because
            # 1. The options is to be used by future children, we want to
            # remove uncessary params.
            # 2. Having both will cause multiple occurance of net_info_str
            # on children scripts, which causes bugs.
            delattr(options, 'net_info')
            options.net_info_str = None
        else:
            l_init_net_info = controller.initial_net_info()
        for mi, net_info in enumerate(l_init_net_info):
            mstr = net_info.to_str()
            # on server model info for each model_iter. Used for critic features.
            # In the order are mi, pi, sd, fp, ve, mstr, stats
            mi_info.append(ModelSearchInfo(mi, mi, 0, None, 2.0, mstr, [0.0]))
            controller.add_one_to_queue(q_child, mi_info, mi, net_info)

    # Job counters
    curr_iter = len(mi_info) - 1

    # queue related counters
    # Model counters and pool resources are reset upon reboot for now.
    n_recv = dict([(qname, 0) for qname in queue_names])
    n_last_train = dict([(qname, 0) for qname in queue_names])
    n_last_mi_save = 0

    # IPC
    pool_sizes = [ 0 ] * NUM_POOLS
    pool_sizes[TRAIN_HALLU] = controller.n_hallu_procs
    pool_sizes[TRAIN_MODEL] = controller.n_model_procs
    for qname in qname_to_pool:
        pool_sizes[qname_to_pool[qname]] = controller.n_critic_procs
    ipc = PetridishServerIPC(pool_sizes, hwm=50)
    ipc.initialize()

    # Server progress workaround
    # This guy wakes up every once a while to increase progress bar a little.
    philly_wa = PhillyHeartBeatWorkAround(max_cnt=options.max_exploration)
    return (
        mi_info,
        ipc,
        qname_to_pool,
        philly_wa,
        curr_iter,
        critic_iter,
        n_recv,
        n_last_train,
        n_last_mi_save
    )
Ejemplo n.º 18
0
def server_handle_hallu_message(
        msg_output, controller, mi_info, options, curr_iter):
    """
    Petridish server handles the return message of a forked
    process that watches over a halluciniation job.
    """
    log_dir_root = logger.get_logger_dir()
    q_child = controller.q_child
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    (fp, ve, te, hallu_stats, l_op_indices, l_op_omega) = (
        jr['fp'], jr['ve'], jr['te'], jr['l_stats'],
        jr['l_op_indices'], jr['l_op_omega']
    )
    logger.info(
        ("HALLU : mi={} val_err={} test_err={} "
         "Gflops={} hallu_stats={}").format(
            model_iter, ve, te, fp * 1e-9, hallu_stats))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    ## compute hallucination related info in net_info
    net_info = net_info_from_str(model_str)
    hallu_locs = net_info.contained_hallucination() # contained
    hallu_indices = net_info.sorted_hallu_indices(hallu_locs)
    # feature selection based on params
    l_fs_ops, l_fs_omega = feature_selection_cutoff(
        l_op_indices, l_op_omega, options)
    separated_hallu_info = net_info.separate_hallu_info_by_cname(
        hallu_locs, hallu_indices, l_fs_ops, l_fs_omega)

    ## Select a subset of hallucination to add to child model
    l_selected = []
    # sort by -cos(grad, hallu) for the indices, 0,1,2,...,n_hallu-1.
    processed_stats = [process_hallu_stats_for_critic_feat([stats]) \
        for stats in hallu_stats]
    logger.info('processed_stats={}'.format(processed_stats))
    logger.info('separated_hallu_info={}'.format(separated_hallu_info))

    # greedy select with gradient boosting
    l_greedy_selected = []
    if options.n_greed_select_per_init:
        greedy_order = sorted(
            range(len(hallu_indices)),
            key=lambda i : - processed_stats[i][0])
        min_select = options.n_hallus_per_select
        max_select = max(min_select, len(hallu_indices) // 2)
        for selected_len in range(min_select, max_select + 1):
            selected = greedy_order[:selected_len]
            l_greedy_selected.append(selected)
        n_greedy_select = len(l_greedy_selected)
        if n_greedy_select > options.n_greed_select_per_init:
            # random choose
            l_greedy_selected = list(np.random.choice(
                l_greedy_selected,
                options.n_greed_select_per_init,
                replace=False))
    # random select a subset
    l_random_selected = []
    if options.n_rand_select_per_init:
        # also try some random samples
        l_random_selected = online_sampling(
            itertools.combinations(
                range(len(hallu_indices)),
                options.n_hallus_per_select
            ),
            options.n_rand_select_per_init)
        np.random.shuffle(l_random_selected)
    l_selected = l_greedy_selected + l_random_selected

    ## for each selected subset of hallu, make a model for q_child
    # since more recent ones tend to be better,
    # we insert in reverse order, so greedy are inserted later.
    for selected in reversed(l_selected):
        # new model description
        child_info = copy.deepcopy(net_info)
        l_hi = [ hallu_indices[s] for s in selected ]
        child_info = child_info.select_hallucination(
            l_hi, separated_hallu_info)
        # Compute initialization stat
        stat = process_hallu_stats_for_critic_feat(
            [hallu_stats[s] for s in selected])
        # update mi_info
        curr_iter += 1
        child_str = child_info.to_str()
        mi_info.append(ModelSearchInfo(
            curr_iter, model_iter, search_depth+1,
            None, None, child_str, stat))
        controller.add_one_to_queue(
            q_child, mi_info, curr_iter, child_info)
    return curr_iter