Exemple #1
0
def server_handle_child_message(
        msg_output, controller, mi_info, options, n_idle, curr_iter):
    """
    Petridish server handles the return message of a forked
    process that watches over a child job.
    """
    log_dir_root = logger.get_logger_dir()
    q_parent, q_hallu = controller.q_parent, controller.q_hallu
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    fp, ve, te = jr['fp'], jr['ve'], jr['te']
    logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format(
        model_iter, ve, te, fp * 1e-9))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    if (search_depth // 2 < options.max_growth
            and (options.search_max_flops is None
                    or fp < options.search_max_flops)):
        controller.add_one_to_queue(
            q_parent, mi_info, model_iter, None)

    if q_parent.size() > 0:
        # choose a parent.
        pqe = controller.choose_parent(q_parent, mi_info)
        model_str, model_iter, _parent_iter, search_depth = pqe
        logger.info('PARENT : mi={}'.format(model_iter))
        # Create hallucinations on the parent
        net_info_parent = net_info_from_str(model_str)
        n_hallu_per_parent = max(
            1,
            min(controller.n_hallu_per_parent_on_idle, n_idle))
        for _ in range(n_hallu_per_parent):
            net_info = copy.deepcopy(net_info_parent)
            hallus = net_info.sample_hallucinations(
                layer_ops=controller.valid_operations,
                merge_ops=controller.merge_operations,
                prob_at_layer=None,
                min_num_hallus=options.n_hallus_per_init,
                hallu_input_choice=options.hallu_input_choice)
            net_info = net_info.add_hallucinations(
                hallus,
                final_merge_op=controller.hallu_final_merge_op,
                stop_gradient_val=controller.stop_gradient_val,
                hallu_gate_layer=controller.hallu_gate_layer)
            # Update mi_info
            curr_iter += 1
            hallu_str = net_info.to_str()
            mi_info.append(ModelSearchInfo(
                curr_iter, model_iter, search_depth + 1,
                None, None, hallu_str))
            controller.add_one_to_queue(
                q_hallu, mi_info, curr_iter, net_info)
    return curr_iter
Exemple #2
0
def server_handle_child_message_soft_vs_hard(
        msg_output, controller, mi_info, options, n_idle, curr_iter):
    """
    Special replacement of server_handle_child_message for
    experimenting on soft init vs. hard init.

    This is for experiment only.
    TODO reuse code with regular server_handle_child_message?
    """
    log_dir_root = logger.get_logger_dir()
    q_parent, q_hallu = controller.q_parent, controller.q_hallu
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    fp, ve, te = jr['fp'], jr['ve'], jr['te']
    logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format(
        model_iter, ve, te, fp * 1e-9))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    if search_depth > 0:
        return curr_iter

    controller.n_hallu_per_parent_on_idle = 1
    # for soft vs hard experiment, only root generates hallu.
    controller.add_one_to_queue(q_parent, mi_info, model_iter, None)
    if q_parent.size() > 0:
        # choose a parent.
        pqe = controller.choose_parent(q_parent, mi_info)
        model_str, model_iter, _parent_iter, search_depth = pqe
        logger.info('PARENT : mi={}'.format(model_iter))
        # Create hallucinations on the parent
        net_info_parent = net_info_from_str(model_str)

        # this experiment only creates one hallu from the root
        hallus = net_info_parent.sample_hallucinations(
            layer_ops=controller.valid_operations,
            merge_ops=controller.merge_operations,
            prob_at_layer=None,
            min_num_hallus=options.n_hallus_per_init,
            hallu_input_choice=options.hallu_input_choice)

        for netmorph_method in ['hard', 'soft']:
            controller.set_netmorph_method(netmorph_method)
            net_info = copy.deepcopy(net_info_parent)
            net_info = net_info.add_hallucinations(
                hallus,
                final_merge_op=controller.hallu_final_merge_op,
                stop_gradient_val=controller.stop_gradient_val,
                hallu_gate_layer=controller.hallu_gate_layer)
            # Update mi_info
            curr_iter += 1
            hallu_str = net_info.to_str()
            mi_info.append(ModelSearchInfo(
                curr_iter, model_iter, search_depth + 1,
                None, None, hallu_str))
            controller.add_one_to_queue(
                q_hallu, mi_info, curr_iter, net_info)
    return curr_iter
Exemple #3
0
def server_main(
        controller, options,
        hallu_handle=None, child_handle=None, critic_handle=None):
    """
        Server entrance/main.
    """
    model_options_base = options
    log_dir_root = logger.get_logger_dir()
    model_dir_root = options.model_dir
    (
        mi_info,
        ipc,
        qname_to_pool,
        philly_wa,
        curr_iter,
        critic_iter,
        n_recv,
        n_last_train,
        n_last_mi_save
    ) = server_init(controller, options)
    # useful alias:
    (q_hallu, q_child) = (controller.q_hallu, controller.q_child)
    # message handles
    hallu_handle = (
        hallu_handle if hallu_handle else server_handle_hallu_message)
    child_handle = (
        child_handle if child_handle else server_handle_child_message)
    critic_handle = (
        critic_handle if critic_handle else server_handle_critic_message)

    # server main loop
    while ipc.pools.has_active() or q_child.size() > 0 or q_hallu.size() > 0:
        # Launch child/hallu sleepers
        for job_type, queue in zip(
                [TRAIN_HALLU, TRAIN_MODEL], [q_hallu, q_child]):
            # Populate workers util either active is full
            # or option_queue is empty.
            while ipc.pools.has_idle(job_type) and queue.size() > 0:
                model_str, model_iter, parent_iter, search_depth = queue.pop()
                # log the pop order of models. Important for analysis
                logger.info("mi={} pi={} sd={}".format(
                    model_iter, parent_iter, search_depth))
                logger.info("LayerInfoList is :\n{}".format(model_str))
                model_options = copy.deepcopy(model_options_base)
                model_options.net_info = net_info_from_str(model_str)
                fork_and_train_model(ipc=ipc,
                        options=model_options,
                        log_dir=_mi_to_dn(log_dir_root, model_iter),
                        child_dir=_mi_to_dn(model_dir_root, model_iter),
                        prev_dir=_mi_to_dn(model_dir_root, parent_iter),
                        model_str=model_str,
                        model_iter=model_iter,
                        parent_iter=parent_iter,
                        search_depth=search_depth,
                        job_type=job_type)

        # launch critic sleepers
        for qname in [q_child.name, q_hallu.name]:
            _n_new = n_recv[qname] - n_last_train[qname]
            _train_every = controller.controller_train_every
            if _n_new >= _train_every:
                pool = qname_to_pool[qname]
                if ipc.pools.has_idle(pool):
                    n_last_train[qname] = n_recv[qname]
                    ci = critic_iter[qname] = 1 + critic_iter[qname]
                    logger.info('Train critic {} ci={} ...'.format(qname, ci))
                    fork_and_train_critic(
                        ipc=ipc,
                        ctrl=controller,
                        data_dir=options.data_dir,
                        crawl_dirs=log_dir_root,
                        log_dir=_ci_to_dn(log_dir_root, ci, qname),
                        model_dir=_ci_to_dn(model_dir_root, ci, qname),
                        prev_dir=_ci_to_dn(model_dir_root, ci-1, qname),
                        critic_iter=ci,
                        queue_name=qname,
                        pool=pool)
                    logger.info('...Train critic launched')

        logger.info('Listening for message...')
        msg_output, job_type = ipc.get_finished_message()
        if job_type == TRAIN_HALLU:
            n_recv[q_hallu.name] += 1
            curr_iter = hallu_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options,
                curr_iter=curr_iter)

        elif job_type == TRAIN_MODEL:
            n_recv[q_child.name] += 1
            n_idle = ipc.pools.num_idle(TRAIN_HALLU)
            curr_iter = child_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options,
                n_idle=n_idle,
                curr_iter=curr_iter)

        elif job_type in [
                TRAIN_CRITIC_MODEL, TRAIN_CRITIC_HALLU, TRAIN_CRITIC_PARENT]:
            critic_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options)

        ## periodic log/heartbeat/ and exits.
        n_finished = n_recv[q_child.name] + n_recv[q_hallu.name]
        philly_wa.new_heart_beat(cnt=n_finished)
        philly_wa.print_progress_percent()
        # Saving mi_info periodically for training
        # critic, post-processing and recovering.
        np.savez(_mi_info_save_fn(log_dir_root), mi_info=mi_info)
        # we have explore enough models. quit now.
        if n_finished >= options.max_exploration:
            break
    # end while (server main loop)
    logger.info(
        "Exiting server main. n_recv[hallu]={} n_recv[child]={}".format(
            n_recv[q_hallu.name], n_recv[q_child.name]))
Exemple #4
0
def server_handle_hallu_message(
        msg_output, controller, mi_info, options, curr_iter):
    """
    Petridish server handles the return message of a forked
    process that watches over a halluciniation job.
    """
    log_dir_root = logger.get_logger_dir()
    q_child = controller.q_child
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    (fp, ve, te, hallu_stats, l_op_indices, l_op_omega) = (
        jr['fp'], jr['ve'], jr['te'], jr['l_stats'],
        jr['l_op_indices'], jr['l_op_omega']
    )
    logger.info(
        ("HALLU : mi={} val_err={} test_err={} "
         "Gflops={} hallu_stats={}").format(
            model_iter, ve, te, fp * 1e-9, hallu_stats))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    ## compute hallucination related info in net_info
    net_info = net_info_from_str(model_str)
    hallu_locs = net_info.contained_hallucination() # contained
    hallu_indices = net_info.sorted_hallu_indices(hallu_locs)
    # feature selection based on params
    l_fs_ops, l_fs_omega = feature_selection_cutoff(
        l_op_indices, l_op_omega, options)
    separated_hallu_info = net_info.separate_hallu_info_by_cname(
        hallu_locs, hallu_indices, l_fs_ops, l_fs_omega)

    ## Select a subset of hallucination to add to child model
    l_selected = []
    # sort by -cos(grad, hallu) for the indices, 0,1,2,...,n_hallu-1.
    processed_stats = [process_hallu_stats_for_critic_feat([stats]) \
        for stats in hallu_stats]
    logger.info('processed_stats={}'.format(processed_stats))
    logger.info('separated_hallu_info={}'.format(separated_hallu_info))

    # greedy select with gradient boosting
    l_greedy_selected = []
    if options.n_greed_select_per_init:
        greedy_order = sorted(
            range(len(hallu_indices)),
            key=lambda i : - processed_stats[i][0])
        min_select = options.n_hallus_per_select
        max_select = max(min_select, len(hallu_indices) // 2)
        for selected_len in range(min_select, max_select + 1):
            selected = greedy_order[:selected_len]
            l_greedy_selected.append(selected)
        n_greedy_select = len(l_greedy_selected)
        if n_greedy_select > options.n_greed_select_per_init:
            # random choose
            l_greedy_selected = list(np.random.choice(
                l_greedy_selected,
                options.n_greed_select_per_init,
                replace=False))
    # random select a subset
    l_random_selected = []
    if options.n_rand_select_per_init:
        # also try some random samples
        l_random_selected = online_sampling(
            itertools.combinations(
                range(len(hallu_indices)),
                options.n_hallus_per_select
            ),
            options.n_rand_select_per_init)
        np.random.shuffle(l_random_selected)
    l_selected = l_greedy_selected + l_random_selected

    ## for each selected subset of hallu, make a model for q_child
    # since more recent ones tend to be better,
    # we insert in reverse order, so greedy are inserted later.
    for selected in reversed(l_selected):
        # new model description
        child_info = copy.deepcopy(net_info)
        l_hi = [ hallu_indices[s] for s in selected ]
        child_info = child_info.select_hallucination(
            l_hi, separated_hallu_info)
        # Compute initialization stat
        stat = process_hallu_stats_for_critic_feat(
            [hallu_stats[s] for s in selected])
        # update mi_info
        curr_iter += 1
        child_str = child_info.to_str()
        mi_info.append(ModelSearchInfo(
            curr_iter, model_iter, search_depth+1,
            None, None, child_str, stat))
        controller.add_one_to_queue(
            q_child, mi_info, curr_iter, child_info)
    return curr_iter
    def full_recovery(self, prev_log_root, log_root, prev_model_root,
                      model_root, q_parent, q_hallu, q_child, mi_info):
        # 0. assume that controller predictor is loaded
        # 1. load old mi_info
        # 2. filter mi_info into models from q_parent, q_hallu, and q_child
        # 3. Copy/link finished model log_dir/model_dir for those mi_info that has ve.
        # 4. add to q_hallu and q_child the ones that are not finished
        # 5. add to q_parent the models that are finished (and are not in hallu)
        # 6. TODO Compute counter variables like n_recv and friends
        def _is_hallu(info):
            return info.sd % 2 == 1

        def _is_finished(info):
            return info.ve is not None and info.ve < 1.0

        prev_mi_info_npz = _mi_info_save_fn(prev_log_root)
        mi_info_npz = _mi_info_save_fn(log_root)
        if os.path.exists(mi_info_npz):
            # Current trial has some mi_info already load from it instead
            # This happens on local runs because they don't have trial id.
            # This also happens on preempt on philly, which doesn't not advance trial id.
            prev_mi_info_npz = mi_info_npz
            prev_model_root = model_root
            prev_log_root = log_root

        if not os.path.exists(prev_mi_info_npz):
            # nothing to load. Return False to let outside know.
            return False

        mi_info.extend(np.load(prev_mi_info_npz, encoding='bytes')['mi_info'])

        if mi_info_npz != prev_mi_info_npz:
            os.rename(prev_mi_info_npz, mi_info_npz)

        all_mi_in_log = set(_all_mi(prev_log_root))
        all_mi_in_model = set(_all_mi(prev_model_root))

        for info in mi_info:
            mi = info.mi
            if mi in all_mi_in_log:
                all_mi_in_log.remove(mi)
            if mi in all_mi_in_model:
                all_mi_in_model.remove(mi)

            # TODO use heapify instaed of inserting one by one...
            old_log_dir = _mi_to_dn(prev_log_root, mi)
            old_model_dir = _mi_to_dn(prev_model_root, mi)
            queue = None
            if not _is_finished(info):
                queue = q_hallu if _is_hallu(info) else q_child
                # remove partial model/logs to avoid confusions.
                if os.path.exists(old_model_dir):
                    shutil.rmtree(old_model_dir)
                if os.path.exists(old_log_dir):
                    shutil.rmtree(old_log_dir)
                logger.info("Recover: mi={} queue={}".format(mi, queue.name))

            else:
                # copy logs
                new_log_dir = _mi_to_dn(log_root, mi)
                if new_log_dir != old_log_dir:
                    shutil.copytree(old_log_dir, new_log_dir)
                # copy models
                new_model_dir = _mi_to_dn(model_root, mi)
                if new_model_dir != old_model_dir:
                    shutil.copytree(old_model_dir, new_model_dir)
                queue = None if _is_hallu(info) else q_parent
                qname = "" if queue is None else queue.name
                # It's important to log val_err for the purpose of analysis later
                logger.info("Recover: mi={} val_err={} queue={}".format(
                    mi, info.ve, qname))

            if queue is not None:
                self.controller.add_one_to_queue(queue, mi_info, mi, None)
        #end for each info

        # remove old mi that have log or model but are not in mi_info
        for mi in all_mi_in_log:
            old_log_dir = _mi_to_dn(prev_log_root, mi)
            if os.path.exists(old_log_dir):
                shutil.rmtree(old_log_dir)

        for mi in all_mi_in_model:
            old_model_dir = _mi_to_dn(prev_model_root, mi)
            if os.path.exists(old_model_dir):
                shutil.rmtree(old_model_dir)

        if not self.n_models_to_recover is None:
            # TODO compute priority and do nsmallest instead of trimming.
            q_parent.keep_top_k(self.n_models_to_recover)
            logger.info("full_recovery: trim q_parent to size {}".format(
                self.n_models_to_recover))

        # recover successfully
        return True
    def partial_recovery(self, prev_log_root, log_root, prev_model_root,
                         model_root, q_parent, q_hallu, q_child, mi_info):
        """
        deprecated DO NOT USE

        prev_log_root (str) : root of previous log, e.g., on philly: xxx/app_id/logs/2/petridish_main
        log_root (str) : current root of log
        prev_model_root (str) : previous model root, e.g., on philly: xxx/app_id/models/2
        model_root (str) : current model root
        q_parent (PetridishQueue) : see PetridishController.init_queues
        q_hallu (PetridishQueue) :
        q_child (PetridishQueue) :
        mi_info (list) : of ModelSearchInfo
        """
        old_npz = _mi_info_save_fn(prev_log_root)
        old_mi_info = list(np.load(old_npz, encoding='bytes')['mi_info'])

        def _is_hallu(info):
            return info.sd % 2 == 1

        def _is_finished(info):
            return info.ve is not None and info.ve <= 1.0

        min_ve_info = None
        min_ve = None
        for info in old_mi_info:
            if not _is_finished(info):
                continue
            if min_ve is None or (info.ve <= 1.0 and info.ve < min_ve):
                min_ve = info.ve
                min_ve_info = info

        if min_ve is None:
            # nothing finished. start regularly
            return

        # get the path to root.
        info = min_ve_info
        l_info = [info]
        while info.pi != info.mi:
            info = old_mi_info[info.pi]
            l_info.append(info)

        curr_iter = -1
        for info in reversed(l_info):
            curr_iter += 1
            # copy logs into the new dir
            old_log_dir = _mi_to_dn(prev_log_root, info.mi)
            new_log_dir = _mi_to_dn(log_root, curr_iter)
            shutil.copytree(old_log_dir, new_log_dir)
            # copy models
            old_model_dir = _mi_to_dn(prev_model_root, info.mi)
            new_model_dir = _mi_to_dn(model_root, curr_iter)
            shutil.copytree(old_model_dir, new_model_dir)

            info.mi = curr_iter
            info.pi = max(curr_iter - 1, 0)
            mi_info.append(info)

        info = mi_info[-1]
        queue = q_hallu if _is_hallu(info) else q_child
        queue.add(model_str=info.mstr,
                  model_iter=info.mi,
                  parent_iter=info.pi,
                  search_depth=info.sd,
                  priority=info.ve)