Ejemplo n.º 1
0
    def evaluate(self, conf_eval: Config,
                 model_desc_builder: ModelDescBuilder) -> EvalResult:
        """Takes a folder of model descriptions output by search process and
        trains them in a distributed manner using ray with 1 gpu"""

        logger.pushd('evaluate')

        final_desc_foldername: str = conf_eval['final_desc_foldername']

        # get list of model descs in the gallery folder
        final_desc_folderpath = utils.full_path(final_desc_foldername)
        files = [os.path.join(final_desc_folderpath, f) \
                for f in glob.glob(os.path.join(final_desc_folderpath, 'model_desc_*.yaml')) \
                    if os.path.isfile(os.path.join(final_desc_folderpath, f))]
        logger.info({'model_desc_files': len(files)})

        # to avoid all workers download datasets individually, let's do it before hand
        self._ensure_dataset_download(conf_eval)

        future_ids = []
        for model_desc_filename in files:
            future_id = EvaluaterPetridish._train_dist.remote(
                self, conf_eval, model_desc_builder, model_desc_filename,
                common.get_state())
            future_ids.append(future_id)

        # wait for all eval jobs to be finished
        ready_refs, remaining_refs = ray.wait(future_ids,
                                              num_returns=len(future_ids))

        # plot pareto curve of gallery of models
        hull_points = [ray.get(ready_ref) for ready_ref in ready_refs]
        save_hull(hull_points, common.get_expdir())
        plot_pool(hull_points, common.get_expdir())

        best_point = max(hull_points, key=lambda p: p.metrics.best_val_top1())
        logger.info({
            'best_val_top1': best_point.metrics.best_val_top1(),
            'best_MAdd': best_point.model_stats.MAdd
        })

        logger.popd()

        return EvalResult(best_point.metrics)
Ejemplo n.º 2
0
    def _create_seed_jobs(self, conf_search:Config, model_desc_builder:ModelDescBuilder)->list:
        conf_model_desc = conf_search['model_desc']
        conf_seed_train = conf_search['seed_train']

        future_ids = [] # ray job IDs
        seed_model_stats = [] # seed model stats for visualization and debugging 
        macro_combinations = list(self.get_combinations(conf_search))
        for reductions, cells, nodes in macro_combinations:
            # if N R N R N R cannot be satisfied, ignore combination
            if cells < reductions * 2 + 1:
                continue

            # create seed model
            model_desc = self.build_model_desc(model_desc_builder,
                                               conf_model_desc,
                                               reductions, cells, nodes)

            hull_point = ConvexHullPoint(JobStage.SEED, 0, 0, model_desc,
                                         (cells, reductions, nodes))

            # pre-train the seed model
            future_id = SearcherPetridish.train_model_desc_dist.remote(self,
                conf_seed_train, hull_point, common.get_state())

            future_ids.append(future_id)

            # build a model so we can get its model stats
            temp_model = Model(model_desc, droppath=True, affine=True)
            seed_model_stats.append(nas_utils.get_model_stats(temp_model))
        
        # save the model stats in a plot and tsv file so we can
        # visualize the spread on the x-axis
        expdir = common.get_expdir()
        assert expdir
        plot_seed_model_stats(seed_model_stats, expdir)

        return future_ids
Ejemplo n.º 3
0
    def update_alphas(self, eta: float, current_t: int, total_t: int,
                      grad_clip: float):
        grad_flat = torch.flatten(self._grad)
        rewards = torch.tensor([
            -torch.dot(grad_flat, torch.flatten(activ))
            for activ in self._activs
        ])
        exprewards = torch.exp(eta * rewards).cuda()
        # NOTE: Will this remain registered?
        self._alphas[0] = torch.mul(self._alphas[0], exprewards)

        # weak learner eviction
        conf = get_conf()
        to_evict = conf['nas']['search']['xnas']['to_evict']
        if to_evict:
            theta = max(self._alphas[0]) * ma.exp(-2 * eta * grad_clip *
                                                  (total_t - current_t))
            assert len(self._ops) == self._alphas[0].shape[0]
            to_keep_mask = self._alphas[0] >= theta
            num_ops_kept = torch.sum(to_keep_mask).item()
            assert num_ops_kept > 0
            # zero out the weights which are evicted
            self._alphas[0] = torch.mul(self._alphas[0], to_keep_mask)

        # save some debugging info
        expdir = get_expdir()
        filename = os.path.join(expdir, str(id(self)) + '.txt')

        # save debug info to file
        alphas = [
            str(self._alphas[0][i].item())
            for i in range(self._alphas[0].shape[0])
        ]
        with open(filename, 'a') as f:
            f.write(str(alphas))
            f.write('\n')
Ejemplo n.º 4
0
    def finalize_node(self, node:nn.ModuleList, node_index:int,
                      node_desc:NodeDesc, max_final_edges:int,
                      cov:np.array, cell: Cell, node_id: int,
                      *args, **kwargs)->NodeDesc:
        # node is a list of edges
        assert len(node) >= max_final_edges

        # covariance matrix shape must be square 2-D
        assert len(cov.shape) == 2
        assert cov.shape[0] == cov.shape[1]

        # the number of primitive operators has to be greater
        # than equal to the maximum number of final edges
        # allowed
        assert cov.shape[0] >= max_final_edges

        # get the order and alpha of all ops other than 'none'
        in_ops = [(edge,op,alpha,i) for i, edge in enumerate(node) \
                            for op, alpha in edge._op.ops()
                            if not isinstance(op, Zero)]
        assert len(in_ops) >= max_final_edges

        # order all the ops by alpha
        in_ops_sorted = sorted(in_ops, key=lambda in_op:in_op[2], reverse=True)

        # keep under consideration top half of the ops
        num_to_keep = max(max_final_edges, len(in_ops_sorted)//2)
        top_ops = in_ops_sorted[:num_to_keep]

        # get the covariance submatrix of the top ops only
        cov_inds = []
        for edge, op, alpha, edge_num in top_ops:
            ind = self._divnas_cells[cell].node_num_to_node_op_to_cov_ind[node_id][op]
            cov_inds.append(ind)

        cov_top_ops = cov[np.ix_(cov_inds, cov_inds)]

        assert len(cov_inds) == len(top_ops)
        assert len(top_ops) >= max_final_edges
        assert cov_top_ops.shape[0] == cov_top_ops.shape[1]
        assert len(cov_top_ops.shape) == 2

        # run brute force set selection algorithm
        # only on the top ops
        max_subset, max_mi = compute_brute_force_sol(cov_top_ops, max_final_edges)

        # note that elements of max_subset are indices into top_ops only
        selected_edges = []
        for ind in max_subset:
            edge, op, alpha, edge_num = top_ops[ind]
            op_desc, _ = op.finalize()
            new_edge = EdgeDesc(op_desc, edge.input_ids)
            logger.info(f'selected edge: {edge_num}, op: {op_desc.name}')
            selected_edges.append(new_edge)

        # save diagnostic information to disk
        expdir = get_expdir()
        sns.heatmap(cov_top_ops, annot=True, fmt='.1g', cmap='coolwarm')
        savename = os.path.join(
            expdir, f'cell_{cell.desc.id}_node_{node_id}_cov.png')
        plt.savefig(savename)

        logger.info('')
        return NodeDesc(selected_edges, node_desc.conv_params)
Ejemplo n.º 5
0
    def search(self, conf_search: Config, model_desc_builder: ModelDescBuilder,
               trainer_class: TArchTrainer,
               finalizers: Finalizers) -> SearchResult:

        logger.pushd('search')

        # region config vars
        self.conf_search = conf_search
        conf_checkpoint = conf_search['checkpoint']
        resume = conf_search['resume']

        conf_post_train = conf_search['post_train']
        final_desc_foldername = conf_search['final_desc_foldername']

        conf_petridish = conf_search['petridish']

        # petridish distributed search related parameters
        self._convex_hull_eps = conf_petridish['convex_hull_eps']
        self._sampling_max_try = conf_petridish['sampling_max_try']
        self._max_madd = conf_petridish['max_madd']
        self._max_hull_points = conf_petridish['max_hull_points']
        self._checkpoints_foldername = conf_petridish['checkpoints_foldername']
        # endregion

        self._checkpoint = nas_utils.create_checkpoint(conf_checkpoint, resume)

        # parent models list
        self._hull_points: List[ConvexHullPoint] = []

        self._ensure_dataset_download(conf_search)

        # checkpoint will restore the hull we had
        is_restored = self._restore_checkpoint()

        # seed the pool with many seed models of different
        # macro parameters like number of cells, reductions etc if parent pool
        # could not be restored and/or this is the first time this job has been run.
        future_ids = [] if is_restored else self._create_seed_jobs(
            conf_search, model_desc_builder)

        while not self._is_search_done():
            logger.info(f'Ray jobs running: {len(future_ids)}')

            if future_ids:
                # get first completed job
                job_id_done, future_ids = ray.wait(future_ids)

                hull_point = ray.get(job_id_done[0])

                logger.info(
                    f'Hull point id {hull_point.id} with stage {hull_point.job_stage.name} completed'
                )

                if hull_point.is_trained_stage():
                    self._update_convex_hull(hull_point)

                    # sample a point and search
                    sampled_point = sample_from_hull(self._hull_points,
                                                     self._convex_hull_eps,
                                                     self._sampling_max_try)

                    future_id = SearcherPetridish.search_model_desc_dist.remote(
                        self, conf_search, sampled_point, model_desc_builder,
                        trainer_class, finalizers, common.get_state())
                    future_ids.append(future_id)
                    logger.info(
                        f'Added sampled point {sampled_point.id} for search')
                elif hull_point.job_stage == JobStage.SEARCH:
                    # create the job to train the searched model
                    future_id = SearcherPetridish.train_model_desc_dist.remote(
                        self, conf_post_train, hull_point, common.get_state())
                    future_ids.append(future_id)
                    logger.info(
                        f'Added sampled point {hull_point.id} for post-search training'
                    )
                else:
                    raise RuntimeError(
                        f'Job stage "{hull_point.job_stage}" is not expected in search loop'
                    )

        # cancel any remaining jobs to free up gpus for the eval phase
        for future_id in future_ids:
            ray.cancel(future_id,
                       force=True)  # without force, main process stops
            ray.wait([future_id])

        # plot and save the hull
        expdir = common.get_expdir()
        assert expdir
        plot_frontier(self._hull_points, self._convex_hull_eps, expdir)
        best_point = save_hull_frontier(self._hull_points,
                                        self._convex_hull_eps,
                                        final_desc_foldername, expdir)
        save_hull(self._hull_points, expdir)
        plot_pool(self._hull_points, expdir)

        # return best point as search result
        search_result = SearchResult(best_point.model_desc,
                                     search_metrics=None,
                                     train_metrics=best_point.metrics)
        self.clean_log_result(conf_search, search_result)

        logger.popd()

        return search_result