Esempio n. 1
0
File: pba.py Progetto: zeyefkey/vega
 def _init_next_rung(self):
     """Init next rung to search."""
     next_rung_id = self.rung_id + 1
     if next_rung_id >= self.total_rungs:
         self.rung_id = self.rung_id + 1
         return
     for i in range(self.config_count):
         self.all_config_dict[i][next_rung_id] = self.all_config_dict[i][self.rung_id]
     current_score = []
     for i in range(self.config_count):
         current_score.append((i, self.best_score_dict[self.rung_id][i]))
     current_score.sort(key=lambda current_score: current_score[1])
     for i in range(4):
         better_id = current_score[self.config_count - 1 - i][0]
         worse_id = current_score[i][0]
         better_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba',
                                                       str(better_id), 'checkpoint')
         FileOps.make_dir(better_worker_result_path)
         worse_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba',
                                                      str(worse_id), 'checkpoint')
         FileOps.make_dir(worse_worker_result_path)
         shutil.rmtree(worse_worker_result_path)
         shutil.copytree(better_worker_result_path, worse_worker_result_path)
         self.all_config_dict[worse_id] = self.all_config_dict[better_id]
         policy_unchange = self.all_config_dict[worse_id][next_rung_id]
         policy_changed = self.explore(policy_unchange)
         self.all_config_dict[worse_id][next_rung_id] = policy_changed
     for id in range(self.config_count):
         self.best_score_dict[next_rung_id][id] = -1 * float('inf')
         tmp_row_data = {'config_id': id,
                         'rung_id': next_rung_id,
                         'status': StatusType.WAITTING}
         self._add_to_board(tmp_row_data)
     self.rung_id = self.rung_id + 1
Esempio n. 2
0
    def search(self):
        """Search one mutated model.

        :return: current number of samples, and the model
        """
        search_desc = self.search_space.search_space.custom
        pareto_front_folder = FileOps.join_path(self.local_base_path, "result")
        if 'pareto_folder' in self.search_space.cfg and self.search_space.cfg.pareto_folder is not None:
            pareto_front_folder = self.search_space.cfg.pareto_folder.replace(
                "{local_base_path}", self.local_base_path)
        pareto_front_df = pd.read_csv(
            FileOps.join_path(pareto_front_folder, "pareto_front.csv"))
        code_to_mutate = random.choice(pareto_front_df['Code'])

        current_mutate, code_mutated = 0, code_to_mutate
        num_candidates = len(search_desc["candidates"])
        while current_mutate < self.num_mutate:
            code_new = self.mutate_once(code_mutated, num_candidates)
            if code_new != code_mutated:
                current_mutate += 1
                code_mutated = code_new

        logging.info("Mutate from {} to {}".format(code_to_mutate,
                                                   code_mutated))
        search_desc['code'] = code_mutated
        search_desc['method'] = "mutate"
        search_desc = self.codec.decode(search_desc)
        self.sample_count += 1
        return self.sample_count, NetworkDesc(self.search_space.search_space)
Esempio n. 3
0
 def _save_checkpoint(self, epoch):
     """Save checkpoint."""
     logging.debug("Start Save Checkpoint, file_name=%s",
                   self.trainer.checkpoint_file_name)
     checkpoint_file = FileOps.join_path(
         self.trainer.get_local_worker_path(),
         self.trainer.checkpoint_file_name)
     logging.debug("Start Save Model, model_file=%s",
                   self.trainer.model_pickle_file_name)
     model_pickle_file = FileOps.join_path(
         self.trainer.get_local_worker_path(),
         self.trainer.model_pickle_file_name)
     # pickle model
     with open(model_pickle_file, 'wb') as handle:
         pickle.dump(self.trainer.model,
                     handle,
                     protocol=pickle.HIGHEST_PROTOCOL)
     # save checkpoint
     ckpt = {
         'epoch': epoch,
         'weight': self.trainer.model.state_dict(),
         'optimizer': self.trainer.optimizer.state_dict(),
         'lr_scheduler': self.trainer.lr_scheduler.state_dict(),
     }
     torch.save(ckpt, checkpoint_file)
     self.trainer.checkpoint_file = checkpoint_file
     self.trainer.model_path = model_pickle_file
Esempio n. 4
0
 def _save_best_model(self):
     save_path = FileOps.join_path(self.trainer.get_local_worker_path(),
                                   self.trainer.step_name, "best_model.pth")
     FileOps.make_base_dir(save_path)
     torch.save(self.model.state_dict(), save_path)
     if self.trainer.backup_base_path is not None:
         _dst = FileOps.join_path(self.trainer.backup_base_path, "workers",
                                  str(self.trainer.worker_id))
         FileOps.copy_folder(self.trainer.get_local_worker_path(), _dst)
Esempio n. 5
0
 def __init__(self):
     """Init HpoBase."""
     super(HpoGenerator, self).__init__(self.cfg)
     self.hpo = None
     self.policy = self.cfg.get('policy')
     self._hps_cache = {}
     step_path = FileOps.join_path(self.local_output_path, self.cfg.step_name)
     self._best_hps_file = FileOps.join_path(step_path, 'best_hps.json')
     self._cache_file = FileOps.join_path(step_path, 'cache.csv')
     self._board_file = FileOps.join_path(step_path, 'score_board.csv')
Esempio n. 6
0
 def _copy_needed_file(self):
     if "pareto_front_file" not in self.cfg or self.cfg.pareto_front_file is None:
         raise FileNotFoundError("Config item paretor_front_file not found in config file.")
     init_pareto_front_file = self.cfg.pareto_front_file.replace("{local_base_path}", self.local_base_path)
     self.pareto_front_file = FileOps.join_path(self.local_output_path, self.cfg.step_name, "pareto_front.csv")
     FileOps.make_base_dir(self.pareto_front_file)
     FileOps.copy_file(init_pareto_front_file, self.pareto_front_file)
     if "random_file" not in self.cfg or self.cfg.random_file is None:
         raise FileNotFoundError("Config item random_file not found in config file.")
     init_random_file = self.cfg.random_file.replace("{local_base_path}", self.local_base_path)
     self.random_file = FileOps.join_path(self.local_output_path, self.cfg.step_name, "random.csv")
     FileOps.copy_file(init_random_file, self.random_file)
Esempio n. 7
0
    def save_metrics_value(self):
        """Save the metric value of the trained model.

        :return: save_path (local) and s3_path (remote). If s3_path not specified, then s3_path is None
        :rtype: a tuple of two str
        """
        pd_path = FileOps.join_path(self.trainer.local_output_path,
                                    self.trainer.step_name, "performace.csv")
        FileOps.make_base_dir(pd_path)
        encoding = self.model.nbit_w_list + self.model.nbit_a_list
        df = pd.DataFrame(
            [[encoding, self.flops_count, self.params_count, self.metric]],
            columns=[
                "encoding", "flops", "parameters",
                self.cfg.get("valid_metric", "acc")
            ])
        if not os.path.exists(pd_path):
            with open(pd_path, "w") as file:
                df.to_csv(file, index=False)
        else:
            with open(pd_path, "a") as file:
                df.to_csv(file, index=False, header=False)
        if self.trainer.backup_base_path is not None:
            FileOps.copy_folder(self.trainer.local_output_path,
                                self.trainer.backup_base_path)
Esempio n. 8
0
    def _get_performance(self, step_name, worker_id):
        """Read Performance values from perform.txt.

        :param step_name: step name in the pipeline.
        :type step_name: str.
        :param worker_id: the worker's worker id.
        :type worker_id: str.
        :return: performance value
        :rtype: int/float/list

        """
        _file = FileOps.join_path(self.get_local_worker_path(step_name, worker_id), "performance.txt")
        if not os.path.isfile(_file):
            logger.info("Performance file is not exited, file={}".format(_file))
            return []
        with open(_file, 'r') as f:
            performance = []
            for line in f.readlines():
                line = line.strip()
                if line == "":
                    continue
                data = json.loads(line)
                if isinstance(data, list):
                    data = data[0]
                performance.append(data)
            logger.info("performance={}".format(performance))
        return performance
Esempio n. 9
0
 def __init__(self, **kwargs):
     """Construct the Imagenet class."""
     Dataset.__init__(self, **kwargs)
     split = 'train' if self.train else 'val'
     local_data_path = FileOps.join_path(self.args.data_path, split)
     ImageFolder.__init__(self,
                          root=local_data_path,
                          transform=Compose(self.transforms.__transform__))
Esempio n. 10
0
 def __init__(self, search_space):
     super(PruneEA, self).__init__(search_space)
     self.length = self.policy.length
     self.num_individual = self.policy.num_individual
     self.num_generation = self.policy.num_generation
     self.x_axis = 'flops'
     self.y_axis = 'acc'
     self.random_models = self.policy.random_models
     self.codec = Codec(self.cfg.codec, search_space)
     self.random_count = 0
     self.ea_count = 0
     self.ea_epoch = 0
     self.step_path = FileOps.join_path(self.local_output_path, self.cfg.step_name)
     self.pd_file_name = FileOps.join_path(self.step_path, "performance.csv")
     self.pareto_front_file = FileOps.join_path(self.step_path, "pareto_front.csv")
     self.pd_path = FileOps.join_path(self.step_path, "pareto_front")
     FileOps.make_dir(self.pd_path)
Esempio n. 11
0
 def get_pareto_list_size(self):
     """Get the number of pareto list."""
     pareto_list_size = 0
     pareto_file_locate = FileOps.join_path(self.local_base_path, "result",
                                            "pareto_front.csv")
     if os.path.exists(pareto_file_locate):
         pareto_front_df = pd.read_csv(pareto_file_locate)
         pareto_list_size = pareto_front_df.size
     return pareto_list_size
Esempio n. 12
0
    def save_backup(self, performance):
        """Save checkpoints and performance file to backup path.

        :param performance: validated performance
        :type param: float, list or dict
        """
        if self.backup_base_path is None:
            return
        pfm_file = os.path.join(self.get_local_worker_path(), 'performance.txt')
        with open(pfm_file, 'w') as f:
            f.write("{}".format(performance))
        backup_worker_path = FileOps.join_path(self.backup_base_path, self.get_worker_subpath())
        FileOps.copy_folder(self.get_local_worker_path(), backup_worker_path)
Esempio n. 13
0
    def update(self, record):
        """Update current performance into hpo score board.

        :param hps: hyper parameters need to update
        :param performance:  trainer performance
        """
        super().update(record)
        config_id = str(record.get('worker_id'))
        step_name = record.get('step_name')
        worker_result_path = self.get_local_worker_path(step_name, config_id)
        new_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', config_id, 'checkpoint')
        FileOps.make_dir(worker_result_path)
        FileOps.make_dir(new_worker_result_path)
        if os.path.exists(new_worker_result_path):
            shutil.rmtree(new_worker_result_path)
        shutil.copytree(worker_result_path, new_worker_result_path)
Esempio n. 14
0
 def _save_model_desc(self):
     """Save final model desc of NAS."""
     pf_file = FileOps.join_path(self.trainer.local_output_path,
                                 self.trainer.step_name, "pareto_front.csv")
     if not FileOps.exists(pf_file):
         return
     with open(pf_file, "r") as file:
         pf = pd.read_csv(file)
     pareto_fronts = pf["encoding"].tolist()
     search_space = SearchSpace()
     codec = QuantCodec('QuantCodec', search_space)
     for i, pareto_front in enumerate(pareto_fronts):
         pareto_front = [int(x) for x in pareto_front[1:-1].split(',')]
         model_desc = Config()
         model_desc.modules = search_space.search_space.modules
         model_desc.backbone = codec.decode(pareto_front)._desc.backbone
         self.trainer.output_model_desc(i, model_desc)
Esempio n. 15
0
 def get_pareto_front(self):
     """Get pareto front from remote result file."""
     with open(self.pd_file_name, "r") as file:
         df = pd.read_csv(file)
     fitness = df[[self.x_axis, self.y_axis]].values.transpose()
     # acc2error
     fitness[1, :] = 1 - fitness[1, :]
     _, _, selected = SortAndSelectPopulation(fitness, self.num_individual)
     result = df.loc[selected, :]
     if self.ea_count % self.num_individual == 0:
         file_name = "{}_epoch.csv".format(
             str(self.ea_epoch))
         pd_result_file = FileOps.join_path(self.pd_path, file_name)
         with open(pd_result_file, "w") as file:
             result.to_csv(file, index=False)
         with open(self.pareto_front_file, "w") as file:
             result.to_csv(file, index=False)
         self.ea_epoch += 1
     return result
Esempio n. 16
0
 def _get_performance(self, step_name, worker_id):
     saved_folder = self.get_local_worker_path(step_name, worker_id)
     performance_file = FileOps.join_path(saved_folder, "performance.txt")
     if not os.path.isfile(performance_file):
         logging.info("Performance file is not exited, file={}".format(
             performance_file))
         return []
     with open(performance_file, 'r') as f:
         performance = []
         for line in f.readlines():
             line = line.strip()
             if line == "":
                 continue
             data = json.loads(line)
             if isinstance(data, list):
                 data = data[0]
             performance.append(data)
         logging.info("performance={}".format(performance))
     return performance
Esempio n. 17
0
 def search(self):
     """Search an id and hps from hpo."""
     sample = self.hpo.propose()
     if sample is None:
         return None
     re_hps = {}
     sample = copy.deepcopy(sample)
     sample_id = sample.get('config_id')
     trans_para = sample.get('configs')
     rung_id = sample.get('rung_id')
     re_hps['dataset.transforms'] = [{'type': 'PBATransformer', 'para_array': trans_para,
                                      'operation_names': self.operation_names}]
     checkpoint_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(sample_id), 'checkpoint')
     FileOps.make_dir(checkpoint_path)
     if os.path.exists(checkpoint_path):
         re_hps['trainer.checkpoint_path'] = checkpoint_path
     if 'epoch' in sample:
         re_hps['trainer.epochs'] = sample.get('epoch')
     return dict(worker_id=sample_id, desc=re_hps, info=rung_id)
Esempio n. 18
0
    def update_performance(self, hps, performance):
        """Update current performance into hpo score board.

        :param hps: hyper parameters need to update
        :param performance:  trainer performance
        """
        if isinstance(performance, list) and len(performance) > 0:
            self.hpo.add_score(int(hps.get('config_id')),
                               int(hps.get('rung_id')), performance[0])
        else:
            self.hpo.add_score(int(hps.get('config_id')),
                               int(hps.get('rung_id')), -1)
            logging.error("hpo get empty performance!")
        worker_result_path = self.get_local_worker_path(self.cfg.step_name, str(hps.get('config_id')))
        new_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba',
                                                   str(hps.get('config_id')), 'checkpoint')
        FileOps.make_dir(worker_result_path)
        FileOps.make_dir(new_worker_result_path)
        if os.path.exists(new_worker_result_path):
            shutil.rmtree(new_worker_result_path)
        shutil.copytree(worker_result_path, new_worker_result_path)
Esempio n. 19
0
    def _new_model_init(self, model_prune):
        """Init new model.

        :param model_prune: searched pruned model
        :type model_prune: torch.nn.Module
        :return: initial model after loading pretrained model
        :rtype: torch.nn.Module
        """
        init_model_file = self.config.init_model_file
        if ":" in init_model_file:
            local_path = FileOps.join_path(
                self.trainer.get_local_worker_path(),
                os.path.basename(init_model_file))
            FileOps.copy_file(init_model_file, local_path)
            self.config.init_model_file = local_path
        network_desc = copy.deepcopy(self.base_net_desc)
        network_desc.backbone.chn = network_desc.backbone.base_chn
        network_desc.backbone.chn_node = network_desc.backbone.base_chn_node
        network_desc.backbone.encoding = model_prune.encoding
        model_init = NetworkDesc(network_desc).to_model()
        return model_init
Esempio n. 20
0
    def __init__(self, search_space=None):
        super(SpNas, self).__init__(search_space)
        self.search_space = search_space
        self.codec = Codec(self.cfg.codec, search_space)
        self.sample_level = self.cfg.sample_level
        self.max_sample = self.cfg.max_sample
        self.max_optimal = self.cfg.max_optimal
        self._total_list_name = self.cfg.total_list
        self.serial_settings = self.cfg.serial_settings

        self._total_list = ListDict()
        self.sample_count = 0
        self.init_code = None
        remote_output_path = FileOps.join_path(self.local_output_path, self.cfg.step_name)

        if 'last_search_result' in self.cfg:
            last_search_file = self.cfg.last_search_result
            assert FileOps.exists(os.path.join(remote_output_path, last_search_file)
                                  ), "Not found serial results!"
            # self.download_task_folder()
            last_search_results = os.path.join(self.local_output_path, last_search_file)
            last_search_results = ListDict.load_csv(last_search_results)
            pre_worker_id, pre_arch = self.select_from_remote(self.max_optimal, last_search_results)
            # re-write config template
            if self.cfg.regnition:
                self.codec.config_template['model']['backbone']['reignition'] = True
                assert FileOps.exists(os.path.join(remote_output_path,
                                                   pre_arch + '_imagenet.pth')
                                      ), "Not found {} pretrained .pth file!".format(pre_arch)
                pretrained_pth = os.path.join(self.local_output_path, pre_arch + '_imagenet.pth')
                self.codec.config_template['model']['pretrained'] = pretrained_pth
                pre_worker_id = -1
            # update config template
            self.init_code = dict(arch=pre_arch,
                                  pre_arch=pre_arch.split('_')[1],
                                  pre_worker_id=pre_worker_id)

        logging.info("inited SpNas {}-level search...".format(self.sample_level))
Esempio n. 21
0
    def update(self, step_name, worker_id):
        """Update hpo score into score board.

        :param step_name: step name in pipeline
        :param worker_id: worker id of worker

        """
        worker_id = str(worker_id)
        performance = self._get_performance(step_name, worker_id)
        if worker_id in self._hps_cache:
            hps = self._hps_cache[worker_id][0]
            self._hps_cache[worker_id][1] = copy.deepcopy(performance)
            logging.info("get hps need to update, worker_id=%s, hps=%s", worker_id, str(hps))
            self.update_performance(hps, performance)
            logging.info("hpo_id=%s, hps=%s, performance=%s", worker_id, str(hps), str(performance))
            self._save_hpo_cache()
            self._save_score_board()
            self._save_best()
            if self.need_backup and self.backup_base_path is not None:
                FileOps.copy_folder(self.local_output_path,
                                    FileOps.join_path(self.backup_base_path, self.output_subpath))
            logger.info("Hpo update finished.")
        else:
            logger.error("worker_id not in hps_cache.")
Esempio n. 22
0
    def sample(self):
        """Sample an id and hps from hpo.

        :return: id, hps
        :rtype: int, dict
        """
        re_hps = {}
        sample = self.hpo.propose()
        if sample is not None:
            sample = copy.deepcopy(sample)
            sample_id = sample.get('config_id')
            self._hps_cache[str(sample_id)] = [copy.deepcopy(sample), []]
            trans_para = sample.get('configs')
            re_hps['dataset.transforms'] = [{'type': 'PBATransformer', 'para_array': trans_para,
                                             'operation_names': self.operation_names}]
            checkpoint_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(sample_id), 'checkpoint')
            FileOps.make_dir(checkpoint_path)
            if os.path.exists(checkpoint_path):
                re_hps['trainer.checkpoint_path'] = checkpoint_path
            if 'epoch' in sample:
                re_hps['trainer.epochs'] = sample.get('epoch')
            return sample_id, re_hps
        else:
            return None, None
Esempio n. 23
0
    def _load_tf_model(self, model_prune, model_init, chn_node_mask):
        """Load tensorflow pretrained model."""
        with tf.Session(config=self.trainer._init_session_config()) as sess:
            saver = tf.train.import_meta_graph("{}.meta".format(
                self.config.init_model_file))
            saver.restore(sess, self.config.init_model_file)
            chn_node_id = 0
            chn_id = 0
            chn_mask = model_prune.chn_mask
            start_mask = []
            end_mask = []

            all_weight = tf.get_collection(tf.GraphKeys.VARIABLES)
            all_weight = [
                t for t in all_weight if not t.name.endswith('Momentum:0')
            ]
            for op in all_weight:
                name = op.name
                if name.startswith('conv_1'):
                    end_mask = chn_node_mask[0]
                    end_mask = np.asarray(end_mask)
                    idx1 = np.squeeze(
                        np.argwhere(
                            np.asarray(np.ones(end_mask.shape) - end_mask)))
                    mask = np.ones(op.get_shape())
                    mask[:, :, :, idx1.tolist()] = 0
                    sess.run(
                        tf.assign(op, op * tf.constant(mask, dtype=op.dtype)))
                elif name.startswith('bn_1'):
                    idx1 = np.squeeze(
                        np.argwhere(
                            np.asarray(np.ones(end_mask.shape) - end_mask)))
                    mask = np.ones(op.get_shape())
                    mask[idx1.tolist()] = 0
                    sess.run(
                        tf.assign(op, op * tf.constant(mask, dtype=op.dtype)))
                elif name.startswith('dense/kernel'):
                    idx1 = np.squeeze(
                        np.argwhere(
                            np.asarray(np.ones(end_mask.shape) - end_mask)))
                    mask = np.ones(op.get_shape())
                    mask[idx1.tolist(), :] = 0
                    sess.run(
                        tf.assign(op, op * tf.constant(mask, dtype=op.dtype)))
                elif name.startswith('layer'):
                    parsed_name = list(name.split('/'))
                    layer_idx = parsed_name[0][-1]
                    block_idx = parsed_name[1][-1]
                    operation = parsed_name[2]
                    if operation.startswith('conv'):
                        if operation == 'conv_1':
                            start_mask = chn_node_mask[int(layer_idx) - 1]
                            end_mask = chn_mask[int(block_idx)]
                        if operation == 'conv_2':
                            start_mask = end_mask
                            end_mask = chn_node_mask[int(layer_idx)]
                        # shortcut
                        if operation == 'conv_3':
                            start_mask = chn_node_mask[int(layer_idx) - 1]
                            end_mask = chn_node_mask[int(layer_idx)]
                        start_mask = np.asarray(start_mask)
                        end_mask = np.asarray(end_mask)
                        idx0 = np.squeeze(
                            np.argwhere(
                                np.asarray(
                                    np.ones(start_mask.shape) - start_mask)))
                        idx1 = np.squeeze(
                            np.argwhere(
                                np.asarray(np.ones(end_mask.shape) -
                                           end_mask)))
                        mask = np.ones(op.get_shape())
                        mask[:, :, idx0.tolist(), :] = 0
                        mask[:, :, :, idx1.tolist()] = 0
                        sess.run(
                            tf.assign(op,
                                      op * tf.constant(mask, dtype=op.dtype)))
                    elif operation.startswith('bn'):
                        idx1 = np.squeeze(
                            np.argwhere(
                                np.asarray(np.ones(end_mask.shape) -
                                           end_mask)))
                        mask = np.ones(op.get_shape())
                        mask[idx1.tolist()] = 0
                        sess.run(
                            tf.assign(op,
                                      op * tf.constant(mask, dtype=op.dtype)))
            save_file = FileOps.join_path(self.trainer.get_local_worker_path(),
                                          'prune_model')
            saver.save(sess, save_file)
            return model_init