Esempio n. 1
0
    def save_results(self):
        """Save the results of evolution contains the information of pupulation and elitism."""
        step_name = Config(deepcopy(UserConfig().data)).general.step_name
        _path = FileOps.join_path(self.local_output_path, step_name)
        FileOps.make_dir(_path)
        arch_file = FileOps.join_path(_path, 'arch.txt')
        arch_child = FileOps.join_path(_path, 'arch_child.txt')
        sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy')
        sel_arch = []
        with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac:
            writer_a = csv.writer(fw_a, lineterminator='\n')
            writer_ac = csv.writer(fw_ac, lineterminator='\n')
            writer_ac.writerow(['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.individual_num):
                writer_ac.writerow(
                    self._log_data(net_info_type='active_only', pop=self.pop[c],
                                   value=self.pop[c].fitness))

            writer_a.writerow(['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.elitism_num):
                writer_a.writerow(self._log_data(net_info_type='active_only',
                                                 pop=self.elitism[c],
                                                 value=self.elit_fitness[c]))
                sel_arch.append(self.elitism[c].gene)
        sel_arch = np.stack(sel_arch)
        np.save(sel_arch_file, sel_arch)
        if self.backup_base_path is not None:
            FileOps.copy_folder(self.local_output_path, self.backup_base_path)
Esempio n. 2
0
    def _save_performance(self, performance, model_desc=None):
        """Save result of the model, and calculate pareto front.

        :param performance: The dict that contains all the result needed
        :param model_desc: config of the model
        """
        performance_str = json.dumps(performance, indent=4, sort_keys=True)
        self.trainer._save_performance(performance_str)
        method = model_desc.method
        code = model_desc.code
        metric_method = self.cfg.metric.method
        FileOps.make_dir(FileOps.join_path(self.result_path))
        result_file_name = FileOps.join_path(self.result_path,
                                             "{}.csv".format(method))
        header = "Code,GFlops,KParams,{0},Best {0},Worker_id\n".format(
            metric_method)
        if not os.path.exists(result_file_name):
            with open(result_file_name, 'w') as file:
                file.write(header)
        with open(result_file_name, 'a') as file:
            file.write('{},{},{},{},{},{}\n'.format(
                code, performance['gflops'], performance['kparams'],
                performance["cur_valid_perf"], performance["best_valid_perf"],
                self.trainer.worker_id))
        logging.info("Model result saved to {}".format(result_file_name))
        self._save_pareto_front("GFlops", "Best {}".format(metric_method))
Esempio n. 3
0
def dump_model_visual_info(trainer, epoch, model, inputs):
    """Dump model to tensorboard event files.

    :param trainer: trainer.
    :type worker: object that the class was inherited from DistributedWorker.
    :param model: model.
    :type model: model.
    :param inputs: input data.
    :type inputs: data.

    """
    (_, visual, interval, title, worker_id,
     output_path) = _get_trainer_info(trainer)
    if visual is not True:
        return
    if epoch % interval != 0:
        return
    title = str(worker_id)
    _path = FileOps.join_path(output_path, title)
    FileOps.make_dir(_path)
    try:
        with SummaryWriter(_path) as writer:
            writer.add_graph(model, (inputs, ))
    except Exception as e:
        logging.error(
            "Failed to dump model visual info, worker id: {}, epoch: {}, error: {}"
            .format(worker_id, epoch, str(e)))
Esempio n. 4
0
    def _save_checkpoint(self, epoch, best=False):
        """Save model weights.

        :param epoch: current epoch
        :type epoch: int
        """
        save_dir = os.path.join(self.worker_path, str(epoch))
        FileOps.make_dir(save_dir)
        for name in self.model.model_names:
            if isinstance(name, str):
                save_filename = '%s_net_%s.pth' % (epoch, name)
                save_path = FileOps.join_path(save_dir, save_filename)
                net = getattr(self.model, 'net' + name)
                best_file = FileOps.join_path(self.worker_path,
                                              "model_{}.pth".format(name))
                if self.cfg.cuda and torch.cuda.is_available():
                    # torch.save(net.module.cpu().state_dict(), save_path)
                    torch.save(net.module.state_dict(), save_path)
                    # net.cuda()
                    if best:
                        torch.save(net.module.state_dict(), best_file)
                else:
                    torch.save(net.cpu().state_dict(), save_path)
                    if best:
                        torch.save(net.cpu().state_dict(), best_file)
Esempio n. 5
0
    def before_train(self, logs=None):
        """Be called before the whole train process."""
        self.trainer.config.call_metrics_on_train = False
        self.cfg = self.trainer.config
        self.worker_id = self.trainer.worker_id
        self.local_base_path = self.trainer.local_base_path
        self.local_output_path = self.trainer.local_output_path

        self.result_path = FileOps.join_path(self.trainer.local_base_path,
                                             "result")
        FileOps.make_dir(self.result_path)
        self.logger_patch()
Esempio n. 6
0
 def copy_pareto_output(self, step_name=None, worker_ids=[]):
     """Copy files related to pareto from  worker to output."""
     taskops = TaskOps()
     local_output_path = os.path.join(taskops.local_output_path, step_name)
     if not (step_name and os.path.exists(local_output_path)):
         return
     for worker_id in worker_ids:
         desDir = os.path.join(local_output_path, str(worker_id))
         FileOps.make_dir(desDir)
         local_worker_path = taskops.get_worker_subpath(
             step_name, str(worker_id))
         srcDir = FileOps.join_path(taskops.local_base_path,
                                    local_worker_path)
         copy_search_file(srcDir, desDir)
Esempio n. 7
0
    def _save_performance(self, results):
        """Save performance into performance.pkl and save checkpoint to output_dir.

        :param results: performance results
        :type sr: dict
        """
        logging.info("performance=%s", str(results))
        performance_dir = os.path.join(self.worker_path, 'performance')
        FileOps.make_dir(performance_dir)
        FileOps.dump_pickle(results,
                            os.path.join(performance_dir, 'performance.pkl'))
        logging.info("performance save to %s", performance_dir)
        # copy pth to output dir
        output_dir = os.path.join(self.output_path, str(self._worker_id))
        FileOps.make_dir(output_dir)
        shutil.copy(
            os.path.join(self.worker_path, 'latest.pth'),
            os.path.join(output_dir, results['arch'].split('_')[1] + '.pth'))
        logging.info("Latest checkpoint save to %s", output_dir)
Esempio n. 8
0
 def before_train(self, logs=None):
     """Be called before the training process."""
     self.cfg = self.trainer.cfg
     self.trainer.auto_save_ckpt = False
     self.trainer.auto_save_perf = False
     self.worker_id = self.trainer.worker_id
     self.local_base_path = self.trainer.local_base_path
     self.local_output_path = self.trainer.local_output_path
     self.result_path = FileOps.join_path(self.local_output_path,
                                          self.cfg.step_name)
     FileOps.make_dir(self.result_path)
     count_input = torch.FloatTensor(1, 3, 192, 192).cuda()
     flops_count, params_count = calc_model_flops_params(
         self.trainer.model, count_input)
     GFlops, KParams = flops_count * 1e-9, params_count * 1e-3
     logger.info("Flops: {:.2f} G, Params: {:.1f} K".format(
         GFlops, KParams))
     if GFlops > 0.6:
         logger.info("Flop too large!")
         self.trainer.skip_train = True
     self._copy_needed_file()
Esempio n. 9
0
 def _output_records(self,
                     step_name,
                     records,
                     desc=True,
                     weights_file=False,
                     performance=False):
     """Dump records."""
     columns = ["worker_id", "performance", "desc"]
     outputs = []
     for record in records:
         record = record.serialize()
         _record = {}
         for key in columns:
             _record[key] = record[key]
         outputs.append(deepcopy(_record))
     data = pd.DataFrame(outputs)
     step_path = FileOps.join_path(TaskOps().local_output_path, step_name)
     FileOps.make_dir(step_path)
     _file = FileOps.join_path(step_path, "output.csv")
     try:
         data.to_csv(_file, index=False)
     except Exception:
         logging.error("Failed to save output file, file={}".format(_file))
     for record in outputs:
         worker_id = record["worker_id"]
         worker_path = TaskOps().get_local_worker_path(step_name, worker_id)
         outputs_globs = []
         if desc:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "desc_*.json"))
         if weights_file:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "model_*.pth"))
         if performance:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "performance_*.json"))
         for _file in outputs_globs:
             FileOps.copy_file(_file, step_path)
Esempio n. 10
0
    def build(self,
              model=None,
              optimizer=None,
              loss=None,
              lr_scheduler=None,
              metrics=None,
              hps=None,
              callbacks=None,
              train_loader=None,
              valid_loader=None,
              make_batch=None,
              train_step=None,
              valid_step=None,
              load_ckpt_flag=False,
              checkpoint_file_name="weights.pth",
              model_pickle_file_name="model.pkl",
              performance_file_name="performance.txt"):
        """Build the trainer by assembling the necessary components."""
        # Intitialize hyperparameters by parameters or configurations
        self.checkpoint_file_name = checkpoint_file_name
        self.model_pickle_file_name = model_pickle_file_name
        self.performance_file_name = performance_file_name

        self._init_cuda_setting()
        self._init_hps(hps)

        self.do_validation = self.cfg.with_valid
        self.model = self._init_model(model)
        self.load_ckpt_flag = load_ckpt_flag
        if self.load_ckpt_flag:
            self.load_checkpoint()
        else:
            self._load_pretrained_model()
        if self.model is not None and self.use_cuda:
            self.model = self.model.cuda()

        self.use_syncbn = self.cfg.get('syncbn', False)
        if self.use_syncbn:
            self.model = apex.parallel.convert_syncbn_model(self.model)
        self.optimizer = self._init_optimizer(optimizer)
        self.loss = self._init_loss(loss)
        self.lr_scheduler = self._init_lr_scheduler(lr_scheduler)
        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics(metrics)
        self.valid_metrics = self._init_metrics(metrics)
        self.train_loader = self._init_dataloader(mode='train',
                                                  loader=train_loader)
        self.valid_loader = self._init_dataloader(mode='test',
                                                  loader=valid_loader)
        self._init_horovod_setting()
        self.use_amp = self.cfg.get('amp', False)
        if self.use_amp:
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')
        if self.callbacks is None:
            self.callbacks = callbacks
        self._init_step_functions(make_batch, train_step, valid_step)
        # self.output_model_desc()
        cur_working_dir = FileOps.join_path(self.local_output_path,
                                            self.step_name)
        FileOps.make_dir(cur_working_dir)
        # Make sure Trainer has been built for training
        self.has_built = True
Esempio n. 11
0
 def performance_path(self, worker_result_path):
     """Get performance path."""
     performance_dir = os.path.join(worker_result_path, 'performance')
     if not os.path.exists(performance_dir):
         FileOps.make_dir(performance_dir)
     return os.path.join(performance_dir, 'performance.pkl')
Esempio n. 12
0
    def build(self,
              model=None,
              optimizer=None,
              loss=None,
              lr_scheduler=None,
              metrics=None,
              hps=None,
              callbacks=None,
              train_loader=None,
              valid_loader=None,
              make_batch=None,
              train_step=None,
              valid_step=None,
              model_fn=None,
              train_input_fn=None,
              valid_input_fn=None,
              load_ckpt_flag=False,
              checkpoint_file_name="checkpoint.pth",
              model_pickle_file_name="model.pkl"):
        """Build the trainer by assembling the necessary components."""
        # Intitialize hyperparameters by parameters or configurations
        self._init_hps(hps)
        logging.debug("Trainer Config: {}".format(obj2config(self.config)))
        self.checkpoint_file_name = checkpoint_file_name
        self.model_pickle_file_name = model_pickle_file_name
        if vega.is_torch_backend():
            self._init_step_functions(make_batch, train_step, valid_step)
        elif vega.is_tf_backend():
            self._init_estimator_fn(model_fn, train_input_fn, valid_input_fn)
        self._init_tf_session()
        self._init_distributed_setting()
        self._init_cuda_setting()
        self._init_tf_estimator()
        self.do_validation = self.config.with_valid
        self.model = self._init_model(model)
        self.load_ckpt_flag = load_ckpt_flag
        if self.load_ckpt_flag:
            self.load_checkpoint()
        else:
            self._load_pretrained_model()
        self.use_syncbn = self.config.syncbn
        if self.use_syncbn and vega.is_torch_backend():
            self.model = apex.parallel.convert_syncbn_model(self.model)
        self.train_loader = self._init_dataloader(mode='train',
                                                  loader=train_loader)
        self.valid_loader = self._init_dataloader(mode='val',
                                                  loader=valid_loader)
        if vega.is_torch_backend():
            self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) \
                if optimizer is None else optimizer
            self.loss = Loss()() if loss is None else loss
            self.lr_scheduler = LrScheduler()(
                self.optimizer) if lr_scheduler is None else lr_scheduler
        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics(
            metrics) if vega.is_torch_backend() else None
        self.valid_metrics = self._init_metrics(metrics)

        self._init_horovod_setting()
        if self.use_amp and vega.is_torch_backend():
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')
        if self.callbacks is None:
            self.callbacks = callbacks
        # self.output_model_desc()
        cur_working_dir = FileOps.join_path(self.local_output_path,
                                            self.step_name)
        FileOps.make_dir(cur_working_dir)
        # Make sure Trainer has been built for training
        self.has_built = True