Beispiel #1
0
 def _train_single_model(self,
                         model_desc=None,
                         model_id=None,
                         weights_file=None):
     cls_trainer = ClassFactory.get_cls(ClassType.TRAINER,
                                        PipeStepConfig.trainer.type)
     step_name = self.task.step_name
     if model_desc is not None:
         sample = dict(worker_id=model_id,
                       desc=model_desc,
                       step_name=step_name)
         record = ReportRecord().load_dict(sample)
         logging.debug("update record=%s", str(record))
         trainer = cls_trainer(model_desc=model_desc,
                               id=model_id,
                               pretrained_model_file=weights_file)
     else:
         trainer = cls_trainer(None, 0)
         record = ReportRecord(trainer.step_name,
                               trainer.worker_id,
                               desc=trainer.model_desc)
     ReportClient().update(**record.to_dict())
     # resume training
     if vega.is_torch_backend() and General._resume:
         trainer.load_checkpoint = True
         trainer._resume_training = True
     if self._distributed_training:
         self._do_distributed_fully_train(trainer)
     else:
         self._do_single_fully_train(trainer)
 def _evaluate_single_model(self, record):
     try:
         worker_info = {
             "step_name": record.step_name,
             "worker_id": record.worker_id
         }
         _record = dict(worker_id=record.worker_id,
                        desc=record.desc,
                        step_name=record.step_name)
         ReportClient().update(**_record)
         if EvaluatorConfig.host_evaluator_enable:
             cls_evaluator = ClassFactory.get_cls(ClassType.HOST_EVALUATOR,
                                                  "HostEvaluator")
             evaluator = cls_evaluator(worker_info=worker_info,
                                       model_desc=record.desc,
                                       weights_file=record.weights_file)
             self.master.run(evaluator)
         if EvaluatorConfig.device_evaluator_enable:
             cls_evaluator = ClassFactory.get_cls(
                 ClassType.DEVICE_EVALUATOR, "DeviceEvaluator")
             evaluator = cls_evaluator(worker_info=worker_info,
                                       model_desc=record.desc,
                                       weights_file=record.weights_file)
             self.master.run(evaluator)
     except Exception:
         logger.error(
             "Failed to evaluate model, worker info={}".format(worker_info))
         logger.error(traceback.format_exc())
         return
Beispiel #3
0
 def _update(self, step_name, worker_id):
     # Waiting report thread update all record
     ReportClient().set_finished(step_name, worker_id)
     if not self.update_func:
         return
     if self.update_func.__code__.co_varnames.index("step_name") == 1:
         self.update_func(step_name, worker_id)
     else:
         self.update_func({"step_name": step_name, "worker_id": worker_id})
Beispiel #4
0
    def update(self, step_name, worker_id):
        """Update search algorithm accord to the worker path.

        :param step_name: step name
        :param worker_id: current worker id
        :return:
        """
        record = ReportClient().get_record(step_name, worker_id)
        logging.debug("Get Record=%s", str(record))
        self.search_alg.update(record.serialize())
        try:
            self.dump()
        except TypeError:
            logging.warning(
                "The Generator contains object which can't be pickled.")
        logging.info(
            f"Update Success. step_name={step_name}, worker_id={worker_id}")
        logging.info("Best values: %s",
                     ReportServer().print_best(step_name=General.step_name))
 def _train_single_model(self, model_desc, model_id, hps, multi_task):
     cls_trainer = ClassFactory.get_cls(ClassType.TRAINER, PipeStepConfig.trainer.type)
     step_name = self.task.step_name
     sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name)
     record = ReportRecord().load_dict(sample)
     logging.debug("update record=%s", str(record))
     trainer = cls_trainer(model_desc=model_desc, id=model_id, hps=hps, multi_task=multi_task)
     ReportClient().update(**record.to_dict())
     if self._distributed_training:
         self._do_distributed_fully_train(trainer)
     else:
         self._do_single_fully_train(trainer)
Beispiel #6
0
    def sample(self):
        """Sample a work id and model from search algorithm."""
        for _ in range(10):
            res = self.search_alg.search()
            if not res:
                return None
            if not isinstance(res, list):
                res = [res]
            if len(res) == 0:
                return None
            out = []
            for sample in res:
                if isinstance(sample, dict):
                    id = sample["worker_id"]
                    desc = self._decode_hps(sample["encoded_desc"])
                    sample.pop("worker_id")
                    sample.pop("encoded_desc")
                    kwargs = sample
                    sample = _split_sample((id, desc))
                else:
                    kwargs = {}
                    sample = _split_sample(sample)
                if hasattr(self, "objective_keys") and self.objective_keys:
                    kwargs["objective_keys"] = self.objective_keys
                (id, desc, hps) = sample

                if "modules" in desc:
                    PipeStepConfig.model.model_desc = deepcopy(desc)
                elif "network" in desc:
                    origin_desc = PipeStepConfig.model.model_desc
                    model_desc = update_dict(desc["network"], origin_desc)
                    PipeStepConfig.model.model_desc = model_desc
                    desc.pop('network')
                    desc.update(model_desc)

                if self.quota.is_filtered(desc):
                    continue
                if self.affinity and not self.affinity.is_affinity(desc):
                    continue
                ReportClient().update(General.step_name,
                                      id,
                                      desc=desc,
                                      hps=hps,
                                      **kwargs)
                out.append((id, desc, hps))
            if out:
                break
        return out
Beispiel #7
0
 def train_process(self):
     """Validate process for the model validate worker."""
     init_log(
         level=General.logger.level,
         log_file=f"{self.step_name}_device_evaluator_{self.worker_id}.log",
         log_path=self.local_log_path)
     logging.info("start Davinci or mobile evaluate process")
     self.load_model()
     self.valid_loader = self._init_dataloader(mode='test')
     performance = self.valid()
     ReportClient().update(self.step_name,
                           self.worker_id,
                           performance=performance)
     logging.info(
         f"finished device evaluation, id: {self.worker_id}, performance: {performance}"
     )
Beispiel #8
0
 def after_train(self, logs=None):
     """Be called after Training."""
     self.trainer._backup()
     self.wrp_trainer.notify('after_train', {
         'logs': logs,
     })
     if self.estim_th:
         self.estim_th.join()
     ret = self.estim_ret.get('final')
     self.trainer.performance = {'default': ret.get('best_score')}
     desc = self.trainer.model_desc.copy()
     desc['custom']['arch_desc'] = ret.get('best_arch_desc')
     # force update trainer record
     ReportClient().update(self.trainer.step_name,
                           self.trainer.worker_id,
                           desc=desc)
Beispiel #9
0
    def epoch_end(self, run_context):
        """Be called after each epoch."""
        cb_params = run_context.original_args()
        metric = self.model.eval(self.eval_dataset,
                                 dataset_sink_mode=self.dataset_sink_mode)
        logging.info(
            "Current epoch : [{}/{}], current valid metric {}.".format(
                cb_params.cur_epoch_num, cb_params.epoch_num, metric))

        self.trainer.performance.update(metric)
        if self.trainer.distributed and os.environ["DEVICE_ID"] != "0":
            return
        else:
            ReportClient().update(self.trainer.step_name,
                                  self.trainer.worker_id,
                                  num_epochs=cb_params.epoch_num,
                                  current_epoch=cb_params.cur_epoch_num,
                                  performance=self.trainer.performance)
Beispiel #10
0
 def do(self):
     """Start to run benchmark evaluator."""
     logger.info("BenchmarkPipeStep started...")
     records = self._get_current_step_records()
     if not records:
         logger.error("There is no model to evaluate.")
         return
     self.update_status(Status.running)
     self.master = create_master()
     for record in records:
         ReportClient().update(record.step_name,
                               record.worker_id,
                               desc=record.desc)
         self._evaluate_single_model(record)
     self.master.join()
     ReportServer().output_step_all_records(step_name=General.step_name)
     self.master.close()
     ReportServer().backup_output_path()
     self.update_status(Status.finished)
Beispiel #11
0
 def _save_best(self, desc):
     ReportClient().update(self.step_name,
                           self.sample_count + 1,
                           performance={"accuracy": 100},
                           desc=desc)
Beispiel #12
0
 def _update_report(self, epoch, performance):
     record = ReportClient().update(self.trainer.step_name,
                                    self.trainer.worker_id,
                                    performance=performance)
     logging.debug("report_callback record: {}".format(record))
Beispiel #13
0
 def _init_report(self):
     record = ReportClient().update(worker_id=self.trainer.worker_id,
                                    desc=self.cfg.model_desc,
                                    step_name=self.trainer.step_name,
                                    weights_file=self.best_model_file)
     logging.debug("update record=%s", str(record))