Beispiel #1
0
    def dump(self):
        """Dump report to file."""
        try:
            _file = FileOps.join_path(TaskOps().step_path, "reports.csv")
            FileOps.make_base_dir(_file)
            data = self.all_records
            data_dict = {}
            for step in data:
                step_data = step.serialize().items()
                for k, v in step_data:
                    if k in data_dict:
                        data_dict[k].append(v)
                    else:
                        data_dict[k] = [v]

            data = pd.DataFrame(data_dict)
            data.to_csv(_file, index=False)
            _file = os.path.join(TaskOps().step_path, ".reports")
            _dump_data = [
                ReportServer._hist_records, ReportServer.__instances__
            ]
            with open(_file, "wb") as f:
                pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL)

            self.backup_output_path()
        except Exception:
            logging.warning(traceback.format_exc())
Beispiel #2
0
 def copy_pareto_output(self, step_name=None, worker_ids=[]):
     """Copy files related to pareto from  worker to output."""
     taskops = TaskOps()
     local_output_path = os.path.join(taskops.local_output_path, step_name)
     if not (step_name and os.path.exists(local_output_path)):
         return
     for worker_id in worker_ids:
         desDir = os.path.join(local_output_path, str(worker_id))
         FileOps.make_dir(desDir)
         local_worker_path = taskops.get_worker_subpath(
             step_name, str(worker_id))
         srcDir = FileOps.join_path(taskops.local_base_path,
                                    local_worker_path)
         copy_search_file(srcDir, desDir)
Beispiel #3
0
 def _append_record_to_csv(self,
                           record_name=None,
                           step_name=None,
                           record=None,
                           mode='a'):
     """Transfer record to csv file."""
     local_output_path = os.path.join(TaskOps().local_output_path,
                                      step_name)
     logging.debug(
         "recode to csv, local_output_path={}".format(local_output_path))
     if not record_name and os.path.exists(local_output_path):
         return
     file_path = os.path.join(local_output_path,
                              "{}.csv".format(record_name))
     FileOps.make_base_dir(file_path)
     try:
         for key in record:
             if isinstance(record[key], dict) or isinstance(
                     record[key], list):
                 record[key] = str(record[key])
         data = pd.DataFrame([record])
         if not os.path.exists(file_path):
             data.to_csv(file_path, index=False)
         elif os.path.exists(file_path) and os.path.getsize(
                 file_path) and mode == 'a':
             data.to_csv(file_path, index=False, mode=mode, header=0)
         else:
             data.to_csv(file_path, index=False, mode=mode)
     except Exception as ex:
         logging.info(
             'Can not transfer record to csv file Error: {}'.format(ex))
Beispiel #4
0
    def _init_model(self):
        """Initialize model if fully training a model.

        :return: config of fully train model
        :rtype: config file
        """
        config = Config(self.cfg.config_template)
        config['total_epochs'] = self.cfg.epoch
        if hasattr(self.cfg, 'model_desc_file') and self.cfg.model_desc_file:
            _model_desc_file = self.cfg.model_desc_file.replace(
                "{local_base_path}", TaskOps().local_base_path)
            _total_list = ListDict.load_csv(_model_desc_file)
            pre_arch = _total_list.sort('mAP')[0]['arch']
            pretrained = pre_arch.split('_')[1]
            pre_worker_id = _total_list.sort('mAP')[0]['pre_worker_id']
            model_desc = dict(arch=pre_arch,
                              pre_arch=pretrained,
                              pre_worker_id=-1)
            logging.info("Initialize fully train model from: {}".format(model_desc))
            if self.cfg.regnition:
                # re-write config from previous result
                config['model']['backbone']['reignition'] = True
                config['model']['pretrained'] = os.path.join(
                    self.output_path, pretrained + '_imagenet.pth')
            else:
                config['model']['pretrained'] = extract_backbone_from_pth(
                    self.output_path, pre_worker_id, pretrained)
            self.sample_results = dict(
                arch=pre_arch, worker_id=self._worker_id, pre_arch=pre_arch, pre_worker_id=pre_worker_id)
        elif 'model_desc' in self.cfg:
            model_desc = self.cfg.model_desc
        else:
            raise ValueError('Missing model description!')
        model_desc = update_config(config, model_desc)
        return model_desc
Beispiel #5
0
 def _save_worker_record(cls, record):
     step_name = record.get('step_name')
     worker_id = record.get('worker_id')
     _path = TaskOps().get_local_worker_path(step_name, worker_id)
     for record_name in ["desc", "performance"]:
         _file_name = None
         _file = None
         record_value = record.get(record_name)
         if not record_value:
             continue
         _file = None
         try:
             # for cars/darts save multi-desc
             if isinstance(record_value, list) and record_name == "desc":
                 for idx, value in enumerate(record_value):
                     _file_name = "desc_{}.json".format(idx)
                     _file = FileOps.join_path(_path, _file_name)
                     with open(_file, "w") as f:
                         json.dump(value, f)
             else:
                 _file_name = None
                 if record_name == "desc":
                     _file_name = "desc_{}.json".format(worker_id)
                 if record_name == "performance":
                     _file_name = "performance_{}.json".format(worker_id)
                 _file = FileOps.join_path(_path, _file_name)
                 with open(_file, "w") as f:
                     json.dump(record_value, f)
         except Exception as ex:
             logging.error(
                 "Failed to save {}, file={}, desc={}, msg={}".format(
                     record_name, _file, record_value, str(ex)))
Beispiel #6
0
 def restore(cls):
     """Transfer cvs_file to records."""
     step_path = TaskOps().step_path
     _file = os.path.join(step_path, ".reports")
     if os.path.exists(_file):
         with open(_file, "rb") as f:
             data = pickle.load(f)
         cls._hist_records = data[0]
         cls.__instances__ = data[1]
Beispiel #7
0
 def _output_records(self,
                     step_name,
                     records,
                     desc=True,
                     weights_file=False,
                     performance=False):
     """Dump records."""
     columns = ["worker_id", "performance", "desc"]
     outputs = []
     for record in records:
         record = record.serialize()
         _record = {}
         for key in columns:
             _record[key] = record[key]
         outputs.append(deepcopy(_record))
     data = pd.DataFrame(outputs)
     step_path = FileOps.join_path(TaskOps().local_output_path, step_name)
     FileOps.make_dir(step_path)
     _file = FileOps.join_path(step_path, "output.csv")
     try:
         data.to_csv(_file, index=False)
     except Exception:
         logging.error("Failed to save output file, file={}".format(_file))
     for record in outputs:
         worker_id = record["worker_id"]
         worker_path = TaskOps().get_local_worker_path(step_name, worker_id)
         outputs_globs = []
         if desc:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "desc_*.json"))
         if weights_file:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "model_*"))
         if performance:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "performance_*.json"))
         for _file in outputs_globs:
             if os.path.isfile(_file):
                 FileOps.copy_file(_file, step_path)
             elif os.path.isdir(_file):
                 FileOps.copy_folder(
                     _file,
                     FileOps.join_path(step_path, os.path.basename(_file)))
Beispiel #8
0
 def _get_current_step_records(self):
     step_name = self.task.step_name
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     records = []
     cur_index = PipelineConfig.steps.index(step_name)
     if cur_index >= 1 or models_folder:
         # records = Report().get_pareto_front_records(PipelineConfig.steps[cur_index - 1])
         if not models_folder:
             models_folder = FileOps.join_path(
                 TaskOps().local_output_path,
                 PipelineConfig.steps[cur_index - 1])
         models_folder = models_folder.replace("{local_base_path}",
                                               TaskOps().local_base_path)
         records = Report().load_records_from_model_folder(models_folder)
     else:
         records = [ReportRecord(step_name, 0)]
     logging.debug("Records: {}".format(records))
     for record in records:
         record.step_name = step_name
     return records
Beispiel #9
0
 def _get_search_space_list(self):
     """Get search space list from models folder."""
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     if not models_folder:
         self.search_space_list = None
         return
     self.search_space_list = []
     models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path)
     pattern = FileOps.join_path(models_folder, "*.json")
     files = glob.glob(pattern)
     for file in files:
         with open(file) as f:
             self.search_space_list.append(json.load(f))
Beispiel #10
0
    def __init__(self):
        """Initialize Visual callback."""
        super(VisualCallBack, self).__init__()
        self.priority = 290
        self._archive_root = TaskOps().local_visual_path
        self._fix_path = None
        self.summary = None
        self.writer = None

        self.input = None
        self.model = None

        self._need_keys = {"loss_avg", "lr"}
        self._info = {k: 0. for k in self._need_keys}
Beispiel #11
0
    def dump(self):
        """Dump report to file."""
        try:
            _file = FileOps.join_path(TaskOps().step_path, "reports.json")
            FileOps.make_base_dir(_file)
            data = {}
            for record in self.all_records:
                if record.step_name in data:
                    data[record.step_name].append(record.to_dict())
                else:
                    data[record.step_name] = [record.to_dict()]
            with open(_file, "w") as f:
                json.dump(data, f, indent=4)

            _file = os.path.join(TaskOps().step_path, ".reports")
            _dump_data = [
                ReportServer._hist_records, ReportServer.__instances__
            ]
            with open(_file, "wb") as f:
                pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL)

            self.backup_output_path()
        except Exception:
            logging.warning(traceback.format_exc())
Beispiel #12
0
    def dump_report(self, step_name=None, record=None):
        """Save one records."""
        try:
            if record and step_name:
                self._append_record_to_csv(self.REPORT_FILE_NAME, step_name,
                                           record.serialize())
            self.backup_output_path()

            step_path = TaskOps().step_path
            _file = os.path.join(step_path, ".reports")
            _dump_data = [Report._hist_records, Report.__instances__]
            with open(_file, "wb") as f:
                pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL)
        except Exception:
            logging.warning(traceback.format_exc())
Beispiel #13
0
 def __init__(self, update_func=None):
     """Init master attrs, setup and start dask distributed cluster and local multiprocess pool."""
     self.cfg = General()
     self.task_count = 0
     self.eval_count = General.worker.eval_count
     self.dask_env = DaskEnv(General.env, self.__master_path__,
                             General.devices_per_trainer,
                             TaskOps().temp_path)
     status = self.dask_env.start()
     if not status or not self.dask_env.is_master:
         sys.exit(0)
     self._start_cluster()
     self.t_queue = Queue()
     self.update_func = update_func
     self.evaluator_list = {}
     self._thread_runing = True
     self._lock = Lock()
     self._thread = self._run_monitor_thread()
     ReportServer().renew()
     return
Beispiel #14
0
 def _start_cluster(self):
     """Set and start dask distributed cluster."""
     self.md = ClusterDaskDistributor(self.dask_env.master_address)
     self.client = self.md.get_client()
     local_host = None
     if "BATCH_CURRENT_HOST" in os.environ:
         local_host = os.environ["BATCH_CURRENT_HOST"]
     elif "BATCH_CUSTOM0_HOSTS" in os.environ:
         local_host = os.environ["BATCH_CUSTOM0_HOSTS"]
     if "CUDA_VISIBLE_DEVICES" in os.environ:
         os.environ["ORIGIN_CUDA_VISIBLE_DEVICES"] = os.environ[
             "CUDA_VISIBLE_DEVICES"]
     self._remove_worker_number_file()
     plugin = WorkerEnv(self.dask_env.slave_proc_num,
                        self.dask_env.slave_device_num_per_proc, local_host,
                        os.getpid(),
                        TaskOps().temp_path)
     self.client.register_worker_plugin(plugin)
     if "ORIGIN_CUDA_VISIBLE_DEVICES" in os.environ:
         os.environ["CUDA_VISIBLE_DEVICES"] = os.environ[
             "ORIGIN_CUDA_VISIBLE_DEVICES"]
     if "CUDA_VISIBLE_DEVICES" in os.environ and "ORIGIN_CUDA_VISIBLE_DEVICES" not in os.environ:
         del os.environ["CUDA_VISIBLE_DEVICES"]
     return
Beispiel #15
0
 def backup_output_path(self):
     """Back up output to local path."""
     backup_path = TaskOps().backup_base_path
     if backup_path is None:
         return
     FileOps.copy_folder(TaskOps().local_output_path, backup_path)
Beispiel #16
0
 def _remove_worker_number_file(self):
     _worker_number_file = os.path.join(TaskOps().temp_path,
                                        ".*worker_number")
     files = glob.glob(_worker_number_file)
     for _file in files:
         os.remove(_file)
Beispiel #17
0
 def __init__(self):
     self.task = TaskOps()