Beispiel #1
0
 def _get_current_step_records(self):
     step_name = General.step_name
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     cur_index = PipelineConfig.steps.index(step_name)
     if cur_index >= 1 or models_folder:
         # records = Report().get_step_records(PipelineConfig.steps[cur_index - 1])
         if not models_folder:
             models_folder = FileOps.join_path(
                 TaskOps().local_output_path,
                 PipelineConfig.steps[cur_index - 1])
         models_folder = models_folder.replace("{local_base_path}",
                                               TaskOps().local_base_path)
         records = Report().load_records_from_model_folder(models_folder)
     else:
         records = self._load_single_model_records()
     final_records = []
     for record in records:
         if not record.weights_file:
             logger.error("Model file is not existed, id={}".format(
                 record.worker_id))
         else:
             record.step_name = General.step_name
             final_records.append(record)
     logging.debug("Records: {}".format(final_records))
     return final_records
Beispiel #2
0
 def _simulate_tiny_pipeline(self, cfg_tiny):
     """Simulate tiny pipeline by using one sample one epoch."""
     report = ReportServer()
     for i, step_name in enumerate(PipelineConfig.steps):
         step_cfg = cfg_tiny.get(step_name)
         if step_cfg.pipe_step.type != 'SearchPipeStep':
             continue
         step_cfg.trainer.distributed = False
         step_cfg.trainer.epochs = 1
         self.restrict_config.trials[step_name] = 1
         General.step_name = step_name
         PipeStepConfig.from_dict(step_cfg)
         pipestep = PipeStep()
         if i == 0:
             pipestep.do()
             record = report.get_step_records(step_name)[-1]
             self.epoch_time = record.runtime
             _worker_path = TaskOps().local_base_path
             if os.path.exists(_worker_path):
                 os.system('rm -rf {}'.format(_worker_path))
         if step_cfg.pipe_step.type == 'SearchPipeStep':
             self.params_dict[step_name][
                 'max_samples'] = pipestep.generator.search_alg.max_samples
         _file = os.path.join(TaskOps().step_path, ".generator")
         if os.path.exists(_file):
             os.system('rm {}'.format(_file))
Beispiel #3
0
Datei: run.py Projekt: ylfzr/vega
def _backup_cfg(cfg_path):
    """Backup yml file.

    :param cfg_path: path of yml file.
    """
    if isinstance(cfg_path, str):
        output_path = FileOps.join_path(TaskOps().local_output_path,
                                        os.path.basename(cfg_path))
        FileOps.copy_file(cfg_path, output_path)
    else:
        output_path = FileOps.join_path(TaskOps().local_output_path,
                                        'config.yml')
        with open(output_path, 'w') as f:
            f.write(yaml.dump(cfg_path))
Beispiel #4
0
 def _save_worker_record(cls, record):
     step_name = record.get('step_name')
     worker_id = record.get('worker_id')
     _path = TaskOps().get_local_worker_path(step_name, worker_id)
     for record_name in ["desc", "hps", "performance"]:
         _file_name = None
         _file = None
         record_value = remove_np_value(record.get(record_name))
         if not record_value:
             continue
         _file = None
         try:
             # for cars/darts save multi-desc
             if isinstance(record_value, list) and record_name == "desc":
                 for idx, value in enumerate(record_value):
                     _file_name = "desc_{}.json".format(idx)
                     _file = FileOps.join_path(_path, _file_name)
                     with open(_file, "w") as f:
                         json.dump(value, f)
             else:
                 _file_name = None
                 if record_name == "desc":
                     _file_name = "desc_{}.json".format(worker_id)
                 if record_name == "hps":
                     _file_name = "hps_{}.json".format(worker_id)
                 if record_name == "performance":
                     _file_name = "performance_{}.json".format(worker_id)
                 _file = FileOps.join_path(_path, _file_name)
                 with open(_file, "w") as f:
                     json.dump(record_value, f)
         except Exception as ex:
             logging.error("Failed to save {}, file={}, desc={}, msg={}".format(
                 record_name, _file, record_value, str(ex)))
Beispiel #5
0
def _backup_config(args):
    _file = args.config_file
    from zeus.common.task_ops import TaskOps
    from zeus.common.file_ops import FileOps
    dest_file = FileOps.join_path(TaskOps().local_output_path,
                                  os.path.basename(_file))
    FileOps.make_base_dir(dest_file)
    FileOps.copy_file(_file, dest_file)
Beispiel #6
0
 def restore(cls):
     """Restore generator from file."""
     step_path = TaskOps().step_path
     _file = os.path.join(step_path, ".generator")
     if os.path.exists(_file):
         with open(_file, "rb") as f:
             return pickle.load(f)
     else:
         return None
Beispiel #7
0
 def _get_current_step_records(self):
     step_name = self.task.step_name
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     cur_index = PipelineConfig.steps.index(step_name)
     if cur_index >= 1 or models_folder:
         # records = ReportServer().get_pareto_front_records(PipelineConfig.steps[cur_index - 1])
         if not models_folder:
             models_folder = FileOps.join_path(
                 TaskOps().local_output_path,
                 PipelineConfig.steps[cur_index - 1])
         models_folder = models_folder.replace("{local_base_path}",
                                               TaskOps().local_base_path)
         records = ReportServer().load_records_from_model_folder(
             models_folder)
     else:
         records = [ReportRecord(step_name, 0)]
     logging.debug("Records: {}".format(records))
     for record in records:
         record.step_name = step_name
     return records
Beispiel #8
0
 def _show_pipeline_info(self, steps_time, step_name):
     logging.info("-" * 48)
     logging.info("  Pipeline end.")
     logging.info("")
     logging.info("  task id: {}".format(General.task.task_id))
     logging.info("  output folder: {}".format(TaskOps().local_output_path))
     logging.info("")
     self._show_step_time(steps_time)
     logging.info("")
     self._show_report(step_name)
     logging.info("-" * 48)
Beispiel #9
0
def _calc_forward_latency_davinci(model,
                                  input,
                                  sess_config=None,
                                  num=10,
                                  evaluate_config=None):
    """Model forward latency calculation.

    :param model: network model
    :type model: torch or tf module
    :param input: input tensor
    :type input: Tensor of torch or tf
    :param num: forward number
    :type num: int
    :param evaluate_config: some config for evaluate in davinci
    :type evaluate_config: dict
    :return: forward latency
    :rtype: float
    """
    from zeus.evaluator.tools.evaluate_davinci_bolt import evaluate
    from zeus.common.task_ops import TaskOps
    # backend = evaluate_config.get("backend")
    hardware = evaluate_config.get("hardware")
    remote_host = evaluate_config.get("remote_host")
    worker_path = TaskOps().local_base_path
    save_data_file = os.path.join(worker_path, "input.bin")

    latency = 0.
    now_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')
    job_id = "pre_evaluate_" + now_time
    logging.info("The job id of evaluate service is {}.".format(job_id))
    if zeus.is_torch_backend():
        import torch
        input_shape = input.shape
        if torch.is_tensor(input):
            input = input.cpu().numpy()
        input.tofile(save_data_file)
        for index in range(num):
            reuse_model = False if index == 0 else True
            results = evaluate("pytorch", hardware, remote_host, model, None,
                               save_data_file, input_shape, reuse_model,
                               job_id)
            latency += np.float(results.get("latency"))
    elif zeus.is_tf_backend():
        input_shape = input.shape.as_list()
        test_data = np.random.random(input_shape).astype(np.float32)
        test_data.tofile(save_data_file)
        for index in range(num):
            reuse_model = False if index == 0 else True
            results = evaluate("tensorflow", hardware, remote_host, model,
                               None, save_data_file, input_shape, reuse_model,
                               job_id)
            latency += np.float(results.get("latency"))
    return latency / num
Beispiel #10
0
def _show_performance():
    output_file = FileOps.join_path(TaskOps().local_output_path,
                                    General.step_name, "output.csv")
    try:
        data = pd.read_csv(output_file)
    except Exception:
        logging.info("  Result file output.csv is not existed or empty.")
        return
    if data.shape[1] < 2 or data.shape[0] == 0:
        logging.info("  Result file output.csv is empty.")
        return
    logging.info("-" * 48)
    data = json.loads(data.to_json())
    logging.info("  result: {}".format(data["performance"]["0"]))
    logging.info("-" * 48)
Beispiel #11
0
 def _show_report(self, step_name):
     performance_file = FileOps.join_path(
         TaskOps().local_output_path, step_name, "output.csv")
     try:
         data = pd.read_csv(performance_file)
     except Exception:
         logging.info("  result file output.csv is not existed or empty")
         return
     if data.shape[1] < 2 or data.shape[0] == 0:
         logging.info("  result file output.csv is empty")
         return
     logging.info("  result:")
     data = json.loads(data.to_json())
     for key in data["worker_id"].keys():
         logging.info("  {:>3s}:  {}".format(str(data["worker_id"][key]), data["performance"][key]))
Beispiel #12
0
def save_master_ip(ip_address, port, args):
    """Write the ip and port in a system path.

    :param str ip_address: The `ip_address` need to write.
    :param str port: The `port` need to write.
    :param argparse.ArgumentParser args: `args` is a argparse that should
         contain `init_method`, `rank` and `world_size`.

    """
    temp_folder = TaskOps().temp_path
    FileOps.make_dir(temp_folder)
    file_path = os.path.join(temp_folder, 'ip_address.txt')
    logging.info("write ip, file path={}".format(file_path))
    with open(file_path, 'w') as f:
        f.write(ip_address + "\n")
        f.write(port + "\n")
Beispiel #13
0
def load_master_ip():
    """Get the ip and port that write in a system path.

    here will not download anything from S3.
    """
    temp_folder = TaskOps().temp_path
    FileOps.make_dir(temp_folder)
    file_path = os.path.join(temp_folder, 'ip_address.txt')
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            ip = f.readline().strip()
            port = f.readline().strip()
            logging.info("get write ip, ip={}, port={}".format(ip, port))
            return ip, port
    else:
        return None, None
Beispiel #14
0
Datei: run.py Projekt: ylfzr/vega
def _init_env(cfg_path):
    """Init config and evn parameters.

    :param cfg_path: config file path
    """
    logging.getLogger().setLevel(logging.DEBUG)
    UserConfig().load(cfg_path)
    # load general
    General.from_json(UserConfig().data.get("general"), skip_check=False)
    init_log(level=General.logger.level, log_path=TaskOps().local_log_path)
    cluster_args = env_args()
    if not cluster_args:
        cluster_args = init_local_cluster_args()
    setattr(PipelineConfig, "steps", UserConfig().data.pipeline)
    General.env = cluster_args
    set_backend(General.backend, General.device_category)
Beispiel #15
0
 def _clean_checkpoint(self):
     worker_parent_folder = os.path.abspath(
         os.path.join(TaskOps().get_local_worker_path(General.step_name, 1),
                      ".."))
     patterns = [
         ".*.pkl",
         "*.pth",
         "model_*",
         "model.ckpt-*",
         "*.pb",
         "graph.*",
         "eval",
         "events*",
         "CKP-*",
         "checkpoint",
         ".*.log",
         "*.ckpt",
         "*.air",
         "*.onnx",
         "*.caffemodel",
         "*.pbtxt",
         "*.bin",
         "kernel_meta",
         "*.prototxt",
     ]
     all_files = []
     worker_folders = glob.glob(worker_parent_folder + "/*")
     for worker_folder in worker_folders:
         for pattern in patterns:
             file_pattern = worker_folder + "/" + pattern
             all_files += glob.glob(file_pattern)
     if all_files:
         logging.info(
             "Clean worker folder {}.".format(worker_parent_folder))
         for item in all_files:
             try:
                 if os.path.isfile(item):
                     os.remove(item)
                 elif os.path.isdir(item):
                     shutil.rmtree(item)
             except Exception:
                 logging.warn("Failed to remove {}".format(item))
Beispiel #16
0
 def _load_single_model_records(self):
     model_desc = PipeStepConfig.model.model_desc
     model_desc_file = PipeStepConfig.model.model_desc_file
     if model_desc_file:
         model_desc_file = model_desc_file.replace(
             "{local_base_path}",
             TaskOps().local_base_path)
         model_desc = Config(model_desc_file)
     if not model_desc:
         logger.error("Model desc or Model desc file is None.")
         return []
     model_file = PipeStepConfig.model.pretrained_model_file
     if not model_file:
         logger.error("Model file is None.")
         return []
     if not os.path.exists(model_file):
         logger.error("Model file is not existed.")
         return []
     return [
         ReportRecord().load_dict(
             dict(worker_id="1", desc=model_desc, weights_file=model_file))
     ]
Beispiel #17
0
 def _get_abs_path(cls, _path):
     if "{local_base_path}" in _path:
         from zeus.common.task_ops import TaskOps
         return os.path.abspath(_path.replace("{local_base_path}", TaskOps().local_base_path))
     return _path
Beispiel #18
0
def _init_env():
    if sys.version_info < (3, 6):
        sys.exit('Sorry, Python < 3.6 is not supported.')
    init_log(level=General.logger.level, log_path=TaskOps().local_log_path)
    General.env = init_cluster_args()
    _print_task_id()
Beispiel #19
0
 def dump(self):
     """Dump generator to file."""
     step_path = TaskOps().step_path
     _file = os.path.join(step_path, ".generator")
     with open(_file, "wb") as f:
         pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)