Beispiel #1
0
 def clean_tmp_files(self):
     """删除过程中生成的临时文件(本地的)"""
     cmd = f"rm -rf {self.tmp_dir_path}"
     cmd_util.run_cmd(cmd)
     tmp_dir_parent = "/".join(self.tmp_dir_path.split("/")[:-1])
     # 确保安全删除
     if ".luwu/tmp/" in tmp_dir_parent:
         cmd = f"rm -rf {tmp_dir_parent}"
         cmd_util.run_cmd(cmd)
Beispiel #2
0
def clean_tmp_dir(days=3):
    """清理陆吾的临时文件夹,默认清理三天前的

    Args:
        days (int, optional): 要清理几天前的临时文件. Defaults to 3.
    """
    timestamp = int(time.time())
    days_timestamp = 86400 * days
    cnt = 0
    for dir_path in glob(os.path.join(LUWU_TMP_DIR_ROOT, "*")):
        dir_timestamp = int(dir_path.split("/")[-1])
        if timestamp - dir_timestamp > days_timestamp:
            cmd = f"rm -rf {abspath(dir_path)}"
            cmd_util.run_cmd(cmd)
            cnt += 1
    logger.info(f"已清理掉 {cnt} 个临时文件夹.")
Beispiel #3
0
 def download_result_from_kaggle(self):
     output_path = os.path.join(self.tmp_dir_path, "kaggle-output")
     logger.info(f"创建文件夹 {output_path} ...")
     file_util.mkdirs(output_path)
     logger.info("从kaggle拉取运行结果...")
     cmd = f"kaggle kernels output {self.kernel_id} -p {output_path}"
     cmd_util.run_cmd(cmd)
     model_save_path = self.kwargs.get("model_save_path", "")
     if not model_save_path:
         model_save_path = "luwu-output"
     project_path = file_util.abspath(model_save_path)
     file_util.mkdirs(project_path)
     output_files_path = os.path.join(output_path, "*")
     logger.info(f"将运行结果移动到指定目录 {project_path} ...")
     cmd = f"cp -r {output_files_path} {project_path}"
     cmd_util.run_cmd(cmd)
     logger.info("Done.")
Beispiel #4
0
    def export_model(self):
        """将训练好的模型导出为pb格式"""
        logger.info("正在导出模型...")
        python_execute_path = cmd_util.get_python_execute_path()
        cmd = f"{python_execute_path} -m object_detection.exporter_main_v2 --input_type image_tensor --pipeline_config_path {self.train_pipeline_config_path} --trained_checkpoint_dir {self.train_checkpoint_path} --output_directory {self.export_model_path}"
        cmd_util.run_cmd(cmd)

        # 复制一份label_map.pbtxt到该目录
        logger.info("导出 label_map.pbtxt...")
        target_path = os.path.join(self.export_model_path, "label_map.pbtxt")
        cmd = f"cp {self.label_map_file_path} {target_path}"
        cmd_util.run_cmd(cmd)

        logger.info("导出测试代码...")
        if os.path.exists(self.origin_dataset_path):
            filenames = glob(os.path.join(self.origin_dataset_path, "*.jpg"))
            if len(filenames):
                eval_image_path = filenames[0]
            else:
                eval_image_path = ""
        else:
            eval_image_path = ""
        params = {
            "path_to_saved_model":
            os.path.join(self.export_model_path, "saved_model"),
            "path_to_label_map":
            target_path,
            "path_to_eval_image":
            eval_image_path,
        }
        template_path = template_path = os.path.join(
            os.path.dirname(__file__), "templates/project/eval.jinja")
        content = self.render_template(template_path, params)
        target_path = os.path.join(self.project_save_path, "eval.py")
        with open(target_path, "w") as f:
            f.write(content)
        logger.info("导出完成!")
Beispiel #5
0
 def upload_dataset(self):
     origin_dataset_path = self.kwargs.get("origin_dataset_path", "")
     if os.path.exists(origin_dataset_path):
         # 先复制一份数据集
         # 创建一个文件夹
         dataset_path = os.path.join(self.tmp_dir_path, "kaggle-data")
         copy_path = os.path.join(dataset_path, "data")
         logger.info(f"创建文件夹 {copy_path} ...")
         file_util.mkdirs(copy_path)
         # 复制数据集到临时目录
         logger.info(f"复制数据集到临时目录...")
         if os.path.isdir(origin_dataset_path):
             cmd = f'cp -r {os.path.join(origin_dataset_path,"*")} {copy_path}'
         else:
             cmd = f"cp -r {origin_dataset_path} {copy_path}"
         cmd_util.run_cmd(cmd)
         # 使用kaggle api初始化数据集
         logger.info("使用kaggle api初始化数据集...")
         cmd = f"kaggle datasets init -p {dataset_path}"
         cmd_util.run_cmd(cmd)
         # 配置dataset meta
         dataset_meta_path = os.path.join(dataset_path,
                                          "dataset-metadata.json")
         with open(dataset_meta_path, "r") as f:
             dataset_meta = json.load(f)
         dataset_meta["title"] = f"luwu-dataset-{self.uuid}"
         dataset_meta["id"] = (dataset_meta["id"].split("/")[0] + "/" +
                               f"luwu-dataset-{self.uuid}")
         with open(dataset_meta_path, "w") as f:
             json.dump(dataset_meta, f, ensure_ascii=False, indent=2)
         # 上传数据集
         logger.info("上传数据集...")
         cmd = f"kaggle datasets create -r zip -p {dataset_path}"
         cmd_util.run_cmd(cmd)
         logger.info("数据集上传完成!")
         logger.info("等待 kaggle 处理数据集,这可能需要几分钟时间 ...")
         self.dataset_id = dataset_meta["id"]
         self.dataset_title = dataset_meta["title"]
         cmd = f"kaggle datasets status {self.dataset_id}"
         while True:
             code, output = subprocess.getstatusoutput(cmd)
             if code != 0:
                 logger.error(output)
                 raise Exception("查询数据集状态失败!")
             if output:
                 if "ready" in output:
                     logger.info("数据集准备完成!")
                 else:
                     logger.warning(output)
                 break
             else:
                 logger.info("暂未查询到数据,等待中 ...")
                 time.sleep(10)
     else:
         raise FileNotFoundError(
             f"指定的 origin_dataset_path 不存在!{origin_dataset_path}")
Beispiel #6
0
 def train(self):
     logger.info("开始训练...")
     python_execute_path = cmd_util.get_python_execute_path()
     cmd = f"{python_execute_path} -m object_detection.model_main_tf2 --model_dir={self.train_checkpoint_path} --pipeline_config_path={self.train_pipeline_config_path}"
     cmd_util.run_cmd(cmd)
     logger.info("训练完成!")
Beispiel #7
0
    def preprocess_dataset(self):
        """对数据集进行预处理,并定义pipeline.config"""
        # 生成 label_map.pbtxt
        if self.need_generate_label_map:
            logger.info("遍历数据集,生成 label_map.pbtxt ...")
            if not os.path.exists(self.origin_dataset_path):
                raise FileNotFoundError("origin_dataset_path 未指定!")
            label_map_util.create_label_map(self.origin_dataset_path,
                                            self.label_map_file_path)
            logger.info(f"label_map 文件已保存到 {self.label_map_file_path}.")
        else:
            logger.info(f"label_map 文件已存在,路径为{self.label_map_file_path},跳过!")

        # 生成 tfrecord
        if self.need_generate_tfrecord:
            logger.info("遍历数据集,生成 tfrecord ...")
            python_execute_path = cmd_util.get_python_execute_path()
            script_path = os.path.join(os.path.dirname(__file__), "utils",
                                       "generate_tfrecord.py")
            csv_path = os.path.join(self.tfrecord_dataset_dir, "tmp.csv")
            cmd = f"{python_execute_path} {script_path} -x {self.origin_dataset_path} -l {self.label_map_file_path} -o {self.tfrecord_dataset_file_path} -c {csv_path}"
            cmd_util.run_cmd(cmd)
            if os.path.exists(csv_path):
                os.remove(csv_path)
        else:
            logger.info(
                f"tfrecord 文件已存在,路径为{self.tfrecord_dataset_file_path},跳过!")

        if self.do_fine_tune:
            # 下载预训练权重
            url = self.fine_tune_models_config_map[
                self.fine_tune_model_name]["url"]
            download_dir_name = url.split("/")[-1].split(".")[0]
            cache_dir_path = os.path.expanduser(
                "~/.luwu/tensorflow-models/object-detection/")
            if self.fine_tune_checkpoint_path:
                # 如果给定路径指向了checkpoint文件夹下ckpt-0.index文件
                if self.fine_tune_checkpoint_path.endswith(".index"):
                    file_path = self.fine_tune_checkpoint_path
                    self.fine_tune_checkpoint_path = (
                        self.fine_tune_checkpoint_path.rstrip(".index"))
                # 指向checkpoint文件夹
                elif (
                    (self.fine_tune_checkpoint_path.endswith("checkpoint/")
                     or self.fine_tune_checkpoint_path.endswith("checkpoint"))
                        and os.path.exists(self.fine_tune_checkpoint_path)
                        and os.path.isdir(self.fine_tune_checkpoint_path)):
                    file_path = os.path.join(self.fine_tune_checkpoint_path,
                                             "ckpt-0.index")
                    self.fine_tune_checkpoint_path = os.path.join(
                        self.fine_tune_checkpoint_path, "ckpt-0")
                # 默认的checkpoint路径
                else:
                    self.fine_tune_checkpoint_path = os.path.join(
                        cache_dir_path,
                        download_dir_name,
                        "checkpoint/ckpt-0",
                    )
                    file_path = self.fine_tune_checkpoint_path + ".index"
            else:
                self.fine_tune_checkpoint_path = os.path.join(
                    cache_dir_path,
                    download_dir_name,
                    "checkpoint/ckpt-0",
                )
                file_path = self.fine_tune_checkpoint_path + ".index"
            # 检查文件是否存在
            if os.path.exists(file_path):
                logger.info(f"预训练权重已存在 {self.fine_tune_checkpoint_path},跳过!")
            else:
                tf.keras.utils.get_file(download_dir_name,
                                        url,
                                        untar=True,
                                        cache_subdir=cache_dir_path)
                logger.info("预训练权重下载完成!")
        else:
            self.fine_tune_checkpoint_path = ""
            logger.info("不使用预训练权重!")

        # 创建项目文件夹结构
        # 根文件夹,如果存在则会抛出异常
        if os.path.exists(self.project_save_path):
            while True:
                text = input(
                    f"目录 {self.project_save_path} 已存在,请更换目录或者确认清空该文件夹!确认清空?[Y/N]"
                )
                text = text.lower().strip()
                if text == "y":
                    cmd = f"rm -rf {self.project_save_path} -y"
                    cmd_util.run_cmd(cmd)
                    os.makedirs(self.project_save_path)
                    break
                elif text == "n":
                    logger.info("请重新选择模型保存的目录,程序已退出。")
                    exit(-1)
                else:
                    continue
        else:
            os.makedirs(self.project_save_path)
        # 训练checkpoint保存路径
        self.train_checkpoint_path = os.path.join(self.project_save_path,
                                                  "train_models")
        os.makedirs(self.train_checkpoint_path, exist_ok=True)
        # 导出模型(SavedModel)保存路径
        self.export_model_path = os.path.join(self.project_save_path,
                                              "exported-models")
        os.makedirs(self.export_model_path, exist_ok=True)

        # 创建pipeline.config
        self.generate_pipeline_config()
        logger.info("pipeline.config 生成完毕!")
Beispiel #8
0
    def train_on_kaggle(self, task_type):
        # 生成训练代码
        # 创建文件夹
        kernel_path = os.path.join(self.tmp_dir_path, "kaggle-kernel")
        logger.info(f"创建文件夹 {kernel_path} ...")
        file_util.mkdirs(kernel_path)
        # 初始化kernel
        logger.info("使用 kaggle api 初始化 kernel ...")
        cmd = f"kaggle kernels init -p {kernel_path}"
        cmd_util.run_cmd(cmd)
        # 生成训练脚本
        override_params = {"project_id", "cmd", "luwu_version"}
        train_cmd_params = []
        if task_type == "classification":
            project_name = "luwu-classification-project"
            override_params.update(["net_name", "network_name"])
            # tfrecord数据集路径
            tfrecord_dataset_path = "./dataset"
            train_cmd_params.append(
                f"--tfrecord_dataset_path {tfrecord_dataset_path}")
            override_params.add("tfrecord_dataset_path")
        elif task_type == "detection":
            project_name = "luwu-object-detection-project"
            override_params.update([
                "label_map_path",
                "fine_tune_checkpoint_path",
            ])
            # tfrecord数据集路径
            tfrecord_dataset_path = "./dataset"
            train_cmd_params.append(
                f"--tfrecord_dataset_path {tfrecord_dataset_path}")
            override_params.add("tfrecord_dataset_path")
        elif task_type == "text_classification":
            project_name = "luwu-text-classification-project"
        else:
            raise Exception(f"不支持的任务类型! {task_type}")

        # 原始数据集路径
        origin_dataset_path = os.path.join("../input", self.dataset_title)
        if self.kwargs.get("cmd") == "text_classification":
            filename = self.kwargs.get("origin_dataset_path").split("/")[-1]
            origin_dataset_path = os.path.join(origin_dataset_path, filename)
        train_cmd_params.append(f"--origin_dataset_path {origin_dataset_path}")
        override_params.add("origin_dataset_path")
        # 模型保存路径
        model_save_path = "./project"
        train_cmd_params.append(f"--model_save_path {model_save_path}")
        override_params.add("model_save_path")
        # 其他参数
        for arg_name, arg_value in self.kwargs.items():
            if "kaggle" in arg_name:
                continue
            if arg_name in override_params:
                continue
            # 兼容bool类型参数
            if arg_value != False:
                train_cmd_params.append(f'--{arg_name} "{arg_value}"')
            # else:
            #     train_cmd_params.append(f"--{arg_name}")
        if task_type == "classification":
            train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)} {self.luwu_model_class.__name__}\n"
        elif task_type == "detection":
            train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)}\n"
        elif task_type == "text_classification":
            train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)}\n"
        else:
            raise Exception(f"不支持的任务类型! {task_type}")
        project_path = os.path.join(model_save_path, project_name)
        if task_type == "classification":
            zip_cmd = (
                f"!mv {project_path} ./ "
                f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} "
                f"&& rm -rf {tfrecord_dataset_path} "
                f"&& rm -rf ./{project_name} "
                f"&& rm -rf {model_save_path} \n")
        elif task_type == "detection":
            zip_cmd = (
                f"!mv {project_path} ./ "
                f'&& rm -rf {os.path.join(project_name,"train_models")} '
                f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} "
                f"&& rm -rf {tfrecord_dataset_path} "
                f"&& rm -rf ./{project_name} "
                f"&& rm -rf {model_save_path} \n")
        elif task_type == "text_classification":
            zip_cmd = (
                f"!mv {project_path} ./ "
                f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} "
                f"&& rm -rf ./{project_name} "
                f"&& rm -rf {model_save_path} \n")
        luwu_version = self.kwargs.get("luwu_version")
        if luwu_version:
            install_cmd = f"!pip install luwu=={luwu_version}\n"
        else:
            install_cmd = "!pip install luwu\n"
        codes = [
            "# 安装 luwu\n",
            install_cmd,
            "# 执行训练指令\n",
            train_cmd,
            "# 打包待下载文件的指令\n",
            zip_cmd,
            "    ",
        ]
        script_metadata = self.load_notebook_metadata()
        self.update_notebook_codes(script_metadata, codes)
        kernel_file_path = os.path.join(kernel_path,
                                        f"luwu-kernel-{self.uuid}.ipynb")
        with open(kernel_file_path, "w") as f:
            json.dump(script_metadata, f, ensure_ascii=False, indent=2)
        # 修改 kernel-metadata.json
        kernel_metadata_path = os.path.join(kernel_path,
                                            "kernel-metadata.json")
        with open(kernel_metadata_path, "r") as f:
            kernel_metadata = json.load(f)
        kernel_metadata["id"] = (kernel_metadata["id"].split("/")[0] + "/" +
                                 f"luwu-kernel-{self.uuid}")
        kernel_metadata["title"] = f"luwu-kernel-{self.uuid}"
        kernel_metadata["code_file"] = kernel_file_path
        kernel_metadata["language"] = "python"
        kernel_metadata["kernel_type"] = "notebook"
        kaggle_accelerator = self.kwargs.get("kaggle_accelerator", False)
        if kaggle_accelerator:
            kernel_metadata["enable_gpu"] = "true"
        else:
            kernel_metadata["enable_gpu"] = "false"
        kernel_metadata["dataset_sources"] = [
            self.dataset_id,
        ]
        with open(kernel_metadata_path, "w") as f:
            json.dump(kernel_metadata, f, ensure_ascii=False, indent=2)
        logger.info(f"kernel metadata :{kernel_metadata}")
        self.kernel_id = kernel_metadata["id"]
        self.kernel_title = kernel_metadata["title"]
        # 推送并运行kernel
        logger.info("将 kernel 推送到 Kaggle 并运行 ...")
        cmd = f"kaggle kernels push -p {kernel_path}"
        logger.debug(cmd)
        cmd_util.run_cmd(cmd)
        logger.info("推送完成!等待运行中 ...")
        running = False
        error_cnt = 0
        while True:
            cmd = f"kaggle kernels status {self.kernel_id}"
            code, output = subprocess.getstatusoutput(cmd)
            if code != 0:
                logger.error(output)
                raise Exception(output)
            pattern = 'has status "([^"]*)"'
            matches = re.findall(pattern, output)
            if not matches:
                logger.error(f"未查询到状态!{output}")
                error_cnt += 1
                if error_cnt > 10:
                    raise Exception(
                        f"连续10次未获取到 kernel {self.kernel_id} 的运行状态!")
            else:
                status = matches[0]
                # 运行之前,所有的状态都忽略
                if not running:
                    if status == "running":
                        logger.info(f"{self.kernel_id} running ...")
                        running = True
                else:
                    # 运行之后,找到第一次非 running 状态就退出
                    if status == "running":
                        logger.info(f"{self.kernel_id} running ...")
                    else:
                        self.kernel_exit_status = status
                        logger.info(output)
                        logger.info(
                            f"{self.kernel_id} 终止状态:{self.kernel_exit_status} . 已退出!"
                        )
                        break
                time.sleep(10)
        logger.info("kernel 运行已结束!")