Esempio n. 1
0
 def __init__(
     self,
     origin_dataset_path: str = "",
     tfrecord_dataset_path: str = "",
     model_save_path: str = "",
     validation_split: float = 0.2,
     batch_size: int = 32,
     epochs: int = 30,
     project_id: int = 0,
     image_size: int = 224,
     do_fine_tune=False,
     with_image_net=True,
     **kwargs,
 ):
     """
     Args:
         origin_dataset_path (str): 处理前的数据集路径
         tfrecord_dataset_path (str): 处理后的数据集路径
         model_save_path (str): 模型保存路径
         validation_split (float): 验证集切割比例
         batch_size (int): mini batch 大小
         epochs (int): 训练epoch数
         project_id (int): 训练项目编号
         with_image_net (bool): 是否使用imagenet的均值初始化数据
     """
     self._call_code = ""
     self.project_id = project_id
     self.do_fine_tune = do_fine_tune
     self.with_image_net = with_image_net
     origin_dataset_path = file_util.abspath(origin_dataset_path)
     tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path)
     model_save_path = file_util.abspath(model_save_path)
     self.image_size = image_size
     self.origin_dataset_path = origin_dataset_path
     # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径
     if tfrecord_dataset_path:
         self.tfrecord_dataset_path = tfrecord_dataset_path
     else:
         self.tfrecord_dataset_path = origin_dataset_path
     # 当未给定模型保存路径时,默认保存到处理后数据集相同路径
     if self.project_id:
         self.project_save_name = f"luwu-classification-project-{self.project_id}"
     else:
         self.project_save_name = f"luwu-classification-project"
     if model_save_path:
         self.project_save_path = os.path.join(model_save_path,
                                               self.project_save_name)
     else:
         self.project_save_path = os.path.join(self.tfrecord_dataset_path,
                                               self.project_save_name)
     self.model_save_path = os.path.join(self.project_save_path,
                                         "best_weights.h5")
     self.validation_split = validation_split
     self.batch_size = batch_size
     self.epochs = epochs
     file_util.mkdirs(self.project_save_path)
     file_util.mkdirs(self.tfrecord_dataset_path)
Esempio n. 2
0
 def upload_dataset(self):
     origin_dataset_path = self.kwargs.get("origin_dataset_path", "")
     if os.path.exists(origin_dataset_path):
         # 先复制一份数据集
         # 创建一个文件夹
         dataset_path = os.path.join(self.tmp_dir_path, "kaggle-data")
         copy_path = os.path.join(dataset_path, "data")
         logger.info(f"创建文件夹 {copy_path} ...")
         file_util.mkdirs(copy_path)
         # 复制数据集到临时目录
         logger.info(f"复制数据集到临时目录...")
         if os.path.isdir(origin_dataset_path):
             cmd = f'cp -r {os.path.join(origin_dataset_path,"*")} {copy_path}'
         else:
             cmd = f"cp -r {origin_dataset_path} {copy_path}"
         cmd_util.run_cmd(cmd)
         # 使用kaggle api初始化数据集
         logger.info("使用kaggle api初始化数据集...")
         cmd = f"kaggle datasets init -p {dataset_path}"
         cmd_util.run_cmd(cmd)
         # 配置dataset meta
         dataset_meta_path = os.path.join(dataset_path,
                                          "dataset-metadata.json")
         with open(dataset_meta_path, "r") as f:
             dataset_meta = json.load(f)
         dataset_meta["title"] = f"luwu-dataset-{self.uuid}"
         dataset_meta["id"] = (dataset_meta["id"].split("/")[0] + "/" +
                               f"luwu-dataset-{self.uuid}")
         with open(dataset_meta_path, "w") as f:
             json.dump(dataset_meta, f, ensure_ascii=False, indent=2)
         # 上传数据集
         logger.info("上传数据集...")
         cmd = f"kaggle datasets create -r zip -p {dataset_path}"
         cmd_util.run_cmd(cmd)
         logger.info("数据集上传完成!")
         logger.info("等待 kaggle 处理数据集,这可能需要几分钟时间 ...")
         self.dataset_id = dataset_meta["id"]
         self.dataset_title = dataset_meta["title"]
         cmd = f"kaggle datasets status {self.dataset_id}"
         while True:
             code, output = subprocess.getstatusoutput(cmd)
             if code != 0:
                 logger.error(output)
                 raise Exception("查询数据集状态失败!")
             if output:
                 if "ready" in output:
                     logger.info("数据集准备完成!")
                 else:
                     logger.warning(output)
                 break
             else:
                 logger.info("暂未查询到数据,等待中 ...")
                 time.sleep(10)
     else:
         raise FileNotFoundError(
             f"指定的 origin_dataset_path 不存在!{origin_dataset_path}")
Esempio n. 3
0
 def download_result_from_kaggle(self):
     output_path = os.path.join(self.tmp_dir_path, "kaggle-output")
     logger.info(f"创建文件夹 {output_path} ...")
     file_util.mkdirs(output_path)
     logger.info("从kaggle拉取运行结果...")
     cmd = f"kaggle kernels output {self.kernel_id} -p {output_path}"
     cmd_util.run_cmd(cmd)
     model_save_path = self.kwargs.get("model_save_path", "")
     if not model_save_path:
         model_save_path = "luwu-output"
     project_path = file_util.abspath(model_save_path)
     file_util.mkdirs(project_path)
     output_files_path = os.path.join(output_path, "*")
     logger.info(f"将运行结果移动到指定目录 {project_path} ...")
     cmd = f"cp -r {output_files_path} {project_path}"
     cmd_util.run_cmd(cmd)
     logger.info("Done.")
Esempio n. 4
0
def run_project(project):
    params = project["params"]
    model_save_path = params["model_save_path"]
    log_path = os.path.join(
        model_save_path,
        f"luwu-classification-project-{project['id']}",
        f"train.log",
    )
    file_util.mkdirs(os.path.dirname(log_path))
    curdir = os.path.abspath(os.path.dirname(__file__))
    cd_path = os.path.abspath(os.path.join(os.path.join(curdir, ".."), ".."))
    py_path = os.path.join("." + curdir[len(cd_path) :], "train_project.py")
    cmd = f"""cd {cd_path};python {py_path} {project['id']} > {log_path} 2>&1"""
    st, out = subprocess.getstatusoutput(cmd)
    if st == 0:
        logger.info("处理成功!")
    else:
        logger.info("处理失败!")
        raise Exception(out)
Esempio n. 5
0
    def __init__(
        self,
        origin_dataset_path: str,
        validation_dataset_path: str = "",
        test_dataset_path: str = "",
        model_save_path: str = "",
        validation_split: float = 0.1,
        test_split: float = 0.1,
        batch_size: int = 32,
        epochs: int = 30,
        learning_rate: float = 0.01,
        project_id: int = 0,
        maxlen: int = 128,
        frezee_pre_trained_model=False,
        optimizer: str = "Adam",
        optimize_with_piecewise_linear_lr: bool = False,
        simplified_tokenizer: bool = False,
        pre_trained_model_type: str = "bert_base",
        language: str = "chinese",
        *args,
        **kwargs,
    ):
        """
        Args:
            origin_dataset_path (str): 处理前的数据集路径
            validation_dataset_path (str): 验证数据集路径。如不指定,
                    则从origin_dataset_path中进行切分。
            test_dataset_path (str): 测试数据集路径。如不指定,则从
                                    origin_dataset_path中进行切分。
            model_save_path (str): 模型保存路径
            validation_split (float): 验证集切割比例
            test_split (float): 测试集切割比例
            batch_size (int): mini batch 大小
            learning_rate (float): 学习率大小
            epochs (int): 训练epoch数
            project_id (int): 训练项目编号
            maxlen (int, optional): 单个文本的最大长度. Defaults to 128.
            frezee_pre_trained_model (bool, optional): 在训练下游网络时,是否冻结预训练模型权重. Defaults to False.
            optimizer (str, optional): 优化器类别. Defaults to "Adam".
            optimize_with_piecewise_linear_lr (bool): 是否使用分段的线性学习率进行优化. 默认 False
            simplified_tokenizer (bool): 是否对分词器的词表进行精简,默认False
            pre_trained_model_type (str): 使用何种预训练模型
            language (str): 预训练语料的语言
        """
        self._call_code = ""
        self.project_id = project_id
        self.frezee_pre_trained_model = frezee_pre_trained_model
        self.learning_rate = learning_rate

        self.optimize_with_piecewise_linear_lr = optimize_with_piecewise_linear_lr
        self.optimizer_cls = self.get_optimizer_cls(optimizer)

        origin_dataset_path = file_util.abspath(origin_dataset_path)
        model_save_path = file_util.abspath(model_save_path)

        self.simplified_tokenizer = simplified_tokenizer
        self.pre_trained_model_type = pre_trained_model_type
        self.language = language
        if self.pre_trained_model_type not in self.model_lang_weights_dict:
            raise Exception(
                f"指定模型 {self.pre_trained_model_type} 不存在!当前支持的模型为:{list(self.model_lang_weights_dict.keys())}"
            )
        if (self.language not in self.model_lang_weights_dict[
                self.pre_trained_model_type]):
            languages = list(self.model_lang_weights_dict[
                self.pre_trained_model_type].keys())
            raise Exception(
                f"指定语料 {self.language} 的预训练模型 {self.pre_trained_model_type} 不存在!支持的语料为:{languages}"
            )

        self.maxlen = maxlen
        self.origin_dataset_path = origin_dataset_path

        if validation_dataset_path:
            self.validation_dataset_path = file_util.abspath(
                validation_dataset_path)
        else:
            self.validation_dataset_path = validation_dataset_path

        if test_dataset_path:
            self.test_dataset_path = file_util.abspath(test_dataset_path)
        else:
            self.test_dataset_path = test_dataset_path

        # 当未给定模型保存路径时,默认保存到origin数据集相同路径
        self.project_save_name = self.init_project_save_name(project_id)
        if model_save_path:
            self.project_save_path = os.path.join(model_save_path,
                                                  self.project_save_name)
        else:
            self.project_save_path = os.path.join(
                os.path.dirname(self.origin_dataset_path),
                self.project_save_name)
        self.model_save_path = os.path.join(self.project_save_path,
                                            "best_weights.h5")

        self.validation_split = validation_split
        self.test_split = test_split
        self.batch_size = batch_size
        self.epochs = epochs

        file_util.mkdirs(self.project_save_path)

        self.model = None
Esempio n. 6
0
    def train_on_kaggle(self, task_type):
        # 生成训练代码
        # 创建文件夹
        kernel_path = os.path.join(self.tmp_dir_path, "kaggle-kernel")
        logger.info(f"创建文件夹 {kernel_path} ...")
        file_util.mkdirs(kernel_path)
        # 初始化kernel
        logger.info("使用 kaggle api 初始化 kernel ...")
        cmd = f"kaggle kernels init -p {kernel_path}"
        cmd_util.run_cmd(cmd)
        # 生成训练脚本
        override_params = {"project_id", "cmd", "luwu_version"}
        train_cmd_params = []
        if task_type == "classification":
            project_name = "luwu-classification-project"
            override_params.update(["net_name", "network_name"])
            # tfrecord数据集路径
            tfrecord_dataset_path = "./dataset"
            train_cmd_params.append(
                f"--tfrecord_dataset_path {tfrecord_dataset_path}")
            override_params.add("tfrecord_dataset_path")
        elif task_type == "detection":
            project_name = "luwu-object-detection-project"
            override_params.update([
                "label_map_path",
                "fine_tune_checkpoint_path",
            ])
            # tfrecord数据集路径
            tfrecord_dataset_path = "./dataset"
            train_cmd_params.append(
                f"--tfrecord_dataset_path {tfrecord_dataset_path}")
            override_params.add("tfrecord_dataset_path")
        elif task_type == "text_classification":
            project_name = "luwu-text-classification-project"
        else:
            raise Exception(f"不支持的任务类型! {task_type}")

        # 原始数据集路径
        origin_dataset_path = os.path.join("../input", self.dataset_title)
        if self.kwargs.get("cmd") == "text_classification":
            filename = self.kwargs.get("origin_dataset_path").split("/")[-1]
            origin_dataset_path = os.path.join(origin_dataset_path, filename)
        train_cmd_params.append(f"--origin_dataset_path {origin_dataset_path}")
        override_params.add("origin_dataset_path")
        # 模型保存路径
        model_save_path = "./project"
        train_cmd_params.append(f"--model_save_path {model_save_path}")
        override_params.add("model_save_path")
        # 其他参数
        for arg_name, arg_value in self.kwargs.items():
            if "kaggle" in arg_name:
                continue
            if arg_name in override_params:
                continue
            # 兼容bool类型参数
            if arg_value != False:
                train_cmd_params.append(f'--{arg_name} "{arg_value}"')
            # else:
            #     train_cmd_params.append(f"--{arg_name}")
        if task_type == "classification":
            train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)} {self.luwu_model_class.__name__}\n"
        elif task_type == "detection":
            train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)}\n"
        elif task_type == "text_classification":
            train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)}\n"
        else:
            raise Exception(f"不支持的任务类型! {task_type}")
        project_path = os.path.join(model_save_path, project_name)
        if task_type == "classification":
            zip_cmd = (
                f"!mv {project_path} ./ "
                f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} "
                f"&& rm -rf {tfrecord_dataset_path} "
                f"&& rm -rf ./{project_name} "
                f"&& rm -rf {model_save_path} \n")
        elif task_type == "detection":
            zip_cmd = (
                f"!mv {project_path} ./ "
                f'&& rm -rf {os.path.join(project_name,"train_models")} '
                f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} "
                f"&& rm -rf {tfrecord_dataset_path} "
                f"&& rm -rf ./{project_name} "
                f"&& rm -rf {model_save_path} \n")
        elif task_type == "text_classification":
            zip_cmd = (
                f"!mv {project_path} ./ "
                f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} "
                f"&& rm -rf ./{project_name} "
                f"&& rm -rf {model_save_path} \n")
        luwu_version = self.kwargs.get("luwu_version")
        if luwu_version:
            install_cmd = f"!pip install luwu=={luwu_version}\n"
        else:
            install_cmd = "!pip install luwu\n"
        codes = [
            "# 安装 luwu\n",
            install_cmd,
            "# 执行训练指令\n",
            train_cmd,
            "# 打包待下载文件的指令\n",
            zip_cmd,
            "    ",
        ]
        script_metadata = self.load_notebook_metadata()
        self.update_notebook_codes(script_metadata, codes)
        kernel_file_path = os.path.join(kernel_path,
                                        f"luwu-kernel-{self.uuid}.ipynb")
        with open(kernel_file_path, "w") as f:
            json.dump(script_metadata, f, ensure_ascii=False, indent=2)
        # 修改 kernel-metadata.json
        kernel_metadata_path = os.path.join(kernel_path,
                                            "kernel-metadata.json")
        with open(kernel_metadata_path, "r") as f:
            kernel_metadata = json.load(f)
        kernel_metadata["id"] = (kernel_metadata["id"].split("/")[0] + "/" +
                                 f"luwu-kernel-{self.uuid}")
        kernel_metadata["title"] = f"luwu-kernel-{self.uuid}"
        kernel_metadata["code_file"] = kernel_file_path
        kernel_metadata["language"] = "python"
        kernel_metadata["kernel_type"] = "notebook"
        kaggle_accelerator = self.kwargs.get("kaggle_accelerator", False)
        if kaggle_accelerator:
            kernel_metadata["enable_gpu"] = "true"
        else:
            kernel_metadata["enable_gpu"] = "false"
        kernel_metadata["dataset_sources"] = [
            self.dataset_id,
        ]
        with open(kernel_metadata_path, "w") as f:
            json.dump(kernel_metadata, f, ensure_ascii=False, indent=2)
        logger.info(f"kernel metadata :{kernel_metadata}")
        self.kernel_id = kernel_metadata["id"]
        self.kernel_title = kernel_metadata["title"]
        # 推送并运行kernel
        logger.info("将 kernel 推送到 Kaggle 并运行 ...")
        cmd = f"kaggle kernels push -p {kernel_path}"
        logger.debug(cmd)
        cmd_util.run_cmd(cmd)
        logger.info("推送完成!等待运行中 ...")
        running = False
        error_cnt = 0
        while True:
            cmd = f"kaggle kernels status {self.kernel_id}"
            code, output = subprocess.getstatusoutput(cmd)
            if code != 0:
                logger.error(output)
                raise Exception(output)
            pattern = 'has status "([^"]*)"'
            matches = re.findall(pattern, output)
            if not matches:
                logger.error(f"未查询到状态!{output}")
                error_cnt += 1
                if error_cnt > 10:
                    raise Exception(
                        f"连续10次未获取到 kernel {self.kernel_id} 的运行状态!")
            else:
                status = matches[0]
                # 运行之前,所有的状态都忽略
                if not running:
                    if status == "running":
                        logger.info(f"{self.kernel_id} running ...")
                        running = True
                else:
                    # 运行之后,找到第一次非 running 状态就退出
                    if status == "running":
                        logger.info(f"{self.kernel_id} running ...")
                    else:
                        self.kernel_exit_status = status
                        logger.info(output)
                        logger.info(
                            f"{self.kernel_id} 终止状态:{self.kernel_exit_status} . 已退出!"
                        )
                        break
                time.sleep(10)
        logger.info("kernel 运行已结束!")
Esempio n. 7
0
 def __init__(
     self,
     origin_dataset_path: str = "",
     validation_dataset_path: str = "",
     test_dataset_path: str = "",
     tfrecord_dataset_path: str = "",
     model_save_path: str = "",
     validation_split: float = 0.1,
     test_split: float = 0.1,
     batch_size: int = 32,
     epochs: int = 30,
     learning_rate: float = 0.01,
     project_id: int = 0,
     image_size: int = 224,
     do_fine_tune=False,
     with_image_net=True,
     optimizer: str = "Adam",
     freeze_epochs_ratio: float = 0.1,
     image_augmentation_random_flip_horizontal: bool = False,
     image_augmentation_random_flip_vertival: bool = False,
     image_augmentation_random_crop: bool = False,
     image_augmentation_random_brightness: bool = False,
     image_augmentation_random_hue: bool = False,
     **kwargs,
 ):
     """
     Args:
         origin_dataset_path (str): 处理前的数据集路径
         validation_dataset_path (str): 验证数据集路径。如不指定,
                 则从origin_dataset_path中进行切分。
         test_dataset_path (str): 测试数据集路径。如不指定,则从
                                 origin_dataset_path中进行切分。
         tfrecord_dataset_path (str): 处理后的数据集路径
         model_save_path (str): 模型保存路径
         validation_split (float): 验证集切割比例
         test_split (float): 测试集切割比例
         batch_size (int): mini batch 大小
         learning_rate (float): 学习率大小
         epochs (int): 训练epoch数
         project_id (int): 训练项目编号
         with_image_net (bool): 是否使用imagenet的均值初始化数据
         optimizer (str): 优化器类别
         freeze_epochs_ratio (float): 当进行fine_tune时,会先冻结预训练模型进行训练一定epochs,
                                     再解冻全部参数训练一定epochs,此参数表示冻结训练epochs占
                                     全部epochs的比例(此参数仅当 do_fine_tune = True 时有效)。
                                     默认 0.1(当总epochs>1时,只要设置了比例,至少会训练一个epoch)
         image_augmentation_random_flip_horizontal (bool): 数据增强选项,是否做随机左右镜像。默认False.
         image_augmentation_random_flip_vertival (bool): 数据增强选项,是否做随机上下镜像。默认False.
         image_augmentation_random_crop (bool): 数据增强选项,是否做随机剪裁,剪裁尺寸为原来比例的0.9。默认False.
         image_augmentation_random_brightness (bool): 数据增强选项,是否做随机饱和度调节。默认False.
         image_augmentation_random_hue (bool): 数据增强选项,是否做随机色调调节。默认False.
     """
     self._call_code = ""
     self.project_id = project_id
     self.do_fine_tune = do_fine_tune
     self.with_image_net = with_image_net
     self.learning_rate = learning_rate
     self.freeze_epochs_ratio = freeze_epochs_ratio
     self.image_augmentation_random_flip_horizontal = (
         image_augmentation_random_flip_horizontal)
     self.image_augmentation_random_flip_vertival = (
         image_augmentation_random_flip_vertival)
     self.image_augmentation_random_crop = image_augmentation_random_crop
     self.image_augmentation_random_brightness = image_augmentation_random_brightness
     self.image_augmentation_random_hue = image_augmentation_random_hue
     self.optimizer_cls = self.get_optimizer_cls(optimizer)
     origin_dataset_path = file_util.abspath(origin_dataset_path)
     tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path)
     model_save_path = file_util.abspath(model_save_path)
     self.image_size = image_size
     self.origin_dataset_path = origin_dataset_path
     if validation_dataset_path:
         self.validation_dataset_path = file_util.abspath(
             validation_dataset_path)
     else:
         self.validation_dataset_path = validation_dataset_path
     if test_dataset_path:
         self.test_dataset_path = file_util.abspath(test_dataset_path)
     else:
         self.test_dataset_path = test_dataset_path
     # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径
     if tfrecord_dataset_path:
         self.tfrecord_dataset_path = tfrecord_dataset_path
     else:
         self.tfrecord_dataset_path = origin_dataset_path
     # 当未给定模型保存路径时,默认保存到处理后数据集相同路径
     if self.project_id:
         self.project_save_name = f"luwu-classification-project-{self.project_id}"
     else:
         self.project_save_name = f"luwu-classification-project"
     if model_save_path:
         self.project_save_path = os.path.join(model_save_path,
                                               self.project_save_name)
     else:
         self.project_save_path = os.path.join(self.tfrecord_dataset_path,
                                               self.project_save_name)
     self.model_save_path = os.path.join(self.project_save_path,
                                         "best_weights.h5")
     self.validation_split = validation_split
     self.test_split = test_split
     self.batch_size = batch_size
     self.epochs = epochs
     file_util.mkdirs(self.project_save_path)
     file_util.mkdirs(self.tfrecord_dataset_path)