Exemple #1
0
 def __init__(
     self,
     origin_dataset_path: str = "",
     tfrecord_dataset_path: str = "",
     model_save_path: str = "",
     validation_split: float = 0.2,
     batch_size: int = 32,
     epochs: int = 30,
     project_id: int = 0,
     image_size: int = 224,
     do_fine_tune=False,
     with_image_net=True,
     **kwargs,
 ):
     """
     Args:
         origin_dataset_path (str): 处理前的数据集路径
         tfrecord_dataset_path (str): 处理后的数据集路径
         model_save_path (str): 模型保存路径
         validation_split (float): 验证集切割比例
         batch_size (int): mini batch 大小
         epochs (int): 训练epoch数
         project_id (int): 训练项目编号
         with_image_net (bool): 是否使用imagenet的均值初始化数据
     """
     self._call_code = ""
     self.project_id = project_id
     self.do_fine_tune = do_fine_tune
     self.with_image_net = with_image_net
     origin_dataset_path = file_util.abspath(origin_dataset_path)
     tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path)
     model_save_path = file_util.abspath(model_save_path)
     self.image_size = image_size
     self.origin_dataset_path = origin_dataset_path
     # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径
     if tfrecord_dataset_path:
         self.tfrecord_dataset_path = tfrecord_dataset_path
     else:
         self.tfrecord_dataset_path = origin_dataset_path
     # 当未给定模型保存路径时,默认保存到处理后数据集相同路径
     if self.project_id:
         self.project_save_name = f"luwu-classification-project-{self.project_id}"
     else:
         self.project_save_name = f"luwu-classification-project"
     if model_save_path:
         self.project_save_path = os.path.join(model_save_path,
                                               self.project_save_name)
     else:
         self.project_save_path = os.path.join(self.tfrecord_dataset_path,
                                               self.project_save_name)
     self.model_save_path = os.path.join(self.project_save_path,
                                         "best_weights.h5")
     self.validation_split = validation_split
     self.batch_size = batch_size
     self.epochs = epochs
     file_util.mkdirs(self.project_save_path)
     file_util.mkdirs(self.tfrecord_dataset_path)
Exemple #2
0
    def download_pre_trained_model(self):
        pre_trained_models_config = self.model_lang_weights_dict[
            self.pre_trained_model_type][self.language]
        url = pre_trained_models_config["url"]
        config_path = pre_trained_models_config["config_path"]
        checkpoint_path = pre_trained_models_config["checkpoint_path"]
        dict_path = pre_trained_models_config["dict_path"]

        filename = url.split("/")[-1]
        cache_subdir = file_util.abspath("~/.luwu/cache_models")
        filepath = tf.keras.utils.get_file(
            filename,
            url,
            cache_dir=".",
            cache_subdir=cache_subdir,
            extract=True,
            archive_format="zip",
        )
        # os.remove(filepath)

        cache_subdir = os.path.join(cache_subdir, filename.split(".")[0])
        self.pre_trained_model_config_path = os.path.join(
            cache_subdir, config_path)
        self.pre_trained_model_checkpoint_path = os.path.join(
            cache_subdir, checkpoint_path)
        self.pre_trained_model_dict_path = os.path.join(
            cache_subdir, dict_path)
Exemple #3
0
    def __init__(
        self,
        origin_dataset_path: str = "",
        tfrecord_dataset_path: str = "",
        label_map_path: str = "",
        do_fine_tune: bool = True,
        fine_tune_checkpoint_path: str = "",
        fine_tune_model_name: str = "",
        model_save_path: str = "",
        batch_size: int = 8,
        steps: int = 2000,
        project_id: int = 0,
        **kwargs,
    ):
        """陆吾目标检测模型基类
        # TODO:暂时不增加验证集相关功能,后面再加

        Args:
            origin_dataset_path (str): 处理前的数据集路径
            tfrecord_dataset_path (str): 处理后的tfrecord数据集路径
            label_map_path (str): 目标检测类别映射表(pbtxt)
            do_fine_tune (bool): 是否在预训练模型的基础上进行微调
            fine_tune_checkpoint_path (str): 预训练权重路径
            fine_tune_model_name (str): 预训练模型名称
            model_save_path (str): 模型保存路径
            batch_size (int): mini batch 大小
            steps (int, optional): 训练steps数量. Defaults to 2000.
            project_id (int, optional): 项目编号. Defaults to 0.
        """
        super().__init__(
            origin_dataset_path=origin_dataset_path,
            tfrecord_dataset_path=tfrecord_dataset_path,
            label_map_path=label_map_path,
            model_save_path=model_save_path,
            batch_size=batch_size,
            steps=steps,
            project_id=project_id,
            **kwargs,
        )
        self.do_fine_tune = do_fine_tune
        fine_tune_checkpoint_path = file_util.abspath(
            fine_tune_checkpoint_path)
        self.fine_tune_checkpoint_path = fine_tune_checkpoint_path
        self.fine_tune_model_name = fine_tune_model_name
        if self.fine_tune_model_name not in self.fine_tune_models_config_map:
            raise Exception(
                f"暂不支持的 object detection model! {self.fine_tune_model_name}")
Exemple #4
0
 def download_result_from_kaggle(self):
     output_path = os.path.join(self.tmp_dir_path, "kaggle-output")
     logger.info(f"创建文件夹 {output_path} ...")
     file_util.mkdirs(output_path)
     logger.info("从kaggle拉取运行结果...")
     cmd = f"kaggle kernels output {self.kernel_id} -p {output_path}"
     cmd_util.run_cmd(cmd)
     model_save_path = self.kwargs.get("model_save_path", "")
     if not model_save_path:
         model_save_path = "luwu-output"
     project_path = file_util.abspath(model_save_path)
     file_util.mkdirs(project_path)
     output_files_path = os.path.join(output_path, "*")
     logger.info(f"将运行结果移动到指定目录 {project_path} ...")
     cmd = f"cp -r {output_files_path} {project_path}"
     cmd_util.run_cmd(cmd)
     logger.info("Done.")
Exemple #5
0
    def __init__(
        self,
        origin_dataset_path: str = "",
        tfrecord_dataset_path: str = "",
        label_map_path: str = "",
        model_save_path: str = "",
        batch_size: int = 8,
        steps: int = 2000,
        project_id: int = 0,
        **kwargs,
    ):
        """陆吾目标检测模型基类
        # TODO:暂时不增加验证集相关功能,后面再加

        Args:
            origin_dataset_path (str): 处理前的数据集路径
            tfrecord_dataset_path (str): 处理后的tfrecord数据集路径
            label_map_path (str): 目标检测类别映射表(pbtxt)
            model_save_path (str): 模型保存路径
            batch_size (int): mini batch 大小
            steps (int, optional): 训练steps数量. Defaults to 2000.
            project_id (int, optional): 项目编号. Defaults to 0.
        """
        self._call_code = ""
        self.project_id = project_id
        origin_dataset_path = file_util.abspath(origin_dataset_path)
        tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path)
        label_map_path = file_util.abspath(label_map_path)
        model_save_path = file_util.abspath(model_save_path)
        self.origin_dataset_path = origin_dataset_path
        # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径
        if tfrecord_dataset_path:
            # 区分给定的是文件夹还是文件。
            # 如果是文件夹,则需要生成tfrecord文件
            # 如果指定指定到文件,则直接使用指定文件,跳过生成步骤
            if os.path.isfile(tfrecord_dataset_path):
                self.tfrecord_dataset_file_path = tfrecord_dataset_path
                self.tfrecord_dataset_dir = os.path.dirname(
                    tfrecord_dataset_path)
                self.need_generate_tfrecord = False
            else:
                self.tfrecord_dataset_dir = tfrecord_dataset_path
                self.tfrecord_dataset_file_path = os.path.join(
                    self.tfrecord_dataset_dir, "train.tfrecord")
                self.need_generate_tfrecord = True
        else:
            self.tfrecord_dataset_dir = self.origin_dataset_path
            self.tfrecord_dataset_file_path = os.path.join(
                self.tfrecord_dataset_dir, "train.tfrecord")
            self.need_generate_tfrecord = True
        # 当未给定pbtxt路径时,也默认保存到tfrecord相同目录下
        if label_map_path:
            if os.path.isfile(label_map_path):
                self.label_map_file_path = label_map_path
                self.label_map_dir = os.path.dirname(self.label_map_file_path)
                self.need_generate_label_map = False
            else:
                self.label_map_dir = label_map_path
                self.label_map_file_path = os.path.join(
                    self.label_map_dir, "label_map.pbtxt")
                self.need_generate_label_map = True
        else:
            self.label_map_dir = self.tfrecord_dataset_dir
            self.label_map_file_path = os.path.join(self.label_map_dir,
                                                    "label_map.pbtxt")
            self.need_generate_label_map = True
        # 当未给定模型保存路径时,默认保存到处理后数据集相同路径
        if self.project_id:
            self.project_save_name = f"luwu-object-detection-project-{self.project_id}"
        else:
            self.project_save_name = f"luwu-object-detection-project"
        if model_save_path:
            self.project_save_path = os.path.join(model_save_path,
                                                  self.project_save_name)
        else:
            self.project_save_path = os.path.join(self.tfrecord_dataset_dir,
                                                  self.project_save_name)
        self.batch_size = batch_size
        self.steps = steps
Exemple #6
0
    def __init__(
        self,
        origin_dataset_path: str,
        validation_dataset_path: str = "",
        test_dataset_path: str = "",
        model_save_path: str = "",
        validation_split: float = 0.1,
        test_split: float = 0.1,
        batch_size: int = 32,
        epochs: int = 30,
        learning_rate: float = 0.01,
        project_id: int = 0,
        maxlen: int = 128,
        frezee_pre_trained_model=False,
        optimizer: str = "Adam",
        optimize_with_piecewise_linear_lr: bool = False,
        simplified_tokenizer: bool = False,
        pre_trained_model_type: str = "bert_base",
        language: str = "chinese",
        *args,
        **kwargs,
    ):
        """
        Args:
            origin_dataset_path (str): 处理前的数据集路径
            validation_dataset_path (str): 验证数据集路径。如不指定,
                    则从origin_dataset_path中进行切分。
            test_dataset_path (str): 测试数据集路径。如不指定,则从
                                    origin_dataset_path中进行切分。
            model_save_path (str): 模型保存路径
            validation_split (float): 验证集切割比例
            test_split (float): 测试集切割比例
            batch_size (int): mini batch 大小
            learning_rate (float): 学习率大小
            epochs (int): 训练epoch数
            project_id (int): 训练项目编号
            maxlen (int, optional): 单个文本的最大长度. Defaults to 128.
            frezee_pre_trained_model (bool, optional): 在训练下游网络时,是否冻结预训练模型权重. Defaults to False.
            optimizer (str, optional): 优化器类别. Defaults to "Adam".
            optimize_with_piecewise_linear_lr (bool): 是否使用分段的线性学习率进行优化. 默认 False
            simplified_tokenizer (bool): 是否对分词器的词表进行精简,默认False
            pre_trained_model_type (str): 使用何种预训练模型
            language (str): 预训练语料的语言
        """
        self._call_code = ""
        self.project_id = project_id
        self.frezee_pre_trained_model = frezee_pre_trained_model
        self.learning_rate = learning_rate

        self.optimize_with_piecewise_linear_lr = optimize_with_piecewise_linear_lr
        self.optimizer_cls = self.get_optimizer_cls(optimizer)

        origin_dataset_path = file_util.abspath(origin_dataset_path)
        model_save_path = file_util.abspath(model_save_path)

        self.simplified_tokenizer = simplified_tokenizer
        self.pre_trained_model_type = pre_trained_model_type
        self.language = language
        if self.pre_trained_model_type not in self.model_lang_weights_dict:
            raise Exception(
                f"指定模型 {self.pre_trained_model_type} 不存在!当前支持的模型为:{list(self.model_lang_weights_dict.keys())}"
            )
        if (self.language not in self.model_lang_weights_dict[
                self.pre_trained_model_type]):
            languages = list(self.model_lang_weights_dict[
                self.pre_trained_model_type].keys())
            raise Exception(
                f"指定语料 {self.language} 的预训练模型 {self.pre_trained_model_type} 不存在!支持的语料为:{languages}"
            )

        self.maxlen = maxlen
        self.origin_dataset_path = origin_dataset_path

        if validation_dataset_path:
            self.validation_dataset_path = file_util.abspath(
                validation_dataset_path)
        else:
            self.validation_dataset_path = validation_dataset_path

        if test_dataset_path:
            self.test_dataset_path = file_util.abspath(test_dataset_path)
        else:
            self.test_dataset_path = test_dataset_path

        # 当未给定模型保存路径时,默认保存到origin数据集相同路径
        self.project_save_name = self.init_project_save_name(project_id)
        if model_save_path:
            self.project_save_path = os.path.join(model_save_path,
                                                  self.project_save_name)
        else:
            self.project_save_path = os.path.join(
                os.path.dirname(self.origin_dataset_path),
                self.project_save_name)
        self.model_save_path = os.path.join(self.project_save_path,
                                            "best_weights.h5")

        self.validation_split = validation_split
        self.test_split = test_split
        self.batch_size = batch_size
        self.epochs = epochs

        file_util.mkdirs(self.project_save_path)

        self.model = None
Exemple #7
0
 def __init__(
     self,
     origin_dataset_path: str = "",
     validation_dataset_path: str = "",
     test_dataset_path: str = "",
     tfrecord_dataset_path: str = "",
     model_save_path: str = "",
     validation_split: float = 0.1,
     test_split: float = 0.1,
     batch_size: int = 32,
     epochs: int = 30,
     learning_rate: float = 0.01,
     project_id: int = 0,
     image_size: int = 224,
     do_fine_tune=False,
     with_image_net=True,
     optimizer: str = "Adam",
     freeze_epochs_ratio: float = 0.1,
     image_augmentation_random_flip_horizontal: bool = False,
     image_augmentation_random_flip_vertival: bool = False,
     image_augmentation_random_crop: bool = False,
     image_augmentation_random_brightness: bool = False,
     image_augmentation_random_hue: bool = False,
     **kwargs,
 ):
     """
     Args:
         origin_dataset_path (str): 处理前的数据集路径
         validation_dataset_path (str): 验证数据集路径。如不指定,
                 则从origin_dataset_path中进行切分。
         test_dataset_path (str): 测试数据集路径。如不指定,则从
                                 origin_dataset_path中进行切分。
         tfrecord_dataset_path (str): 处理后的数据集路径
         model_save_path (str): 模型保存路径
         validation_split (float): 验证集切割比例
         test_split (float): 测试集切割比例
         batch_size (int): mini batch 大小
         learning_rate (float): 学习率大小
         epochs (int): 训练epoch数
         project_id (int): 训练项目编号
         with_image_net (bool): 是否使用imagenet的均值初始化数据
         optimizer (str): 优化器类别
         freeze_epochs_ratio (float): 当进行fine_tune时,会先冻结预训练模型进行训练一定epochs,
                                     再解冻全部参数训练一定epochs,此参数表示冻结训练epochs占
                                     全部epochs的比例(此参数仅当 do_fine_tune = True 时有效)。
                                     默认 0.1(当总epochs>1时,只要设置了比例,至少会训练一个epoch)
         image_augmentation_random_flip_horizontal (bool): 数据增强选项,是否做随机左右镜像。默认False.
         image_augmentation_random_flip_vertival (bool): 数据增强选项,是否做随机上下镜像。默认False.
         image_augmentation_random_crop (bool): 数据增强选项,是否做随机剪裁,剪裁尺寸为原来比例的0.9。默认False.
         image_augmentation_random_brightness (bool): 数据增强选项,是否做随机饱和度调节。默认False.
         image_augmentation_random_hue (bool): 数据增强选项,是否做随机色调调节。默认False.
     """
     self._call_code = ""
     self.project_id = project_id
     self.do_fine_tune = do_fine_tune
     self.with_image_net = with_image_net
     self.learning_rate = learning_rate
     self.freeze_epochs_ratio = freeze_epochs_ratio
     self.image_augmentation_random_flip_horizontal = (
         image_augmentation_random_flip_horizontal)
     self.image_augmentation_random_flip_vertival = (
         image_augmentation_random_flip_vertival)
     self.image_augmentation_random_crop = image_augmentation_random_crop
     self.image_augmentation_random_brightness = image_augmentation_random_brightness
     self.image_augmentation_random_hue = image_augmentation_random_hue
     self.optimizer_cls = self.get_optimizer_cls(optimizer)
     origin_dataset_path = file_util.abspath(origin_dataset_path)
     tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path)
     model_save_path = file_util.abspath(model_save_path)
     self.image_size = image_size
     self.origin_dataset_path = origin_dataset_path
     if validation_dataset_path:
         self.validation_dataset_path = file_util.abspath(
             validation_dataset_path)
     else:
         self.validation_dataset_path = validation_dataset_path
     if test_dataset_path:
         self.test_dataset_path = file_util.abspath(test_dataset_path)
     else:
         self.test_dataset_path = test_dataset_path
     # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径
     if tfrecord_dataset_path:
         self.tfrecord_dataset_path = tfrecord_dataset_path
     else:
         self.tfrecord_dataset_path = origin_dataset_path
     # 当未给定模型保存路径时,默认保存到处理后数据集相同路径
     if self.project_id:
         self.project_save_name = f"luwu-classification-project-{self.project_id}"
     else:
         self.project_save_name = f"luwu-classification-project"
     if model_save_path:
         self.project_save_path = os.path.join(model_save_path,
                                               self.project_save_name)
     else:
         self.project_save_path = os.path.join(self.tfrecord_dataset_path,
                                               self.project_save_name)
     self.model_save_path = os.path.join(self.project_save_path,
                                         "best_weights.h5")
     self.validation_split = validation_split
     self.test_split = test_split
     self.batch_size = batch_size
     self.epochs = epochs
     file_util.mkdirs(self.project_save_path)
     file_util.mkdirs(self.tfrecord_dataset_path)