Ejemplo n.º 1
0
 def prepare_solver_config(self):
     use_gpu, cores = utils.get_cores()
     gpu_counts = cores if use_gpu else 0
     batch_size, actual_batch_size, val_batch_size = utils.get_batch_size()
     step=[550,1650,2750,4400]
     optimizer_params = {
         "momentum": params.get_value("momentum", default=0.9),
         "wd": params.get_value("wd", default=0.0005),
         "learning_rate": params.get_value("learning_rate", default=0.01),
         "lr_scheduler": mx.lr_scheduler.MultiFactorScheduler(step, factor=0.1),
     }
     self.solver_config = {
         "gpu_counts": gpu_counts,
         # fit_args
         "optimizer": "SGD",
         "optimizer_params": optimizer_params,
     }
Ejemplo n.º 2
0
    def prepare_train_config(self):
        """配置训练参数"""

        # AVA-SDK 获取训练参数
        """
        1) 获取所有配置 example
            param_dict = params.get_all()
            value1 = param_dict["key1"]
        2) 获取某项value
            value1 = params.get_value("key1", default=1)
        """
        snapshot_interval_epochs = params.get_value(
            "intervals.snapshotIntervalEpochs", default=1)
        max_epochs = params.get_value("stopCondition.maxEpochs", default=3)
        rand_crop = params.get_value("inputTransform.randomCrop", default=True)
        rand_mirror = params.get_value("inputTransform.randomMirror",
                                       default=True)
        batch_size, actual_batch_size, val_batch_size = utils.get_batch_size()
        # USING the trainning batch size as valadition batch size
        val_batch_size = actual_batch_size
        #crop_w, crop_h = utils.get_crop_size()
        # 使用默认的一个较小的 crop_size
        crop_w, crop_h = 16, 16

        # 添加监控
        snapshot_prefix = self.train_ins.get_snapshot_base_path() + "/snapshot"
        kv_store = "device"
        kv = mx.kvstore.create(kv_store)
        '''
        rank = int(kv.rank)
        if rank > 0:
            snapshot_prefix += "-%s" % rank
        '''

        batch_freq = 10  # 打印/上报指标的 batch 粒度

        # AVA-SDK mxnet monitor callback 初始化
        batch_end_cb = self.train_ins.get_monitor_callback(
            "mxnet", batch_size=actual_batch_size, batch_freq=batch_freq)
        epoch_end_cb = [
            # mxnet default epoch callback
            mx.callback.do_checkpoint(snapshot_prefix,
                                      snapshot_interval_epochs),
            self.train_ins.get_epoch_end_callback(
                "mxnet",
                epoch_interval=snapshot_interval_epochs,
                other_files=[])
        ]

        # 训练参数,用户可以自行配置
        self.train_config = {
            "input_data_shape": (CROP_CHANNELS, crop_h, crop_w),
            "rand_crop": rand_crop,
            "rand_mirror": rand_mirror,
            "batch_size": batch_size,
            "actual_batch_size": actual_batch_size,
            "val_batch_size": val_batch_size,
            # fit_args
            "eval_metric":
            mxnet_monitor.full_mxnet_metrics(),  # AVA-SDK 获取mxnet metric 列表
            "epoch_end_callback": epoch_end_cb,
            "batch_end_callback": batch_end_cb,
            "kvstore": kv,
            "num_epoch": max_epochs,
        }
Ejemplo n.º 3
0
    def prepare_train_config(self):
        """配置训练参数"""

        # AVA-SDK 获取训练参数
        """
        1) 获取所有配置 example
            param_dict = params.get_all()
            value1 = param_dict["key1"]
        2) 获取某项value
            value1 = params.get_value("key1", default=1)
        """
        snapshot_interval_epochs = params.get_value("snapshot_interval_epochs",
                                                    default=1)
        max_epochs = params.get_value("max_epochs", default=3)
        rand_crop = params.get_value("rand_crop", default=True)
        rand_mirror = params.get_value("rand_mirror", default=True)
        batch_size, actual_batch_size, val_batch_size = utils.get_batch_size()

        batch_size = params.get_value("batchSize", default=8)
        val_batch_size = params.get_value("valBatchSize", default=batch_size)
        use_gpu, cores = utils.get_cores()
        logger.info("Cores GPU=%s, count=%d", use_gpu, cores)
        actual_batch_size = batch_size if not use_gpu else batch_size * cores
        if use_gpu:
            val_batch_size *= cores

        # USING the trainning batch size as valadition batch size
        val_batch_size = actual_batch_size
        # crop_w, crop_h = utils.get_crop_size()
        # 根据模型的输入要求选择 crop_size
        crop_w = params.get_value("crop_w")
        crop_h = params.get_value("crop_h")

        # 添加监控
        snapshot_prefix = self.train_ins.get_snapshot_base_path() + "/snapshot"
        kv_store = "device"
        kv = mx.kvstore.create(kv_store)
        '''
        rank = int(kv.rank)
        if rank > 0:
            snapshot_prefix += "-%s" % rank
        '''

        batch_freq = 10  # 打印/上报指标的 batch 粒度
        batch_of_epoch = utils.ceil_by_level(
            float(utils.get_sampleset_num() / actual_batch_size))

        # AVA-SDK mxnet monitor callback 初始化
        batch_end_cb = self.train_ins.get_monitor_callback(
            "mxnet", batch_size=actual_batch_size, batch_freq=batch_freq)
        epoch_end_cb = [
            # mxnet default epoch callback
            mx.callback.do_checkpoint(snapshot_prefix,
                                      snapshot_interval_epochs),
            self.train_ins.get_epoch_end_callback(
                "mxnet",
                batch_of_epoch=batch_of_epoch,
                epoch_interval=snapshot_interval_epochs,
                other_files=[])
        ]

        # 训练参数,用户可以自行配置
        self.train_config = {
            "input_data_shape": (CROP_CHANNELS, crop_h, crop_w),
            "rand_crop": rand_crop,
            "rand_mirror": rand_mirror,
            "batch_size": batch_size,
            "actual_batch_size": actual_batch_size,
            "val_batch_size": val_batch_size,
            # fit_args
            "eval_metric":
            mxnet_monitor.full_mxnet_metrics(),  # AVA-SDK 获取mxnet metric 列表
            "epoch_end_callback": epoch_end_cb,
            "batch_end_callback": batch_end_cb,
            "kvstore": kv,
            "num_epoch": max_epochs,
        }