예제 #1
0
    def prepare_model(self):
        """load 网络模型,(更新输出层)
        """
        # 需要更新模型输出层种类数目的场景,用户自行决定模型文件的路径,非必要场景
        # 用户可以直接使用训练框架来读取模型
        origin_model_path = self.train_ins.get_model_base_path() + "/Inception-BN-symbol.json"
        weight_file_path = self.train_ins.get_model_base_path() + "/Inception-BN-0126.params"
        fixed_model_path = self.train_ins.get_model_base_path() + "/fixed_train.symbol.json"

        # AVA-SDK 获取数据集类型数 && 更新网络模型输出层
        output_layer_num = utils.get_sampleset_class_num()
        # old_output_layer_name = model_utils.update_model_output_num(
        #   origin_model_path, fixed_model_path, output_layer_num)
        sym,arg_params,aux_params=mx.model.load_checkpoint(prefix='/workspace/model/Inception-BN',epoch=126)
        layer_name='flatten'
        all_layers=sym.get_internals()
        net = all_layers[layer_name+'_output']
        net = mx.symbol.FullyConnected(data=net, num_hidden=output_layer_num, name='fc1_new')
        net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
        sym = net


        # sym = mx.symbol.load(fixed_model_path)
        gpu_count = self.solver_config.get('gpu_counts', 0)
        ctx = [mx.cpu()] if gpu_count == 0 else [
            mx.gpu(i) for i in xrange(gpu_count)
        ]
        mod = mx.mod.Module(symbol=sym, context=ctx)

        mod.bind(data_shapes=self.train_data.provide_data,
                 label_shapes=self.train_data.provide_label)

        # 默认权值初始化方式
        mod.init_params(initializer=mx.init.Xavier(rnd_type='gaussian',
                                                   factor_type="in",
                                                   magnitude=2))
        # AVA-SDK 在替换网络输出层的场景下读取权重参数
       # arg_params, aux_params = model_utils.load_model_params(
        #    weight_file_path, old_output_layer_name)
        arg_params = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k})



        if arg_params:
            logger.info("set pretrained weights")
            mod.set_params(arg_params, aux_params, allow_missing=True)

        self.mod = mod
예제 #2
0
    def prepare_model(self):
        '''load 网络模型,(更新输出层)
        '''
        sym, arg_params, aux_params = mx.model.load_checkpoint(
            '/workspace/model/resnet-50', 0)
        num_hidden = 10
        (new_sym, new_args) = self.get_fine_tune_model(sym, arg_params,
                                                       num_hidden)

        # get information of ctx
        gpu_count = self.solver_config.get('gpu_counts', 0)
        ctx = [mx.cpu()
               ] if gpu_count == 0 else [mx.gpu(i) for i in xrange(gpu_count)]

        mod = mx.mod.Module(symbol=new_sym, context=ctx)
        mod.bind(data_shapes=self.train_data.provide_data,
                 label_shapes=self.train_data.provide_label)

        logger.info("set pretrained weights")
        mod.set_params(new_args, aux_params, allow_missing=True)
        self.mod = mod
예제 #3
0
    def prepare_model(self):
        """load 网络模型,(更新输出层)
        """
        # 需要更新模型输出层种类数目的场景,用户自行决定模型文件的路径,非必要场景
        # 用户可以直接使用训练框架来读取模型
        # 替换成自己需要的模型名以及参数名 Riheng 2018/02/23
        origin_model_path = self.train_ins.get_model_base_path(
        ) + "/resnet-50-symbol.json"
        weight_file_path = self.train_ins.get_model_base_path(
        ) + "/resnet-50-0000.params"
        fixed_model_path = self.train_ins.get_model_base_path(
        ) + "/fixed_resnet-50-symbol.json"

        # AVA-SDK 获取数据集类型数 && 更新网络模型输出层
        output_layer_num = utils.get_sampleset_class_num()
        old_output_layer_name = model_utils.update_model_output_num(
            origin_model_path, fixed_model_path, output_layer_num)

        sym = mx.symbol.load(fixed_model_path)
        gpu_count = self.solver_config.get('gpu_counts', 0)
        ctx = [mx.cpu()
               ] if gpu_count == 0 else [mx.gpu(i) for i in xrange(gpu_count)]
        mod = mx.mod.Module(symbol=sym, context=ctx)

        mod.bind(data_shapes=self.train_data.provide_data,
                 label_shapes=self.train_data.provide_label)

        # 默认权值初始化方式
        mod.init_params(initializer=mx.init.Xavier(
            rnd_type='gaussian', factor_type="in", magnitude=2))
        # AVA-SDK 在替换网络输出层的场景下读取权重参数
        arg_params, aux_params = model_utils.load_model_params(
            weight_file_path, old_output_layer_name)
        if arg_params:
            logger.info("set pretrained weights")
            mod.set_params(arg_params, aux_params, allow_missing=True)

        self.mod = mod
예제 #4
0
 def signal_handler(self, signum, stack):
     logger.info("received signal: %s, do clean_up", signum)
     self.clean_up()
     sys.exit()
예제 #5
0
    def start_new_training(self):
        try:
            # 绑定信号,如果是接收到信号,表示用户自己选择退出训练实例
            # 训练实例状态为正常结束
            SUPPORTED_SIGNALS = (
                signal.SIGINT,
                signal.SIGTERM,
            )
            for signum in SUPPORTED_SIGNALS:
                try:
                    signal.signal(signum, self.signal_handler)
                    logger.info("Bind signal '%s' success to %s", signum,
                                self.signal_handler)
                except Exception as identifier:
                    logger.warning("Bind signal '%s' failed, err: %s", signum,
                                   identifier)

            # AVA-SDK 初始化一个训练实例
            self.train_ins = train.TrainInstance()

            logger.info("start new tarining, training_ins_id: %s",
                        self.train_ins.get_training_ins_id())

            logger.info("prepare_train_config")
            self.prepare_train_config()
            logger.info("prepare_solver_config")
            self.prepare_solver_config()
            logger.info("prepare_sampleset_config")
            self.prepare_sampleset_data()
            logger.info("prepare_model")
            self.prepare_model()

            opts = self.train_config
            opts.update(self.solver_config)
            fit_args = {k: opts.get(k) for k in FIT_KWARGS_KEYS}
            logger.info("fit args: %s" % fit_args)
            self.mod.fit(self.train_data, eval_data=self.val_data, **fit_args)

            logger.info("training finish")
            err_msg = ""
        except Exception as err:
            err_msg = "training failed, err: %s" % (err)
            logger.info(err_msg)
            traceback.print_exc(file=sys.stderr)

        self.clean_up(err_msg=err_msg)
예제 #6
0
def start_new_training():
    # binding signals
    SUPPORTED_SIGNALS = (
        signal.SIGINT,
        signal.SIGTERM,
    )
    for signum in SUPPORTED_SIGNALS:
        try:
            signal.signal(signum, signal_handler)
            logger.info("Bind signal '%s' success to %s", signum,
                        signal_handler)
        except Exception as identifier:
            logger.warning("Bind signal '%s' failed, err: %s", signum,
                           identifier)
    try:
        # parse args
        parser = argparse.ArgumentParser(
            description="train imagenet-1k",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        fit.add_fit_args(parser)
        data.add_data_args(parser)
        data.add_data_aug_args(parser)
        # use a large aug level
        data.set_data_aug_level(parser, 3)
        parser.set_defaults(
            # network
            network='resnet',
            num_layers=50,
            # data
            num_classes=10,
            num_examples=60000,
            image_shape='3,28,28',
            min_random_scale=1,  # if input image has min size k, suggest to use
            # 256.0/x, e.g. 0.533 for 480
            # train
            num_epochs=80,
            lr_step_epochs='30,60',
            dtype='float32',
            batch_size=32)
        args = parser.parse_args()

        # AVA-SDK  new an Instance
        train_ins = train.TrainInstance()
        # add CALLBACK
        batch_end_cb = train_ins.get_monitor_callback(
            "mxnet", batch_size=args.batch_size, batch_freq=10)
        args.batch_end_callback = batch_end_cb

        # load network
        from importlib import import_module
        net = import_module('symbols.' + args.network)
        sym = net.get_symbol(**vars(args))

        # train
        fit.fit(args, sym, data.get_rec_iter)

        logger.info("training finish")
        err_msg = ""
        if train_ins == None:
            return
        train_ins.done(err_msg=err_msg)
    except Exception as err:
        err_msg = "training failed, err: %s" % (err)
        logger.info(err_msg)
        traceback.print_exc(file=sys.stderr)

        if train_ins == None:
            return
        train_ins.done(err_msg=err_msg)
예제 #7
0
def signal_handler(signum, frame):
    logger.info("received signal: %s, do clean_up", signum)
    train_ins = frame.f_locals['train_ins']
    clean_up(train_ins)
    exit()
예제 #8
0
def start_new_training():
    # AVA-SDK training Instance
    SUPPORTED_SIGNALS = (signal.SIGINT, signal.SIGTERM,)
    for signum in SUPPORTED_SIGNALS:
        try:
            signal.signal(signum, signal_handler)
            logger.info("Bind signal '%s' success to %s",
                        signum, signal_handler)
        except Exception as identifier:
            logger.warning(
                "Bind signal '%s' failed, err: %s", signum, identifier)

    try:
        # parse args
        train_ins = train.TrainInstance()
        err_msg=''
        # add CALLBACK
        solver_param = caffe_pb2.SolverParameter()
        with open('/workspace/model/lenet_solver.prototxt','r') as f:
            text_format.Merge(f.read(),solver_param)
        solver_param.snapshot_prefix = train_ins.get_snapshot_base_path()
        logger.info("saving to  %s", solver_param.snapshot_prefix)

        fixed_solver = train_ins.get_base_path() + "/solver.prototxt"
        with open(fixed_solver, 'w') as f:
            f.write(str(solver_param))
        logger.info("write fixed solver to %s", fixed_solver)

        # AVA-SDK start caffe process
        training_cmd = ['caffe','train','-solver',fixed_solver,'-gpu','0']
        proc = cmd.startproc(training_cmd)
        logger.info("Started %s", proc)
        # AVA-SDK add caffe callback
        cmd.logproc(proc, [train_ins.get_monitor_callback("caffe")])
        exit_code = proc.wait()
        logger.info("Finished proc with code %s", exit_code)
        logger.info("Gracefully shutdown after 5s, wait cleaner ...")
        time.sleep(5)
        logger.info("Done.")
        if exit_code != 0:
            logger.error(
                "training exit code [%d] != 0, raise Exception", exit_code)
            raise Exception("training exit code [%d] != 0" % (exit_code))
        train_ins.done(err_msg=err_msg)
    except Exception as err:
        err_msg = "training failed, err: %s" % (err)
        logger.info(err_msg)
        traceback.print_exc(file=sys.stderr)

        if train_ins == None:
            return
        train_ins.done(err_msg=err_msg)
예제 #9
0
    def prepare_train_config(self):
        """配置训练参数"""

        # AVA-SDK 获取训练参数
        """
        1) 获取所有配置 example
            param_dict = params.get_all()
            value1 = param_dict["key1"]
        2) 获取某项value
            value1 = params.get_value("key1", default=1)
        """
        snapshot_interval_epochs = params.get_value("snapshot_interval_epochs",
                                                    default=1)
        max_epochs = params.get_value("max_epochs", default=3)
        rand_crop = params.get_value("rand_crop", default=True)
        rand_mirror = params.get_value("rand_mirror", default=True)
        batch_size, actual_batch_size, val_batch_size = utils.get_batch_size()

        batch_size = params.get_value("batchSize", default=8)
        val_batch_size = params.get_value("valBatchSize", default=batch_size)
        use_gpu, cores = utils.get_cores()
        logger.info("Cores GPU=%s, count=%d", use_gpu, cores)
        actual_batch_size = batch_size if not use_gpu else batch_size * cores
        if use_gpu:
            val_batch_size *= cores

        # USING the trainning batch size as valadition batch size
        val_batch_size = actual_batch_size
        # crop_w, crop_h = utils.get_crop_size()
        # 根据模型的输入要求选择 crop_size
        crop_w = params.get_value("crop_w")
        crop_h = params.get_value("crop_h")

        # 添加监控
        snapshot_prefix = self.train_ins.get_snapshot_base_path() + "/snapshot"
        kv_store = "device"
        kv = mx.kvstore.create(kv_store)
        '''
        rank = int(kv.rank)
        if rank > 0:
            snapshot_prefix += "-%s" % rank
        '''

        batch_freq = 10  # 打印/上报指标的 batch 粒度
        batch_of_epoch = utils.ceil_by_level(
            float(utils.get_sampleset_num() / actual_batch_size))

        # AVA-SDK mxnet monitor callback 初始化
        batch_end_cb = self.train_ins.get_monitor_callback(
            "mxnet", batch_size=actual_batch_size, batch_freq=batch_freq)
        epoch_end_cb = [
            # mxnet default epoch callback
            mx.callback.do_checkpoint(snapshot_prefix,
                                      snapshot_interval_epochs),
            self.train_ins.get_epoch_end_callback(
                "mxnet",
                batch_of_epoch=batch_of_epoch,
                epoch_interval=snapshot_interval_epochs,
                other_files=[])
        ]

        # 训练参数,用户可以自行配置
        self.train_config = {
            "input_data_shape": (CROP_CHANNELS, crop_h, crop_w),
            "rand_crop": rand_crop,
            "rand_mirror": rand_mirror,
            "batch_size": batch_size,
            "actual_batch_size": actual_batch_size,
            "val_batch_size": val_batch_size,
            # fit_args
            "eval_metric":
            mxnet_monitor.full_mxnet_metrics(),  # AVA-SDK 获取mxnet metric 列表
            "epoch_end_callback": epoch_end_cb,
            "batch_end_callback": batch_end_cb,
            "kvstore": kv,
            "num_epoch": max_epochs,
        }
예제 #10
0
def start_new_training():
    # AVA-SDK training Instance
    SUPPORTED_SIGNALS = (signal.SIGINT, signal.SIGTERM,)
    for signum in SUPPORTED_SIGNALS:
        try:
            signal.signal(signum, signal_handler)
            logger.info("Bind signal '%s' success to %s",
                        signum, signal_handler)
        except Exception as identifier:
            logger.warning(
                "Bind signal '%s' failed, err: %s", signum, identifier)

    try:
        # parse args
        train_ins = train.TrainInstance()
        err_msg=''
        # add CALLBACK

        # AVA-SDK start caffe process
        out_dir = train_ins.get_snapshot_base_path()
        roidb_path = train_ins.get_trainset_base_path() + "/cache/gt_roidb.pkl"
        training_cmd = ['python', 'detect_py_faster_rcnn.py', '--solver', 'vgg_solver.prototxt', '--gpu', '0',
                        '--output_path', out_dir, '--ava_roidb_path', roidb_path,
                        '--train_base_path', train_ins.get_trainset_base_path()+'/cache']
        proc = cmd.startproc(training_cmd)
        logger.info("Started %s", proc)
        # AVA-SDK add caffe callback
        cmd.logproc(proc, [train_ins.get_monitor_callback("caffe")])
        exit_code = proc.wait()
        logger.info("Finished proc with code %s", exit_code)
        logger.info("Gracefully shutdown after 5s, wait cleaner ...")
        time.sleep(5)
        logger.info("Done.")
        if exit_code != 0:
            logger.error(
                "training exit code [%d] != 0, raise Exception", exit_code)
            raise Exception("training exit code [%d] != 0" % (exit_code))
        train_ins.done(err_msg=err_msg)
    except Exception as err:
        err_msg = "training failed, err: %s" % (err)
        logger.info(err_msg)
        traceback.print_exc(file=sys.stderr)

        if train_ins == None:
            return
        train_ins.done(err_msg=err_msg)