def __init__(self,
                 symbol,
                 data_names,
                 label_names,
                 logger=logging,
                 context=ctx.cpu(),
                 work_load_list=None,
                 asymbol=None,
                 args=None):
        super(ParallModule, self).__init__(logger=logger)
        self._symbol = symbol
        self._asymbol = asymbol
        self._data_names = data_names
        self._label_names = label_names
        self._context = context
        self._work_load_list = work_load_list
        self._num_classes = config.num_classes
        self._batch_size = args.batch_size
        self._verbose = args.verbose
        self._emb_size = config.emb_size
        self._local_class_start = args.local_class_start
        assert self._local_class_start == 0
        self._iter = 0

        self._backbone_module = None

        self._num_workers = config.num_workers
        self._num_ctx = len(self._context)
        self._ctx_num_classes = args.ctx_num_classes
        self._nd_cache = {}
        self._ctx_single_gpu = self._context[-1]
        self._fixed_param_names = None
        self._backbone_module = Module(
            self._symbol,
            self._data_names,
            self._label_names,
            logger=self.logger,
            context=self._context,
            work_load_list=self._work_load_list,
            fixed_param_names=self._fixed_param_names)
        self._arcface_modules = []
        self._ctx_class_start = []
        for i in range(len(self._context)):
            args._ctxid = i
            _module = Module(self._asymbol(args),
                             self._data_names,
                             self._label_names,
                             logger=self.logger,
                             context=self._context[i],
                             work_load_list=self._work_load_list,
                             fixed_param_names=self._fixed_param_names)
            self._arcface_modules.append(_module)
            _c = args.local_class_start + i * args.ctx_num_classes
            self._ctx_class_start.append(_c)
        self._usekv = False

        if self._usekv:
            self._distkv = mx.kvstore.create('dist_sync')
            self._kvinit = {}
Esempio n. 2
0
    def forward(self, data_batch, is_train=None):
        assert self.binded and self.params_initialized

        # get current_shapes
        if self._curr_module.label_shapes is not None:
            current_shapes = dict(self._curr_module.data_shapes + self._curr_module.label_shapes)
        else:
            current_shapes = dict(self._curr_module.data_shapes)

        # get input_shapes
        if data_batch.provide_label is not None:
            input_shapes = dict(data_batch.provide_data + data_batch.provide_label)
        else:
            input_shapes = dict(data_batch.provide_data)

        # decide if shape changed
        shape_changed = False
        for k, v in current_shapes.items():
            if v != input_shapes[k]:
                shape_changed = True

        if shape_changed:
            module = Module(self._symbol, self._data_names, self._label_names,
                            logger=self.logger, context=self._context,
                            work_load_list=self._work_load_list,
                            fixed_param_names=self._fixed_param_names)
            module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training,
                        self._curr_module.inputs_need_grad, force_rebind=False,
                        shared_module=self._curr_module)
            self._curr_module = module

        self._curr_module.forward(data_batch, is_train=is_train)
Esempio n. 3
0
    def forward(self, data_batch, is_train=None):
        assert self.binded and self.params_initialized

        # get current_shapes
        if self._curr_module.label_shapes is not None:
            print self._curr_module.data_shapes
            print self._curr_module.label_shapes
            print data_batch.provide_data
            print data_batch.provide_label
            current_shapes = [
                dict(self._curr_module.data_shapes[i] +
                     self._curr_module.label_shapes[i])
                for i in xrange(len(self._context))
            ]
        else:
            current_shapes = [
                dict(self._curr_module.data_shapes[i])
                for i in xrange(len(self._context))
            ]

        # get input_shapes
        if is_train:
            input_shapes = [
                dict(data_batch.provide_data[i] + data_batch.provide_label[i])
                for i in xrange(len(self._context))
            ]
        else:
            input_shapes = [
                dict(data_batch.provide_data[i])
                for i in xrange(len(data_batch.provide_data))
            ]

        # decide if shape changed
        shape_changed = len(current_shapes) != len(input_shapes)
        for pre, cur in zip(current_shapes, input_shapes):
            for k, v in pre.items():
                if v != cur[k]:
                    shape_changed = True

        if shape_changed:
            # self._curr_module.reshape(data_batch.provide_data, data_batch.provide_label)
            module = Module(self._symbol,
                            self._data_names,
                            self._label_names,
                            logger=self.logger,
                            context=[
                                self._context[i]
                                for i in xrange(len(data_batch.provide_data))
                            ],
                            work_load_list=self._work_load_list,
                            fixed_param_names=self._fixed_param_names)
            module.bind(data_batch.provide_data,
                        data_batch.provide_label,
                        self._curr_module.for_training,
                        self._curr_module.inputs_need_grad,
                        force_rebind=False,
                        shared_module=self._curr_module)
            self._curr_module = module

        self._curr_module.forward(data_batch, is_train=is_train)
Esempio n. 4
0
    def bind(self, data_shapes, label_shapes=None, for_training=True,
             inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req="write"):
        # in case we already initialized params, keep it
        if self.params_initialized:
            arg_params, aux_params = self.get_params()

        # force rebinding is typically used when one want to switch from
        # training to prediction phase.
        if force_rebind:
            self._reset_bind()

        if self.binded:
            self.logger.warning('Already bound, ignoring bind()')
            return

        assert shared_module is None, 'shared_module for KTModule is not supported'

        self.for_training = for_training
        self.inputs_need_grad = inputs_need_grad
        self.binded = True

        module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger,
                        context=self._context, work_load_list=self._work_load_list)
        module.bind(self._data_shapes, self._label_shapes, for_training, inputs_need_grad,
                    force_rebind=False, shared_module=None)
        self._curr_module = module

        # copy back saved params, if already initialized
        if self.params_initialized:
            self.set_params(arg_params, aux_params)
    def bind(self,
             data_shapes,
             label_shapes=None,
             for_training=True,
             inputs_need_grad=False,
             force_rebind=False,
             shared_module=None):
        # in case we already initialized params, keep it
        if self.params_initialized:
            arg_params, aux_params = self.get_params()

        # force rebinding is typically used when one want to switch from
        # training to prediction phase.
        if force_rebind:
            self._reset_bind()

        if self.binded:
            self.logger.warning('Already binded, ignoring bind()')
            return

        assert shared_module is None, 'shared_module for MutableModule is not supported'

        self.for_training = for_training
        self.inputs_need_grad = inputs_need_grad
        self.binded = True

        max_shapes_dict = dict(self._max_data_shapes + self._max_label_shapes)
        max_data_shapes = list()
        for name, shape in data_shapes:
            if name in max_shapes_dict:
                max_data_shapes.append((name, max_shapes_dict[name]))
            else:
                max_data_shapes.append((name, shape))
        max_label_shapes = list()
        for name, shape in label_shapes:
            if name in max_shapes_dict:
                max_label_shapes.append((name, max_shapes_dict[name]))
            else:
                max_label_shapes.append((name, shape))

        module = Module(self._symbol,
                        self._data_names,
                        self._label_names,
                        logger=self.logger,
                        context=self._context,
                        work_load_list=self._work_load_list,
                        fixed_param_names=self._fixed_param_names)
        module.bind(max_data_shapes,
                    max_label_shapes,
                    for_training,
                    inputs_need_grad,
                    force_rebind=False,
                    shared_module=None)
        self._curr_module = module

        # copy back saved params, if already initialized
        if self.params_initialized:
            self.set_params(arg_params, aux_params)
Esempio n. 6
0
    def __init__(self, symbol, bn_symbol, batch_size, fc7_model, size,
                 rank, local_rank, memory_bank_list, memory_optimizer,
                 backbone_grad_rescale, memory_lr_scale_list,
                 embedding_size=512, head_num=1, logger=logging, ):
        # configure horovod
        self.memory_lr_scale_list = memory_lr_scale_list
        self.size = size
        self.rank = rank
        self.local_rank = local_rank
        self.gpu = mx.gpu(self.local_rank)
        self.cpu = mx.cpu()                                     # `device_id` is not needed for CPU.
        self.nd_cache = {}
        self.embedding_size = embedding_size
        self.batch_size = batch_size
        self.num_update = 0

        self.batch_end_param = namedtuple(
            'batch_end_param',
            ['loss_list', 'num_epoch_list', 'epoch', 'num_update'])

        self.symbol = symbol
        # self.bn_symbol = bn_symbol
        #
        self.logger = logger
        self.backbone_module = Module(self.symbol,    ['data'], ['softmax_label'], logger=self.logger, context=self.gpu)
        # self.bn_module       = Module(self.bn_symbol, ['data'], None, logger=self.logger, context=self.gpu)
        self.head_num = head_num
        self.memory_bank_list = memory_bank_list
        self.memory_optimizer = memory_optimizer
        self.memory_lr = None
        self.loss_cache = None
        self.grad_cache = None

        assert isinstance(self.memory_bank_list, list)

        # init
        self.fc7_model = fc7_model

        # fp16
        self.backbone_grad_rescale = backbone_grad_rescale

        self.binded = False
        self.for_training = False
        self.inputs_need_grad = False
        self.params_initialized = False
        self.optimizer_initialized = False
        self._total_exec_bytes = 0

        self.global_label = None
Esempio n. 7
0
def train_net(sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, imdb, batch_size, thread_num,
              net=12, with_cls = True, with_bbox = True, with_landmark = False, frequent=50, initialize=True, base_lr=0.01, lr_epoch = [6,14]):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    train_data = ImageLoader(imdb, net, batch_size, thread_num, True, shuffle=True, ctx=ctx)

    if not initialize:
        args, auxs = load_param(pretrained, epoch, convert=True)

    if initialize:
        print "init weights and bias:"
        data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
        arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict)
        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
        aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
        init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2)
        args = dict()
        auxs = dict()
        print 'hello3'
		
        for k in sym.list_arguments():
            if k in data_shape_dict:
                continue

            #print 'init', k

            args[k] = mx.nd.zeros(arg_shape_dict[k])
            init(k, args[k])
            if k.startswith('fc'):
                args[k][:] /= 10

            '''
            if k.endswith('weight'):
                if k.startswith('conv'):
                    args[k] = mx.random.normal(loc=0, scale=0.001, shape=arg_shape_dict[k])
                else:
                    args[k] = mx.random.normal(loc=0, scale=0.01, shape=arg_shape_dict[k])
            else: # bias
                args[k] = mx.nd.zeros(shape=arg_shape_dict[k])
            '''

        for k in sym.list_auxiliary_states():
            auxs[k] = mx.nd.zeros(aux_shape_dict[k])
            #print aux_shape_dict[k]
            init(k, auxs[k])

    lr_factor = 0.1
    image_num = len(imdb)
    
    lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
    lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
    lr_iters = [int(epoch * image_num / batch_size) for epoch in lr_epoch_diff]
    print 'lr', lr, 'lr_epoch', lr_epoch, 'lr_epoch_diff', lr_epoch_diff
    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)

    data_names = [k[0] for k in train_data.provide_data]
    label_names = [k[0] for k in train_data.provide_label]

    batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent)
    epoch_end_callback = mx.callback.do_checkpoint(prefix,period=10)
    eval_metrics = mx.metric.CompositeEvalMetric()
    eval_metrics.add(metric_human14.LANDMARK_MSE())
    eval_metrics.add(metric_human14.LANDMARK_L1())
    
    optimizer_params = {'momentum': 0.9,
                        'wd': 0.00001,
                        'learning_rate': lr,
                        'lr_scheduler': lr_scheduler,
                        'rescale_grad': 1.0}

    mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx)
    mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
            batch_end_callback=batch_end_callback,
            optimizer='sgd', optimizer_params=optimizer_params,
            arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)
Esempio n. 8
0
    def bind(self,
             data_shapes,
             label_shapes=None,
             for_training=True,
             inputs_need_grad=False,
             force_rebind=False,
             shared_module=None):
        # in case we already initialized params, keep it
        if self.params_initialized:
            arg_params, aux_params = self.get_params()

        # force rebinding is typically used when one want to switch from
        # training to prediction phase.
        if force_rebind:
            self._reset_bind()

        if self.binded:
            self.logger.warning('Already binded, ignoring bind()')
            return

        assert shared_module is None, 'shared_module for MutableModule is not supported'

        self.for_training = for_training
        self.inputs_need_grad = inputs_need_grad
        self.binded = True

        max_shapes_dict = dict()
        if self._max_data_shapes is not None:
            # dict1.update(dict2) 用于把 字典2中的键值对更新到 字典1 中
            max_shapes_dict.update(dict(self._max_data_shapes))
        if self._max_label_shapes is not None:
            max_shapes_dict.update(dict(self._max_label_shapes))

        max_data_shapes = list()
        for name, shape in data_shapes:
            if name in max_shapes_dict:
                max_data_shapes.append((name, max_shapes_dict[name]))
            else:
                max_data_shapes.append((name, shape))

        max_label_shapes = list()
        if label_shapes is not None:
            for name, shape in label_shapes:
                if name in max_shapes_dict:
                    max_label_shapes.append((name, max_shapes_dict[name]))
                else:
                    max_label_shapes.append((name, shape))

        if len(max_label_shapes) == 0:
            max_label_shapes = None
        # referen: https://mxnet.incubator.apache.org/api/python/module/module.html#mxnet.module.Module
        # 初始化 模型
        module = Module(self._symbol,
                        self._data_names,
                        self._label_names,
                        logger=self.logger,
                        context=self._context,
                        work_load_list=self._work_load_list,
                        fixed_param_names=self._fixed_param_names)
        module.bind(max_data_shapes,
                    max_label_shapes,
                    for_training,
                    inputs_need_grad,
                    force_rebind=False,
                    shared_module=None)
        self._curr_module = module

        # copy back saved params, if already initialized
        if self.params_initialized:
            self.set_params(arg_params, aux_params)
Esempio n. 9
0
def train_net(mode,
              sym,
              prefix,
              ctx,
              pretrained,
              epoch,
              begin_epoch,
              end_epoch,
              imdb,
              batch_size,
              thread_num,
              im_size,
              net=112,
              frequent=50,
              initialize=True,
              base_lr=0.01,
              lr_epoch=[6, 14]):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    train_data = ImageLoader(imdb,
                             net,
                             batch_size,
                             thread_num,
                             shuffle=True,
                             ctx=ctx)

    if not initialize:
        args, auxs = load_param(pretrained, epoch, convert=True)

    if initialize:
        print "init weights and bias:"
        data_shape_dict = dict(train_data.provide_data +
                               train_data.provide_label)
        print(data_shape_dict)
        arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict)
        #print(arg_shape)
        #print(aux_shape)
        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
        aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
        init = mx.init.Xavier(factor_type="in",
                              rnd_type='gaussian',
                              magnitude=2)
        args = dict()
        auxs = dict()
        #print 'hello3'

        for k in sym.list_arguments():
            if k in data_shape_dict:
                continue

            #print 'init', k

            args[k] = mx.nd.zeros(arg_shape_dict[k])
            init(k, args[k])
            if k.startswith('fc'):
                args[k][:] /= 10

        for k in sym.list_auxiliary_states():
            auxs[k] = mx.nd.zeros(aux_shape_dict[k])
            #print aux_shape_dict[k]
            init(k, auxs[k])

    lr_factor = 0.1
    #lr_epoch = config.LR_EPOCH
    lr_epoch_diff = [
        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
    ]
    lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff)))
    lr_iters = [int(epoch * len(imdb) / batch_size) for epoch in lr_epoch_diff]
    print 'lr', lr, 'lr_epoch', lr_epoch, 'lr_epoch_diff', lr_epoch_diff
    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)

    data_names = [k[0] for k in train_data.provide_data]
    label_names = [k[0] for k in train_data.provide_label]

    batch_end_callback = mx.callback.Speedometer(train_data.batch_size,
                                                 frequent=frequent)
    epoch_end_callback = mx.callback.do_checkpoint(prefix)
    eval_metrics = mx.metric.CompositeEvalMetric()

    metric1 = metric.GenderAccuracy()
    metric2 = metric.GenderLogLoss()
    if mode == "gender_age":
        metric3 = metric.AGE_MAE()
        for child_metric in [metric1, metric2, metric3]:
            eval_metrics.add(child_metric)
    else:
        for child_metric in [metric1, metric2]:
            eval_metrics.add(child_metric)
    #eval_metrics = mx.metric.CompositeEvalMetric([metric.AccMetric(), metric.MAEMetric(), metric.CUMMetric()])
    optimizer_params = {
        'momentum': 0.9,
        'wd': 0.00001,
        'learning_rate': lr,
        'lr_scheduler': lr_scheduler,
        'rescale_grad': 1.0
    }

    mod = Module(sym,
                 data_names=data_names,
                 label_names=label_names,
                 logger=logger,
                 context=ctx)
    mod.fit(train_data,
            eval_metric=eval_metrics,
            epoch_end_callback=epoch_end_callback,
            batch_end_callback=batch_end_callback,
            optimizer='sgd',
            optimizer_params=optimizer_params,
            arg_params=args,
            aux_params=auxs,
            begin_epoch=begin_epoch,
            num_epoch=end_epoch)
Esempio n. 10
0
def train_net(sym,
              prefix,
              ctx,
              pretrained,
              epoch,
              begin_epoch,
              end_epoch,
              imdb,
              net=12,
              frequent=50,
              initialize=True,
              base_lr=0.01):

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)  # 记录到标准输出

    # 训练数据
    train_data = ImageLoader(imdb,
                             net,
                             config.BATCH_SIZE,
                             shuffle=True,
                             ctx=ctx)

    if not initialize:  # 如果非初始化 加载参数
        args, auxs = load_param(pretrained, epoch, convert=True)

    if initialize:
        print("init weights and bias:")
        data_shape_dict = dict(train_data.provide_data +
                               train_data.provide_label)
        arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict)
        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
        aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))

        # 权重初始化 Xavier初始化器
        init = mx.init.Xavier(factor_type="in",
                              rnd_type='gaussian',
                              magnitude=2)
        args = dict()  # 模型参数以及网络权重字典
        auxs = dict()  # 模型参数以及一些附加状态的字典

        for k in sym.list_arguments():
            if k in data_shape_dict:
                continue

            print('init', k)

            args[k] = mx.nd.zeros(arg_shape_dict[k])
            init(k, args[k])
            if k.startswith('fc'):
                args[k][:] /= 10
            '''
            if k.endswith('weight'):
                if k.startswith('conv'):
                    args[k] = mx.random.normal(loc=0, scale=0.001, shape=arg_shape_dict[k])
                else:
                    args[k] = mx.random.normal(loc=0, scale=0.01, shape=arg_shape_dict[k])
            else: # bias
                args[k] = mx.nd.zeros(shape=arg_shape_dict[k])
            '''

        for k in sym.list_auxiliary_states():
            auxs[k] = mx.nd.zeros()
            init(k, auxs[k])

    lr_factor = 0.1
    lr_epoch = config.LR_EPOCH
    lr_epoch_diff = [
        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
    ]
    lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff)))
    lr_iters = [
        int(epoch * len(imdb) / config.BATCH_SIZE) for epoch in lr_epoch_diff
    ]
    print('lr:{},lr_epoch:{},lr_epoch_diff:{}'.format(lr, lr_epoch,
                                                      lr_epoch_diff))
    # print('lr', lr, 'lr_epoch', lr_epoch, 'lr_epoch_diff', lr_epoch_diff)

    # MXNet设置动态学习率,经过lr_iters次更新后,学习率变为lr*lr_factor
    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)

    data_names = [k[0] for k in train_data.provide_data]
    label_names = [k[0] for k in train_data.provide_label]

    # 作用是每隔多少个batch显示一次结果
    batch_end_callback = mx.callback.Speedometer(train_data.batch_size,
                                                 frequent=frequent)
    # 作用是每隔period个epoch保存训练得到的模型
    epoch_end_callback = mx.callback.do_checkpoint(prefix)
    # 调用评价函数类
    eval_metrics = mx.metric.CompositeEvalMetric()
    metric1 = metric.Accuracy()
    metric2 = metric.LogLoss()
    metric3 = metric.BBOX_MSE()
    # 使用add方法添加评价函数类
    for child_metric in [metric1, metric2, metric3]:
        eval_metrics.add(child_metric)
    # 优化相关参数
    optimizer_params = {
        'momentum': 0.9,
        'wd': 0.00001,
        'learning_rate': lr,
        'lr_scheduler': lr_scheduler,
        'rescale_grad': 1.0,
        'clip_gradient': 5
    }
    # 创建一个可训练的模块
    mod = Module(sym,
                 data_names=data_names,
                 label_names=label_names,
                 logger=logger,
                 context=ctx)
    # 训练模型
    mod.fit(train_data,
            eval_metric=eval_metrics,
            epoch_end_callback=epoch_end_callback,
            batch_end_callback=batch_end_callback,
            optimizer='sgd',
            optimizer_params=optimizer_params,
            arg_params=args,
            aux_params=auxs,
            begin_epoch=begin_epoch,
            num_epoch=end_epoch)