def __init__(self, symbol, data_names, label_names, logger=logging, context=ctx.cpu(), work_load_list=None, asymbol=None, args=None): super(ParallModule, self).__init__(logger=logger) self._symbol = symbol self._asymbol = asymbol self._data_names = data_names self._label_names = label_names self._context = context self._work_load_list = work_load_list self._num_classes = config.num_classes self._batch_size = args.batch_size self._verbose = args.verbose self._emb_size = config.emb_size self._local_class_start = args.local_class_start assert self._local_class_start == 0 self._iter = 0 self._backbone_module = None self._num_workers = config.num_workers self._num_ctx = len(self._context) self._ctx_num_classes = args.ctx_num_classes self._nd_cache = {} self._ctx_single_gpu = self._context[-1] self._fixed_param_names = None self._backbone_module = Module( self._symbol, self._data_names, self._label_names, logger=self.logger, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names) self._arcface_modules = [] self._ctx_class_start = [] for i in range(len(self._context)): args._ctxid = i _module = Module(self._asymbol(args), self._data_names, self._label_names, logger=self.logger, context=self._context[i], work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names) self._arcface_modules.append(_module) _c = args.local_class_start + i * args.ctx_num_classes self._ctx_class_start.append(_c) self._usekv = False if self._usekv: self._distkv = mx.kvstore.create('dist_sync') self._kvinit = {}
def forward(self, data_batch, is_train=None): assert self.binded and self.params_initialized # get current_shapes if self._curr_module.label_shapes is not None: current_shapes = dict(self._curr_module.data_shapes + self._curr_module.label_shapes) else: current_shapes = dict(self._curr_module.data_shapes) # get input_shapes if data_batch.provide_label is not None: input_shapes = dict(data_batch.provide_data + data_batch.provide_label) else: input_shapes = dict(data_batch.provide_data) # decide if shape changed shape_changed = False for k, v in current_shapes.items(): if v != input_shapes[k]: shape_changed = True if shape_changed: module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names) module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training, self._curr_module.inputs_need_grad, force_rebind=False, shared_module=self._curr_module) self._curr_module = module self._curr_module.forward(data_batch, is_train=is_train)
def forward(self, data_batch, is_train=None): assert self.binded and self.params_initialized # get current_shapes if self._curr_module.label_shapes is not None: print self._curr_module.data_shapes print self._curr_module.label_shapes print data_batch.provide_data print data_batch.provide_label current_shapes = [ dict(self._curr_module.data_shapes[i] + self._curr_module.label_shapes[i]) for i in xrange(len(self._context)) ] else: current_shapes = [ dict(self._curr_module.data_shapes[i]) for i in xrange(len(self._context)) ] # get input_shapes if is_train: input_shapes = [ dict(data_batch.provide_data[i] + data_batch.provide_label[i]) for i in xrange(len(self._context)) ] else: input_shapes = [ dict(data_batch.provide_data[i]) for i in xrange(len(data_batch.provide_data)) ] # decide if shape changed shape_changed = len(current_shapes) != len(input_shapes) for pre, cur in zip(current_shapes, input_shapes): for k, v in pre.items(): if v != cur[k]: shape_changed = True if shape_changed: # self._curr_module.reshape(data_batch.provide_data, data_batch.provide_label) module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, context=[ self._context[i] for i in xrange(len(data_batch.provide_data)) ], work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names) module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training, self._curr_module.inputs_need_grad, force_rebind=False, shared_module=self._curr_module) self._curr_module = module self._curr_module.forward(data_batch, is_train=is_train)
def bind(self, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req="write"): # in case we already initialized params, keep it if self.params_initialized: arg_params, aux_params = self.get_params() # force rebinding is typically used when one want to switch from # training to prediction phase. if force_rebind: self._reset_bind() if self.binded: self.logger.warning('Already bound, ignoring bind()') return assert shared_module is None, 'shared_module for KTModule is not supported' self.for_training = for_training self.inputs_need_grad = inputs_need_grad self.binded = True module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, context=self._context, work_load_list=self._work_load_list) module.bind(self._data_shapes, self._label_shapes, for_training, inputs_need_grad, force_rebind=False, shared_module=None) self._curr_module = module # copy back saved params, if already initialized if self.params_initialized: self.set_params(arg_params, aux_params)
def bind(self, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None): # in case we already initialized params, keep it if self.params_initialized: arg_params, aux_params = self.get_params() # force rebinding is typically used when one want to switch from # training to prediction phase. if force_rebind: self._reset_bind() if self.binded: self.logger.warning('Already binded, ignoring bind()') return assert shared_module is None, 'shared_module for MutableModule is not supported' self.for_training = for_training self.inputs_need_grad = inputs_need_grad self.binded = True max_shapes_dict = dict(self._max_data_shapes + self._max_label_shapes) max_data_shapes = list() for name, shape in data_shapes: if name in max_shapes_dict: max_data_shapes.append((name, max_shapes_dict[name])) else: max_data_shapes.append((name, shape)) max_label_shapes = list() for name, shape in label_shapes: if name in max_shapes_dict: max_label_shapes.append((name, max_shapes_dict[name])) else: max_label_shapes.append((name, shape)) module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names) module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad, force_rebind=False, shared_module=None) self._curr_module = module # copy back saved params, if already initialized if self.params_initialized: self.set_params(arg_params, aux_params)
def __init__(self, symbol, bn_symbol, batch_size, fc7_model, size, rank, local_rank, memory_bank_list, memory_optimizer, backbone_grad_rescale, memory_lr_scale_list, embedding_size=512, head_num=1, logger=logging, ): # configure horovod self.memory_lr_scale_list = memory_lr_scale_list self.size = size self.rank = rank self.local_rank = local_rank self.gpu = mx.gpu(self.local_rank) self.cpu = mx.cpu() # `device_id` is not needed for CPU. self.nd_cache = {} self.embedding_size = embedding_size self.batch_size = batch_size self.num_update = 0 self.batch_end_param = namedtuple( 'batch_end_param', ['loss_list', 'num_epoch_list', 'epoch', 'num_update']) self.symbol = symbol # self.bn_symbol = bn_symbol # self.logger = logger self.backbone_module = Module(self.symbol, ['data'], ['softmax_label'], logger=self.logger, context=self.gpu) # self.bn_module = Module(self.bn_symbol, ['data'], None, logger=self.logger, context=self.gpu) self.head_num = head_num self.memory_bank_list = memory_bank_list self.memory_optimizer = memory_optimizer self.memory_lr = None self.loss_cache = None self.grad_cache = None assert isinstance(self.memory_bank_list, list) # init self.fc7_model = fc7_model # fp16 self.backbone_grad_rescale = backbone_grad_rescale self.binded = False self.for_training = False self.inputs_need_grad = False self.params_initialized = False self.optimizer_initialized = False self._total_exec_bytes = 0 self.global_label = None
def train_net(sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, imdb, batch_size, thread_num, net=12, with_cls = True, with_bbox = True, with_landmark = False, frequent=50, initialize=True, base_lr=0.01, lr_epoch = [6,14]): logger = logging.getLogger() logger.setLevel(logging.INFO) train_data = ImageLoader(imdb, net, batch_size, thread_num, True, shuffle=True, ctx=ctx) if not initialize: args, auxs = load_param(pretrained, epoch, convert=True) if initialize: print "init weights and bias:" data_shape_dict = dict(train_data.provide_data + train_data.provide_label) arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) args = dict() auxs = dict() print 'hello3' for k in sym.list_arguments(): if k in data_shape_dict: continue #print 'init', k args[k] = mx.nd.zeros(arg_shape_dict[k]) init(k, args[k]) if k.startswith('fc'): args[k][:] /= 10 ''' if k.endswith('weight'): if k.startswith('conv'): args[k] = mx.random.normal(loc=0, scale=0.001, shape=arg_shape_dict[k]) else: args[k] = mx.random.normal(loc=0, scale=0.01, shape=arg_shape_dict[k]) else: # bias args[k] = mx.nd.zeros(shape=arg_shape_dict[k]) ''' for k in sym.list_auxiliary_states(): auxs[k] = mx.nd.zeros(aux_shape_dict[k]) #print aux_shape_dict[k] init(k, auxs[k]) lr_factor = 0.1 image_num = len(imdb) lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * image_num / batch_size) for epoch in lr_epoch_diff] print 'lr', lr, 'lr_epoch', lr_epoch, 'lr_epoch_diff', lr_epoch_diff lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) data_names = [k[0] for k in train_data.provide_data] label_names = [k[0] for k in train_data.provide_label] batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent) epoch_end_callback = mx.callback.do_checkpoint(prefix,period=10) eval_metrics = mx.metric.CompositeEvalMetric() eval_metrics.add(metric_human14.LANDMARK_MSE()) eval_metrics.add(metric_human14.LANDMARK_L1()) optimizer_params = {'momentum': 0.9, 'wd': 0.00001, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0} mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)
def bind(self, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None): # in case we already initialized params, keep it if self.params_initialized: arg_params, aux_params = self.get_params() # force rebinding is typically used when one want to switch from # training to prediction phase. if force_rebind: self._reset_bind() if self.binded: self.logger.warning('Already binded, ignoring bind()') return assert shared_module is None, 'shared_module for MutableModule is not supported' self.for_training = for_training self.inputs_need_grad = inputs_need_grad self.binded = True max_shapes_dict = dict() if self._max_data_shapes is not None: # dict1.update(dict2) 用于把 字典2中的键值对更新到 字典1 中 max_shapes_dict.update(dict(self._max_data_shapes)) if self._max_label_shapes is not None: max_shapes_dict.update(dict(self._max_label_shapes)) max_data_shapes = list() for name, shape in data_shapes: if name in max_shapes_dict: max_data_shapes.append((name, max_shapes_dict[name])) else: max_data_shapes.append((name, shape)) max_label_shapes = list() if label_shapes is not None: for name, shape in label_shapes: if name in max_shapes_dict: max_label_shapes.append((name, max_shapes_dict[name])) else: max_label_shapes.append((name, shape)) if len(max_label_shapes) == 0: max_label_shapes = None # referen: https://mxnet.incubator.apache.org/api/python/module/module.html#mxnet.module.Module # 初始化 模型 module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names) module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad, force_rebind=False, shared_module=None) self._curr_module = module # copy back saved params, if already initialized if self.params_initialized: self.set_params(arg_params, aux_params)
def train_net(mode, sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, imdb, batch_size, thread_num, im_size, net=112, frequent=50, initialize=True, base_lr=0.01, lr_epoch=[6, 14]): logger = logging.getLogger() logger.setLevel(logging.INFO) train_data = ImageLoader(imdb, net, batch_size, thread_num, shuffle=True, ctx=ctx) if not initialize: args, auxs = load_param(pretrained, epoch, convert=True) if initialize: print "init weights and bias:" data_shape_dict = dict(train_data.provide_data + train_data.provide_label) print(data_shape_dict) arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) #print(arg_shape) #print(aux_shape) arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) args = dict() auxs = dict() #print 'hello3' for k in sym.list_arguments(): if k in data_shape_dict: continue #print 'init', k args[k] = mx.nd.zeros(arg_shape_dict[k]) init(k, args[k]) if k.startswith('fc'): args[k][:] /= 10 for k in sym.list_auxiliary_states(): auxs[k] = mx.nd.zeros(aux_shape_dict[k]) #print aux_shape_dict[k] init(k, auxs[k]) lr_factor = 0.1 #lr_epoch = config.LR_EPOCH lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(imdb) / batch_size) for epoch in lr_epoch_diff] print 'lr', lr, 'lr_epoch', lr_epoch, 'lr_epoch_diff', lr_epoch_diff lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) data_names = [k[0] for k in train_data.provide_data] label_names = [k[0] for k in train_data.provide_label] batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent) epoch_end_callback = mx.callback.do_checkpoint(prefix) eval_metrics = mx.metric.CompositeEvalMetric() metric1 = metric.GenderAccuracy() metric2 = metric.GenderLogLoss() if mode == "gender_age": metric3 = metric.AGE_MAE() for child_metric in [metric1, metric2, metric3]: eval_metrics.add(child_metric) else: for child_metric in [metric1, metric2]: eval_metrics.add(child_metric) #eval_metrics = mx.metric.CompositeEvalMetric([metric.AccMetric(), metric.MAEMetric(), metric.CUMMetric()]) optimizer_params = { 'momentum': 0.9, 'wd': 0.00001, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0 } mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, imdb, net=12, frequent=50, initialize=True, base_lr=0.01): logger = logging.getLogger() logger.setLevel(logging.INFO) # 记录到标准输出 # 训练数据 train_data = ImageLoader(imdb, net, config.BATCH_SIZE, shuffle=True, ctx=ctx) if not initialize: # 如果非初始化 加载参数 args, auxs = load_param(pretrained, epoch, convert=True) if initialize: print("init weights and bias:") data_shape_dict = dict(train_data.provide_data + train_data.provide_label) arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) # 权重初始化 Xavier初始化器 init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) args = dict() # 模型参数以及网络权重字典 auxs = dict() # 模型参数以及一些附加状态的字典 for k in sym.list_arguments(): if k in data_shape_dict: continue print('init', k) args[k] = mx.nd.zeros(arg_shape_dict[k]) init(k, args[k]) if k.startswith('fc'): args[k][:] /= 10 ''' if k.endswith('weight'): if k.startswith('conv'): args[k] = mx.random.normal(loc=0, scale=0.001, shape=arg_shape_dict[k]) else: args[k] = mx.random.normal(loc=0, scale=0.01, shape=arg_shape_dict[k]) else: # bias args[k] = mx.nd.zeros(shape=arg_shape_dict[k]) ''' for k in sym.list_auxiliary_states(): auxs[k] = mx.nd.zeros() init(k, auxs[k]) lr_factor = 0.1 lr_epoch = config.LR_EPOCH lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(imdb) / config.BATCH_SIZE) for epoch in lr_epoch_diff ] print('lr:{},lr_epoch:{},lr_epoch_diff:{}'.format(lr, lr_epoch, lr_epoch_diff)) # print('lr', lr, 'lr_epoch', lr_epoch, 'lr_epoch_diff', lr_epoch_diff) # MXNet设置动态学习率,经过lr_iters次更新后,学习率变为lr*lr_factor lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) data_names = [k[0] for k in train_data.provide_data] label_names = [k[0] for k in train_data.provide_label] # 作用是每隔多少个batch显示一次结果 batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent) # 作用是每隔period个epoch保存训练得到的模型 epoch_end_callback = mx.callback.do_checkpoint(prefix) # 调用评价函数类 eval_metrics = mx.metric.CompositeEvalMetric() metric1 = metric.Accuracy() metric2 = metric.LogLoss() metric3 = metric.BBOX_MSE() # 使用add方法添加评价函数类 for child_metric in [metric1, metric2, metric3]: eval_metrics.add(child_metric) # 优化相关参数 optimizer_params = { 'momentum': 0.9, 'wd': 0.00001, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': 5 } # 创建一个可训练的模块 mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx) # 训练模型 mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)