def train_net(net, train_path, num_classes, batch_size, data_shape, mean_pixels, resume, finetune, pretrained, epoch, prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate, momentum, weight_decay, lr_refactor_step, lr_refactor_ratio, freeze_layer_pattern='', shape_range=(320, 512), random_shape_step=0, random_shape_epoch=10, num_example=10000, label_pad_width=350, nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, nms_topk=400, force_suppress=False, train_list="", val_path="", val_list="", iter_monitor=0, monitor_pattern=".*", log_file=None): """ Wrapper for training phase. Parameters: ---------- net : str symbol name for the network structure train_path : str record file path for training num_classes : int number of object classes, not including background batch_size : int training batch-size data_shape : int or tuple width/height as integer or (3, height, width) tuple mean_pixels : tuple of floats mean pixel values for red, green and blue resume : int resume from previous checkpoint if > 0 finetune : int fine-tune from previous checkpoint if > 0 pretrained : str prefix of pretrained model, including path epoch : int load epoch of either resume/finetune/pretrained model prefix : str prefix for saving checkpoints ctx : [mx.cpu()] or [mx.gpu(x)] list of mxnet contexts begin_epoch : int starting epoch for training, should be 0 if not otherwise specified end_epoch : int end epoch of training frequent : int frequency to print out training status learning_rate : float training learning rate momentum : float trainig momentum weight_decay : float training weight decay param lr_refactor_ratio : float multiplier for reducing learning rate lr_refactor_step : comma separated integers at which epoch to rescale learning rate, e.g. '30, 60, 90' freeze_layer_pattern : str regex pattern for layers need to be fixed shape_range : tuple of (min, max) random data shape range random_shape_step : int step size for random data shape, defined by network, 0 to disable random_step_epoch : int number of epoch before next random shape num_example : int number of training images label_pad_width : int force padding training and validation labels to sync their label widths nms_thresh : float non-maximum suppression threshold for validation force_nms : boolean suppress overlaped objects from different classes train_list : str list file path for training, this will replace the embeded labels in record val_path : str record file path for validation val_list : str list file path for validation, this will replace the embeded labels in record iter_monitor : int monitor internal stats in networks if > 0, specified by monitor_pattern monitor_pattern : str regex pattern for monitoring network stats log_file : str log to file if enabled """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: fh = logging.FileHandler(log_file) logger.addHandler(fh) # check args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 prefix += '_' + net.strip('_yolo') + '_' + str(data_shape[1]) if isinstance(mean_pixels, (int, float)): mean_pixels = [mean_pixels, mean_pixels, mean_pixels] assert len(mean_pixels) == 3, "must provide all RGB mean values" # load symbol sys.path.append(os.path.join(cfg.ROOT_DIR, 'symbol')) symbol_module = importlib.import_module("symbol_" + net) net = symbol_module.get_symbol(num_classes, nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk) # define layers with fixed weight/bias if freeze_layer_pattern.strip(): re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [ name for name in net.list_arguments() if re_prog.match(name) ] else: fixed_param_names = None # load pretrained or resume from previous state ctx_str = '(' + ','.join([str(c) for c in ctx]) + ')' allow_missing = True if resume > 0: logger.info("Resume training with {} from epoch {}".format( ctx_str, resume)) _, args, auxs = mx.model.load_checkpoint(prefix, resume) begin_epoch = resume allow_missing = False elif finetune > 0: logger.info("Start finetuning with {} from epoch {}".format( ctx_str, finetune)) _, args, auxs = mx.model.load_checkpoint(prefix, finetune) begin_epoch = finetune # the prediction convolution layers name starts with relu, so it's fine fixed_param_names = [name for name in net.list_arguments() \ if name.startswith('conv')] elif pretrained: logger.info("Start training with {} from pretrained model {}".format( ctx_str, pretrained)) _, args, auxs = mx.model.load_checkpoint(pretrained, epoch) args = convert_pretrained(pretrained, args) else: logger.info("Experimental: start training from scratch with {}".format( ctx_str)) args = None auxs = None fixed_param_names = None # helper information if fixed_param_names: logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']') # fit parameters batch_end_callback = mx.callback.Speedometer(batch_size, frequent=frequent) epoch_end_callback = mx.callback.do_checkpoint(prefix) monitor = mx.mon.Monitor( iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None # run fit net, every n epochs we run evaluation network to get mAP if voc07_metric: valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=0) else: valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=0) # init training module mod = mx.mod.Module(net, label_names=('yolo_output_label', ), logger=logger, context=ctx, fixed_param_names=fixed_param_names) random_shape_step = int(random_shape_step) if random_shape_step > 0: fit_begins = list(range(begin_epoch, end_epoch, random_shape_epoch)) fit_ends = fit_begins[1:] + [end_epoch] assert (len(shape_range) == 2) data_shapes = [(3, x * random_shape_step, x * random_shape_step) \ for x in range(shape_range[0] // random_shape_step, shape_range[1] // random_shape_step + 1)] logger.info("Candidate random shapes:" + str(data_shapes)) else: fit_begins = [begin_epoch] fit_ends = [end_epoch] data_shapes = [data_shape] for begin, end in zip(fit_begins, fit_ends): if len(data_shapes) == 1: data_shape = data_shapes[0] else: data_shape = data_shapes[random.randint(0, len(data_shapes) - 1)] logger.info("Setting random data shape: " + str(data_shape)) train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio, num_example, batch_size, begin_epoch) optimizer_params = { 'learning_rate': learning_rate, 'momentum': momentum, 'wd': weight_decay, 'lr_scheduler': lr_scheduler, 'clip_gradient': 10, 'rescale_grad': 1.0 } mod.fit(train_iter, val_iter, eval_metric=MultiBoxMetric(), validation_metric=valid_metric, batch_end_callback=batch_end_callback, epoch_end_callback=epoch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, begin_epoch=begin, num_epoch=end, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=allow_missing, monitor=monitor, force_rebind=True, force_init=True) args, auxs = mod.get_params() allow_missing = False
def train_net(net, train_path, num_classes, batch_size, data_shape, mean_pixels, resume, finetune, pretrained, epoch, prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate, momentum, weight_decay, lr_refactor_step, lr_refactor_ratio, freeze_layer_pattern='', num_example=10000, label_pad_width=350, nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, nms_topk=400, force_suppress=False, train_list="", val_path="", val_list="", iter_monitor=0, monitor_pattern=".*", log_file=None): """ Wrapper for training phase. Parameters: ---------- net : str symbol name for the network structure train_path : str record file path for training num_classes : int number of object classes, not including background batch_size : int training batch-size data_shape : int or tuple width/height as integer or (3, height, width) tuple mean_pixels : tuple of floats mean pixel values for red, green and blue resume : int resume from previous checkpoint if > 0 finetune : int fine-tune from previous checkpoint if > 0 pretrained : str prefix of pretrained model, including path epoch : int load epoch of either resume/finetune/pretrained model prefix : str prefix for saving checkpoints ctx : [mx.cpu()] or [mx.gpu(x)] list of mxnet contexts begin_epoch : int starting epoch for training, should be 0 if not otherwise specified end_epoch : int end epoch of training frequent : int frequency to print out training status learning_rate : float training learning rate momentum : float trainig momentum weight_decay : float training weight decay param lr_refactor_ratio : float multiplier for reducing learning rate lr_refactor_step : comma separated integers at which epoch to rescale learning rate, e.g. '30, 60, 90' freeze_layer_pattern : str regex pattern for layers need to be fixed num_example : int number of training images label_pad_width : int force padding training and validation labels to sync their label widths nms_thresh : float non-maximum suppression threshold for validation force_nms : boolean suppress overlaped objects from different classes train_list : str list file path for training, this will replace the embeded labels in record val_path : str record file path for validation val_list : str list file path for validation, this will replace the embeded labels in record iter_monitor : int monitor internal stats in networks if > 0, specified by monitor_pattern monitor_pattern : str regex pattern for monitoring network stats log_file : str log to file if enabled """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: fh = logging.FileHandler(log_file) logger.addHandler(fh) # check args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 if prefix.endswith('_'): prefix += '_' + str(data_shape[1]) if isinstance(mean_pixels, (int, float)): mean_pixels = [mean_pixels, mean_pixels, mean_pixels] assert len(mean_pixels) == 3, "must provide all RGB mean values" train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None # load symbol net = get_symbol_train(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk) # define layers with fixed weight/bias if freeze_layer_pattern.strip(): re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [name for name in net.list_arguments() if re_prog.match(name)] else: fixed_param_names = None # load pretrained or resume from previous state ctx_str = '('+ ','.join([str(c) for c in ctx]) + ')' if resume > 0: logger.info("Resume training with {} from epoch {}" .format(ctx_str, resume)) _, args, auxs = mx.model.load_checkpoint(prefix, resume) begin_epoch = resume elif finetune > 0: logger.info("Start finetuning with {} from epoch {}" .format(ctx_str, finetune)) _, args, auxs = mx.model.load_checkpoint(prefix, finetune) begin_epoch = finetune # check what layers mismatch with the loaded parameters exe = net.simple_bind(mx.cpu(), data=(1, 3, 300, 300), label=(1, 1, 5), grad_req='null') arg_dict = exe.arg_dict fixed_param_names = [] for k, v in arg_dict.items(): if k in args: if v.shape != args[k].shape: del args[k] logging.info("Removed %s" % k) else: if not 'pred' in k: fixed_param_names.append(k) elif pretrained: logger.info("Start training with {} from pretrained model {}" .format(ctx_str, pretrained)) _, args, auxs = mx.model.load_checkpoint(pretrained, epoch) args = convert_pretrained(pretrained, args) else: logger.info("Experimental: start training from scratch with {}" .format(ctx_str)) args = None auxs = None fixed_param_names = None # helper information if fixed_param_names: logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']') # init training module mod = mx.mod.Module(net, label_names=('label',), logger=logger, context=ctx, fixed_param_names=fixed_param_names) # fit parameters batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent) epoch_end_callback = mx.callback.do_checkpoint(prefix) learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio, num_example, batch_size, begin_epoch) optimizer_params={'learning_rate':learning_rate, 'momentum':momentum, 'wd':weight_decay, 'lr_scheduler':lr_scheduler, 'clip_gradient':None, 'rescale_grad': 1.0 / len(ctx) if len(ctx) > 0 else 1.0 } monitor = mx.mon.Monitor(iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None # run fit net, every n epochs we run evaluation network to get mAP if voc07_metric: valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) else: valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) mod.fit(train_iter, val_iter, eval_metric=MultiBoxMetric(), validation_metric=valid_metric, batch_end_callback=batch_end_callback, epoch_end_callback=epoch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=True, monitor=monitor)
def train_net(network, train_path, num_classes, batch_size, data_shape, mean_pixels, resume, finetune, pretrained, epoch, prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate, momentum, weight_decay, lr_refactor_step, lr_refactor_ratio, freeze_layer_pattern='', num_example=10000, label_pad_width=350, nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, nms_topk=400, force_suppress=False, train_list="", val_path="", val_list="", iter_monitor=0, monitor_pattern=".*", log_file=None, optimizer='sgd', tensorboard=False, checkpoint_period=5, min_neg_samples=0): """ Wrapper for training phase. Parameters: ---------- net : str symbol name for the network structure train_path : str record file path for training num_classes : int number of object classes, not including background batch_size : int training batch-size data_shape : int or tuple width/height as integer or (3, height, width) tuple mean_pixels : tuple of floats mean pixel values for red, green and blue resume : int resume from previous checkpoint if > 0 finetune : int fine-tune from previous checkpoint if > 0 pretrained : str prefix of pretrained model, including path epoch : int load epoch of either resume/finetune/pretrained model prefix : str prefix for saving checkpoints ctx : [mx.cpu()] or [mx.gpu(x)] list of mxnet contexts begin_epoch : int starting epoch for training, should be 0 if not otherwise specified end_epoch : int end epoch of training frequent : int frequency to print out training status optimizer : str usage of different optimizers, other then default sgd learning_rate : float training learning rate momentum : float trainig momentum weight_decay : float training weight decay param lr_refactor_ratio : float multiplier for reducing learning rate lr_refactor_step : comma separated integers at which epoch to rescale learning rate, e.g. '30, 60, 90' freeze_layer_pattern : str regex pattern for layers need to be fixed num_example : int number of training images label_pad_width : int force padding training and validation labels to sync their label widths nms_thresh : float non-maximum suppression threshold for validation force_nms : boolean suppress overlaped objects from different classes train_list : str list file path for training, this will replace the embeded labels in record val_path : str record file path for validation val_list : str list file path for validation, this will replace the embeded labels in record iter_monitor : int monitor internal stats in networks if > 0, specified by monitor_pattern monitor_pattern : str regex pattern for monitoring network stats log_file : str log to file if enabled tensorboard : bool record logs into tensorboard min_neg_samples : int always have some negative examples, no matter how many positive there are. this is useful when training on images with no ground-truth. checkpoint_period : int a checkpoint will be saved every "checkpoint_period" epochs """ # check actual number of train_images if os.path.exists(train_path.replace('rec', 'idx')): with open(train_path.replace('rec', 'idx'), 'r') as f: txt = f.readlines() num_example = len(txt) # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: log_file_path = os.path.join(os.path.dirname(prefix), log_file) if not os.path.exists(os.path.dirname(log_file_path)): os.makedirs(os.path.dirname(log_file_path)) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) # check args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 if prefix.endswith('_'): prefix += '_' + str(data_shape[1]) if isinstance(mean_pixels, (int, float)): mean_pixels = [mean_pixels, mean_pixels, mean_pixels] assert len(mean_pixels) == 3, "must provide all RGB mean values" train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None # load symbol net = get_symbol_train(network, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk, minimum_negative_samples=min_neg_samples) # define layers with fixed weight/bias if freeze_layer_pattern.strip(): re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [ name for name in net.list_arguments() if re_prog.match(name) ] else: fixed_param_names = None # load pretrained or resume from previous state ctx_str = '(' + ','.join([str(c) for c in ctx]) + ')' if resume > 0: logger.info("Resume training with {} from epoch {}".format( ctx_str, resume)) _, args, auxs = mx.model.load_checkpoint(prefix, resume) begin_epoch = resume elif finetune > 0: logger.info("Start finetuning with {} from epoch {}".format( ctx_str, finetune)) # _, args, auxs = mx.model.load_checkpoint(prefix, finetune) # Gang Chen changed _, args, auxs = mx.model.load_checkpoint(pretrained, finetune) begin_epoch = finetune # check what layers mismatch with the loaded parameters exe = net.simple_bind(mx.cpu(), data=(1, 3, 300, 300), label=(1, 1, 5), grad_req='null') arg_dict = exe.arg_dict fixed_param_names = [] for k, v in arg_dict.items(): if k in args: if v.shape != args[k].shape: del args[k] logging.info("Removed %s" % k) else: if not 'pred' in k: #Gang Chen #fixed_param_names.append(k) pass elif pretrained and not finetune: logger.info("Start training with {} from pretrained model {}".format( ctx_str, pretrained)) _, args, auxs = mx.model.load_checkpoint(pretrained, epoch) args = convert_pretrained(pretrained, args) else: logger.info("Experimental: start training from scratch with {}".format( ctx_str)) args = None auxs = None fixed_param_names = None # helper information if fixed_param_names: logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']') # visualize net - both train and test net_visualization(net=net, network=network, data_shape=data_shape[2], output_dir=os.path.dirname(prefix), train=True) net_visualization(net=None, network=network, data_shape=data_shape[2], output_dir=os.path.dirname(prefix), train=False, num_classes=num_classes) # init training module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=fixed_param_names) batch_end_callback = [] eval_end_callback = [] epoch_end_callback = [ mx.callback.do_checkpoint(prefix, period=checkpoint_period) ] # add logging to tensorboard if tensorboard: tensorboard_dir = os.path.join(os.path.dirname(prefix), 'logs') if not os.path.exists(tensorboard_dir): os.makedirs(os.path.join(tensorboard_dir, 'train', 'scalar')) os.makedirs(os.path.join(tensorboard_dir, 'train', 'dist')) os.makedirs(os.path.join(tensorboard_dir, 'val', 'roc')) os.makedirs(os.path.join(tensorboard_dir, 'val', 'scalar')) os.makedirs(os.path.join(tensorboard_dir, 'val', 'images')) batch_end_callback.append( ParseLogCallback( dist_logging_dir=os.path.join(tensorboard_dir, 'train', 'dist'), scalar_logging_dir=os.path.join(tensorboard_dir, 'train', 'scalar'), logfile_path=log_file_path, batch_size=batch_size, iter_monitor=iter_monitor, frequent=frequent)) eval_end_callback.append( mx.contrib.tensorboard.LogMetricsCallback( os.path.join(tensorboard_dir, 'val/scalar'), 'ssd')) eval_end_callback.append( LogROCCallback(logging_dir=os.path.join(tensorboard_dir, 'val/roc'), roc_path=os.path.join(os.path.dirname(prefix), 'roc'), class_names=class_names)) eval_end_callback.append( LogDetectionsCallback( logging_dir=os.path.join(tensorboard_dir, 'val/images'), images_path=os.path.join(os.path.dirname(prefix), 'images'), class_names=class_names, batch_size=batch_size, mean_pixels=mean_pixels)) # this callback should be the last in a serie of batch_callbacks # since it is resetting the metric evaluation every $frequent batches batch_end_callback.append( mx.callback.Speedometer(train_iter.batch_size, frequent=frequent)) learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio, num_example, batch_size, begin_epoch) # add possibility for different optimizer opt, opt_params = get_optimizer_params(optimizer=optimizer, learning_rate=learning_rate, momentum=momentum, weight_decay=weight_decay, lr_scheduler=lr_scheduler, ctx=ctx, logger=logger) # TODO monitor the gradient flow as in 'https://github.com/dmlc/tensorboard/blob/master/docs/tutorial/understanding-vanish-gradient.ipynb' monitor = mx.mon.Monitor( iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None # run fit net, every n epochs we run evaluation network to get mAP if voc07_metric: valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3, roc_output_path=os.path.join( os.path.dirname(prefix), 'roc')) else: valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3, roc_output_path=os.path.join( os.path.dirname(prefix), 'roc')) mod.fit(train_iter, val_iter, eval_metric=MultiBoxMetric(), validation_metric=valid_metric, batch_end_callback=batch_end_callback, eval_end_callback=eval_end_callback, epoch_end_callback=epoch_end_callback, optimizer=opt, optimizer_params=opt_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=True, monitor=monitor)
arg_params=arg_params, aux_params=aux_params, ctx=ctx, excluded_sym_names=excluded_sym_names, calib_mode=calib_mode, quantized_dtype=args.quantized_dtype, logger=logger) sym_name = '%s-symbol.json' % ('./model/qssd_vgg16_reduced_300') param_name = '%s-%04d.params' % ('./model/qssd_vgg16_reduced_300', epoch) save_symbol(sym_name, qsym, logger) else: logger.info('Creating ImageRecordIter for reading calibration dataset') eval_iter = DetRecordIter(os.path.join(os.getcwd(), 'data', 'val.rec'), batch_size, data_shape, mean_pixels=(123, 117, 104), path_imglist="", **cfg.valid) qsym, qarg_params, aux_params = quantize_model( sym=sym, arg_params=arg_params, aux_params=aux_params, ctx=ctx, excluded_sym_names=excluded_sym_names, calib_mode=calib_mode, calib_data=eval_iter, num_calib_examples=num_calib_batches * batch_size, calib_layer=calib_layer, quantized_dtype=args.quantized_dtype, label_names=(label_name, ),
def evaluate_net(net, path_imgrec, num_classes, mean_pixels, data_shape, model_prefix, epoch, ctx=mx.cpu(), batch_size=1, path_imglist="", nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False): """ evalute network given validation record file Parameters: ---------- net : str or None Network name or use None to load from json without modifying path_imgrec : str path to the record validation file path_imglist : str path to the list file to replace labels in record file, optional num_classes : int number of classes, not including background mean_pixels : tuple (mean_r, mean_g, mean_b) data_shape : tuple or int (3, height, width) or height/width model_prefix : str model prefix of saved checkpoint epoch : int load model epoch ctx : mx.ctx mx.gpu() or mx.cpu() batch_size : int validation batch size nms_thresh : float non-maximum suppression threshold force_nms : boolean whether suppress different class objects ovp_thresh : float AP overlap threshold for true/false postives use_difficult : boolean whether to use difficult objects in evaluation if applicable class_names : comma separated str class names in string, must correspond to num_classes if set voc07_metric : boolean whether to use 11-point evluation as in VOC07 competition """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 model_prefix += '_' + str(data_shape[1]) # iterator eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, mean_pixels=mean_pixels, path_imglist=path_imglist, **cfg.valid) # model params load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch) # network if net is None: net = load_net else: net = get_symbol(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_nms) if not 'label' in net.list_arguments(): label = mx.sym.Variable(name='label') net = mx.sym.Group([net, label]) # init module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=net.list_arguments()) mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label) mod.set_params(args, auxs, allow_missing=False, force_init=True) # run evaluation if voc07_metric: metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names) else: metric = MApMetric(ovp_thresh, use_difficult, class_names) results = mod.score(eval_iter, metric, num_batch=None) for k, v in results: print("{}: {}".format(k, v))
def evaluate_net(net, path_imgrec, num_classes, num_batch, mean_pixels, data_shape, model_prefix, epoch, ctx=mx.cpu(), batch_size=32, path_imglist="", nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, lite=False): """ evalute network given validation record file Parameters: ---------- net : str or None Network name or use None to load from json without modifying path_imgrec : str path to the record validation file path_imglist : str path to the list file to replace labels in record file, optional num_classes : int number of classes, not including background mean_pixels : tuple (mean_r, mean_g, mean_b) data_shape : tuple or int (3, height, width) or height/width model_prefix : str model prefix of saved checkpoint epoch : int load model epoch ctx : mx.ctx mx.gpu() or mx.cpu() batch_size : int validation batch size nms_thresh : float non-maximum suppression threshold force_nms : boolean whether suppress different class objects ovp_thresh : float AP overlap threshold for true/false postives use_difficult : boolean whether to use difficult objects in evaluation if applicable class_names : comma separated str class names in string, must correspond to num_classes if set voc07_metric : boolean whether to use 11-point evluation as in VOC07 competition """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 model_prefix += '_' + str(data_shape[1]) + '_' + str(data_shape[2]) # iterator eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, mean_pixels=mean_pixels, path_imglist=path_imglist, **cfg.valid) # model params load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch) # network if net is None: net = load_net else: net = get_symbol(net, data_shape, num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_nms, lite=lite) if not 'label' in net.list_arguments(): label = mx.sym.Variable(name='label') net = mx.sym.Group([net, label]) # init module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=net.list_arguments()) mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label) mod.set_params(args, auxs, allow_missing=False, force_init=True) # run evaluation if voc07_metric: metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names) else: metric = MApMetric(ovp_thresh, use_difficult, class_names) num = num_batch * batch_size data = [ mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes ] batch = mx.io.DataBatch(data, []) # empty label dry_run = 5 # use 5 iterations to warm up for i in range(dry_run): mod.forward(batch, is_train=False) for output in mod.get_outputs(): output.wait_to_read() tic = time.time() results = mod.score(eval_iter, metric, num_batch=None, batch_end_callback=mx.callback.Speedometer( batch_size, frequent=10, auto_reset=False)) speed = num / (time.time() - tic) if logger is not None: logger.info('Finished inference with %d images' % num) logger.info('Finished with %f images per second', speed) for k, v in results: print("{}: {}".format(k, v))
def evaluate_net(net, path_imgrec, num_classes, mean_pixels, data_shape, model_prefix, epoch, ctx=mx.cpu(), batch_size=1, path_imglist="", nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, frequent=20): """ evalute network given validation record file Parameters: ---------- net : str or None Network name or use None to load from json without modifying path_imgrec : str path to the record validation file path_imglist : str path to the list file to replace labels in record file, optional num_classes : int number of classes, not including background mean_pixels : tuple (mean_r, mean_g, mean_b) data_shape : tuple or int (3, height, width) or height/width model_prefix : str model prefix of saved checkpoint epoch : int load model epoch ctx : mx.ctx mx.gpu() or mx.cpu() batch_size : int validation batch size nms_thresh : float non-maximum suppression threshold force_nms : boolean whether suppress different class objects ovp_thresh : float AP overlap threshold for true/false postives use_difficult : boolean whether to use difficult objects in evaluation if applicable class_names : comma separated str class names in string, must correspond to num_classes if set voc07_metric : boolean whether to use 11-point evluation as in VOC07 competition frequent : int frequency to print out validation status """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 #model_prefix += '_' + str(data_shape[1]) # iterator eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, path_imglist=path_imglist, **cfg.valid) # model params load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch) # network if net is None: net = load_net else: net = get_symbol(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_nms) if not 'label' in net.list_arguments(): label = mx.sym.Variable(name='label') net = mx.sym.Group([net, label]) data_shape = (1, 3, 300, 300) mx.viz.plot_network_detail(net, shape={ "data": data_shape, "label": (1, 1, 5) }, node_attrs={ "hide_weights": "true", "fixedsize": 'false', "shape": 'oval' }).view() # Gang Chen add exe = net.simple_bind(mx.cpu(), data=(1, 3, 300, 300), label=(1, 1, 5), grad_req='null') arg_dict = exe.arg_dict for k, v in args.items(): if k not in arg_dict: del args[k] # END Gang Chen add # init module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=net.list_arguments()) mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label) mod.set_params(args, auxs, allow_missing=True, force_init=True) # run evaluation if voc07_metric: metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, roc_output_path=os.path.join( os.path.dirname(model_prefix), 'roc')) else: metric = MApMetric(ovp_thresh, use_difficult, class_names, roc_output_path=os.path.join( os.path.dirname(model_prefix), 'roc')) results = mod.score(eval_iter, metric, num_batch=None, batch_end_callback=mx.callback.Speedometer( batch_size, frequent=frequent, auto_reset=False)) for k, v in results: print("{}: {}".format(k, v))
def evaluate_net(net, path_imgrec, num_classes, mean_img, data_shape, model_prefix, epoch, path_img, ctx=mx.cpu(), batch_size=1, path_imglist="", nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False): """ evalute network given validation record file Parameters: ---------- net : str or None Network name or use None to load from json without modifying path_imgrec : str path to the record validation file path_imglist : str path to the list file to replace labels in record file, optional num_classes : int number of classes, not including background mean_pixels : tuple (mean_r, mean_g, mean_b) data_shape : tuple or int (3, height, width) or height/width model_prefix : str model prefix of saved checkpoint epoch : int load model epoch ctx : mx.ctx mx.gpu() or mx.cpu() batch_size : int validation batch size nms_thresh : float non-maximum suppression threshold force_nms : boolean whether suppress different class objects ovp_thresh : float AP overlap threshold for true/false postives use_difficult : boolean whether to use difficult objects in evaluation if applicable class_names : comma separated str class names in string, must correspond to num_classes if set voc07_metric : boolean whether to use 11-point evluation as in VOC07 competition """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 model_prefix += '_' + str(data_shape[1]) netname = net # iterator eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, mean_img=mean_img, path_imglist=path_imglist, **cfg.valid) # model params load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch) # network if net is None: net = load_net else: net = get_symbol(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_nms) if not 'label' in net.list_arguments(): label = mx.sym.Variable(name='label') net = mx.sym.Group([net, label]) # init module mod = mx.mod.Module(net, label_names=('label',), logger=logger, context=ctx, fixed_param_names=net.list_arguments()) mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label) mod.set_params(args, auxs, allow_missing=False, force_init=True) # # run evaluation # if voc07_metric: # metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names) # else: # metric = MApMetric(ovp_thresh, use_difficult, class_names) # results = mod.score(eval_iter, metric, num_batch=None) # for k, v in results: # print("{}: {}".format(k, v)) predict_results = mod.predict(eval_iter, merge_batches = True) preds = predict_results[0] labels = predict_results[1] (flags, ious) = find_wrong_detection.find_wrong_detection(labels, preds, path_imglist, path_img, ovp_thresh = ovp_thresh) flags_dict = {0:'correct', 1:'lower iou', 2:'wrong class'} flag_count = Counter(flags) for flag in set(flags): print ("%s image number is : %d"%(flags_dict[flag], flag_count[flag])) print ("recall is %f"%((len(flags)-flag_count[1])/float(len(flags)))) if not os.path.exists('./model/iou_distribution'): os.mkdir('./model/iou_distribution') xmin = min(ious) - 0.1 if min(ious) > 0.1 else 0 xmax = max(ious) + 0.1 if max(ious) < 0.9 else 1 draw_hist(ious, "iou distribution", "iou", "image number", xmin, xmax, 0, len(ious)/20, netname)
def train_net(net, train_path, num_classes, batch_size, data_shape, mean_pixels, resume, finetune, pretrained, epoch, prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate, momentum, weight_decay, use_plateau, lr_refactor_step, lr_refactor_ratio, use_global_stats=0, freeze_layer_pattern='', num_example=10000, label_pad_width=350, nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, ignore_names=None, optimizer_name='sgd', voc07_metric=False, nms_topk=400, force_suppress=False, train_list="", val_path="", val_list="", iter_monitor=0, monitor_pattern=".*", log_file=None): """ Wrapper for training phase. Parameters: ---------- net : str symbol name for the network structure train_path : str record file path for training num_classes : int number of object classes, not including background batch_size : int training batch-size data_shape : int or tuple width/height as integer or (3, height, width) tuple mean_pixels : tuple of floats mean pixel values for red, green and blue resume : int resume from previous checkpoint if > 0 finetune : int fine-tune from previous checkpoint if > 0 pretrained : str prefix of pretrained model, including path epoch : int load epoch of either resume/finetune/pretrained model prefix : str prefix for saving checkpoints ctx : [mx.cpu()] or [mx.gpu(x)] list of mxnet contexts begin_epoch : int starting epoch for training, should be 0 if not otherwise specified end_epoch : int end epoch of training frequent : int frequency to print out training status learning_rate : float training learning rate momentum : float trainig momentum weight_decay : float training weight decay param lr_refactor_ratio : float multiplier for reducing learning rate lr_refactor_step : comma separated integers at which epoch to rescale learning rate, e.g. '30, 60, 90' freeze_layer_pattern : str regex pattern for layers need to be fixed num_example : int number of training images label_pad_width : int force padding training and validation labels to sync their label widths nms_thresh : float non-maximum suppression threshold for validation force_nms : boolean suppress overlaped objects from different classes train_list : str list file path for training, this will replace the embeded labels in record val_path : str record file path for validation val_list : str list file path for validation, this will replace the embeded labels in record iter_monitor : int monitor internal stats in networks if > 0, specified by monitor_pattern monitor_pattern : str regex pattern for monitoring network stats log_file : str log to file if enabled """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: fh = logging.FileHandler(log_file) logger.addHandler(fh) # check args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 prefix += '_' + net + '_' + str(data_shape[1]) if isinstance(mean_pixels, (int, float)): mean_pixels = [mean_pixels, mean_pixels, mean_pixels] assert len(mean_pixels) == 3, "must provide all RGB mean values" train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None # load symbol net_str = net net = get_symbol_train(net, data_shape[1], \ use_global_stats=use_global_stats, \ num_classes=num_classes, ignore_names=ignore_names, \ nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk) # define layers with fixed weight/bias if freeze_layer_pattern.strip(): re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [ name for name in net.list_arguments() if re_prog.match(name) ] else: fixed_param_names = None # load pretrained or resume from previous state ctx_str = '(' + ','.join([str(c) for c in ctx]) + ')' if resume > 0: logger.info("Resume training with {} from epoch {}".format( ctx_str, resume)) _, args, auxs = mx.model.load_checkpoint(prefix, resume) begin_epoch = resume elif finetune > 0: logger.info("Start finetuning with {} from epoch {}".format( ctx_str, finetune)) _, args, auxs = mx.model.load_checkpoint(prefix, finetune) begin_epoch = finetune # the prediction convolution layers name starts with relu, so it's fine fixed_param_names = [name for name in net.list_arguments() \ if name.startswith('conv')] elif pretrained: try: logger.info( "Start training with {} from pretrained model {}".format( ctx_str, pretrained)) _, args, auxs = mx.model.load_checkpoint(pretrained, epoch) args = convert_pretrained(pretrained, args) if net_str == 'ssd_pva': args, auxs = convert_pvanet(args, auxs) except: logger.info( "Failed to load the pretrained model. Start from scratch.") args = None auxs = None fixed_param_names = None else: logger.info("Experimental: start training from scratch with {}".format( ctx_str)) args = None auxs = None fixed_param_names = None # helper information if fixed_param_names: logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']') # init training module if not use_plateau: # focal loss does not go well with plateau mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=fixed_param_names) else: mod = PlateauModule(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=fixed_param_names) # robust parameter setting mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) mod = set_mod_params(mod, args, auxs, logger) # fit parameters batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent, auto_reset=True) epoch_end_callback = mx.callback.do_checkpoint(prefix) monitor = mx.mon.Monitor( iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None optimizer_params = { 'learning_rate': learning_rate, 'wd': weight_decay, 'clip_gradient': 4.0, 'rescale_grad': 1.0 / len(ctx) if len(ctx) > 0 else 1.0 } if optimizer_name == 'sgd': optimizer_params['momentum'] = momentum # #7847 mod.init_optimizer(optimizer=optimizer_name, optimizer_params=optimizer_params, force_init=True) if not use_plateau: learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio, num_example, batch_size, begin_epoch) else: w_l1 = cfg.train['smoothl1_weight'] eval_weights = { 'CrossEntropy': 1.0, 'SmoothL1': w_l1, 'ObjectRecall': 0.0 } plateau_lr = PlateauScheduler( \ patient_epochs=lr_refactor_step, factor=float(lr_refactor_ratio), eval_weights=eval_weights) plateau_metric = MultiBoxMetric( fn_stat='/home/hyunjoon/github/additions_mxnet/ssd/stat.txt') mod.init_optimizer(optimizer=optimizer_name, optimizer_params=optimizer_params) eval_metric = MultiBoxMetric() # run fit net, every n epochs we run evaluation network to get mAP if voc07_metric: map_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=4) recall_metric = RecallMetric(ovp_thresh, use_difficult, pred_idx=4) valid_metric = mx.metric.create([map_metric, recall_metric]) else: valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=4) if not use_plateau: mod.fit(train_iter, eval_data=val_iter, eval_metric=eval_metric, validation_metric=valid_metric, batch_end_callback=batch_end_callback, epoch_end_callback=epoch_end_callback, optimizer=optimizer_name, optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=True, monitor=monitor) else: mod.fit(train_iter, plateau_lr, plateau_metric=plateau_metric, fn_curr_model=prefix + '-1000.params', plateau_backtrace=False, eval_data=val_iter, eval_metric=eval_metric, validation_metric=valid_metric, validation_period=5, kvstore='local', batch_end_callback=batch_end_callback, epoch_end_callback=epoch_end_callback, optimizer=optimizer_name, optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=True, monitor=monitor)
def evaluate_net(net, path_imgrec, num_classes, mean_pixels, data_shape, model_prefix, epoch, ctx=mx.cpu(), batch_size=1, path_imglist="", nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, use_second_network=False, net1=None, path_imgrec1=None, epoch1=None, model_prefix1=None, data_shape1=None): """ evalute network given validation record file Parameters: ---------- net : str or None Network name or use None to load from json without modifying path_imgrec : str path to the record validation file path_imglist : str path to the list file to replace labels in record file, optional num_classes : int number of classes, not including background mean_pixels : tuple (mean_r, mean_g, mean_b) data_shape : tuple or int (3, height, width) or height/width model_prefix : str model prefix of saved checkpoint epoch : int load model epoch ctx : mx.ctx mx.gpu() or mx.cpu() batch_size : int validation batch size nms_thresh : float non-maximum suppression threshold force_nms : boolean whether suppress different class objects ovp_thresh : float AP overlap threshold for true/false postives use_difficult : boolean whether to use difficult objects in evaluation if applicable class_names : comma separated str class names in string, must correspond to num_classes if set voc07_metric : boolean whether to use 11-point evluation as in VOC07 competition """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) elif isinstance(data_shape, list): data_shape = (3, data_shape[0], data_shape[1]) assert len(data_shape) == 3 and data_shape[0] == 3 # model_prefix += '_' + str(data_shape[1]) # iterator #eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, # path_imglist=path_imglist, **cfg.valid) curr_path = os.path.abspath(os.path.dirname(__file__)) imdb_val = load_caltech(image_set='val', caltech_path=os.path.join( curr_path, '..', 'data', 'caltech-pedestrian-dataset-converter'), shuffle=False) eval_iter = DetIter(imdb_val, batch_size, (data_shape[1], data_shape[2]), \ mean_pixels=[128, 128, 128], rand_samplers=[], \ rand_mirror=False, shuffle=False, rand_seed=None, \ is_train=True, max_crop_trial=50) # model params load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch) # network if net is None: net = load_net else: #net = get_symbol(net, data_shape[1], num_classes=num_classes, net = get_symbol_concat(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_nms) if not 'label' in net.list_arguments(): label = mx.sym.Variable(name='label') label2 = mx.sym.Variable(name='label2') net = mx.sym.Group([net, label, label2]) # init module #mod = mx.mod.Module(net, label_names=('label',), logger=logger, context=ctx, mod = mx.mod.Module(net, label_names=('label', 'label2'), logger=logger, context=ctx, fixed_param_names=net.list_arguments()) mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label) mod.set_params(args, auxs, allow_missing=False, force_init=True) if voc07_metric: #metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=1) metric = VOC07MApMetric( ovp_thresh, use_difficult, class_names, pred_idx=[0, 1], output_names=['detection_output', 'detection2_output'], label_names=['label', 'label2']) else: #metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=1) metric = MApMetric( ovp_thresh, use_difficult, class_names, pred_idx=[0, 1], output_names=['detection_output', 'detection2_output'], label_names=['label', 'label2']) # run evaluation if not use_second_network: results = mod.score(eval_iter, metric, num_batch=None) for k, v in results: print("{}: {}".format(k, v)) else: logging.basicConfig() logger1 = logging.getLogger() logger1.setLevel(logging.INFO) # load sub network if isinstance(data_shape1, int): data_shape1 = (3, data_shape1, data_shape1) elif isinstance(data_shape1, list): data_shape1 = (3, data_shape1[0], data_shape1[1]) assert len(data_shape1) == 3 and data_shape1[0] == 3 # iterator eval_iter1 = DetRecordIter(path_imgrec1, batch_size, data_shape1, path_imglist=path_imglist, **cfg.valid) # model params load_net1, args1, auxs1 = mx.model.load_checkpoint( model_prefix1, epoch1) # network if net1 is None: net1 = load_net1 else: net1 = net if 'label' not in net1.list_arguments(): label1 = mx.sym.Variable(name='label') net1 = mx.sym.Group([net1, label1]) # init module mod1 = mx.mod.Module(net1, label_names=('label', ), logger=logger1, context=ctx, fixed_param_names=net1.list_arguments()) mod1.bind(data_shapes=eval_iter1.provide_data, label_shapes=eval_iter1.provide_label) mod1.set_params(args1, auxs1, allow_missing=False, force_init=True) if voc07_metric: metric1 = VOC07MApMetric(ovp_thresh, use_difficult, class_names) else: metric1 = MApMetric(ovp_thresh, use_difficult, class_names) # filepath = '/home/binghao/workspace/MXNet-SSD/matlab/kitti/outputs/ssd/' filepath1 = '/home/binghao/workspace/MXNet-SSD/matlab/kitti/outputs/ssd_small/' # mod.score_m(filepath, eval_iter, metric, num_batch=None) mod1.score_m(filepath1, eval_iter1, metric1, num_batch=None)
def evaluate_net(net, path_imgrec, num_classes, mean_pixels, data_shape, model_prefix, epoch, ctx=mx.cpu(), batch_size=1, path_imglist="", nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, frequent=20): """ evalute network given validation record file Parameters: ---------- net : str or None Network name or use None to load from json without modifying path_imgrec : str path to the record validation file path_imglist : str path to the list file to replace labels in record file, optional num_classes : int number of classes, not including background mean_pixels : tuple (mean_r, mean_g, mean_b) data_shape : tuple or int (3, height, width) or height/width model_prefix : str model prefix of saved checkpoint epoch : int load model epoch ctx : mx.ctx mx.gpu() or mx.cpu() batch_size : int validation batch size nms_thresh : float non-maximum suppression threshold force_nms : boolean whether suppress different class objects ovp_thresh : float AP overlap threshold for true/false postives use_difficult : boolean whether to use difficult objects in evaluation if applicable class_names : comma separated str class names in string, must correspond to num_classes if set voc07_metric : boolean whether to use 11-point evluation as in VOC07 competition frequent : int frequency to print out validation status """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 #model_prefix += '_' + str(data_shape[1]) # iterator eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=350, path_imglist=path_imglist, **cfg.valid) # model params load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch) # network if net is None: net = load_net else: net = get_symbol(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_nms) if not 'label' in net.list_arguments(): label = mx.sym.Variable(name='label') net = mx.sym.Group([net, label]) # init module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=net.list_arguments()) mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label) mod.set_params(args, auxs, allow_missing=False, force_init=True) # run evaluation if voc07_metric: metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, roc_output_path=os.path.join( os.path.dirname(model_prefix), 'roc')) else: metric = MApMetric(ovp_thresh, use_difficult, class_names, roc_output_path=os.path.join( os.path.dirname(model_prefix), 'roc')) posemetric = PoseMetric( LINEMOD_path='/data/ZHANGXIN/DATASETS/SIXD_CHALLENGE/LINEMOD/', classes=class_names) # visualize bb8 results # for nbatch, eval_batch in tqdm(enumerate(eval_iter)): # mod.forward(eval_batch) # preds = mod.get_outputs(merge_multi_context=True) # # labels = eval_batch.label[0].asnumpy() # # get generated multi label from network # cls_prob = preds[0] # loc_pred = preds[4] # bb8_pred = preds[5] # anchors = preds[6] # # bb8dets = BB8MultiBoxDetection(cls_prob, loc_pred, bb8_pred, anchors, nms_threshold=0.5, force_suppress=False, # variances=(0.1, 0.1, 0.2, 0.2), nms_topk=400) # bb8dets = bb8dets.asnumpy() # # for nsample, sampleDet in enumerate(bb8dets): # image = eval_batch.data[0][nsample].asnumpy() # image += np.array(mean_pixels).reshape((3, 1, 1)) # image = np.transpose(image, axes=(1, 2, 0)) # draw_dets = [] # draw_cids = [] # # for instanceDet in sampleDet: # if instanceDet[0] == -1: # continue # else: # cid = instanceDet[0].astype(np.int16) # indices = np.where(sampleDet[:, 0] == cid)[0] # # if indices.size > 0: # draw_dets.append(sampleDet[indices[0], 6:]) # draw_cids.append(cid) # sampleDet = np.delete(sampleDet, indices, axis=0) # show_BB8(image / 255., np.transpose(draw_dets[-1].reshape((-1, 8, 2)), axes=(0,2,1)), [cid], # plot_path='./output/bb8results/{:04d}_{}'.format(nbatch * batch_size + nsample, class_names[cid])) # # # draw_dets = np.array(draw_dets) # # draw_cids = np.array(draw_cids) # # # show_BB8(image / 255., np.transpose(draw_dets.reshape((-1, 8, 2)), axes=(0,2,1)), draw_cids, # # plot_path='./output/bb8results/{:04d}'.format(nbatch * batch_size + nsample)) # quantitive results results = mod.score(eval_iter, [metric, posemetric], num_batch=None, batch_end_callback=mx.callback.Speedometer( batch_size, frequent=frequent, auto_reset=False)) results_save_path = os.path.join(os.path.dirname(model_prefix), 'evaluate_results') with open(results_save_path, 'w') as f: for k, v in results: print("{}: {}".format(k, v)) f.write("{}: {}\n".format(k, v)) f.close() reproj_save_path = os.path.join(os.path.dirname(model_prefix), 'reprojection_error') with open(reproj_save_path, 'wb') as f: # for k, v in metric.Reproj.items(): # f.write("{}: {}\n".format(k, v)) pickle.dump(posemetric.Reproj, f, protocol=2) f.close() count_save_path = os.path.join(os.path.dirname(model_prefix), 'gt_count') with open(count_save_path, 'wb') as f: # for k, v in metric.counts.items(): # f.write("{}: {}\n".format(k, v)) pickle.dump(posemetric.counts, f, protocol=2) f.close()
# net.hybridize() # out = net(data) # net.export(path='./', epoch=0) # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: log_file_path = os.path.join(os.path.dirname(prefix), log_file) if not os.path.exists(os.path.dirname(log_file_path)): os.makedirs(os.path.dirname(log_file_path)) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None # val_iter = None cls_loss = MaskSoftmaxCELoss(class_axis=-1, batch_axis=0) # cls_loss = FocalLoss(class_axis=-1, batch_axis=0, alpha=0.25, gamma=2) box_loss = SmoothL1Loss(batch_axis=0) cls_metric = mx.metric.Accuracy(axis=-1) box_metric = MaskMAE()
def train_net(net, train_path, num_classes, batch_size, data_shape, mean_img, mean_img_dir, resume, finetune, pretrained, epoch, prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate, momentum, weight_decay, lr_refactor_step, lr_refactor_ratio, convert_numpy=1, freeze_layer_pattern='', num_example=10000, label_pad_width=350, nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, nms_topk=400, force_suppress=False, train_list="", val_path="", val_list="", iter_monitor=0, monitor_pattern=".*", log_file=None, summarywriter=0, flush_secs=180): """ Wrapper for training phase. Parameters: ---------- net : str symbol name for the network structure train_path : str record file path for training num_classes : int number of object classes, not including background batch_size : int training batch-size data_shape : int or tuple width/height as integer or (3, height, width) tuple mean_pixels : tuple of floats mean pixel values for red, green and blue resume : int resume from previous checkpoint if > 0 finetune : int fine-tune from previous checkpoint if > 0 pretrained : str prefix of pretrained model, including path epoch : int load epoch of either resume/finetune/pretrained model prefix : str prefix for saving checkpoints ctx : [mx.cpu()] or [mx.gpu(x)] list of mxnet contexts begin_epoch : int starting epoch for training, should be 0 if not otherwise specified end_epoch : int end epoch of training frequent : int frequency to print out training status learning_rate : float training learning rate momentum : float trainig momentum weight_decay : float training weight decay param lr_refactor_ratio : float multiplier for reducing learning rate lr_refactor_step : comma separated integers at which epoch to rescale learning rate, e.g. '30, 60, 90' freeze_layer_pattern : str regex pattern for layers need to be fixed num_example : int number of training images label_pad_width : int force padding training and validation labels to sync their label widths nms_thresh : float non-maximum suppression threshold for validation force_nms : boolean suppress overlaped objects from different classes train_list : str list file path for training, this will replace the embeded labels in record val_path : str record file path for validation val_list : str list file path for validation, this will replace the embeded labels in record iter_monitor : int monitor internal stats in networks if > 0, specified by monitor_pattern monitor_pattern : str regex pattern for monitoring network stats log_file : str log to file if enabled """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: fh = logging.FileHandler(log_file) logger.addHandler(fh) # check args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 prefix += '_' + net + '_' + str(data_shape[1]) # if isinstance(mean_pixels, (int, float)): # mean_pixels = [mean_pixels, mean_pixels, mean_pixels] # assert len(mean_pixels) == 3, "must provide all RGB mean values" train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_img=mean_img, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_img=mean_img, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None # convert mean.bin to mean.npy _convert_mean_numpy(convert_numpy, mean_img_dir, mean_img) # load symbol net = get_symbol_train(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk) if summarywriter: if os.path.exists('/opt/incubator-mxnet/example/ssd/logs'): shutil.rmtree('/opt/incubator-mxnet/example/ssd/logs' ) # clear the previous logs os.mkdir('/opt/incubator-mxnet/example/ssd/logs') sw = SummaryWriter(logdir='/opt/incubator-mxnet/example/ssd/logs', flush_secs=flush_secs) sw.add_graph(net) else: sw = None # mx.viz.plot_network(net, shape={"data":(64, 3, 320, 320)}, node_attrs={"shape":'rect',"fixedsize":'false'}).view() # define layers with fixed weight/bias if freeze_layer_pattern.strip(): re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [ name for name in net.list_arguments() if re_prog.match(name) ] else: fixed_param_names = None # load pretrained or resume from previous state ctx_str = '(' + ','.join([str(c) for c in ctx]) + ')' if resume > 0: logger.info("Resume training with {} from epoch {}".format( ctx_str, resume)) _, args, auxs = mx.model.load_checkpoint(prefix, resume) begin_epoch = resume elif finetune > 0: logger.info("Start finetuning with {} from epoch {}".format( ctx_str, finetune)) _, args, auxs = mx.model.load_checkpoint(prefix, finetune) begin_epoch = finetune # the prediction convolution layers name starts with relu, so it's fine fixed_param_names = [name for name in net.list_arguments() \ if name.startswith('conv')] elif pretrained: logger.info("Start training with {} from pretrained model {}".format( ctx_str, pretrained)) _, args, auxs = mx.model.load_checkpoint(pretrained, epoch) args = convert_pretrained(pretrained, args) else: logger.info("Experimental: start training from scratch with {}".format( ctx_str)) args = None auxs = None fixed_param_names = None # helper information if fixed_param_names: logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']') # init training module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=fixed_param_names) # fit parameters if summarywriter: # 增加可视化的回调函数,有多个回调函数时,除最后一个回调函数外不能进行准确率的清零操作(即auto_reset参数必须设置为False) batch_end_callbacks = [ mx.callback.Speedometer(train_iter.batch_size, frequent=frequent, auto_reset=True), summary_writter_callback.summary_writter_eval_metric(sw) ] else: batch_end_callbacks = [ mx.callback.Speedometer(train_iter.batch_size, frequent=frequent, auto_reset=False) ] # batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent) epoch_end_callback = mx.callback.do_checkpoint(prefix) learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio, num_example, batch_size, begin_epoch) optimizer_params = { 'learning_rate': learning_rate, 'momentum': momentum, 'wd': weight_decay, 'lr_scheduler': lr_scheduler, 'clip_gradient': None, 'rescale_grad': 1.0 / len(ctx) if len(ctx) > 0 else 1.0 } monitor = mx.mon.Monitor( iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None # run fit net, every n epochs we run evaluation network to get mAP if voc07_metric: valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) else: valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) mod.fit(train_iter, val_iter, eval_metric=MultiBoxMetric(), validation_metric=valid_metric, batch_end_callback=batch_end_callbacks, epoch_end_callback=epoch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=True, monitor=monitor) if summarywriter: sw.close()
def train_net(net, train_path, num_classes, batch_size, data_shape, mean_pixels, resume, finetune, pretrained, epoch, prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate, momentum, weight_decay, lr_refactor_step, lr_refactor_ratio, freeze_layer_pattern='', num_example=10000, label_pad_width=350, nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, nms_topk=2000, force_suppress=False, train_list="", val_path="", val_list="", iter_monitor=0, monitor_pattern=".*", log_file=None): """ Wrapper for training phase. Parameters: ---------- net : str symbol name for the network structure train_path : str record file path for training num_classes : int number of object classes, not including background batch_size : int training batch-size data_shape : int or tuple width/height as integer or (3, height, width) tuple mean_pixels : tuple of floats mean pixel values for red, green and blue resume : int resume from previous checkpoint if > 0 finetune : int fine-tune from previous checkpoint if > 0 pretrained : str prefix of pretrained model, including path epoch : int load epoch of either resume/finetune/pretrained model prefix : str prefix for saving checkpoints ctx : [mx.cpu()] or [mx.gpu(x)] list of mxnet contexts begin_epoch : int starting epoch for training, should be 0 if not otherwise specified end_epoch : int end epoch of training frequent : int frequency to print out training status learning_rate : float training learning rate momentum : float trainig momentum weight_decay : float training weight decay param lr_refactor_ratio : float multiplier for reducing learning rate lr_refactor_step : comma separated integers at which epoch to rescale learning rate, e.g. '30, 60, 90' freeze_layer_pattern : str regex pattern for layers need to be fixed num_example : int number of training images label_pad_width : int force padding training and validation labels to sync their label widths nms_thresh : float non-maximum suppression threshold for validation force_nms : boolean suppress overlaped objects from different classes train_list : str list file path for training, this will replace the embeded labels in record val_path : str record file path for validation val_list : str list file path for validation, this will replace the embeded labels in record iter_monitor : int monitor internal stats in networks if > 0, specified by monitor_pattern monitor_pattern : str regex pattern for monitoring network stats log_file : str log to file if enabled """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # set a log if log_file: # crate a fileHandler fh = logging.FileHandler(log_file) logger.addHandler(fh) # check args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 prefix += '_' + net + '_' + str(data_shape[1]) # check the mean_pixels is list if isinstance(mean_pixels, (int, float)): mean_pixels = [mean_pixels, mean_pixels, mean_pixels] assert len(mean_pixels) == 3, "must provide all RGB mean values" train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) # for c in range(12840): # batch = train_iter.next() # data=batch.data[0] # label=batch.label[0] # from matplotlib import pyplot as plt # import numpy as np # import cv2 # for i in range(2): # plt.subplot(1,2,i+1) # img = np.array(data[i].asnumpy().transpose(1,2,0).copy(), np.uint8) # box = label[i].asnumpy() # bbox = [] # print 'The', i, 'th image' # for j in range(box.shape[0]): # if box[j][0] == -1: # break # else: # bbox.append(box[j][1:5]) # for k in range(len(bbox)): # xmin = (bbox[k][0] * img.shape[0]).astype(np.int16) # ymin = (bbox[k][1] * img.shape[0]).astype(np.int16) # xmax = (bbox[k][2] * img.shape[0]).astype(np.int16) # ymax = (bbox[k][3] * img.shape[0]).astype(np.int16) # cv2.rectangle(img, (xmin,ymin), (xmax,ymax), (255,0,0),4) # # print 'xmin', xmin, 'ymin', ymin, 'xmax', xmax, 'ymax', ymax # plt.imshow(img) # plt.show() # #path = 'crop_image/'+ str(c) + '.jpg' # #plt.savefig(path) # print batch if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None # load symbol net = get_symbol_train(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk) # viz = mx.viz.plot_network(net) # viz.view() # define layers with fixed weight/bias if freeze_layer_pattern.strip(): re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [ name for name in net.list_arguments() if re_prog.match(name) ] else: fixed_param_names = None # load pretrained or resume from previous state ctx_str = '(' + ','.join([str(c) for c in ctx]) + ')' if resume > 0: logger.info("Resume training with {} from epoch {}".format( ctx_str, resume)) _, args, auxs = mx.model.load_checkpoint(prefix, resume) begin_epoch = resume elif finetune > 0: logger.info("Start finetuning with {} from epoch {}".format( ctx_str, finetune)) _, args, auxs = mx.model.load_checkpoint(prefix, finetune) begin_epoch = finetune # the prediction convolution layers name starts with relu, so it's fine fixed_param_names = [name for name in net.list_arguments() \ if name.startswith('conv')] elif pretrained: logger.info("Start training with {} from pretrained model {}".format( ctx_str, pretrained)) fixed_param_names = None _, args, auxs = mx.model.load_checkpoint(pretrained, epoch) args = convert_pretrained(pretrained, args) else: logger.info("Experimental: start training from scratch with {}".format( ctx_str)) args = None auxs = None fixed_param_names = None # helper information if fixed_param_names: logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']') # init training module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=fixed_param_names) # fit parameters batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent) epoch_end_callback = mx.callback.do_checkpoint(prefix) learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio, num_example, batch_size, begin_epoch) optimizer_params = { 'learning_rate': learning_rate, 'momentum': momentum, 'wd': weight_decay, 'lr_scheduler': lr_scheduler, 'clip_gradient': None, 'rescale_grad': 1.0 / len(ctx) if len(ctx) > 0 else 1.0 } monitor = mx.mon.Monitor( iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None # run fit net, every n epochs we run evaluation network to get mAP if voc07_metric: valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) else: valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) mod.fit( train_data=train_iter, #train_iter, eval_data=val_iter, eval_metric=MultiBoxMetric(), validation_metric=valid_metric, batch_end_callback=batch_end_callback, epoch_end_callback=epoch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=True, monitor=monitor)