def forward_backward_v2(net, criterions, ctx, packet, is_train=True): data, ht8, ht8_mask, paf8, paf8_mask = packet criterion, criterion_ohkm = criterions # split to gpus data = gl.utils.split_and_load(data, ctx) ht8 = gl.utils.split_and_load(ht8, ctx) ht8_mask = gl.utils.split_and_load(ht8_mask, ctx) paf8 = gl.utils.split_and_load(paf8, ctx) paf8_mask = gl.utils.split_and_load(paf8_mask, ctx) # run ag.set_recording(is_train) ag.set_training(is_train) losses = [] for data_, ht8_, paf8_, ht8_mask_, paf8_mask_ in zip( data, ht8, paf8, ht8_mask, paf8_mask): # forward out_ = net(data_) losses_ = [] num_stage = len(out_) for i in range(num_stage): losses_.append(criterion(out_[i][0], ht8_, ht8_mask_)) losses_.append(criterion(out_[i][1], paf8_, paf8_mask_)) losses.append(losses_) # backward if is_train: ag.backward(losses_) ag.set_recording(False) ag.set_training(False) return losses
def evaluate(model, dataset_type='train', ema=None): r"""Evaluate the model on train/dev/test dataset. This function is just an encapsulation of official evaluate function. The official evaluate code can be find in https://rajpurkar.github.io/SQuAD-explorer/ Parameters ---------- dataset_type : string, default 'train' which dataset to evaluate. ema : object or None, default None Whether use the shadow variable to evaluate. """ model.save_parameters('tmp') if ema is not None: for name, params in model.collect_params().items(): params.set_data(ema.get(name)) if dataset_type == 'train': data_loader = DataLoader(batch_size=EVAL_BATCH_SIZE, dev_set=False) else: data_loader = DataLoader(batch_size=EVAL_BATCH_SIZE, dev_set=True) autograd.set_training(False) total_answers = {} for batch_data in tqdm(data_loader.next_batch()): ids = [x[0] for x in batch_data] context = nd.array([x[1] for x in batch_data], ctx=ctx) context_mask = context > 0 query = nd.array([x[2] for x in batch_data], ctx=ctx) query_mask = query > 0 context_char = nd.array([x[3] for x in batch_data], ctx=ctx) query_char = nd.array([x[4] for x in batch_data], ctx=ctx) raw_context = [x[7] for x in batch_data] spans = [x[8] for x in batch_data] begin_hat, end_hat, _, _ = model(context, query, context_char, query_char, context_mask, query_mask, None, None) begin_hat = begin_hat.softmax(axis=1) end_hat = end_hat.softmax(axis=1) answer_span_pair = matrix_answer_select(begin_hat, end_hat) for i, a, r, s in zip(ids, answer_span_pair, raw_context, spans): total_answers[i] = format_answer(a, r, s) model.load_parameters('tmp', ctx=CTX) autograd.set_training(True) if dataset_type == 'train': with open(DATA_PATH + RAW_TRAIN_FILE) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] else: with open(DATA_PATH + RAW_DEV_FILE) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] result = offical_eval(dataset, total_answers) f1_score = result['f1'] em_score = result['exact_match'] return f1_score, em_score
def export_model(self): if not isinstance(self.model, nn.HybridBlock): raise ValueError( "Expected a HybridBlock but the model seems not one.") autograd.set_training(False) autograd.set_recording(False) loader = self.create_dataloader("train") raw_data = next(iter(loader)) splitted_data = utils.split_and_load(raw_data, self.ctx) for data in splitted_data: inputs, labels = self.parse_data(data, "train") self.model(*inputs) self.model.export(os.path.join(self.config.PARAM_DIR, "model"), 9999)
def forward_backward(net, criterions, ctx, data, rois, is_train=True): criterion_cls1, criterion_cls2, criterion_reg = criterions data = gl.utils.split_and_load(data, ctx) rois = gl.utils.split_and_load(rois, ctx) ag.set_recording(is_train) ag.set_training(is_train) # forward rpn rpn_cls1, rpn_reg1, rpn_cls2, rpn_reg2 = [], [], [], [] for data_ in data: rpn_cls1_, rpn_reg1_, rpn_cls2_, rpn_reg2_ = net(data_) rpn_cls1.append(rpn_cls1_) rpn_reg1.append(rpn_reg1_) rpn_cls2.append(rpn_cls2_) rpn_reg2.append(rpn_reg2_) losses = [] anchor_proposals = net.anchor_proposals for data_, rois_, rpn_cls1_, rpn_reg1_, rpn_cls2_, rpn_reg2_ in zip( data, rois, rpn_cls1, rpn_reg1, rpn_cls2, rpn_reg2): im_info = data_.shape[-2:] # anchor target # feat 1/8 # parallel stops here batch_label1, batch_label_weight1, batch_bbox_targets1, batch_bbox_weights1 = anchor_proposals[ 0].target(rpn_cls1_, rois_, im_info) # loss cls loss_cls1 = criterion_cls1(rpn_cls1_, batch_label1, batch_label_weight1) / data_.shape[0] # loss reg loss_reg1 = criterion_reg(rpn_reg1_, batch_bbox_targets1, batch_bbox_weights1) / data_.shape[0] # feat 1/16 # parallel stops here batch_label2, batch_label_weight2, batch_bbox_targets2, batch_bbox_weights2 = anchor_proposals[ 1].target(rpn_cls2_, rois_, im_info) # loss cls loss_cls2 = criterion_cls2(rpn_cls2_, batch_label2, batch_label_weight2) / data_.shape[0] # loss reg loss_reg2 = criterion_reg(rpn_reg2_, batch_bbox_targets2, batch_bbox_weights2) / data_.shape[0] loss = [loss_cls1, loss_reg1, loss_cls2, loss_reg2] # backward if is_train: ag.backward(loss) losses.append(loss) ag.set_recording(False) ag.set_training(False) return losses
def validate_faster_rcnn(net, val_data, cfg): """Test on validation dataset.""" # When hybridize is true, set network to test mode, set proposal nms test params # then clear and cache new compute graph # FIXME Will raise deferred init error if call hybridized net in test mode first net.proposal.set_nms(cfg.rpn_test_pre_nms_top_n, cfg.rpn_test_post_nms_top_n) if cfg.hybridize: autograd.set_training(train_mode=False) net.hybridize() metric = VOC07MApMetric(iou_thresh=0.5, class_names=cfg.classes) for batch in val_data: pred_bboxes = [] pred_cls = [] pred_scores = [] gt_bboxes = [] gt_cls = [] gt_difficults = [] # Split and load data for multi-gpu data_list = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) gt_box_list = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) im_info_list = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) for data, gt_box, im_info in zip(data_list, gt_box_list, im_info_list): # get prediction results cls, scores, bboxes = net(data, im_info) pred_cls.append(cls) pred_scores.append(scores) pred_bboxes.append(bboxes) # split ground truths gt_cls.append(gt_box.slice_axis(axis=-1, begin=4, end=5)) gt_bboxes.append(gt_box.slice_axis(axis=-1, begin=0, end=4)) gt_difficults.append( gt_box.slice_axis(axis=-1, begin=5, end=6 ) if gt_box.shape[-1] > 5 else None) # update metric metric.update(pred_bboxes, pred_cls, pred_scores, gt_bboxes, gt_cls, gt_difficults) return metric.get()
def forward_backward_v3(net, criterions, ctx, packet, is_train=True): data, ht4, ht4_mask, paf4, paf4_mask, ht8, ht8_mask, paf8, paf8_mask, ht16, ht16_mask, paf16, paf16_mask = packet criterion, criterion_ohkm = criterions ht = [ht4, ht8, ht16] paf = [paf4, paf8, paf16] ht_mask = [ht4_mask, ht8_mask, ht16_mask] paf_mask = [paf4_mask, paf8_mask, paf16_mask] # split to gpus data = gl.utils.split_and_load(data, ctx) ht = [gl.utils.split_and_load(x, ctx) for x in ht] paf = [gl.utils.split_and_load(x, ctx) for x in paf] ht_mask = [gl.utils.split_and_load(x, ctx) for x in ht_mask] paf_mask = [gl.utils.split_and_load(x, ctx) for x in paf_mask] # run ag.set_recording(is_train) ag.set_training(is_train) losses = [] for idx, data_ in enumerate(data): # forward g_ht4, g_paf4, r_ht4, r_paf4, g_ht8, g_paf8, r_ht8, r_paf8, g_ht16, g_paf16, r_ht16, r_paf16 = net( data_) ht4_, ht8_, ht16_ = [h[idx] for h in ht] paf4_, paf8_, paf16_ = [p[idx] for p in paf] ht4_mask_, ht8_mask_, ht16_mask_ = [hm[idx] for hm in ht_mask] paf4_mask_, paf8_mask_, paf16_mask_ = [pm[idx] for pm in paf_mask] # loss losses_ = [ criterion(g_ht4, ht4_, ht4_mask_), criterion_ohkm(r_ht4, ht4_, ht4_mask_), criterion(g_ht8, ht8_, ht8_mask_), criterion_ohkm(r_ht8, ht8_, ht8_mask_), criterion(g_ht16, ht16_, ht16_mask_), criterion_ohkm(r_ht16, ht16_, ht16_mask_), criterion(g_paf4, paf4_, paf4_mask_), criterion(r_paf4, paf4_, paf4_mask_), criterion(g_paf8, paf8_, paf8_mask_), criterion(r_paf8, paf8_, paf8_mask_), criterion(g_paf16, paf16_, paf16_mask_), criterion(r_paf16, paf16_, paf16_mask_) ] losses.append(losses_) # backward if is_train: ag.backward(losses_) ag.set_recording(False) ag.set_training(False) return losses
def forward_backward(net, criterion, ctx, packet, is_train=True): data, ht, mask = packet data = gl.utils.split_and_load(data, ctx) ht = gl.utils.split_and_load(ht, ctx) mask = gl.utils.split_and_load(mask, ctx) # run ag.set_recording(is_train) ag.set_training(is_train) losses = [] for data_, ht_, mask_ in zip(data, ht, mask): pred_ = net(data_) losses_ = [criterion(ht_, pred_, mask_)] losses.append(losses_) if is_train: ag.backward(losses_) ag.set_recording(False) ag.set_training(False) return losses
def _process_epoch(self, mode): color_code = esc_seq.GREEN if sys.platform != "win32" else "" end_color_code = esc_seq.END if sys.platform != "win32" else "" print(color_code + "{}: epoch {:3d}/{:3d}".format(mode, self.latest_state + 1, self.config.MAX_EPOCHS) + end_color_code) loader = self.create_dataloader(mode) handler = self.create_handler(mode=mode, num_batch=len(loader)) for i, raw_data in enumerate(loader): gathered_outputs = [] gathered_losses = [] losses = [] tick = time.time() splitted_data = utils.split_and_load(raw_data, self.ctx) if mode == "train": autograd.set_training(True) autograd.set_recording(True) elif mode == "test": autograd.set_training(False) autograd.set_recording(False) for data in splitted_data: inputs, labels = self.parse_data(data, mode) outputs = self.parse_output(self.model(*inputs), mode) gathered_outputs.append(outputs) loss = self.compute_loss(outputs, labels) gathered_losses.append(loss) if mode == "train": losses.extend(loss) autograd.set_training(False) autograd.set_recording(False) if mode == "train": autograd.backward(losses) self.trainer.step(raw_data[0].shape[0]) handler.cleanup_batch(raw_data, gathered_outputs, gathered_losses, i, tick) handler.cleanup_epoch()
def train_rpn(net, train_data, cfg): """Training pipeline""" rpn_loss = RPNLoss(cfg.rpn_batch_size) rpn_loss.initialize(ctx=ctx) if cfg.hybridize: autograd.set_training(train_mode=True) net.hybridize() trainer = gluon.Trainer( net.collect_params(), 'sgd', {'learning_rate': cfg.lr, 'wd': cfg.wd, 'momentum': cfg.momentum, 'clip_gradient': 5}) # lr decay policy lr_decay = float(cfg.lr_decay) lr_steps = sorted(cfg.lr_decay_epochs) # Create Metrics log_metric = LogLossMetric(name='LogLoss', batch_size=cfg.rpn_batch_size) smoothl1_metric = SmoothL1LossMetric(name='SmoothL1Loss', batch_size=cfg.rpn_batch_size) logger.info('Config for training RPN:\n%s' % cfg) logger.info('Start training from [Epoch %d]' % args.start_epoch) for epoch in range(cfg.start_epoch, cfg.end_epoch): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) tic = time.time() btic = time.time() log_metric.reset() smoothl1_metric.reset() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data_list = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) gt_box_list = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) im_info_list = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_loss_list = [] bbox_loss_list = [] label_list = [] for data, gt_box, im_info in zip(data_list, gt_box_list, im_info_list): rpn_cls_prob, rpn_bbox_pred, labels, bbox_targets = net(data, im_info, gt_box) cls_loss, bbox_loss = rpn_loss(rpn_cls_prob, rpn_bbox_pred, labels, bbox_targets) cls_loss_list.append(cls_loss) bbox_loss_list.append(bbox_loss) label_list.append(labels) autograd.backward(cls_loss_list + bbox_loss_list) trainer.step(batch_size) log_metric.update(label_list, cls_loss_list) smoothl1_metric.update(label_list, bbox_loss_list) if cfg.log_interval and not (i + 1) % cfg.log_interval: name1, loss1 = log_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch %d][Batch %d], Speed: %f samples/sec, %s=%f, %s=%f' % ( epoch, i, batch_size / (time.time() - btic), name1, loss1, name2, loss2)) btic = time.time() name1, loss1 = log_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch %d] Training cost: %f, %s=%f, %s=%f' % ( epoch, (time.time() - tic), name1, loss1, name2, loss2)) save_params(net, epoch, cfg.save_interval, cfg.save_prefix)
def train_faster_rcnn(net, train_data, val_data, cfg): """Training pipeline""" rpn_loss = RPNLoss(cfg.rpn_batch_size) rcnn_loss = RCNNLoss(cfg.roi_batch_size) rpn_loss.initialize(ctx=ctx) rcnn_loss.initialize(ctx=ctx) trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'learning_rate': cfg.lr, 'wd': cfg.wd, 'momentum': cfg.momentum, 'clip_gradient': 5 }) # lr decay policy lr_decay = float(cfg.lr_decay) lr_steps = sorted(cfg.lr_decay_epochs) # Create Metrics rpn_log_metric = LogLossMetric(name='RPNLogLoss', batch_size=cfg.rpn_batch_size) rpn_smoothl1_metric = SmoothL1LossMetric(name='RPNSmoothL1Loss', batch_size=cfg.rpn_batch_size) rcnn_log_metric = LogLossMetric(name='RCNNLogLoss', batch_size=cfg.roi_batch_size) rcnn_smoothl1_metric = SmoothL1LossMetric(name='RCNNSmoothL1Loss', batch_size=cfg.roi_batch_size) # New list to store loss and label for backward and update metric rpn_cls_loss_list = [] rpn_bbox_loss_list = [] rcnn_cls_loss_list = [] rcnn_bbox_loss_list = [] logger.info('Config for end to end training FasterRCNN:\n%s' % cfg) logger.info('Start training from [Epoch %d]' % args.start_epoch) best_map = [0] for epoch in range(cfg.start_epoch, cfg.end_epoch): # When hybridize is true, set network to train mode, reset proposal nms params # then clear and cache new compute graph net.proposal.set_nms(cfg.rpn_pre_nms_top_n, cfg.rpn_post_nms_top_n) if cfg.hybridize: autograd.set_training(train_mode=True) net.hybridize() # Check and update learning rate while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) # Refresh time and metrics tic = time.time() btic = time.time() rpn_log_metric.reset() rpn_smoothl1_metric.reset() rcnn_log_metric.reset() rcnn_smoothl1_metric.reset() for i, batch in enumerate(train_data): # Empty lists rpn_cls_loss_list[:] = [] rpn_bbox_loss_list[:] = [] rcnn_cls_loss_list[:] = [] rcnn_bbox_loss_list[:] = [] # Split and load data for multi-gpu batch_size = batch[0].shape[0] data_list = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) gt_box_list = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) im_info_list = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) # Network Forward with autograd.record(): for data, gt_box, im_info in zip(data_list, gt_box_list, im_info_list): rpn_cls_prob, rpn_bbox_pred, rpn_label, rpn_bbox_target, \ rcnn_cls_prob, rcnn_bbox_pred, rcnn_label, rcnn_bbox_target = net(data, im_info, gt_box) rpn_cls_loss, rpn_bbox_loss = \ rpn_loss(rpn_cls_prob, rpn_bbox_pred, rpn_label, rpn_bbox_target) rcnn_cls_loss, rcnn_bbox_loss = \ rcnn_loss(rcnn_cls_prob, rcnn_bbox_pred, rcnn_label, rcnn_bbox_target) rpn_cls_loss_list.append(rpn_cls_loss) rpn_bbox_loss_list.append(rpn_bbox_loss) rcnn_cls_loss_list.append(rcnn_cls_loss) rcnn_bbox_loss_list.append(rcnn_bbox_loss) # Backward and update parameters and metrics autograd.backward(rpn_cls_loss_list + rpn_bbox_loss_list + rcnn_cls_loss_list + rcnn_bbox_loss_list) trainer.step(1) rpn_log_metric.update(preds=rpn_cls_loss_list) rpn_smoothl1_metric.update(preds=rpn_bbox_loss_list) rcnn_log_metric.update(preds=rcnn_cls_loss_list) rcnn_smoothl1_metric.update(preds=rcnn_bbox_loss_list) # Log training states if cfg.log_interval and not (i + 1) % cfg.log_interval: name1, loss1 = rpn_log_metric.get() name2, loss2 = rpn_smoothl1_metric.get() name3, loss3 = rcnn_log_metric.get() name4, loss4 = rcnn_smoothl1_metric.get() logger.info( '[Epoch %d][Batch %d], Speed: %f samples/sec, %s=%f, %s=%f, %s=%f, %s=%f' % (epoch, i, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = rpn_log_metric.get() name2, loss2 = rpn_smoothl1_metric.get() name3, loss3 = rcnn_log_metric.get() name4, loss4 = rcnn_smoothl1_metric.get() logger.info( '[Epoch %d] Training cost: %f, %s=%f, %s=%f, %s=%f, %s=%f' % (epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) map_name, mean_ap = validate_faster_rcnn(net, val_data, cfg) val_msg = '\n'.join( ['%s=%f' % (k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch %d] Validation: \n%s' % (epoch, val_msg)) save_params(net, best_map, mean_ap[-1], epoch, cfg.save_interval, cfg.save_prefix)
print("Using network architecture: ", opt.arch) if opt.mode == "symbolic": print("Mode: symbolic") if opt.flag_finetune: with warnings.catch_warnings(): warnings.simplefilter("ignore") opt.cur_epoch = int(opt.pretrained_path.split('.')[0][-4:]) params_path = opt.pretrained_path json_path = opt.pretrained_path[:-11] + "symbol.json" model = gluon.nn.SymbolBlock.imports(json_path, ['data'], params_path, ctx=ctx) else: opt.cur_epoch = 0 autograd.set_training(0) model = create_model(opt.arch, opt.heads, opt.head_conv, ctx=ctx) model.hybridize() autograd.set_training(1) else: print("Mode: imperative") opt.cur_epoch = 0 model = create_model(opt.arch, opt.heads, opt.head_conv, ctx=ctx) if opt.flag_finetune: model = load_model(model, opt.pretrained_path, ctx=ctx) #model = model.load_parameters(opt.pretrained_path, ctx=ctx, ignore_extra=True, allow_missing = True) opt.cur_epoch = int(opt.pretrained_path.split('.')[0][-4:]) elif opt.arch != "res_18": model.collect_params().initialize(init=init.Xavier(), ctx=ctx) """ 2. Dataset """ train_dataset, val_dataset = get_coco(opt, "./data/coco")
def dropout2(X, drop_rate): autograd.set_training(True) Z = nd.zeros_like(X) nd.Dropout(X, p=drop_rate, out=Z) return Z