def test(net, val_data, ctx): """ Evaluate the result""" metric_acc = metric.Accuracy() for i, batch in enumerate(val_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [] data = data[0] label = label[0] for idx in range(data.shape[0]): outputs.append(net(data[i])) metric_acc.update(label, outputs) return metric_acc.get()
def train_net(train_path, val_path, anno_file, num_class, batch_size, pretrained, pretrained_path, epochs, ctx, learning_rate, weight_decay, optimizer, momentum, lr_refactor_steps, lr_refactor_ratio, log_file, tensorboard, num_workers, per_device_batch_size): """ Training network """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: fh = logging.FileHandler(log_file) logger.addHandler(fh) # split training dataset into training and validation dataset train_anno_file, val_anno_file = split_image_dataset( train_path, val_path, anno_file) # load dataset train_data = DataLoader(eco_dataset.ImageNpyDataset( train_path, train_anno_file).transform_first(get_transform('train')), batch_size=batch_size, shuffle=True, num_workers=num_workers) val_data = DataLoader(eco_dataset.ImageNpyDataset( val_path, val_anno_file).transform_first(get_transform('test')), batch_size=batch_size, shuffle=True, num_workers=num_workers) # build network net = eco_full.eco_full() # pre-train model if pretrained: logger.info( "Start training from pretrained model {}".format(pretrained)) params_file = get_latest_params_file(pretrained_path) if not params_file: logger.info( "No params file exist, the net will be initialized by Xavier") net.collect_params().initialize(mx.init.Xavier(), ctx) net.hybridize() else: # logger.info("Initialize network by symbol parameters.") # net = gluon.SymbolBlock.imports("eco_gluon_to_symbol-symbol.json", # ["data"], "eco_gluon_to_symbol-0000.params", ctx=mx.gpu()) logger.info("Initialize network by %s" % params_file) net.load_parameters( '/home/lijie/ECO_Full_kinetics_pretrained/model/' + params_file, ctx) net.hybridize() else: net.collect_params().initialize(mx.init.Xavier(), ctx) net.hybridize() # learning rate refactor steps if lr_refactor_steps is None: decay_interval = int(epochs / 3) lr_refactor_steps = [i for i in range(1, epochs, decay_interval)] else: lr_refactor_steps = [ int(i.strip()) for i in lr_refactor_steps.split(',') ] trainer = gluon.Trainer(net.collect_params(), optimizer, { 'learning_rate': learning_rate, 'momentum': momentum, 'wd': weight_decay }) metric_acc = metric.Accuracy() L = gluon.loss.SoftmaxCrossEntropyLoss() lr_counter = 0 num_batch = len(train_data) for epoch in range(epochs): epoch_start = time.time() if lr_counter < len( lr_refactor_steps) and epoch == lr_refactor_steps[lr_counter]: trainer.set_learning_rate(trainer.learning_rate * lr_refactor_ratio) lr_counter += 1 train_loss = 0 metric_acc.reset() for i, batch in enumerate(train_data): batch_start = time.time() data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) with ag.record(): # print('data length : {}'.format(len(data))) outputs = [] data = data[0] label = label[0] for idx in range(data.shape[0]): outputs.append(net(data[idx])) loss = 0 for yhat, y in zip(outputs, label): loss = loss + mx.nd.mean(L(yhat, y)) loss.backward() # for l in loss: # l.backward() trainer.step(batch_size, ignore_stale_grad=True) # train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) train_loss = loss.mean().asscalar() / batch_size metric_acc.update(label, outputs) _, train_acc = metric_acc.get() # save parameters if i % 100 == 0 and i != 0: logger.info("Save parameters") net.save_parameters( os.path.join(pretrained_path, 'eco_net_iter_{}.params'.format(str(i)))) logger.info( '[Epoch %d] Iter: %d, Train-acc: %.3f, loss: %.3f | time: %.1f' % (epoch, i, train_acc, train_loss, time.time() - batch_start)) _, train_acc = metric_acc.get() train_loss /= num_batch _, val_acc = test(net, val_data, ctx) logger.info( '[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | time: %.1f' % (epoch, train_acc, train_loss, val_acc, time.time() - epoch_start))
def train_net(train_path, val_path, anno_file, num_class, batch_size, pretrained, pretrained_path, epochs, ctx, learning_rate, weight_decay, optimizer, momentum, lr_refactor_steps, lr_refactor_ratio, log_file, tensorboard, num_workers, per_device_batch_size): """ Training network """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: fh = logging.FileHandler(log_file) logger.addHandler(fh) # split training dataset into training and validation dataset train_anno_file, val_anno_file = split_image_dataset(train_path, val_path, anno_file) # load dataset train_data = DataLoader( eco_dataset.ImageNpyDataset(train_path, train_anno_file).transform_first(get_transform('train')), batch_size=batch_size, shuffle=True, num_workers=num_workers) val_data = DataLoader( eco_dataset.ImageNpyDataset(val_path, val_anno_file).transform_first(get_transform('test')), batch_size=batch_size, shuffle=True, num_workers=num_workers) # build network and initialize logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " construct network") net_path = '/home/lijie/ECO_Full_kinetics_pretrained/pretrained_models/' json_file = 'eco_full_with_63_classes-symbol.json' params_file = 'eco_full_with_63_classes-0000.params' # pre-train model if pretrained: logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Start training from pretrained model {}".format(pretrained)) saved_params_file = get_latest_params_file(pretrained_path) if saved_params_file: logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Initialize network by saved parameter file {}".format(saved_params_file)) model_path = '/home/lijie/ECO_Full_kinetics_pretrained/model/' net = eco_full_symbol.eco_full(net_path + json_file, model_path + saved_params_file, ctx) else: logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Initialized network by pre_trained parameter file {}".format(params_file)) net = eco_full_symbol.eco_full(net_path + json_file, net_path + params_file, ctx) else: logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Just construct network with {}, and initialized network by Xavier.".format(json_file)) net = eco_full_symbol.eco_full(net_path + json_file, None, ctx) net.hybridize() # learning rate refactor steps if lr_refactor_steps is None: decay_interval = int(epochs / 3) lr_refactor_steps = [i for i in range(1, epochs, decay_interval)] else: lr_refactor_steps = [int(i.strip()) for i in lr_refactor_steps.split(',')] trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate, 'momentum': momentum, 'wd': weight_decay}) metric_acc = metric.Accuracy() L = gluon.loss.SoftmaxCrossEntropyLoss() lr_counter = 0 num_batch = len(train_data) for epoch in range(epochs): epoch_start = time.time() if lr_counter < len(lr_refactor_steps) and epoch == lr_refactor_steps[lr_counter]: trainer.set_learning_rate(trainer.learning_rate*lr_refactor_ratio) lr_counter += 1 train_loss = 0 metric_acc.reset() for i, batch in enumerate(train_data): batch_start = time.time() data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) with ag.record(): outputs = [] data = data[0] label = label[0] for idx in range(data.shape[0]): outputs.append(net(data[idx])) loss = 0 for yhat, y in zip(outputs, label): loss = loss + mx.nd.mean(L(yhat, y)) loss.backward() trainer.step(batch_size, ignore_stale_grad=True) train_loss = loss.mean().asscalar() / batch_size metric_acc.update(label, outputs) _, train_acc = metric_acc.get() # save parameters if i % 100 == 0 and i != 0: logger.info("Save parameters") net.save_parameters(os.path.join(pretrained_path, 'eco_net_iter_{}.params'.format(str(i)))) logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' [Epoch %d] Iter: %d, Train-acc: %.3f, loss: %.3f | time: %.1f' % (epoch, i, train_acc, train_loss, time.time() - batch_start)) # gluon导出symbol网络的结构和参数,需要在运行block.hybridize()方法之后至少运行一个iter,才能导出 # logger.info(" Export function export to symbol network and parameters. ") # net.export("eco_gluon_to_symbol") _, train_acc = metric_acc.get() train_loss /= num_batch _, val_acc = test(net, val_data, ctx) logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' [Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | time: %.1f' % (epoch, train_acc, train_loss, val_acc, time.time() - epoch_start))
# Initialize ship or no-ship detection network num_classes = 1 print("Loading ship detection model ({})...".format(config["resnet_size"])) net = models.resnet(config["resnet_size"], num_classes) print(net) # Loss function: binary cross entropy with logits. Expects logits therefore the # output layer must return a logits instead of probabilities criterion = torch.nn.BCEWithLogitsLoss() # Optimizer: adam optimizer = torch.optim.Adam(net.parameters(), lr=config["lr_rate"]) # If a model checkpoint has been specified try to load its weights start_epoch = 1 metrics = metric.MetricList([metric.Accuracy()]) if args.model_checkpoint: print("Loading weights from {}...".format(args.model_checkpoint)) checkpoint = torch.load(args.model_checkpoint, map_location=torch.device("cpu")) net.load_state_dict(checkpoint["model"]) # If the --resume flag is specified, training will continue from the checkpoint # as if it was never aborted. Otherwise, training will take only the already # loaded weights start from scratch if args.resume: start_epoch = checkpoint["epoch"] + 1 optimizer.load_state_dict(checkpoint["optimizer"]) metrics = checkpoint["metrics"] print("Resuming training from epoch {}: Metrics - {}".format( start_epoch, metrics))
def fit(self, train_iter, optimizer, lr_scheduler, eval_iter=None, metrics=metric.Accuracy(topk=1), epoch_start=0, epoch_end=10000, **kwargs): """ checking """ if kwargs: logging.warning("Unknown kwargs: {}".format(kwargs)) # assert torch.cuda.is_available(), "only support GPU version" """ start the main loop """ pause_sec = 0. for i_epoch in range(epoch_start, epoch_end): self.callback_kwargs['epoch'] = i_epoch epoch_start_time = time.time() ########### # 1] TRAINING ########### metrics.reset() self.net.train() sum_sample_inst = 0 sum_sample_elapse = 0. sum_update_elapse = 0 batch_start_time = time.time() logging.info("Start epoch {:d}:".format(i_epoch)) # for i_batch, (data, target,sampled_idx,vid_subpath) in enumerate(train_iter): for i_batch, (data, target) in enumerate(train_iter): self.callback_kwargs['batch'] = i_batch update_start_time = time.time() # [forward] making next step outputs, losses = self.forward(data, target) """ for l in range(len(outputs)): print("output.shape:::",outputs[l].shape) """ # [backward] optimizer.zero_grad() for loss in losses: loss.backward() self.adjust_learning_rate(optimizer=optimizer, lr=lr_scheduler.update()) optimizer.step() metrics.update([output.data.cpu() for output in outputs], target.cpu(), [loss.data.cpu() for loss in losses]) # timing each batch sum_sample_elapse += time.time() - batch_start_time sum_update_elapse += time.time() - update_start_time batch_start_time = time.time() sum_sample_inst += data.shape[0] if (i_batch % self.step_callback_freq) == 0: # retrive eval results and reset metic self.callback_kwargs['namevals'] = metrics.get_name_value() metrics.reset() # speed monitor self.callback_kwargs[ 'sample_elapse'] = sum_sample_elapse / sum_sample_inst self.callback_kwargs[ 'update_elapse'] = sum_update_elapse / sum_sample_inst sum_update_elapse = 0 sum_sample_elapse = 0 sum_sample_inst = 0 # callbacks self.step_end_callback() ########### # 2] END OF EPOCH ########### self.callback_kwargs['epoch_elapse'] = time.time( ) - epoch_start_time self.callback_kwargs['optimizer_dict'] = optimizer.state_dict() self.epoch_end_callback() ########### # 3] Evaluation ########### if (eval_iter is not None) \ and ((i_epoch + 1) % max(1, int(self.save_checkpoint_freq / 2))) == 0: logging.info("Start evaluating epoch {:d}:".format(i_epoch)) metrics.reset() self.net.eval() sum_sample_elapse = 0. sum_sample_inst = 0 sum_forward_elapse = 0. batch_start_time = time.time() for i_batch, (data, target) in enumerate(eval_iter): self.callback_kwargs['batch'] = i_batch forward_start_time = time.time() outputs, losses = self.forward(data, target) metrics.update([output.data.cpu() for output in outputs], target.cpu(), [loss.data.cpu() for loss in losses]) sum_forward_elapse += time.time() - forward_start_time sum_sample_elapse += time.time() - batch_start_time batch_start_time = time.time() sum_sample_inst += data.shape[0] # evaluation callbacks self.callback_kwargs[ 'sample_elapse'] = sum_sample_elapse / sum_sample_inst self.callback_kwargs[ 'update_elapse'] = sum_forward_elapse / sum_sample_inst self.callback_kwargs['namevals'] = metrics.get_name_value() self.step_end_callback() logging.info("Optimization done!")
# initializatioln the dynamic model net = model(net=sym_c3d, optimizer=optimizer, criterion=torch.nn.CrossEntropyLoss().cuda()) # load the pretained model if resume: net.load_checkpoint(epoch=load_epoch) elif pretained: pretrained_model_state_dic = GetPretrainedModel(name='resnet') net.load_state(state_dic=pretrained_model_state_dic, strict=False) else: logging.info("Train from scratch using random initialization") # prepare opmitization metrics = metric.MetricList(metric.Accuracy(topk=1, name="acc-top1"), metric.Accuracy(topk=5, name="acc-top5")) lr_scheduler = MultiFactorScheduler(steps=[300, 1000], base_lr=0.1, factor=0.1) tr_iter, ts_iter = dataiter_factory.creat( name='ucf101', data_root='../../dataset/UCF101', batch_size=1, ) net.fit( iter_train=tr_iter, metrics_train=metrics, epoch_start=0,
def test(data_loader, model, opt, class_names): print('test') model.eval() # eval metrics metrics = metric.MetricList( metric.Accuracy(topk=1, name="top1"), metric.Accuracy(topk=2, name="top2"), metric.Accuracy(topk=3, name="top3"), metric.Accuracy(topk=4, name="top4"), metric.Accuracy(topk=5, name="top5"), ) metrics.reset() avg_score = {} sum_batch_elapse = 0. sum_batch_inst = 0 duplication = 1 total_round = 1 out_target = [] out_output = [] with open('datasets/template.csv', 'r') as f: template_sample = {} for line in f.readlines(): name = line.split(',')[0] template_sample[name] = -1 interval = data_loader.__len__() // 10 for i_round in range(total_round): i_batch = 0 print("round #{}/{}".format(i_round, total_round)) with torch.no_grad(): for i, (inputs, targets, bbox) in enumerate(data_loader): # data_time.update(time.time() - end_time) batch_start_time = time.time() targets_ori = targets[0].cuda() if opt.model == 'slowfast': slow = inputs[:, :, ::8, :, :] fast = inputs[:, :, ::2, :, :] outputs = model([slow, fast]) else: outputs = model(inputs) output_np = outputs.data.cpu().numpy() target_np = targets_ori.data.cpu().numpy() out_output.append(output_np) out_target.append(target_np[:, np.newaxis]) sum_batch_elapse += time.time() - batch_start_time sum_batch_inst += 1 if not opt.no_softmax_in_test: outputs = F.softmax(outputs, dim=1) outputs = outputs.data.cpu() # targets = targets.cpu() for i_item in range(0, outputs.shape[0]): output_i = outputs[i_item, :].view(1, -1) target_i = torch.LongTensor([targets[0][i_item]]) video_subpath_i = targets[1][i_item] if video_subpath_i in avg_score: avg_score[video_subpath_i][1] += output_i avg_score[video_subpath_i][2] += 1 duplication = 0.92 * duplication + 0.08 * avg_score[ video_subpath_i][2] else: avg_score[video_subpath_i] = [ torch.LongTensor(target_i.numpy().copy()), torch.FloatTensor(output_i.numpy().copy()), 1 ] # the last one is counter # show progress if (i_batch % interval) == 0: metrics.reset() for _, video_info in avg_score.items(): target, pred, _ = video_info metrics.update([pred], target) name_value = metrics.get_name_value() print( "{:.1f}%, {:.1f} \t| Batch [0,{}] \tAvg: {} = {:.5f}, {} = {:.5f}".format( float(100 * i_batch) / data_loader.__len__(), \ duplication, \ i_batch, \ name_value[0][0][0], name_value[0][0][1], \ name_value[1][0][0], name_value[1][0][1])) i_batch += 1 # finished print("Evaluation one epoch Finished!") # savefig output_array = np.concatenate(out_output, axis=0) target_array = np.concatenate(out_target, axis=0) if opt.annotation_path.endswith('split.json'): name = 'AUTSL_' + opt.model + '.npy' pkl_name = 'AUTSL_' + opt.model + '2_all.pkl' else: name = 'AUTSL_' + opt.model + '_all.npy' pkl_name = 'AUTSL_' + opt.model + '_all.pkl' # np.save(os.path.join(name), output_array, allow_pickle=False) import pickle with open(pkl_name, 'wb') as f: pickle.dump(avg_score, f) metrics.reset() class_num = {} class_acc = {} for _, video_info in avg_score.items(): # total video target, pred, _ = video_info metrics.update([pred], target) # class acc if target.item() not in class_num: class_num[target.item()] = 1 else: class_num[target.item()] += 1 _, pred_topk = pred.topk(1, 1, True, True) pred_topk = pred_topk.t() correct = pred_topk.eq(target.view(1, -1).expand_as(pred_topk)) if target.item() not in class_acc: # class_acc[target.item()] = correct.item() class_acc[target.item()] = float( correct.view(-1).float().sum(0, keepdim=True).numpy()) else: # class_acc[target.item()] += correct.item() class_acc[target.item()] += float( correct.view(-1).float().sum(0, keepdim=True).numpy()) for video_name, video_info in avg_score.items(): target, pred, _ = video_info template_sample[video_name] = torch.argmax(pred).item() # with open('predictions.csv', 'w') as f2: # for k, v in template_sample.items(): # line = k + ',' + str(v) + '\n' # f2.writelines(line) print("Total time cost: {:.1f} sec".format(sum_batch_elapse)) print("Speed: {:.4f} samples/sec".format( opt.batch_size * sum_batch_inst / sum_batch_elapse)) print("Accuracy:") print(json.dumps(metrics.get_name_value(), indent=4, sort_keys=True))