Example #1
0
def test(net, val_data, ctx):
    """ Evaluate the result"""
    metric_acc = metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
        outputs = []
        data = data[0]
        label = label[0]
        for idx in range(data.shape[0]):
            outputs.append(net(data[i]))
        metric_acc.update(label, outputs)

    return metric_acc.get()
Example #2
0
def train_net(train_path, val_path, anno_file, num_class, batch_size,
              pretrained, pretrained_path, epochs, ctx, learning_rate,
              weight_decay, optimizer, momentum, lr_refactor_steps,
              lr_refactor_ratio, log_file, tensorboard, num_workers,
              per_device_batch_size):
    """ Training network """
    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    if log_file:
        fh = logging.FileHandler(log_file)
        logger.addHandler(fh)

    # split training dataset into training and validation dataset
    train_anno_file, val_anno_file = split_image_dataset(
        train_path, val_path, anno_file)
    # load dataset
    train_data = DataLoader(eco_dataset.ImageNpyDataset(
        train_path, train_anno_file).transform_first(get_transform('train')),
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=num_workers)
    val_data = DataLoader(eco_dataset.ImageNpyDataset(
        val_path, val_anno_file).transform_first(get_transform('test')),
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=num_workers)

    # build network
    net = eco_full.eco_full()

    # pre-train model
    if pretrained:
        logger.info(
            "Start training from pretrained model {}".format(pretrained))
        params_file = get_latest_params_file(pretrained_path)
        if not params_file:
            logger.info(
                "No params file exist, the net will be initialized by Xavier")
            net.collect_params().initialize(mx.init.Xavier(), ctx)
            net.hybridize()
        else:
            #            logger.info("Initialize network by symbol parameters.")
            #            net = gluon.SymbolBlock.imports("eco_gluon_to_symbol-symbol.json",
            #                        ["data"], "eco_gluon_to_symbol-0000.params", ctx=mx.gpu())

            logger.info("Initialize network by %s" % params_file)
            net.load_parameters(
                '/home/lijie/ECO_Full_kinetics_pretrained/model/' +
                params_file, ctx)
            net.hybridize()
    else:
        net.collect_params().initialize(mx.init.Xavier(), ctx)
        net.hybridize()

    # learning rate refactor steps
    if lr_refactor_steps is None:
        decay_interval = int(epochs / 3)
        lr_refactor_steps = [i for i in range(1, epochs, decay_interval)]
    else:
        lr_refactor_steps = [
            int(i.strip()) for i in lr_refactor_steps.split(',')
        ]

    trainer = gluon.Trainer(net.collect_params(), optimizer, {
        'learning_rate': learning_rate,
        'momentum': momentum,
        'wd': weight_decay
    })

    metric_acc = metric.Accuracy()
    L = gluon.loss.SoftmaxCrossEntropyLoss()

    lr_counter = 0
    num_batch = len(train_data)

    for epoch in range(epochs):
        epoch_start = time.time()
        if lr_counter < len(
                lr_refactor_steps) and epoch == lr_refactor_steps[lr_counter]:
            trainer.set_learning_rate(trainer.learning_rate *
                                      lr_refactor_ratio)
            lr_counter += 1
        train_loss = 0
        metric_acc.reset()

        for i, batch in enumerate(train_data):
            batch_start = time.time()
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0,
                                              even_split=False)
            label = gluon.utils.split_and_load(batch[1],
                                               ctx_list=ctx,
                                               batch_axis=0,
                                               even_split=False)
            with ag.record():
                # print('data length : {}'.format(len(data)))
                outputs = []
                data = data[0]
                label = label[0]
                for idx in range(data.shape[0]):
                    outputs.append(net(data[idx]))
                loss = 0
                for yhat, y in zip(outputs, label):
                    loss = loss + mx.nd.mean(L(yhat, y))
                loss.backward()
            # for l in loss:
            #     l.backward()

            trainer.step(batch_size, ignore_stale_grad=True)
            # train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
            train_loss = loss.mean().asscalar() / batch_size
            metric_acc.update(label, outputs)
            _, train_acc = metric_acc.get()
            # save parameters
            if i % 100 == 0 and i != 0:
                logger.info("Save parameters")
                net.save_parameters(
                    os.path.join(pretrained_path,
                                 'eco_net_iter_{}.params'.format(str(i))))
            logger.info(
                '[Epoch %d] Iter: %d, Train-acc: %.3f, loss: %.3f | time: %.1f'
                % (epoch, i, train_acc, train_loss, time.time() - batch_start))

        _, train_acc = metric_acc.get()
        train_loss /= num_batch

        _, val_acc = test(net, val_data, ctx)

        logger.info(
            '[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | time: %.1f'
            %
            (epoch, train_acc, train_loss, val_acc, time.time() - epoch_start))
Example #3
0
def train_net(train_path, val_path, anno_file,
              num_class, batch_size,
              pretrained, pretrained_path, epochs,
              ctx, learning_rate, weight_decay,
              optimizer, momentum,
              lr_refactor_steps, lr_refactor_ratio,
              log_file, tensorboard,
              num_workers, per_device_batch_size):
    """ Training network """
    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    if log_file:
        fh = logging.FileHandler(log_file)
        logger.addHandler(fh)

    # split training dataset into training and validation dataset
    train_anno_file, val_anno_file = split_image_dataset(train_path, val_path, anno_file)
    # load dataset
    train_data = DataLoader(
        eco_dataset.ImageNpyDataset(train_path, train_anno_file).transform_first(get_transform('train')),
        batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_data = DataLoader(
        eco_dataset.ImageNpyDataset(val_path, val_anno_file).transform_first(get_transform('test')),
        batch_size=batch_size, shuffle=True, num_workers=num_workers)

    # build network and initialize
    logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " construct network")
    net_path = '/home/lijie/ECO_Full_kinetics_pretrained/pretrained_models/'
    json_file = 'eco_full_with_63_classes-symbol.json'
    params_file = 'eco_full_with_63_classes-0000.params'
    # pre-train model
    if pretrained:
        logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Start training from pretrained model {}".format(pretrained))
        saved_params_file = get_latest_params_file(pretrained_path)
        if saved_params_file:
            logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Initialize network by saved parameter file {}".format(saved_params_file))
            model_path = '/home/lijie/ECO_Full_kinetics_pretrained/model/'
            net = eco_full_symbol.eco_full(net_path + json_file, model_path + saved_params_file, ctx)
        else:
            logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Initialized network by pre_trained parameter file {}".format(params_file))
            net = eco_full_symbol.eco_full(net_path + json_file, net_path + params_file, ctx)
    else:
        logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " Just construct network with {}, and initialized network by Xavier.".format(json_file))
        net = eco_full_symbol.eco_full(net_path + json_file, None, ctx)
    net.hybridize()

    # learning rate refactor steps
    if lr_refactor_steps is None:
        decay_interval = int(epochs / 3)
        lr_refactor_steps = [i for i in range(1, epochs, decay_interval)]
    else:
        lr_refactor_steps = [int(i.strip()) for i in lr_refactor_steps.split(',')]

    trainer = gluon.Trainer(net.collect_params(), optimizer,
                            {'learning_rate': learning_rate, 'momentum': momentum, 'wd': weight_decay})

    metric_acc = metric.Accuracy()
    L = gluon.loss.SoftmaxCrossEntropyLoss()

    lr_counter = 0
    num_batch = len(train_data)

    for epoch in range(epochs):
        epoch_start = time.time()
        if lr_counter < len(lr_refactor_steps) and epoch == lr_refactor_steps[lr_counter]:
            trainer.set_learning_rate(trainer.learning_rate*lr_refactor_ratio)
            lr_counter += 1
        train_loss = 0
        metric_acc.reset()

        for i, batch in enumerate(train_data):
            batch_start = time.time()
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
            label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
            with ag.record():
                outputs = []
                data = data[0]
                label = label[0]
                for idx in range(data.shape[0]):
                    outputs.append(net(data[idx]))
                loss = 0
                for yhat, y in zip(outputs, label):
                    loss = loss + mx.nd.mean(L(yhat, y))
                loss.backward()

            trainer.step(batch_size, ignore_stale_grad=True)
            train_loss = loss.mean().asscalar() / batch_size
            metric_acc.update(label, outputs)
            _, train_acc = metric_acc.get()
            # save parameters
            if i % 100 == 0 and i != 0:
                logger.info("Save parameters")
                net.save_parameters(os.path.join(pretrained_path, 'eco_net_iter_{}.params'.format(str(i))))
            logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' [Epoch %d] Iter: %d, Train-acc: %.3f, loss: %.3f | time: %.1f' % (epoch, i, train_acc, train_loss, time.time() - batch_start))

            # gluon导出symbol网络的结构和参数,需要在运行block.hybridize()方法之后至少运行一个iter,才能导出
            # logger.info(" Export function export to symbol network and parameters. ")
            # net.export("eco_gluon_to_symbol")

        _, train_acc = metric_acc.get()
        train_loss /= num_batch

        _, val_acc = test(net, val_data, ctx)

        logger.info(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' [Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | time: %.1f' % (epoch, train_acc, train_loss, val_acc, time.time() - epoch_start))
    # Initialize ship or no-ship detection network
    num_classes = 1
    print("Loading ship detection model ({})...".format(config["resnet_size"]))
    net = models.resnet(config["resnet_size"], num_classes)
    print(net)

    # Loss function: binary cross entropy with logits. Expects logits therefore the
    # output layer must return a logits instead of probabilities
    criterion = torch.nn.BCEWithLogitsLoss()

    # Optimizer: adam
    optimizer = torch.optim.Adam(net.parameters(), lr=config["lr_rate"])

    # If a model checkpoint has been specified try to load its weights
    start_epoch = 1
    metrics = metric.MetricList([metric.Accuracy()])
    if args.model_checkpoint:
        print("Loading weights from {}...".format(args.model_checkpoint))
        checkpoint = torch.load(args.model_checkpoint,
                                map_location=torch.device("cpu"))
        net.load_state_dict(checkpoint["model"])

        # If the --resume flag is specified, training will continue from the checkpoint
        # as if it was never aborted. Otherwise, training will take only the already
        # loaded weights start from scratch
        if args.resume:
            start_epoch = checkpoint["epoch"] + 1
            optimizer.load_state_dict(checkpoint["optimizer"])
            metrics = checkpoint["metrics"]
            print("Resuming training from epoch {}: Metrics - {}".format(
                start_epoch, metrics))
    def fit(self,
            train_iter,
            optimizer,
            lr_scheduler,
            eval_iter=None,
            metrics=metric.Accuracy(topk=1),
            epoch_start=0,
            epoch_end=10000,
            **kwargs):
        """
        checking
        """
        if kwargs:
            logging.warning("Unknown kwargs: {}".format(kwargs))

        # assert torch.cuda.is_available(), "only support GPU version"
        """
        start the main loop
        """
        pause_sec = 0.

        for i_epoch in range(epoch_start, epoch_end):
            self.callback_kwargs['epoch'] = i_epoch
            epoch_start_time = time.time()
            ###########
            # 1] TRAINING
            ###########
            metrics.reset()
            self.net.train()
            sum_sample_inst = 0
            sum_sample_elapse = 0.
            sum_update_elapse = 0
            batch_start_time = time.time()
            logging.info("Start epoch {:d}:".format(i_epoch))
            # for i_batch, (data, target,sampled_idx,vid_subpath) in enumerate(train_iter):
            for i_batch, (data, target) in enumerate(train_iter):
                self.callback_kwargs['batch'] = i_batch

                update_start_time = time.time()

                # [forward] making next step
                outputs, losses = self.forward(data, target)
                """
                for l in range(len(outputs)):
                    print("output.shape:::",outputs[l].shape)
                    
                """
                # [backward]
                optimizer.zero_grad()
                for loss in losses:
                    loss.backward()

                self.adjust_learning_rate(optimizer=optimizer,
                                          lr=lr_scheduler.update())
                optimizer.step()

                metrics.update([output.data.cpu() for output in outputs],
                               target.cpu(),
                               [loss.data.cpu() for loss in losses])

                # timing each batch
                sum_sample_elapse += time.time() - batch_start_time
                sum_update_elapse += time.time() - update_start_time
                batch_start_time = time.time()
                sum_sample_inst += data.shape[0]

                if (i_batch % self.step_callback_freq) == 0:
                    # retrive eval results and reset metic
                    self.callback_kwargs['namevals'] = metrics.get_name_value()
                    metrics.reset()
                    # speed monitor
                    self.callback_kwargs[
                        'sample_elapse'] = sum_sample_elapse / sum_sample_inst
                    self.callback_kwargs[
                        'update_elapse'] = sum_update_elapse / sum_sample_inst
                    sum_update_elapse = 0
                    sum_sample_elapse = 0
                    sum_sample_inst = 0
                    # callbacks
                    self.step_end_callback()

            ###########
            # 2] END OF EPOCH
            ###########
            self.callback_kwargs['epoch_elapse'] = time.time(
            ) - epoch_start_time
            self.callback_kwargs['optimizer_dict'] = optimizer.state_dict()
            self.epoch_end_callback()

            ###########
            # 3] Evaluation
            ###########
            if (eval_iter is not None) \
                    and ((i_epoch + 1) % max(1, int(self.save_checkpoint_freq / 2))) == 0:
                logging.info("Start evaluating epoch {:d}:".format(i_epoch))

                metrics.reset()
                self.net.eval()
                sum_sample_elapse = 0.
                sum_sample_inst = 0
                sum_forward_elapse = 0.
                batch_start_time = time.time()
                for i_batch, (data, target) in enumerate(eval_iter):
                    self.callback_kwargs['batch'] = i_batch

                    forward_start_time = time.time()

                    outputs, losses = self.forward(data, target)

                    metrics.update([output.data.cpu() for output in outputs],
                                   target.cpu(),
                                   [loss.data.cpu() for loss in losses])

                    sum_forward_elapse += time.time() - forward_start_time
                    sum_sample_elapse += time.time() - batch_start_time
                    batch_start_time = time.time()
                    sum_sample_inst += data.shape[0]

                # evaluation callbacks
                self.callback_kwargs[
                    'sample_elapse'] = sum_sample_elapse / sum_sample_inst
                self.callback_kwargs[
                    'update_elapse'] = sum_forward_elapse / sum_sample_inst
                self.callback_kwargs['namevals'] = metrics.get_name_value()
                self.step_end_callback()

        logging.info("Optimization done!")
Example #6
0
    # initializatioln the dynamic model
    net = model(net=sym_c3d,
                optimizer=optimizer,
                criterion=torch.nn.CrossEntropyLoss().cuda())

    # load the pretained model
    if resume:
        net.load_checkpoint(epoch=load_epoch)
    elif pretained:
        pretrained_model_state_dic = GetPretrainedModel(name='resnet')
        net.load_state(state_dic=pretrained_model_state_dic, strict=False)
    else:
        logging.info("Train from scratch using random initialization")

    # prepare opmitization
    metrics = metric.MetricList(metric.Accuracy(topk=1, name="acc-top1"),
                                metric.Accuracy(topk=5, name="acc-top5"))
    lr_scheduler = MultiFactorScheduler(steps=[300, 1000],
                                        base_lr=0.1,
                                        factor=0.1)

    tr_iter, ts_iter = dataiter_factory.creat(
        name='ucf101',
        data_root='../../dataset/UCF101',
        batch_size=1,
    )

    net.fit(
        iter_train=tr_iter,
        metrics_train=metrics,
        epoch_start=0,
Example #7
0
def test(data_loader, model, opt, class_names):
    print('test')

    model.eval()

    # eval metrics
    metrics = metric.MetricList(
        metric.Accuracy(topk=1, name="top1"),
        metric.Accuracy(topk=2, name="top2"),
        metric.Accuracy(topk=3, name="top3"),
        metric.Accuracy(topk=4, name="top4"),
        metric.Accuracy(topk=5, name="top5"),
    )
    metrics.reset()

    avg_score = {}
    sum_batch_elapse = 0.
    sum_batch_inst = 0
    duplication = 1
    total_round = 1

    out_target = []
    out_output = []

    with open('datasets/template.csv', 'r') as f:
        template_sample = {}
        for line in f.readlines():
            name = line.split(',')[0]
            template_sample[name] = -1

    interval = data_loader.__len__() // 10
    for i_round in range(total_round):

        i_batch = 0
        print("round #{}/{}".format(i_round, total_round))

        with torch.no_grad():
            for i, (inputs, targets, bbox) in enumerate(data_loader):
                # data_time.update(time.time() - end_time)
                batch_start_time = time.time()
                targets_ori = targets[0].cuda()

                if opt.model == 'slowfast':
                    slow = inputs[:, :, ::8, :, :]
                    fast = inputs[:, :, ::2, :, :]
                    outputs = model([slow, fast])
                else:
                    outputs = model(inputs)

                output_np = outputs.data.cpu().numpy()
                target_np = targets_ori.data.cpu().numpy()
                out_output.append(output_np)
                out_target.append(target_np[:, np.newaxis])

                sum_batch_elapse += time.time() - batch_start_time
                sum_batch_inst += 1
                if not opt.no_softmax_in_test:
                    outputs = F.softmax(outputs, dim=1)

                outputs = outputs.data.cpu()
                # targets = targets.cpu()

                for i_item in range(0, outputs.shape[0]):
                    output_i = outputs[i_item, :].view(1, -1)
                    target_i = torch.LongTensor([targets[0][i_item]])
                    video_subpath_i = targets[1][i_item]
                    if video_subpath_i in avg_score:
                        avg_score[video_subpath_i][1] += output_i
                        avg_score[video_subpath_i][2] += 1
                        duplication = 0.92 * duplication + 0.08 * avg_score[
                            video_subpath_i][2]
                    else:
                        avg_score[video_subpath_i] = [
                            torch.LongTensor(target_i.numpy().copy()),
                            torch.FloatTensor(output_i.numpy().copy()), 1
                        ]  # the last one is counter

                    # show progress
                if (i_batch % interval) == 0:
                    metrics.reset()
                    for _, video_info in avg_score.items():
                        target, pred, _ = video_info
                        metrics.update([pred], target)
                    name_value = metrics.get_name_value()
                    print(
                        "{:.1f}%, {:.1f} \t| Batch [0,{}]    \tAvg: {} = {:.5f}, {} = {:.5f}".format(
                            float(100 * i_batch) / data_loader.__len__(), \
                            duplication, \
                            i_batch, \
                            name_value[0][0][0], name_value[0][0][1], \
                            name_value[1][0][0], name_value[1][0][1]))
                i_batch += 1

        # finished
        print("Evaluation one epoch Finished!")

        # savefig
        output_array = np.concatenate(out_output, axis=0)
        target_array = np.concatenate(out_target, axis=0)
        if opt.annotation_path.endswith('split.json'):
            name = 'AUTSL_' + opt.model + '.npy'
            pkl_name = 'AUTSL_' + opt.model + '2_all.pkl'
        else:
            name = 'AUTSL_' + opt.model + '_all.npy'
            pkl_name = 'AUTSL_' + opt.model + '_all.pkl'
        # np.save(os.path.join(name), output_array, allow_pickle=False)

        import pickle
        with open(pkl_name, 'wb') as f:
            pickle.dump(avg_score, f)

        metrics.reset()
        class_num = {}
        class_acc = {}
        for _, video_info in avg_score.items():
            # total video
            target, pred, _ = video_info
            metrics.update([pred], target)

            # class acc
            if target.item() not in class_num:
                class_num[target.item()] = 1
            else:
                class_num[target.item()] += 1

            _, pred_topk = pred.topk(1, 1, True, True)

            pred_topk = pred_topk.t()
            correct = pred_topk.eq(target.view(1, -1).expand_as(pred_topk))
            if target.item() not in class_acc:
                # class_acc[target.item()] = correct.item()
                class_acc[target.item()] = float(
                    correct.view(-1).float().sum(0, keepdim=True).numpy())
            else:
                # class_acc[target.item()] += correct.item()
                class_acc[target.item()] += float(
                    correct.view(-1).float().sum(0, keepdim=True).numpy())

        for video_name, video_info in avg_score.items():
            target, pred, _ = video_info
            template_sample[video_name] = torch.argmax(pred).item()
        # with open('predictions.csv', 'w') as f2:
        #     for k, v in template_sample.items():
        #         line = k + ',' + str(v) + '\n'
        #         f2.writelines(line)

        print("Total time cost: {:.1f} sec".format(sum_batch_elapse))
        print("Speed: {:.4f} samples/sec".format(
            opt.batch_size * sum_batch_inst / sum_batch_elapse))
        print("Accuracy:")
        print(json.dumps(metrics.get_name_value(), indent=4, sort_keys=True))