Ejemplo n.º 1
0
def get_model(paths, feature_type):
    if feature_type == 'vgg':
        feature_network = roi_feature_model.Vgg16(
            num_classes=len(metadata.action_classes))
    elif feature_type == 'resnet':
        feature_network = roi_feature_model.Resnet152(
            num_classes=len(metadata.action_classes))
    elif feature_type == 'densenet':
        feature_network = roi_feature_model.Densenet(
            num_classes=len(metadata.action_classes))
    else:
        raise ValueError('feature type not recognized')

    if feature_type.startswith('alexnet') or feature_type.startswith('vgg'):
        feature_network.features = torch.nn.DataParallel(
            feature_network.features)
        feature_network.cuda()
    else:
        feature_network = torch.nn.DataParallel(feature_network).cuda()

    checkpoint_dir = os.path.join(paths.tmp_root, 'checkpoints', 'hico',
                                  'finetune_{}'.format(feature_type))
    best_model_file = os.path.join(checkpoint_dir, 'model_best.pth')
    checkpoint = torch.load(best_model_file)
    feature_network.load_state_dict(checkpoint['state_dict'])
    return feature_network
def extract_node_features(paths,mode):

    input_h, input_w=224,224
    node_feature_len=1000
    transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor(),torchvision.transforms.Normalize(metadata.train_mean_value,metadata.train_std_value)])
    feature_network = roi_feature_model.Resnet152(num_classes=len(metadata.node_classes))
    feature_network = torch.nn.DataParallel(feature_network).cuda()

    # get the processed annotation and corresponding original image

    ant_files=[f for f in sorted(listdir(os.path.join(paths.data_root,mode,'ant_processed'))) if f.endswith('_ant_all.npy')]

    #for ant_file_ind in [3]:

    for ant_file_ind in range(len(ant_files)):
        ant_f=ant_files[ant_file_ind]

        vid=ant_f[4:-12]

        # if os.path.isfile(os.path.join(paths.data_root, mode, 'node_feature_1000', 'vid_{}_resnet_node_feature.npy'.format(vid))):
        #     continue


        print('node feature vid {}'.format(vid))

        ant_all=np.load(os.path.join(paths.data_root,mode,'ant_processed',ant_f))

        frame_num=len(ant_all)

        node_feature_all=list()

        for frame_ind in range(frame_num):
            ant=ant_all[frame_ind]
            orig_img=scipy.misc.imread(os.path.join(paths.data_root,mode,'img',vid,'{}.png'.format(str(frame_ind+1).zfill(5))),mode='RGB')

            node_feature_tmp=np.zeros((len(ant),node_feature_len))

            for ant_ind in range(len(ant)):

                pos=ant[ant_ind]['pos']
                roi_img=orig_img[int(pos[1]):(int(pos[3])+1), int(pos[0]):(int(pos[2])+1), :]

                # fig, ax = plt.subplots(1)
                # ax.imshow(roi_img)
                #
                # plt.show()

                roi_img=transform(cv2.resize(roi_img,(input_h,input_w),interpolation=cv2.INTER_LINEAR))
                roi_img=torch.autograd.Variable(roi_img.unsqueeze(0)).cuda()
                feature,_=feature_network(roi_img)

                #node_feature_tmp.append(feature.data.cpu().numpy())
                node_feature_tmp[ant_ind,:]=feature.data.cpu().numpy()

            node_feature_all.append(node_feature_tmp)

        np.save(os.path.join(paths.data_root, mode, 'node_feature_1000', 'vid_{}_resnet_node_feature'.format(vid)), node_feature_all)
def extract_edge_features(paths,mode):

    input_h, input_w = 224, 224
    edge_feature_len = 1000
    transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                torchvision.transforms.Normalize(metadata.train_mean_value,
                                                                            metadata.train_std_value)])

    # get the finetuned feature network ready!

    feature_network = roi_feature_model.Resnet152(num_classes=len(metadata.node_classes))

    feature_network = torch.nn.DataParallel(feature_network).cuda()

    # checkpoint_dir = os.path.join(paths.tmp_root, 'checkpoints', 'finetune_resnet')
    # if not os.path.exists(checkpoint_dir):
    #     os.makedirs(checkpoint_dir)
    #
    # best_model_file = os.path.join(checkpoint_dir, 'model_best.pth')
    #
    # if os.path.isfile(best_model_file):
    #     checkpoint = torch.load(best_model_file)
    #     feature_network.load_state_dict(checkpoint['state_dict'])
    #     print "Loading trained model successfully!"

    # get the processed annotation and corresponding original image

    ant_files = [f for f in sorted(listdir(os.path.join(paths.data_root, mode, 'ant_processed'))) if
                 f.endswith('_ant_all.npy')]

    #for ant_file_ind in [3]:

    for ant_file_ind in range(len(ant_files)):

        ant_f = ant_files[ant_file_ind]

        vid = ant_f[4:-12]

        # if os.path.isfile(os.path.join(paths.data_root, mode, 'edge_feature_1000', 'vid_{}_resnet_edge_feature.npy'.format(vid))):
        #
        #     continue


        print('edge feature vid {}'.format(vid))

        ant_all = np.load(os.path.join(paths.data_root, mode, 'ant_processed', ant_f))

        frame_num = len(ant_all)

        edge_feature_all = list()

        for frame_ind in range(frame_num):
            ant = ant_all[frame_ind]

            # get the human node amount and object node amount in the current frame
            human_num=0
            obj_num=0

            for i in range(len(ant)):
                if ant[i]['label'].startswith('Person'):
                    human_num+=1
                elif ant[i]['label'].startswith('Object'):
                    obj_num+=1

            orig_img = scipy.misc.imread(
                os.path.join(paths.data_root, mode, 'img', vid, '{}.png'.format(str(frame_ind + 1).zfill(5))),
                mode='RGB')


            edge_feature_tmp_per_frame=np.zeros((human_num+obj_num,human_num+obj_num,edge_feature_len))

            #edge_feature_tmp_per_frame = list()

            for ant_ind1 in range(human_num):
                #edge_feature_tmp_per_person=list()

                for ant_ind2 in range(human_num+obj_num):

                    if ant_ind2==ant_ind1:
                        continue

                    pos1=ant[ant_ind1]['pos']
                    pos2=ant[ant_ind2]['pos']

                    min_xy=np.minimum([int(pos1[0]),int(pos1[1])],[int(pos2[0]),int(pos2[1])])
                    max_xy=np.maximum([int(pos1[2]),int(pos1[3])],[int(pos2[2]),int(pos2[3])])

                    scipy.misc.imshow(orig_img[min_xy[1]:(max_xy[1]+1),min_xy[0]:(max_xy[0]+1),:])
                    roi_img = orig_img[min_xy[1]:(max_xy[1]+1),min_xy[0]:(max_xy[0]+1),:]

                    pos1_x_center=(int(pos1[0])+int(pos1[2]))*1.0/2
                    pos2_x_center=(int(pos2[0])+int(pos2[2]))*1.0/2

                    if pos1_x_center>pos2_x_center:

                        roi_img=np.fliplr(roi_img)

                    scipy.misc.imshow(roi_img)

                    roi_img = transform(cv2.resize(roi_img, (input_h, input_w), interpolation=cv2.INTER_LINEAR))
                    roi_img = torch.autograd.Variable(roi_img.unsqueeze(0)).cuda()
                    feature, _ = feature_network(roi_img)

                    edge_feature_tmp_per_frame[ant_ind1, ant_ind2,:]=feature.data.cpu().numpy()

                    #edge_feature_tmp_per_person.append(feature.data.cpu().numpy())

                #edge_feature_tmp_per_frame.append(edge_feature_tmp_per_person)

            edge_feature_all.append(edge_feature_tmp_per_frame)

        np.save(os.path.join(paths.data_root, mode, 'edge_feature_1000', 'vid_{}_resnet_edge_feature'.format(vid)), edge_feature_all)
def main(args):
    best_prec1 = 0.0
    args.distributed = args.world_size > 1
    if args.distributed:
        torch.distributed.init_process_group(backend=args.dist_backend,
                                             init_method=args.dist_url,
                                             world_size=args.world_size)

    # create model
    if args.feature_type == 'vgg':
        model = roi_feature_model.Vgg16(
            num_classes=len(metadata.action_classes))
    elif args.feature_type == 'resnet':
        model = roi_feature_model.Resnet152(
            num_classes=len(metadata.action_classes))
    elif args.feature_type == 'densenet':
        model = roi_feature_model.Densenet(
            num_classes=len(metadata.action_classes))
    input_imsize = (224, 224)

    if not args.distributed:
        if args.feature_type.startswith(
                'alexnet') or args.feature_type.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = torch.nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(os.path.join(args.resume, 'model_best.pth')):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(os.path.join(args.resume,
                                                 'model_best.pth'))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(
                os.path.join(args.resume, 'model_best.pth')))

    torch.backends.cudnn.benchmark = True

    # Data loading code
    normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        normalize,
    ])
    train_dataset = roi_feature_model.HICO(args.data, input_imsize, transform,
                                           'train')
    #val_dataset = roi_feature_model.HICO(args.data, input_imsize, transform, 'val')
    test_dataset = roi_feature_model.HICO(args.data, input_imsize, transform,
                                          'test')

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=args.workers,
                                              pin_memory=False)

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        if epoch == 0 or epoch >= 5:
            # evaluate on validation set
            prec1 = validate(test_loader, model, criterion)

            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            print('Best precision: {:.03f}'.format(best_prec1))
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.feature_type,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)

    test_prec = validate(test_loader, model, criterion, test=True)
    print('Testing precision: {:.04f}'.format(test_prec))