Ejemplo n.º 1
0
    def __init__(self, opt, train=True):
        super(HurricaneVideoDataset, self).__init__(opt, train=train)

        self.nc = 3 if self.opt.dataset == "hurricane" else 6

        if self.train:
            self.image_path = os.path.join('./dataset/Hurricane/', 'train')
        else:
            self.image_path = os.path.join('./dataset/Hurricane/', 'test')

        threshold = self.window_size if opt.irregular else self.sample_size
        self.image_list = remove_files_under_sample_size(
            image_path=self.image_path, threshold=threshold)
        self.image_list = sorted(self.image_list)

        vtrans = [vtransforms.Pad(padding=(1, 0), fill=0)]

        if self.train:
            # vtrans += [vtransforms.RandomHorizontalFlip()]
            # vtrans += [vtransforms.RandomRotation()]
            pass

        vtrans += [vtransforms.ToTensor(scale=False)]
        vtrans += [vtransforms.Normalize(0.5, 0.5)] if opt.input_norm else []
        self.vtrans = T.Compose(vtrans)
def VideoSpatialPrediction(vid_name,
                           target,
                           net,
                           num_categories,
                           num_samples=25,
                           new_size=299,
                           batch_size=2):

    gc = GradCAM(model=net)

    clip_mean = [0.5] * num_samples
    clip_std = [0.226] * num_samples

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    val_transform = video_transforms.Compose([
        video_transforms.ToTensor(),
        normalize,
    ])

    deep = 1

    # inception = 299,299, resnet = 224,224
    dims = (new_size, new_size, deep, num_samples)
    rgb = np.zeros(shape=dims, dtype=np.float64)
    rgb_flip = np.zeros(shape=dims, dtype=np.float64)

    for i in range(num_samples):
        img_file = os.path.join(vid_name, 'vr_{0:02d}.png'.format(i))
        img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)
        rgb[:, :, 0, i] = img
        rgb_flip[:, :, 0, i] = img[:, ::-1]

    _, _, _, c = rgb.shape
    rgb_list = []
    for c_index in range(c):
        cur_img = rgb[:, :, :, c_index]
        cur_img_tensor = val_transform(cur_img)
        rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))

    rgb_np = np.concatenate(rgb_list, axis=0)
    prediction = np.zeros((num_categories, rgb.shape[3]))

    index = 50
    input_data = rgb_np[index:index + 1, :, :, :]
    imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda()
    imgDataVar = torch.autograd.Variable(imgDataTensor)

    probs, ids = gc.forward(imgDataVar)
    ids_ = torch.LongTensor([[target]] * len(imgDataVar)).to(
        torch.device("cuda"))
    gc.backward(ids=ids_)
    regions = gc.generate(target_layer="Mixed_7c")
    save_gradcam(vid_name.split("/")[-1] + ".png",
                 gcam=regions[0, 0],
                 raw_image=rgb[:, :, :, index])

    return prediction
Ejemplo n.º 3
0
def VideoSpatialPrediction(
        vid_name,
        net,
        num_categories,
        num_samples=25,
        new_size = 299,
        batch_size = 2
        ):

    clip_mean = [0.5]*num_samples
    clip_std = [0.226]*num_samples

    normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
    val_transform = video_transforms.Compose([
            video_transforms.ToTensor(),
            normalize,
        ])

    deep = 1

    # inception = 299,299, resnet = 224,224
    dims = (new_size,new_size,deep,num_samples)
    rgb = np.zeros(shape=dims, dtype=np.float64)
    rgb_flip = np.zeros(shape=dims, dtype=np.float64)
   
    for i in range(num_samples):
        img_file = os.path.join(vid_name, 'vr_{0:02d}.png'.format(i))
        img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)   
        rgb[:,:,0,i] = img
        rgb_flip[:,:,0,i] = img[:,::-1]    

    _, _, _, c = rgb.shape
    rgb_list = []
    for c_index in range(c):
        cur_img = rgb[:,:,:,c_index]
        cur_img_tensor = val_transform(cur_img)
        rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))
        
    rgb_np = np.concatenate(rgb_list,axis=0)

    prediction = np.zeros((num_categories,rgb.shape[3]))
    num_batches = int(math.ceil(float(rgb.shape[3])/batch_size))

    for bb in range(num_batches):
        span = range(batch_size*bb, min(rgb.shape[3],batch_size*(bb+1)))
        input_data = rgb_np[span,:,:,:]
        imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda()
        imgDataVar = torch.autograd.Variable(imgDataTensor)
        output = net(imgDataVar)
        result = output.data.cpu().numpy()
        prediction[:, span] = np.transpose(result)

    return prediction
Ejemplo n.º 4
0
def get_video_transform(data_name, split_name, opt):
    normalizer = video_transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                            std=[0.229, 0.224, 0.225])
    t_list = []
    if split_name == 'train':
        t_list = [
            video_transforms.RandomResizedCrop(opt.crop_size),
            video_transforms.RandomHorizontalFlip(),
            video_transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1)
        ]
    else:
        t_list = [
            video_transforms.Resize(256),
            video_transforms.CenterCrop(opt.crop_size)
        ]

    t_end = [video_transforms.ToTensor(), normalizer]
    transform = video_transforms.Compose(t_list + t_end)
    return transform
Ejemplo n.º 5
0
    def __init__(self, opt, train=True):
        super(VideoDataset, self).__init__(opt, train=train)

        # Dataroot & Transform
        if opt.dataset == 'mgif':
            data_root = './dataset/moving-gif'
            vtrans = [vtransforms.Scale(size=128)]
        elif opt.dataset == 'kth':
            data_root = './dataset/kth_action/'
            vtrans = [
                vtransforms.CenterCrop(size=120),
                vtransforms.Scale(size=128)
            ]
        elif opt.dataset == 'penn':
            data_root = './dataset/penn_action/'
            vtrans = [vtransforms.Scale(size=128)]

        if self.train:
            vtrans += [vtransforms.RandomHorizontalFlip()]
            vtrans += [vtransforms.RandomRotation()]

        vtrans += [vtransforms.ToTensor(scale=True)]
        vtrans += [vtransforms.Normalize(0.5, 0.5)] if opt.input_norm else []
        self.vtrans = T.Compose(vtrans)

        if self.train:
            self.image_path = os.path.join(data_root, 'train')
        else:
            self.image_path = os.path.join(data_root, 'test')

        threshold = self.window_size if opt.irregular else self.sample_size
        if opt.dataset in ['kth', 'sintel', 'ucf101', 'penn']:
            self.image_list = os.listdir(self.image_path)
        elif opt.dataset in ['mgif', 'stickman']:
            self.image_list = remove_files_under_sample_size(
                image_path=self.image_path, threshold=threshold)
        self.image_list = sorted(self.image_list)
def main():
    global args, best_acc1
    args = parser.parse_args()
    num_classes = args.num_classes
    start_epoch=0
    writer = SummaryWriter(args.logdir)

    model = build_model(num_classes=num_classes, input_length=args.new_length)

    print(model)

    # create model
    print("Building model ... ")

    model = torch.nn.DataParallel(model)
    model.cuda()

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    print("Saving everything to directory %s." % (args.out_dir))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, verbose=True, patience=4)

    # if resume set to True, load the model and continue training
    if args.resume or args.evaluate:
        if os.path.isfile(args.model_path):
            model, optimizer, start_epoch = load_checkpoint(model, optimizer, args.model_path)

    cudnn.benchmark = True

    is_color = True
    # scale_ratios = [1.0, 0.875, 0.75, 0.66]
    clip_mean = {'rgb': [0.485, 0.456, 0.406] * args.new_length, 'flow': [0.9432, 0.9359, 0.9511] *args.new_length,
                 'skeleton': [0.0071, 0.0078, 0.0079]*args.new_length}
    clip_std = {'rgb': [0.229, 0.224, 0.225] * args.new_length, 'flow': [0.0788, 0.0753, 0.0683] * args.new_length,
                'skeleton': [0.0581, 0.0621, 0.0623] * args.new_length}

    normalize = video_transforms.Normalize(mean=clip_mean,
                                           std=clip_std)
    train_transform = video_transforms.Compose([
            video_transforms.Resize((args.new_width, args.new_height)),
            video_transforms.ToTensor(),
            normalize,
        ])

    val_transform = video_transforms.Compose([
            video_transforms.Resize((args.new_width, args.new_height)),
            video_transforms.ToTensor(),
            normalize,
        ])

    train_dataset = datasets.__dict__[args.dataset](root=args.data,
                                                    source=args.train_split_file,
                                                    phase="train",
                                                    is_color=is_color,
                                                    new_length=args.new_length,
                                                    video_transform=train_transform)
    val_dataset = datasets.__dict__[args.dataset](root=args.data,
                                                  source=args.test_split_file,
                                                  phase="val",
                                                  is_color=is_color,
                                                  new_length=args.new_length,
                                                  video_transform=val_transform,
                                                  return_id=True)

    print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset),
                                                                           len(train_dataset),
                                                                           len(val_dataset)))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:

        validate(val_loader, model, criterion, epoch=0, writer=writer, classes=val_dataset.classes)
        return

    for epoch in range(start_epoch, args.epochs):
        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, writer)

        # evaluate on validation set
        acc1, loss = validate(val_loader, model, criterion, epoch, writer)
        scheduler.step(loss, epoch=epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        save_checkpoint({
                'epoch': epoch + 1,
                'arch': 'ThreeStreamTemporal',
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
            }, is_best, 'last_checkpoint.pth.tar', args.out_dir)

    writer.close()
Ejemplo n.º 7
0
def flow_test(param_model):
    model = param_model
    video_list = os.listdir(video)
    f = open(video + "flow_result.txt", 'w')
    for file in video_list:
        if file.endswith("mp4"):
            f.write(file + "\n")
            #         file_data = OrderedDict()
            frame_count = 0
            clip_mean = [0.5, 0.5] * 10
            clip_std = [0.226, 0.226] * 10
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            # config the transform to match the network's format
            transform = video_transforms.Compose([
                # video_transforms.Scale((256)),
                video_transforms.CenterCrop((224)),
                video_transforms.ToTensor(),
                normalize,
            ])

            # prepare the translation dictionary label-action
            data_handler = UCF101_splitter(
                os.getcwd() + '/datasets/ucf101_splits/', None)
            data_handler.get_action_index()
            class_to_idx = data_handler.action_label
            idx_to_class = {v: k for k, v in class_to_idx.items()}

            # Start looping on frames received from webcam
            vs = cv2.VideoCapture(video + file)
            softmax = torch.nn.Softmax()
            nn_output = torch.FloatTensor(2 * 10, 224, 224)
            count = 0
            idx = 0
            temp = ''
            x = []
            sampled_list = []
            while (vs.isOpened()):
                ret, image = vs.read()
                if ret is False:
                    break
                else:
                    image = cv2.resize(image, (342, 256),
                                       interpolation=cv2.INTER_LINEAR)
                    x.append(temp)
                    if count == 11:
                        sampled_list = []
                        #             input_var = torch.autograd.Variable(clip_input, volatile=True)
                        temp = ''
                        input_var = clip_input.view(1, 20, 224, 224).cuda()
                        output = model(input_var)
                        output = softmax(output)
                        output = output.data.cpu().numpy()
                        preds = output.argsort()[0][-5:][::-1]
                        pred_classes = [(idx_to_class[str(pred + 1)],
                                         output[0, pred]) for pred in preds]
                        value = 0
                        for i in range(5):
                            if pred_classes[i][0] == label:
                                value = pred_classes[i][1]

                            temp += '{} - {:.2f}\n'.format(
                                pred_classes[i][0], pred_classes[i][1])
                        f.write(str(value) + "\n")
                        nn_output = torch.FloatTensor(2 * 10, 224, 224)
                        count = 1

                    if count == 0:
                        old_frame = image.copy()
                        prev = cv2.cvtColor(old_frame, cv2.COLOR_BGR2GRAY)

                    else:
                        next = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                        flow = cv2.calcOpticalFlowFarneback(
                            prev, next, 1, 0.5, 3, 15, 3, 5, 1.2, 0)
                        horz = cv2.normalize(flow[..., 0], None, 0, 255,
                                             cv2.NORM_MINMAX)
                        vert = cv2.normalize(flow[..., 1], None, 0, 255,
                                             cv2.NORM_MINMAX)
                        horz = horz.astype('uint8')
                        vert = vert.astype('uint8')
                        imgH = Image.fromarray(horz)
                        imgV = Image.fromarray(vert)

                        sampled_list.append(np.expand_dims(imgH, 2))
                        sampled_list.append(np.expand_dims(imgV, 2))

                        clip_input = np.concatenate(sampled_list, axis=2)
                        clip_input = transform(clip_input)
                        clip_input = clip_input.float().cuda(async=True)
                        imgH.close()
                        imgV.close()
                        prev = next.copy()

                    count += 1
                    idx += 1
            f.write("----\n")
            #         file_data[file] = flow_value
            #         with open('flow.json', 'w', encoding="utf-8") as make_file:
            #             json.dump(file_data, make_file, ensure_ascii=False, indent="\t")
            print(idx)
            vs.release()

    f.close()
def VideoSpatialPrediction(mode,
                           vid_name,
                           net,
                           num_categories,
                           start_frame=0,
                           num_frames=0,
                           num_samples=25,
                           index=1,
                           new_size=299):

    if num_frames == 0:
        imglist = os.listdir(vid_name)
        #imglist = list(filter(lambda x: x[:3]=='img',imglist))
        duration = len(imglist)
        # print(duration)
    else:
        duration = num_frames

    # selection
    if mode == 'rgb':
        step = int(math.floor((duration - 1) / (num_samples - 1)))
        clip_mean = [0.485, 0.456, 0.406]
        clip_std = [0.229, 0.224, 0.225]
    else:
        clip_mean = [0.5, 0.5]
        clip_std = [0.226, 0.226]

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    val_transform = video_transforms.Compose([
        video_transforms.ToTensor(),
        normalize,
    ])

    deep = 1 if mode == 'rhythm' else 3

    # inception = 320,360, resnet = 240, 320
    width = 320 if new_size == 299 else 240
    height = 360 if new_size == 299 else 320
    dims = (width, height, deep, num_samples)
    rgb = np.zeros(shape=dims, dtype=np.float64)
    rgb_flip = np.zeros(shape=dims, dtype=np.float64)

    for i in range(num_samples):
        if mode == 'rhythm':
            img_file = os.path.join(vid_name,
                                    'visual_rhythm_{0:05d}.png'.format(index))
            print(img_file)
            img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, dims[1::-1])
            rgb[:, :, 0, i] = img
            rgb_flip[:, :, 0, i] = img[:, ::-1]
        else:
            img_file = os.path.join(vid_name,
                                    'img_{0:05d}.jpg'.format(i * step + 1))
            img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED)
            img = cv2.resize(img, dims[1::-1])
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            rgb[:, :, :, i] = img
            rgb_flip[:, :, :, i] = img[:, ::-1, :]

    # crop 299 = inception, 224 = resnet
    size = new_size
    corner = [(height - size) // 2, (width - size) // 2]
    rgb_1 = rgb[:size, :size, :, :]
    rgb_2 = rgb[:size, -size:, :, :]
    rgb_3 = rgb[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :]
    rgb_4 = rgb[-size:, :size, :, :]
    rgb_5 = rgb[-size:, -size:, :, :]
    rgb_f_1 = rgb_flip[:size, :size, :, :]
    rgb_f_2 = rgb_flip[:size, -size:, :, :]
    rgb_f_3 = rgb_flip[corner[1]:corner[1] + size,
                       corner[0]:corner[0] + size, :, :]
    rgb_f_4 = rgb_flip[-size:, :size, :, :]
    rgb_f_5 = rgb_flip[-size:, -size:, :, :]

    rgb = np.concatenate((rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_f_1, rgb_f_2,
                          rgb_f_3, rgb_f_4, rgb_f_5),
                         axis=3)

    _, _, _, c = rgb.shape
    rgb_list = []
    for c_index in range(c):
        cur_img = rgb[:, :, :, c_index]
        cur_img_tensor = val_transform(cur_img)
        rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))

    rgb_np = np.concatenate(rgb_list, axis=0)
    #batch_size = 25
    batch_size = 5
    prediction = np.zeros((num_categories, rgb.shape[3]))
    num_batches = int(math.ceil(float(rgb.shape[3]) / batch_size))

    for bb in range(num_batches):
        span = range(batch_size * bb, min(rgb.shape[3], batch_size * (bb + 1)))
        input_data = rgb_np[span, :, :, :]
        imgDataTensor = torch.from_numpy(input_data).type(
            torch.FloatTensor).cuda()
        imgDataVar = torch.autograd.Variable(imgDataTensor)
        output = net(imgDataVar)
        result = output.data.cpu().numpy()
        prediction[:, span] = np.transpose(result)

    return prediction
Ejemplo n.º 9
0
def VideoSpatialPrediction3D(vid_name,
                             net,
                             num_categories,
                             architecture_name,
                             start_frame=0,
                             num_frames=0,
                             length=16,
                             extension='img_{0:05d}.jpg',
                             ten_crop=False):

    if num_frames == 0:
        imglist = os.listdir(vid_name)
        newImageList = []
        if 'rgb' in architecture_name or 'pose' in architecture_name:
            for item in imglist:
                if 'img' in item:
                    newImageList.append(item)
        elif 'flow' in architecture_name:
            for item in imglist:
                if 'flow_x' in item:
                    newImageList.append(item)
        duration = len(newImageList)
    else:
        duration = num_frames

    if 'rgb' in architecture_name or 'pose' in architecture_name:
        if 'I3D' in architecture_name:

            if not 'resnet' in architecture_name:
                clip_mean = [0.5, 0.5, 0.5]
                clip_std = [0.5, 0.5, 0.5]
            else:
                clip_mean = [0.45, 0.45, 0.45]
                clip_std = [0.225, 0.225, 0.225]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose([
                video_transforms.ToTensor(),
                normalize,
            ])
            if '112' in architecture_name:
                scale = 0.5
            else:
                scale = 1
        elif 'MFNET3D' in architecture_name:
            clip_mean = [0.48627451, 0.45882353, 0.40784314]
            clip_std = [0.234, 0.234, 0.234]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose(
                [video_transforms.ToTensor(), normalize])
            if '112' in architecture_name:
                scale = 0.5
            else:
                scale = 1
        elif 'tsm' in architecture_name:
            clip_mean = [0.485, 0.456, 0.406]
            clip_std = [0.229, 0.224, 0.225]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose(
                [video_transforms.ToTensor(), normalize])
            scale = 1
        elif "r2plus1d" in architecture_name:
            clip_mean = [0.43216, 0.394666, 0.37645]
            clip_std = [0.22803, 0.22145, 0.216989]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose(
                [video_transforms.ToTensor(), normalize])
            scale = 0.5
        elif 'rep_flow' in architecture_name:
            clip_mean = [0.5, 0.5, 0.5]
            clip_std = [0.5, 0.5, 0.5]

            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose([
                video_transforms.ToTensor(),
                normalize,
            ])
            scale = 1
        elif "slowfast" in architecture_name:
            clip_mean = [0.45, 0.45, 0.45]
            clip_std = [0.225, 0.225, 0.225]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose([
                video_transforms.ToTensor(),
                normalize,
            ])
            scale = 1
        else:
            scale = 0.5
            clip_mean = [114.7748, 107.7354, 99.4750]
            clip_std = [1, 1, 1]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose([
                video_transforms.ToTensor2(),
                normalize,
            ])
    elif 'flow' in architecture_name:
        if 'I3D' in architecture_name:
            clip_mean = [0.5] * 2
            clip_std = [0.5] * 2
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)

            val_transform = video_transforms.Compose([
                video_transforms.ToTensor(),
                normalize,
            ])
            scale = 1
        elif "3D" in architecture_name:
            scale = 0.5
            clip_mean = [127.5, 127.5]
            clip_std = [1, 1]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            val_transform = video_transforms.Compose([
                video_transforms.ToTensor2(),
                normalize,
            ])
        elif "r2plus1d" in architecture_name:
            clip_mean = [0.5] * 2
            clip_std = [0.226] * 2
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)

            val_transform = video_transforms.Compose([
                video_transforms.ToTensor(),
                normalize,
            ])
            scale = 0.5

    if '224' in architecture_name:
        scale = 1
    if '112' in architecture_name:
        scale = 0.5
    # selection
    #step = int(math.floor((duration-1)/(num_samples-1)))
    dims2 = (224, 224, 3, duration)

    imageSize = int(224 * scale)
    dims = (int(256 * scale), int(340 * scale), 3, duration)
    #dims = (int(256 * scale),int(256 * scale),3,duration)
    duration = duration - 1

    offsets = []

    offsetMainIndexes = list(range(1, duration - length, length))
    if len(offsetMainIndexes) == 0:
        offsets = list(range(1, duration + 2)) * int(
            np.floor(length / (duration + 1))) + list(
                range(1, length % (duration + 1) + 1))
    else:
        shift = int((duration - (offsetMainIndexes[-1] + length)) / 2)
        for mainOffsetValue in offsetMainIndexes:
            for lengthID in range(1, length + 1):
                offsets.append(lengthID + mainOffsetValue + shift)

#    offsetMainIndexes = list(range(0,duration,length))
#    for mainOffsetValue in offsetMainIndexes:
#        for lengthID in range(1, length+1):
#            loaded_frame_index = lengthID + mainOffsetValue
#            moded_loaded_frame_index = loaded_frame_index % (duration + 1)
#            if moded_loaded_frame_index == 0:
#                moded_loaded_frame_index = (duration + 1)
#            offsets.append(moded_loaded_frame_index)

    imageList = []
    imageList1 = []
    imageList2 = []
    imageList3 = []
    imageList4 = []
    imageList5 = []
    imageList6 = []
    imageList7 = []
    imageList8 = []
    imageList9 = []
    imageList10 = []
    imageList11 = []
    imageList12 = []
    interpolation = cv2.INTER_LINEAR

    for index in offsets:
        if 'rgb' in architecture_name or 'pose' in architecture_name:
            img_file = os.path.join(vid_name, extension.format(index))
            img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED)

            img = cv2.resize(img, dims[1::-1], interpolation)

            #img2 = cv2.resize(img, dims2[1::-1],interpolation)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img_flip = img[:, ::-1, :].copy()
        elif 'flow' in architecture_name:
            flow_x_file = os.path.join(vid_name, extension.format('x', index))
            flow_y_file = os.path.join(vid_name, extension.format('y', index))
            img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE)
            img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE)
            img_x = np.expand_dims(img_x, -1)
            img_y = np.expand_dims(img_y, -1)
            img = np.concatenate((img_x, img_y), 2)
            img = cv2.resize(img, dims[1::-1], interpolation)
            img_flip = img[:, ::-1, :].copy()
        #img_flip2 = img2[:,::-1,:].copy()
        #imageList1.append(img[int(16 * scale):int(16 * scale + imageSize), int(16 * scale) : int(16 * scale + imageSize), :])
        imageList1.append(img[int(16 * scale):int(16 * scale + imageSize),
                              int(58 * scale):int(58 * scale + imageSize), :])
        imageList2.append(img[:imageSize, :imageSize, :])
        imageList3.append(img[:imageSize, -imageSize:, :])
        imageList4.append(img[-imageSize:, :imageSize, :])
        imageList5.append(img[-imageSize:, -imageSize:, :])
        imageList6.append(img_flip[int(16 * scale):int(16 * scale + imageSize),
                                   int(58 * scale):int(58 * scale +
                                                       imageSize), :])
        imageList7.append(img_flip[:imageSize, :imageSize, :])
        imageList8.append(img_flip[:imageSize, -imageSize:, :])
        imageList9.append(img_flip[-imageSize:, :imageSize, :])
        imageList10.append(img_flip[-imageSize:, -imageSize:, :])
#        imageList11.append(img2)
#        imageList12.append(img_flip2)

    if ten_crop:
        imageList = imageList1 + imageList2 + imageList3 + imageList4 + imageList5 + imageList6 + imageList7 + imageList8 + imageList9 + imageList10
    else:
        imageList = imageList1

    #imageList=imageList11+imageList12

    rgb_list = []

    for i in range(len(imageList)):
        cur_img = imageList[i]
        cur_img_tensor = val_transform(cur_img)
        rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))

    input_data = np.concatenate(rgb_list, axis=0)
    if 'rgb' in architecture_name or 'pose' in architecture_name:
        input_data = input_data.reshape(-1, length, 3, imageSize, imageSize)
    elif 'flow' in architecture_name:
        input_data = input_data.reshape(-1, length, 2, imageSize, imageSize)

    batch_size = 10
    result = np.zeros([input_data.shape[0], num_categories])
    num_batches = int(math.ceil(float(input_data.shape[0]) / batch_size))

    with torch.no_grad():
        for bb in range(num_batches):
            span = range(batch_size * bb,
                         min(input_data.shape[0], batch_size * (bb + 1)))
            input_data_batched = input_data[span, :, :, :, :]
            imgDataTensor = torch.from_numpy(input_data_batched).type(
                torch.FloatTensor).cuda()
            if 'rgb' in architecture_name or 'pose' in architecture_name:
                imgDataTensor = imgDataTensor.view(-1, length, 3, imageSize,
                                                   imageSize).transpose(1, 2)
            elif 'flow' in architecture_name:
                imgDataTensor = imgDataTensor.view(-1, length, 2, imageSize,
                                                   imageSize).transpose(1, 2)

            if 'bert' in architecture_name or 'pooling' in architecture_name or 'NLB' in architecture_name \
                or 'lstm' in architecture_name or 'adamw' in architecture_name:
                output, input_vectors, sequenceOut, maskSample = net(
                    imgDataTensor)
            else:
                output = net(imgDataTensor)
            #span = range(sample_size*bb, min(int(input_data.shape[0]/length),sample_size*(bb+1)))
            result[span, :] = output.data.cpu().numpy()
        mean_result = np.mean(result, 0)
        prediction = np.argmax(mean_result)
        top3 = mean_result.argsort()[::-1][:3]
        top5 = mean_result.argsort()[::-1][:5]

    return prediction, mean_result, top3
Ejemplo n.º 10
0
def VideoSpatialPrediction(vid_name,
                           net,
                           num_categories,
                           start_frame=0,
                           num_frames=0,
                           num_samples=25):

    if num_frames == 0:
        imglist = os.listdir(vid_name)
        duration = len(imglist)
        # print(duration)
    else:
        duration = num_frames

    clip_mean = [0.485, 0.456, 0.406]
    clip_std = [0.229, 0.224, 0.225]
    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    val_transform = video_transforms.Compose([
        video_transforms.ToTensor(),
        normalize,
    ])

    # selection
    step = int(math.floor((duration - 1) / (num_samples - 1)))
    dims = (256, 340, 3, num_samples)
    rgb = np.zeros(shape=dims, dtype=np.float64)
    rgb_flip = np.zeros(shape=dims, dtype=np.float64)

    for i in range(num_samples):
        img_file = os.path.join(vid_name,
                                'frame{0:06d}.jpg'.format(i * step + 1))
        img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED)
        img = cv2.resize(img, dims[1::-1])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        rgb[:, :, :, i] = img
        rgb_flip[:, :, :, i] = img[:, ::-1, :]

    # crop
    rgb_1 = rgb[:224, :224, :, :]
    rgb_2 = rgb[:224, -224:, :, :]
    rgb_3 = rgb[16:240, 60:284, :, :]
    rgb_4 = rgb[-224:, :224, :, :]
    rgb_5 = rgb[-224:, -224:, :, :]
    rgb_f_1 = rgb_flip[:224, :224, :, :]
    rgb_f_2 = rgb_flip[:224, -224:, :, :]
    rgb_f_3 = rgb_flip[16:240, 60:284, :, :]
    rgb_f_4 = rgb_flip[-224:, :224, :, :]
    rgb_f_5 = rgb_flip[-224:, -224:, :, :]

    rgb = np.concatenate((rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_f_1, rgb_f_2,
                          rgb_f_3, rgb_f_4, rgb_f_5),
                         axis=3)

    _, _, _, c = rgb.shape
    rgb_list = []
    for c_index in range(c):
        cur_img = rgb[:, :, :, c_index].squeeze()
        cur_img_tensor = val_transform(cur_img)
        rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))

    rgb_np = np.concatenate(rgb_list, axis=0)
    # print(rgb_np.shape)
    batch_size = 25
    prediction = np.zeros((num_categories, rgb.shape[3]))
    num_batches = int(math.ceil(float(rgb.shape[3]) / batch_size))

    for bb in range(num_batches):
        span = range(batch_size * bb, min(rgb.shape[3], batch_size * (bb + 1)))
        input_data = rgb_np[span, :, :, :]
        imgDataTensor = torch.from_numpy(input_data).type(
            torch.FloatTensor).cuda()
        imgDataVar = torch.autograd.Variable(imgDataTensor)
        output = net(imgDataVar)
        result = output.data.cpu().numpy()
        prediction[:, span] = np.transpose(result)

    return prediction
Ejemplo n.º 11
0
def VideoTemporalPrediction(mode,
                            vid_name,
                            target,
                            net,
                            num_categories,
                            start_frame=0,
                            num_frames=0,
                            num_samples=25,
                            optical_flow_frames=10,
                            new_size=299,
                            ext=".jpg"):

    gc = GradCAM(model=net)

    if num_frames == 0:
        imglist = os.listdir(vid_name)
        duration = len(imglist)
    else:
        duration = num_frames

    # selection
    step = int(math.floor((duration - optical_flow_frames + 1) / num_samples))
    clip_mean = [0.5] * optical_flow_frames * 2
    clip_std = [0.226] * optical_flow_frames * 2

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    test_transform = video_transforms.Compose(
        [video_transforms.ToTensor(), normalize])

    # inception = 320,360, resnet = 240, 320
    width = 320 if new_size == 299 else 240
    height = 360 if new_size == 299 else 320
    deep = optical_flow_frames * 2
    dims = (width, height, deep, num_samples)
    flow = np.zeros(shape=dims, dtype=np.float64)
    flow_flip = np.zeros(shape=dims, dtype=np.float64)

    for i in range(num_samples):
        for j in range(optical_flow_frames):
            flow_x_file = os.path.join(
                vid_name, mode +
                '_x_{0:05d}{1}'.format(i * step + j + 1 + start_frame, ext))
            flow_y_file = os.path.join(
                vid_name, mode +
                '_y_{0:05d}{1}'.format(i * step + j + 1 + start_frame, ext))
            img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE)
            img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE)
            img_x = cv2.resize(img_x, dims[1::-1])
            img_y = cv2.resize(img_y, dims[1::-1])

            flow[:, :, j * 2, i] = img_x
            flow[:, :, j * 2 + 1, i] = img_y

            flow_flip[:, :, j * 2, i] = 255 - img_x[:, ::-1]
            flow_flip[:, :, j * 2 + 1, i] = img_y[:, ::-1]

    # crop 299 = inception, 224 = resnet
    size = new_size
    corner = [(height - size) // 2, (width - size) // 2]
    flow_1 = flow[:size, :size, :, :]
    flow_2 = flow[:size, -size:, :, :]
    flow_3 = flow[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :]
    flow_4 = flow[-size:, :size, :, :]
    flow_5 = flow[-size:, -size:, :, :]
    flow_f_1 = flow_flip[:size, :size, :, :]
    flow_f_2 = flow_flip[:size, -size:, :, :]
    flow_f_3 = flow_flip[corner[1]:corner[1] + size,
                         corner[0]:corner[0] + size, :, :]
    flow_f_4 = flow_flip[-size:, :size, :, :]
    flow_f_5 = flow_flip[-size:, -size:, :, :]

    flow = np.concatenate((flow_1, flow_2, flow_3, flow_4, flow_5, flow_f_1,
                           flow_f_2, flow_f_3, flow_f_4, flow_f_5),
                          axis=3)

    _, _, _, c = flow.shape
    flow_list = []
    for c_index in range(c):
        cur_img = flow[:, :, :, c_index].squeeze()
        cur_img_tensor = test_transform(cur_img)
        flow_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))

    flow_np = np.concatenate(flow_list, axis=0)
    prediction = np.zeros((num_categories, flow.shape[3]))

    index = 50
    input_data = flow_np[index:index + 1, :, :, :]
    raw_image_x = flow[:, :, [0, 2, 4, 6, 8], index]
    raw_image_y = flow[:, :, [1, 3, 5, 7, 9], index]
    print(raw_image_x.shape, raw_image_y.shape)
    imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda()
    imgDataVar = torch.autograd.Variable(imgDataTensor)

    probs, ids = gc.forward(imgDataVar)
    ids_ = torch.LongTensor([[target]] * len(imgDataVar)).to(
        torch.device("cuda"))
    gc.backward(ids=ids_)
    regions = gc.generate(target_layer="Mixed_7c")
    save_gradcam(vid_name.split("/")[-1] + "_x.png",
                 gcam=regions[0, 0],
                 raw_image=flow[:, :, 4:5, index])
    save_gradcam(vid_name.split("/")[-1] + "_y.png",
                 gcam=regions[0, 0],
                 raw_image=flow[:, :, 5:6, index])

    return prediction
def main():
    global args, best_prec1, model, writer, best_loss, length, width, height, input_size, scheduler
    args = parser.parse_args()
    training_continue = args.contine
    if '3D' in args.arch:
        if 'I3D' in args.arch or 'MFNET3D' in args.arch:
            if '112' in args.arch:
                scale = 0.5
            else:
                scale = 1
        else:
            if '224' in args.arch:
                scale = 1
            else:
                scale = 0.5
    elif 'r2plus1d' in args.arch:
        scale = 0.5
    else:
        scale = 1

    print('scale: %.1f' % (scale))

    input_size = int(224 * scale)
    width = int(340 * scale)
    height = int(256 * scale)

    saveLocation = "./checkpoint/" + args.dataset + "_" + args.arch + "_split" + str(
        args.split)
    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)
    writer = SummaryWriter(saveLocation)

    # create model

    if args.evaluate:
        print("Building validation model ... ")
        model = build_model_validate()
        optimizer = AdamW(model.parameters(),
                          lr=args.lr,
                          weight_decay=args.weight_decay)
    elif training_continue:
        model, startEpoch, optimizer, best_prec1 = build_model_continue()
        for param_group in optimizer.param_groups:
            lr = param_group['lr']
            #param_group['lr'] = lr
        print(
            "Continuing with best precision: %.3f and start epoch %d and lr: %f"
            % (best_prec1, startEpoch, lr))
    else:
        print("Building model with ADAMW... ")
        model = build_model()
        optimizer = AdamW(model.parameters(),
                          lr=args.lr,
                          weight_decay=args.weight_decay)
        startEpoch = 0

    if HALF:
        model.half()  # convert to half precision
        for layer in model.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.float()

    print("Model %s is loaded. " % (args.arch))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    criterion2 = nn.MSELoss().cuda()

    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               'min',
                                               patience=5,
                                               verbose=True)

    print("Saving everything to directory %s." % (saveLocation))
    if args.dataset == 'ucf101':
        dataset = './datasets/ucf101_frames'
    elif args.dataset == 'hmdb51':
        dataset = './datasets/hmdb51_frames'
    elif args.dataset == 'smtV2':
        dataset = './datasets/smtV2_frames'
    elif args.dataset == 'window':
        dataset = './datasets/window_frames'
    elif args.dataset == 'haa500_basketball':
        dataset = './datasets/haa500_basketball_frames'
    else:
        print("No convenient dataset entered, exiting....")
        return 0

    cudnn.benchmark = True
    modality = args.arch.split('_')[0]
    if "3D" in args.arch or 'tsm' in args.arch or 'slowfast' in args.arch or 'r2plus1d' in args.arch:
        if '64f' in args.arch:
            length = 64
        elif '32f' in args.arch:
            length = 32
        else:
            length = 16
    else:
        length = 1
    # Data transforming
    if modality == "rgb" or modality == "pose":
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        if 'I3D' in args.arch:
            if 'resnet' in args.arch:
                clip_mean = [0.45, 0.45, 0.45] * args.num_seg * length
                clip_std = [0.225, 0.225, 0.225] * args.num_seg * length
            else:
                clip_mean = [0.5, 0.5, 0.5] * args.num_seg * length
                clip_std = [0.5, 0.5, 0.5] * args.num_seg * length
            #clip_std = [0.25, 0.25, 0.25] * args.num_seg * length
        elif 'MFNET3D' in args.arch:
            clip_mean = [0.48627451, 0.45882353, 0.40784314
                         ] * args.num_seg * length
            clip_std = [0.234, 0.234, 0.234] * args.num_seg * length
        elif "3D" in args.arch:
            clip_mean = [114.7748, 107.7354, 99.4750] * args.num_seg * length
            clip_std = [1, 1, 1] * args.num_seg * length
        elif "r2plus1d" in args.arch:
            clip_mean = [0.43216, 0.394666, 0.37645] * args.num_seg * length
            clip_std = [0.22803, 0.22145, 0.216989] * args.num_seg * length
        elif "rep_flow" in args.arch:
            clip_mean = [0.5, 0.5, 0.5] * args.num_seg * length
            clip_std = [0.5, 0.5, 0.5] * args.num_seg * length
        elif "slowfast" in args.arch:
            clip_mean = [0.45, 0.45, 0.45] * args.num_seg * length
            clip_std = [0.225, 0.225, 0.225] * args.num_seg * length
        else:
            clip_mean = [0.485, 0.456, 0.406] * args.num_seg * length
            clip_std = [0.229, 0.224, 0.225] * args.num_seg * length
    elif modality == "pose":
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        clip_mean = [0.485, 0.456, 0.406] * args.num_seg
        clip_std = [0.229, 0.224, 0.225] * args.num_seg
    elif modality == "flow":
        is_color = False
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        if 'I3D' in args.arch:
            clip_mean = [0.5, 0.5] * args.num_seg * length
            clip_std = [0.5, 0.5] * args.num_seg * length
        elif "3D" in args.arch:
            clip_mean = [127.5, 127.5] * args.num_seg * length
            clip_std = [1, 1] * args.num_seg * length
        else:
            clip_mean = [0.5, 0.5] * args.num_seg * length
            clip_std = [0.226, 0.226] * args.num_seg * length
    elif modality == "both":
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        clip_mean = [0.485, 0.456, 0.406, 0.5, 0.5] * args.num_seg * length
        clip_std = [0.229, 0.224, 0.225, 0.226, 0.226] * args.num_seg * length
    else:
        print("No such modality. Only rgb and flow supported.")

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)

    if "3D" in args.arch and not ('I3D' in args.arch):
        train_transform = video_transforms.Compose([
            video_transforms.MultiScaleCrop((input_size, input_size),
                                            scale_ratios),
            video_transforms.RandomHorizontalFlip(),
            video_transforms.ToTensor2(),
            normalize,
        ])

        val_transform = video_transforms.Compose([
            video_transforms.CenterCrop((input_size)),
            video_transforms.ToTensor2(),
            normalize,
        ])
    else:
        train_transform = video_transforms.Compose([
            video_transforms.MultiScaleCrop((input_size, input_size),
                                            scale_ratios),
            video_transforms.RandomHorizontalFlip(),
            video_transforms.ToTensor(),
            normalize,
        ])

        val_transform = video_transforms.Compose([
            video_transforms.CenterCrop((input_size)),
            video_transforms.ToTensor(),
            normalize,
        ])

    # data loading
    train_setting_file = "train_%s_split%d.txt" % (modality, args.split)
    train_split_file = os.path.join(args.settings, args.dataset,
                                    train_setting_file)
    val_setting_file = "val_%s_split%d.txt" % (modality, args.split)
    val_split_file = os.path.join(args.settings, args.dataset,
                                  val_setting_file)
    if not os.path.exists(train_split_file) or not os.path.exists(
            val_split_file):
        print(
            "No split file exists in %s directory. Preprocess the dataset first"
            % (args.settings))

    train_dataset = datasets.__dict__[args.dataset](
        root=dataset,
        source=train_split_file,
        phase="train",
        modality=modality,
        is_color=is_color,
        new_length=length,
        new_width=width,
        new_height=height,
        video_transform=train_transform,
        num_segments=args.num_seg)

    val_dataset = datasets.__dict__[args.dataset](
        root=dataset,
        source=val_split_file,
        phase="val",
        modality=modality,
        is_color=is_color,
        new_length=length,
        new_width=width,
        new_height=height,
        video_transform=val_transform,
        num_segments=args.num_seg)

    print('{} samples found, {} train samples and {} test samples.'.format(
        len(val_dataset) + len(train_dataset), len(train_dataset),
        len(val_dataset)))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        prec1, prec3, _ = validate(val_loader, model, criterion, criterion2,
                                   modality)
        return

    for epoch in range(startEpoch, args.epochs):
        #        if learning_rate_index > max_learning_rate_decay_count:
        #            break
        #        adjust_learning_rate(optimizer, epoch)
        train(train_loader, model, criterion, criterion2, optimizer, epoch,
              modality)

        # evaluate on validation set
        prec1 = 0.0
        lossClassification = 0
        if (epoch + 1) % args.save_freq == 0:
            prec1, prec3, lossClassification = validate(
                val_loader, model, criterion, criterion2, modality)
            writer.add_scalar('data/top1_validation', prec1, epoch)
            writer.add_scalar('data/top3_validation', prec3, epoch)
            writer.add_scalar('data/classification_loss_validation',
                              lossClassification, epoch)
            scheduler.step(lossClassification)
        # remember best prec@1 and save checkpoint

        is_best = prec1 >= best_prec1
        best_prec1 = max(prec1, best_prec1)
        #        best_in_existing_learning_rate = max(prec1, best_in_existing_learning_rate)
        #
        #        if best_in_existing_learning_rate > prec1 + 1:
        #            learning_rate_index = learning_rate_index
        #            best_in_existing_learning_rate = 0

        if (epoch + 1) % args.save_freq == 0:
            checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar")
            if is_best:
                print("Model works well")
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                        'best_loss': best_loss,
                        'optimizer': optimizer.state_dict(),
                    }, is_best, checkpoint_name, saveLocation)

    checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar")
    save_checkpoint(
        {
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'best_loss': best_loss,
            'optimizer': optimizer.state_dict(),
        }, is_best, checkpoint_name, saveLocation)
    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    args = parser.parse_args()
    set_logger(log_file=args.log_file, debug_mode=args.debug_mode)

    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    cudnn.benchmark = True

    mean = [124 / 255, 117 / 255, 104 / 255]
    std = [1 / (.0167 * 255)] * 3
    normalize = transforms.Normalize(mean=mean, std=std)

    train_loader = VideoIterTrain(
        dataset_path=args.dataset_path,
        annotation_path=args.annotation_path,
        clip_length=args.clip_length,
        frame_interval=args.train_frame_interval,
        video_transform=transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomCrop((224, 224)),
            transforms.ToTensor(),
            normalize,
        ]),
        name='train',
        return_item_subpath=False,
    )

    train_iter = torch.utils.data.DataLoader(
        train_loader,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=32,  # 4, # change this part accordingly
        pin_memory=True)

    val_loader = VideoIterTrain(
        dataset_path=args.dataset_path,
        annotation_path=args.annotation_path_test,
        clip_length=args.clip_length,
        frame_interval=args.val_frame_interval,
        video_transform=transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomCrop((224, 224)),
            transforms.ToTensor(),
            normalize,
        ]),
        name='val',
        return_item_subpath=False,
    )

    val_iter = torch.utils.data.DataLoader(
        val_loader,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=32,  # 4, # change this part accordingly
        pin_memory=True)

    network = C3D(pretrained=args.pretrained_3d)
    network.to(device)

    if not path.exists(features_dir):
        mkdir(features_dir)

    features_writer = FeaturesWriter()

    for i_batch, (data, target, sampled_idx, dirs,
                  vid_names) in tqdm(enumerate(train_iter)):
        data = data.to(device)
        with torch.no_grad():
            input_var = torch.autograd.Variable(data)
            outputs = network(input_var)

            for i, (dir, vid_name, start_frame) in enumerate(
                    zip(dirs, vid_names,
                        sampled_idx.cpu().numpy())):
                dir = path.join(features_dir, dir)
                features_writer.write(feature=outputs[i],
                                      video_name=vid_name,
                                      start_frame=start_frame,
                                      dir=dir)

    features_writer.dump()

    features_writer = FeaturesWriter()
    for i_batch, (data, target, sampled_idx, dirs,
                  vid_names) in tqdm(enumerate(val_iter)):
        data = data.to(device)
        with torch.no_grad():
            input_var = torch.autograd.Variable(data)
            outputs = network(input_var)

            for i, (dir, vid_name, start_frame) in enumerate(
                    zip(dirs, vid_names,
                        sampled_idx.cpu().numpy())):
                dir = path.join(features_dir, dir)
                features_writer.write(feature=outputs[i],
                                      video_name=vid_name,
                                      start_frame=start_frame,
                                      dir=dir)

    features_writer.dump()
Ejemplo n.º 14
0
def rgb_test(param_model):
    model = param_model
    f = open(video + "rgb_result.txt", 'w')
    video_list = os.listdir(video)
    for file in video_list:
        if file.endswith("mp4"):
            f.write(file + "\n")
            frame_count = 2
            clip_mean = [0.485, 0.456, 0.406] * 1
            clip_std = [0.229, 0.224, 0.225] * 1
            normalize = video_transforms.Normalize(mean=clip_mean,
                                                   std=clip_std)
            # config the transform to match the network's format
            transform = video_transforms.Compose([
                # video_transforms.Scale((256)),
                video_transforms.CenterCrop((224)),
                video_transforms.ToTensor(),
                normalize,
            ])

            # prepare the translation dictionary label-action
            data_handler = UCF101_splitter(
                os.getcwd() + '/datasets/ucf101_splits/', None)
            data_handler.get_action_index()
            class_to_idx = data_handler.action_label
            idx_to_class = {v: k for k, v in class_to_idx.items()}

            # Start looping on frames received from webcam
            vs = cv2.VideoCapture(video + file)
            softmax = torch.nn.Softmax()
            nn_output = torch.tensor(np.zeros((1, 23)),
                                     dtype=torch.float32).cuda()
            sampled_list = []
            first_count = 0
            while True:
                # read each frame and prepare it for feedforward in nn (resize and type)
                ret, orig_frame = vs.read()
                if ret is False:
                    break

                else:
                    orig_frame = cv2.resize(orig_frame, (342, 256),
                                            interpolation=cv2.INTER_LINEAR)
                    frame = cv2.cvtColor(orig_frame, cv2.COLOR_BGR2RGB)
                    frame = transform(frame).view(1, 3, 224, 224).cuda()
                    frame = frame.float().cuda(async=True)
                    # feed the frame to the neural network
                    nn_output = model(frame)
                    # vote for class with 25 consecutive frames
                    if frame_count % 10 == 0:
                        nn_output = softmax(nn_output)
                        nn_output = nn_output.data.cpu().numpy()
                        preds = nn_output.argsort()[0][-5:][::-1]
                        pred_classes = [(idx_to_class[str(pred + 1)],
                                         nn_output[0, pred]) for pred in preds]

                        red = (0, 0, 255)
                        green = (0, 255, 0)
                        blue = (255, 0, 0)
                        white = (255, 255, 255)
                        yellow = (0, 255, 255)
                        cyan = (255, 255, 0)
                        magenta = (255, 0, 255)
                        thickness = 2
                        center_x = int(342 / 2.0)
                        center_y = int(256 / 2.0)
                        location = (center_x - 170, center_y + 80)
                        fontScale = 1.5
                        font = cv2.FONT_HERSHEY_SIMPLEX
                        y0, dy = 180, 20
                        value = 0
                        for i in range(5):
                            y = y0 + i * dy
                            if pred_classes[i][0] == label:
                                value = pred_classes[i][1]

                        # reset the process
                        f.write(str(value) + "\n")
                        nn_output = torch.tensor(np.zeros((1, 23)),
                                                 dtype=torch.float32).cuda()
        #             cv2.imwrite("temp/" + str(frame_count) + ".jpg", orig_frame)
        # Display the resulting frame and the classified action
                    frame_count += 1

            # When everything done, release the capture
            f.write("----\n")
            vs.release()

    f.close()
Ejemplo n.º 15
0
def main(args):
    ############################################################################
    # Path to optical flow images
    if args.detector == 'yolo':
        img_root = './data/yolov3/'
    else:
        img_root = './data/faster-rcnn/'
    # Path to training and testing files
    load_path = './data/'
    # CPU or GPU?
    device = torch.device("cuda")

    # Model saving and loading
    model_save_path = './data/'
    model_load_path = './data/'

    # Training settings
    epochs = 15
    batch_size = 64
    learning_rate = 1e-5
    num_workers = 8
    weight_decay = 1e-2
    NUM_FLOW_FRAMES = 9
    training_proportion = 100  #  How much of the dataset to use? 100 = 100percent

    # Transformers for training and validation
    transform_train = video_transforms.Compose([
        video_transforms.MultiScaleCrop((224, 224), [1.0]),
        video_transforms.ToTensor(),
    ])

    transform_val = video_transforms.Compose([
        video_transforms.Scale((224)),
        video_transforms.ToTensor(),
    ])

    ############################################################################

    print('################### Training settings ###################')
    print('epochs:', epochs, '   batch_size:', batch_size, '   learning_rate:',
          learning_rate, '   num_workers:', num_workers, '   NUM_FLOW_FRAMES:',
          NUM_FLOW_FRAMES)

    results = pd.DataFrame()

    print('Training model')
    print(args.detector + '_bdd10k_val.pkl')

    try:
        testset = LocationDatasetBDD(filename='bdd10k_val_' + args.detector +
                                     '.pkl',
                                     root_dir=load_path,
                                     transform=transform_val,
                                     img_root=img_root,
                                     NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
        test_loader = torch.utils.data.DataLoader(testset,
                                                  batch_size=batch_size,
                                                  shuffle=False,
                                                  num_workers=num_workers)

        trainset = LocationDatasetBDD(filename='bdd10k_train_' +
                                      args.detector + '.pkl',
                                      root_dir=load_path,
                                      transform=transform_train,
                                      img_root=img_root,
                                      NUM_FLOW_FRAMES=NUM_FLOW_FRAMES,
                                      proportion=training_proportion)
        train_loader = torch.utils.data.DataLoader(trainset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=num_workers)
        valset = LocationDatasetBDD(filename='bdd10k_val_' + args.detector +
                                    '.pkl',
                                    root_dir=load_path,
                                    transform=transform_val,
                                    img_root=img_root,
                                    NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
        val_loader = torch.utils.data.DataLoader(valset,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 num_workers=num_workers)
    except:
        sys.exit(
            'ERROR: Could not load pkl data file. Check the bdd .pkl files are in the correct path.'
        )

    model = DynamicTrajectoryPredictor(NUM_FLOW_FRAMES).to(device)
    model = model.float()

    model = nn.DataParallel(model)

    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)
    loss_function = torch.nn.MSELoss()
    best_FDE = np.inf
    best_MSE = np.inf
    best_model = copy.deepcopy(model)

    # Begin training
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer, epoch, loss_function)
        MSE_5, FDE_5, MSE_10, FDE_10, MSE_15, FDE_15, _, _ = test(
            model, device, val_loader, loss_function)
        if MSE_15 < best_MSE:
            best_MSE = MSE_15
            best_model = copy.deepcopy(model)
            best_FDE = FDE_15
            torch.save(
                best_model.state_dict(),
                model_save_path + args.detector + '_rn18_bdd10k_flow_css_' +
                str(NUM_FLOW_FRAMES) + 'stack_training_proportion_' +
                str(training_proportion) + '_shuffled_disp.weights')
        print(epoch)
        print('Best MSE:', round(best_MSE, 0))

    test_mse_5, test_fde_5, test_mse_10, test_fde_10, test_mse_15, test_fde_15, all_outputs, all_targets = test(
        best_model, device, test_loader, loss_function)
    print('Test mse @ 15:', round(test_mse_15, 0))

    # Save the model
    torch.save(
        best_model.state_dict(),
        model_save_path + args.detector + 'bdd10k_rn18_flow_css_' +
        str(NUM_FLOW_FRAMES) + 'stack_training_proportion_' +
        str(training_proportion) + '_shuffled_disp.weights')

    # Save the predictions and the targets
    np.save(
        './' + args.detector + '_predictions_rn18_flow_css_' +
        str(NUM_FLOW_FRAMES) + 'stack_bdd10k_training_proportion_' +
        str(training_proportion) + '_shuffled_disp.npy', all_outputs)
    np.save(
        './' + args.detector + '_targets_rn18_flow_css_' +
        str(NUM_FLOW_FRAMES) + 'stack_bdd10k__shuffled_disp.npy', all_targets)

    # Save the results
    result = {
        'NUM_FLOW_FRAMES': NUM_FLOW_FRAMES,
        'training_proportion': training_proportion,
        'val_mse': best_MSE,
        'val_fde': best_FDE,
        'test_mse_5': test_mse_5,
        'test_fde_5': test_fde_5,
        'test_mse_10': test_mse_10,
        'test_fde_10': test_fde_10,
        'test_mse_15': test_mse_15,
        'test_fde_15': test_fde_15
    }
    results = results.append(result, ignore_index=True)
    results.to_csv('./' + args.detector + '_results_rn18_bdd10k.csv',
                   index=False)
Ejemplo n.º 16
0
def main(args):
    ############################################################################
    # Path to optical flow images
    img_root = './data/human-annotated/'
    # Path to training and testing files
    load_path = './data/'
    # CPU or GPU?
    device = torch.device("cuda")

    # Training settings
    epochs = 30
    batch_size = 64
    learning_rate = 1e-5
    num_workers = 8
    pretrained = False
    weight_decay = 1e-2
    NUM_FLOW_FRAMES = 9

    model_load_path = args.model_load_path
    model_save_path = args.model_save_path

    # Transformers for training and validation
    transform_train = video_transforms.Compose([
        video_transforms.MultiScaleCrop((224, 224), [1.0]),
        video_transforms.ToTensor(),
    ])
    transform_val = video_transforms.Compose([
        video_transforms.Scale((224)),
        video_transforms.ToTensor(),
    ])
    ############################################################################

    print('################### Training settings ###################')
    print('epochs:', epochs, '   batch_size:', batch_size, '   learning_rate:',
          learning_rate, '   num_workers:', num_workers, '   model_load_path:',
          model_load_path, '   NUM_FLOW_FRAMES:', NUM_FLOW_FRAMES)

    results = pd.DataFrame()
    testset = LocationDatasetJAAD(filename='jaad_cv_test.pkl',
                                  root_dir=load_path,
                                  transform=transform_val,
                                  img_root=img_root,
                                  NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
    test_loader = torch.utils.data.DataLoader(testset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers)
    trainset = LocationDatasetJAAD(filename='jaad_cv_train_' + str(1) + '.pkl',
                                   root_dir=load_path,
                                   transform=transform_train,
                                   img_root=img_root,
                                   NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=num_workers)

    valset = LocationDatasetJAAD(filename='jaad_cv_val_' + str(1) + '.pkl',
                                 root_dir=load_path,
                                 transform=transform_val,
                                 img_root=img_root,
                                 NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
    val_loader = torch.utils.data.DataLoader(valset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=num_workers)
    print('test_loader flow_stack size = ', test_loader['flow_stack'])
def main():
    global best_prec1    

    # create model
    print("Building model ... ")
    print("Building model ... ", file = f_log)
    model = build_model(resume_path = args.resume)
    print("Model %s is loaded. " % (args.arch))
    print("Model %s is loaded. " % (args.arch), file = f_log)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True

    # Data transforming
    if args.modality == "rgb":
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        clip_mean = [0.485, 0.456, 0.406]
        clip_std = [0.229, 0.224, 0.225]
    elif args.modality == "tvl1_flow" or args.modality == "lk_flow":
        is_color = False
        scale_ratios = [1.0, 0.875, 0.75]
        clip_mean = [0.5, 0.5]
        clip_std = [0.226, 0.226]
    else:
        print("No such modality. Only rgb and flow supported.")
        print("No such modality. Only rgb and flow supported.", file = f_log)

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    train_transform = video_transforms.Compose([
        #video_transforms.Scale((288)),
        video_transforms.MultiScaleCrop((256, 256), scale_ratios),
        video_transforms.RandomHorizontalFlip(),
        video_transforms.ToTensor(),
        normalize,
    ])

    val_transform = video_transforms.Compose([
        #video_transforms.Scale((288)),
        video_transforms.CenterCrop((256)),
        video_transforms.ToTensor(),
        normalize,
    ])

    # data loading
    train_setting_file = "train_%s_split%d.txt" % (args.modality, args.split)
    train_split_file = os.path.join(args.settings, args.dataset, train_setting_file)
    val_setting_file = "val_%s_split%d.txt" % (args.modality, args.split)
    val_split_file = os.path.join(args.settings, args.dataset, val_setting_file)
    if not os.path.exists(train_split_file) or not os.path.exists(val_split_file):
        print("No split file exists in %s directory. Preprocess the dataset first" % (args.settings))
        print("No split file exists in %s directory. Preprocess the dataset first" % (args.settings), file = f_log )

    train_dataset = datasets.__dict__[args.dataset](setting=train_split_file, root=args.data, train=True,
                               new_width=args.new_width, new_height=args.new_height, new_length=args.new_length,
                               target_width=args.new_width, target_height=args.new_height,
                               modality=args.modality, num_segments=args.num_segments, transform=train_transform,
                               name_pattern='frame%06d.jpg')

    val_dataset = datasets.__dict__[args.dataset](setting=val_split_file, root=args.data, train=False,
                             new_width=args.new_width, new_height=args.new_height, new_length=args.new_length,
                             target_width=args.new_width, target_height=args.new_height,
                             modality=args.modality, num_segments=args.num_segments, transform=val_transform,
                             name_pattern='frame%06d.jpg')

    print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset),
                                                                           len(train_dataset),
                                                                           len(val_dataset)))
    print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset),
                                                                           len(train_dataset),
                                                                           len(val_dataset)), file = f_log )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers) #, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers) #, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        print("start epoch ", epoch)
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = 0.0
        if (epoch + 1) % args.save_freq == 0:
            prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

        if (epoch + 1) % args.save_freq == 0:
            checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar")
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer' : optimizer.state_dict(),
            }, is_best, checkpoint_name, args.save_path)
Ejemplo n.º 18
0
def main():
    global args, prec_list
    prec_list = []
    args = parser.parse_args()
    full_path = logging(args)

    print(args.modality + " network trained with the split " +
          str(args.split) + ".")

    # create model
    print("Building model ... ")
    exits_model, model = build_model(int(args.start_epoch),
                                     args.pretrain_weights)
    if not exits_model:
        return
    else:
        print("Model %s is loaded. " % (args.arch))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True

    # Data transforming
    if args.modality == "rgb" or args.modality == "rgb2":
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        clip_mean = [0.485, 0.456, 0.406] * args.new_length
        clip_std = [0.229, 0.224, 0.225] * args.new_length
    elif args.modality == "flow" or args.modality == "rhythm":
        is_color = False
        scale_ratios = [1.0, 0.875, 0.75]
        clip_mean = [0.5, 0.5] * args.new_length
        clip_std = [0.226, 0.226] * args.new_length
    else:
        print("No such modality. Only rgb and flow supported.")

    new_size = 299 if args.arch.find("inception_v3") > 0 else 224

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    train_transform = video_transforms.Compose([
        #video_transforms.Scale((256)),
        video_transforms.MultiScaleCrop((new_size, new_size), scale_ratios),
        video_transforms.RandomHorizontalFlip(),
        video_transforms.ToTensor(),
        normalize,
    ])

    if args.es:
        val_transform = video_transforms.Compose([
            # video_transforms.Scale((256)),
            video_transforms.CenterCrop((new_size)),
            video_transforms.ToTensor(),
            normalize,
        ])

    modality_ = "rgb" if (args.modality == "rhythm"
                          or args.modality[:3] == "rgb") else "flow"

    if args.modality == "rgb2":
        createNewDataset("train_split%d.txt", "new_train.txt", modality_)
        #createNewDataset("val_%s_split%d.txt", "new_val.txt",modality_)

    # data loading
    train_setting_file = "new_train.txt" if args.modality == "rgb2" else "train_split%d.txt" % (
        args.split)
    train_split_file = os.path.join(args.settings, args.dataset,
                                    train_setting_file)

    if not os.path.exists(
            train_split_file):  # or not os.path.exists(val_split_file):
        print(
            "No split file exists in %s directory. Preprocess the dataset first"
            % (args.settings))

    extension = ".png" if args.dataset == "hmdb51" and args.modality == "rhythm" else ".jpg"
    direction_file = "direction.txt" if args.vr_approach == 3 else "direction_video.txt"
    direction_path = os.path.join(args.settings, args.dataset, direction_file)

    train_dataset = datasets.__dict__['dataset'](
        root=args.data,
        source=train_split_file,
        phase="train",
        modality=args.modality,
        is_color=is_color,
        new_length=args.new_length,
        new_width=args.new_width,
        new_height=args.new_height,
        video_transform=train_transform,
        approach_VR=args.vr_approach,
        extension=extension,
        direction_path=direction_path)

    if args.es:
        val_setting_file = "val_split%d.txt" % (args.split)
        val_split_file = os.path.join(args.settings, args.dataset,
                                      val_setting_file)

        if not os.path.exists(val_split_file):
            print(
                "No split file exists in %s directory. Preprocess the dataset first"
                % (args.settings))

        val_dataset = datasets.__dict__['dataset'](
            root=args.data,
            source=val_split_file,
            phase="val",
            modality=args.modality,
            is_color=is_color,
            new_length=args.new_length,
            new_width=args.new_width,
            new_height=args.new_height,
            video_transform=val_transform,
            approach_VR=args.vr_approach,
            extension=extension,
            direction_path=direction_path)

        print('{} samples found, {} train samples and {} validation samples.'.
              format(
                  len(val_dataset) + len(train_dataset), len(train_dataset),
                  len(val_dataset)))
    else:
        print('{} train samples found.'.format(len(train_dataset)))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    if args.es:
        val_loader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=args.batch_size,
                                                 shuffle=True,
                                                 num_workers=args.workers,
                                                 pin_memory=True)

        early_stop = EarlyStopping(verbose=True,
                                   log_path=os.path.join(
                                       full_path, "early_stopping.json"))

    is_best = False

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        if args.es:
            # evaluate on validation set
            losses = validate(val_loader, model, criterion)

            is_best = early_stop(losses.avg, epoch)

        if (epoch + 1) % args.save_freq == 0 or is_best:
            checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_" +
                                           args.modality + "_split_" +
                                           str(args.split) + ".pth.tar")
            es_val = float('inf') if not args.es else early_stop.val_loss_min
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'val_loss_min': es_val
                }, is_best, checkpoint_name,
                os.path.join(full_path, "checkpoints"))

        prec_name = "%03d_%s" % (epoch + 1,
                                 "prec_split_" + str(args.split) + ".txt")
        save_precision(prec_name, os.path.join(full_path, "precision"))

        if args.es and early_stop.early_stop:
            break

    if not args.es:  # Final model
        checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_" +
                                       args.modality + "_split_" +
                                       str(args.split) + ".pth.tar")
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'val_loss_min': float('inf')
            }, True, checkpoint_name, os.path.join(full_path, "checkpoints"))
def VideoTemporalPrediction(
        mode,
        vid_name,
        net,
        num_categories,
        start_frame=0,
        num_frames=0,
        num_samples=25,
        optical_flow_frames=10,
        new_size = 299,
        ext = ".jpg"
        ):

    if num_frames == 0:
        imglist = os.listdir(vid_name)
        duration = len(imglist)
    else:
        duration = num_frames

    # selection
    step = int(math.floor((duration-optical_flow_frames+1)/num_samples))
    clip_mean = [0.5] * optical_flow_frames * 2
    clip_std = [0.226] * optical_flow_frames * 2

    normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
    test_transform = video_transforms.Compose([
            video_transforms.ToTensor(),
            normalize
        ])

    
    # inception = 320,360, resnet = 240, 320
    width = 320 if new_size==299 else 240
    height = 360 if new_size==299 else 320
    deep = optical_flow_frames*2
    dims = (width,height,deep,num_samples)
    flow = np.zeros(shape=dims, dtype=np.float64)
    flow_flip = np.zeros(shape=dims, dtype=np.float64)

    for i in range(num_samples):
        for j in range(optical_flow_frames):
            flow_x_file = os.path.join(vid_name, mode+'_x_{0:05d}{1}'.format(i*step+j+1 + start_frame, ext))
            flow_y_file = os.path.join(vid_name, mode+'_y_{0:05d}{1}'.format(i*step+j+1 + start_frame, ext))
            img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE)
            img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE)
            img_x = cv2.resize(img_x, dims[1::-1])
            img_y = cv2.resize(img_y, dims[1::-1])

            flow[:,:,j*2  ,i] = img_x
            flow[:,:,j*2+1,i] = img_y

            flow_flip[:,:,j*2  ,i] = 255 - img_x[:, ::-1]
            flow_flip[:,:,j*2+1,i] = img_y[:, ::-1]

    # crop 299 = inception, 224 = resnet
    size = new_size
    corner = [(height-size)//2, (width-size)//2]
    flow_1 = flow[:size, :size, :,:]
    flow_2 = flow[:size, -size:, :,:]
    flow_3 = flow[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:]
    flow_4 = flow[-size:, :size, :,:]
    flow_5 = flow[-size:, -size:, :,:]
    flow_f_1 = flow_flip[:size, :size, :,:]
    flow_f_2 = flow_flip[:size, -size:, :,:]
    flow_f_3 = flow_flip[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:]
    flow_f_4 = flow_flip[-size:, :size, :,:]
    flow_f_5 = flow_flip[-size:, -size:, :,:]

    flow = np.concatenate((flow_1,flow_2,flow_3,flow_4,flow_5,flow_f_1,flow_f_2,flow_f_3,flow_f_4,flow_f_5), axis=3)
    
    _, _, _, c = flow.shape
    flow_list = []
    for c_index in range(c):
        cur_img = flow[:,:,:,c_index].squeeze()
        cur_img_tensor = test_transform(cur_img)
        flow_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))
        
    flow_np = np.concatenate(flow_list,axis=0)
    batch_size = 15
    prediction = np.zeros((num_categories,flow.shape[3]))
    num_batches = int(math.ceil(float(flow.shape[3])/batch_size))

    for bb in range(num_batches):
        span = range(batch_size*bb, min(flow.shape[3],batch_size*(bb+1)))

        input_data = flow_np[span,:,:,:]
        imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda()
        imgDataVar = torch.autograd.Variable(imgDataTensor)
        output = net(imgDataVar)
        result = output.data.cpu().numpy()
        prediction[:, span] = np.transpose(result)

    return prediction
def main():
    global args, best_prec1
    args = parser.parse_args()

    print(args.modality + " network trained whith the split " +
          str(args.split) + ".")

    # create model
    print("Building model ... ")
    exits_model, model = build_model(int(args.start_epoch))
    if not exits_model:
        return
    else:
        print("Model %s is loaded. " % (args.arch))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # create file where we allocate the models by each args.save_freq epochs
    if not os.path.exists(args.resume):
        os.makedirs(args.resume)
    print("Saving everything to directory %s." % (args.resume))

    cudnn.benchmark = True

    # Data transforming
    if args.modality == "rgb" or args.modality == "rhythm" or args.modality == "history":
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        clip_mean = [0.485, 0.456, 0.406] * args.new_length
        clip_std = [0.299, 0.224, 0.225] * args.new_length
    elif args.modality == "flow":
        is_color = False
        scale_ratios = [1.0, 0.875, 0.75]
        clip_mean = [0.5, 0.5] * args.new_length
        clip_std = [0.226, 0.226] * args.new_length
    else:
        print("No such modality. Only rgb and flow supported.")

    new_size = 299 if args.arch == 'rgb_inception_v3' else 224

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    train_transform = video_transforms.Compose([
        #video_transforms.Scale((256)),
        video_transforms.MultiScaleCrop((new_size, new_size), scale_ratios),
        video_transforms.RandomHorizontalFlip(),
        video_transforms.ToTensor(),
        normalize,
    ])

    val_transform = video_transforms.Compose([
        # video_transforms.Scale((256)),
        video_transforms.CenterCrop((new_size)),
        video_transforms.ToTensor(),
        normalize,
    ])

    #createNewDataset("train_%s_split%d.txt" , "new_train.txt")
    #createNewDataset("val_%s_split%d.txt", "new_test.txt")

    # data loading
    #train_setting_file = 'new_train.txt'
    modality_ = "rgb" if (args.modality == "rhythm"
                          or args.modality == "history") else args.modality
    train_setting_file = "train_%s_split%d.txt" % (modality_, args.split)
    train_split_file = os.path.join(args.settings, args.dataset,
                                    train_setting_file)
    #val_setting_file = 'new_test.txt'
    val_setting_file = "val_%s_split%d.txt" % (modality_, args.split)
    val_split_file = os.path.join(args.settings, args.dataset,
                                  val_setting_file)
    if not os.path.exists(train_split_file) or not os.path.exists(
            val_split_file):
        print(
            "No split file exists in %s directory. Preprocess the dataset first"
            % (args.settings))

    train_dataset = datasets.__dict__['dataset'](
        root=args.data,
        source=train_split_file,
        phase="train",
        modality=args.modality,
        is_color=is_color,
        new_length=args.new_length,
        new_width=args.new_width,
        new_height=args.new_height,
        video_transform=train_transform)
    val_dataset = datasets.__dict__['dataset'](root=args.data,
                                               source=val_split_file,
                                               phase="val",
                                               modality=args.modality,
                                               is_color=is_color,
                                               new_length=args.new_length,
                                               new_width=args.new_width,
                                               new_height=args.new_height,
                                               video_transform=val_transform)

    print('{} samples found, {} train samples and {} test samples.'.format(
        len(val_dataset) + len(train_dataset), len(train_dataset),
        len(val_dataset)))
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=True,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = 0.0
        if (epoch + 1) % args.save_freq == 0:
            prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

        if (epoch + 1) % args.save_freq == 0:
            checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_" +
                                           args.modality + "_split_" +
                                           str(args.split) + ".pth.tar")
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, checkpoint_name, args.resume)
Ejemplo n.º 21
0
def VideoTemporalPrediction(vid_name,
                            net,
                            num_categories,
                            start_frame=0,
                            num_frames=0,
                            num_samples=5,
                            optical_flow_frames=25):

    if num_frames == 0:
        # print(vid_name)
        imglist = glob.glob(os.path.join(vid_name, '*flow_x*.jpg'))
        duration = len(imglist)
    else:
        duration = num_frames

    clip_mean = [0.5] * 20
    clip_std = [0.226] * 20
    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    val_transform = video_transforms.Compose([
        video_transforms.ToTensor(),
        normalize,
    ])

    # selection
    step = int(math.floor((duration - optical_flow_frames + 1) / num_samples))
    dims = (256, 340, optical_flow_frames * 2, num_samples)
    flow = np.zeros(shape=dims, dtype=np.float64)
    flow_flip = np.zeros(shape=dims, dtype=np.float64)

    for i in range(num_samples):
        for j in range(optical_flow_frames):
            flow_x_file = os.path.join(
                vid_name,
                'flow_x_{0:04d}.jpg'.format(i * step + j + 1 + start_frame))
            flow_y_file = os.path.join(
                vid_name,
                'flow_y_{0:04d}.jpg'.format(i * step + j + 1 + start_frame))
            img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE)
            img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE)
            img_x = cv2.resize(img_x, dims[1::-1])
            img_y = cv2.resize(img_y, dims[1::-1])

            flow[:, :, j * 2, i] = img_x
            flow[:, :, j * 2 + 1, i] = img_y

            flow_flip[:, :, j * 2, i] = 255 - img_x[:, ::-1]
            flow_flip[:, :, j * 2 + 1, i] = img_y[:, ::-1]

    # crop
    flow_1 = flow[:224, :224, :, :]
    flow_2 = flow[:224, -224:, :, :]
    flow_3 = flow[16:240, 60:284, :, :]
    flow_4 = flow[-224:, :224, :, :]
    flow_5 = flow[-224:, -224:, :, :]
    flow_f_1 = flow_flip[:224, :224, :, :]
    flow_f_2 = flow_flip[:224, -224:, :, :]
    flow_f_3 = flow_flip[16:240, 60:284, :, :]
    flow_f_4 = flow_flip[-224:, :224, :, :]
    flow_f_5 = flow_flip[-224:, -224:, :, :]

    flow = np.concatenate((flow_1, flow_2, flow_3, flow_4, flow_5, flow_f_1,
                           flow_f_2, flow_f_3, flow_f_4, flow_f_5),
                          axis=3)

    _, _, _, c = flow.shape
    flow_list = []
    for c_index in range(c):
        cur_img = flow[:, :, :, c_index].squeeze()
        cur_img_tensor = val_transform(cur_img)
        flow_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))

    flow_np = np.concatenate(flow_list, axis=0)

    batch_size = 25
    prediction = np.zeros((num_categories, flow.shape[3]))
    num_batches = int(math.ceil(float(flow.shape[3]) / batch_size))

    for bb in range(num_batches):
        span = range(batch_size * bb, min(flow.shape[3],
                                          batch_size * (bb + 1)))

        input_data = flow_np[span, :, :, :]
        imgDataTensor = torch.from_numpy(input_data).type(
            torch.FloatTensor).cuda()
        imgDataVar = torch.autograd.Variable(imgDataTensor)
        output = net(imgDataVar)
        result = output.data.cpu().numpy()
        prediction[:, span] = np.transpose(result)

    return prediction
Ejemplo n.º 22
0
def main():
    '''
        导入模型
    '''
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = DynamicTrajectoryPredictor(9).to(device)
    model = model.float()
    model = nn.DataParallel(model)
    # summary(model, input_size=(18, 224, 224))
    model.load_state_dict(
        torch.load(
            './data/yolomyvideo_rn50_flow_css_9stack_training_proportion_100_shuffled_disp.weights'
        ), False)
    model.eval()

    load_path = './data/'
    img_root = '../../flow_result/'

    # Training settings
    epochs = 15
    batch_size = 1
    learning_rate = 1e-5
    num_workers = 8
    weight_decay = 1e-2
    NUM_FLOW_FRAMES = 9
    training_proportion = 100

    # Transformers
    transform_val = video_transforms.Compose([
        video_transforms.Scale((224)),
        video_transforms.ToTensor(),
    ])

    for fold_type in ['train', 'val', 'test']:
        for fold_num in range(1, 4):
            result.clear()
            valset = LocationDatasetBDD(filename=fold_type + str(fold_num) +
                                        '_myvideo_location_features_yolo.pkl',
                                        root_dir=load_path,
                                        transform=transform_val,
                                        img_root=img_root,
                                        NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
            val_loader = torch.utils.data.DataLoader(valset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=num_workers)

            for param in model.parameters():
                param.requires_grad = False

            start_time = time.time()
            for batch_idx, data in enumerate(val_loader):
                if batch_idx % 100 == 0:
                    end_time = time.time()
                    print(fold_type + ':', fold_num, ' Batch ', batch_idx,
                          ' of ', len(val_loader), ' Cost time: ',
                          end_time - start_time)
                    start_time = end_time
                #    break

                # if batch_idx == 20:
                #     break

                flow = data['flow_stack'].to(device)
                flow = flow.float()
                output = model(flow)

                # print('Processing: ', batch_idx)

            ans = np.array(result).reshape(-1, 2048)
            print(ans.shape)

            with open('record_extract.txt', 'w') as f:
                f.write(fold_type + ' ' + str(fold_num) + ' ' + str(ans.shape))

            np.save(
                './data/sted_feature/fold_' + str(fold_num) + '_' + fold_type +
                '_dtp_features.npy', ans)
Ejemplo n.º 23
0
def main(args):
    ############################################################################
    # Path to optical flow images
    img_root = './data/human-annotated/'
    # Path to training and testing files
    load_path = './data/'
    # CPU or GPU?
    device = torch.device("cuda")

    # Training settings
    epochs = 30
    batch_size = 64
    learning_rate = 1e-5
    num_workers = 8
    pretrained = False
    weight_decay = 1e-2
    NUM_FLOW_FRAMES = 9

    model_load_path = args.model_load_path
    model_save_path = args.model_save_path

    # Transformers for training and validation
    transform_train = video_transforms.Compose([
        video_transforms.MultiScaleCrop((224, 224), [1.0]),
        video_transforms.ToTensor(),
    ])
    transform_val = video_transforms.Compose([
        video_transforms.Scale((224)),
        video_transforms.ToTensor(),
    ])
    ############################################################################

    print('################### Training settings ###################')
    print('epochs:', epochs, '   batch_size:', batch_size, '   learning_rate:',
          learning_rate, '   num_workers:', num_workers, '   model_load_path:',
          model_load_path, '   NUM_FLOW_FRAMES:', NUM_FLOW_FRAMES)

    results = pd.DataFrame()

    for fold in [1, 2, 3, 4, 5]:
        if pretrained:
            learning_rate = 1e-6
            epochs = 30
        else:
            learning_rate = 1e-5
            epochs = 40

        print('Training on fold ' + str(fold))

        try:
            testset = LocationDatasetJAAD(filename='jaad_cv_test.pkl',
                                          root_dir=load_path,
                                          transform=transform_val,
                                          img_root=img_root,
                                          NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
            test_loader = torch.utils.data.DataLoader(testset,
                                                      batch_size=batch_size,
                                                      shuffle=False,
                                                      num_workers=num_workers)
            trainset = LocationDatasetJAAD(filename='jaad_cv_train_' +
                                           str(fold) + '.pkl',
                                           root_dir=load_path,
                                           transform=transform_train,
                                           img_root=img_root,
                                           NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
            train_loader = torch.utils.data.DataLoader(trainset,
                                                       batch_size=batch_size,
                                                       shuffle=True,
                                                       num_workers=num_workers)
            valset = LocationDatasetJAAD(filename='jaad_cv_val_' + str(fold) +
                                         '.pkl',
                                         root_dir=load_path,
                                         transform=transform_val,
                                         img_root=img_root,
                                         NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
            val_loader = torch.utils.data.DataLoader(valset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=num_workers)
        except:
            sys.exit(
                'ERROR: Could not load pkl data file. Check the jaad .pkl files are in the correct path.'
            )

        model = DynamicTrajectoryPredictor(NUM_FLOW_FRAMES).to(device)
        model = model.float()

        model = nn.DataParallel(model)

        if model_load_path is not None:
            print('loading model from', model_load_path)
            model.load_state_dict(torch.load(model_load_path))

        optimizer = optim.Adam(model.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_decay)
        loss_function = torch.nn.MSELoss()
        best_FDE = np.inf
        best_MSE = np.inf
        best_model = copy.deepcopy(model)

        # Begin training
        for epoch in range(1, epochs + 1):
            # Set learning rate to 1e-6 after 30 epochs
            if epoch > 30:
                optimizer = optim.Adam(model.parameters(),
                                       lr=1e-6,
                                       weight_weight_decay=decay)

            train(model, device, train_loader, optimizer, epoch, loss_function)
            MSE_5, FDE_5, MSE_10, FDE_10, MSE_15, FDE_15, _, _ = test(
                model, device, val_loader, loss_function)
            if MSE_15 < best_MSE:
                best_MSE = MSE_15
                best_model = copy.deepcopy(model)
                best_FDE = FDE_15
            print(epoch)
            print('Best MSE:', round(best_MSE, 0))

        test_mse_5, test_fde_5, test_mse_10, test_fde_10, test_mse_15, test_fde_15, all_outputs, all_targets = test(
            best_model, device, test_loader, loss_function)
        print('Test mse @ 15:', round(test_mse_15, 0))

        # Save the model
        torch.save(
            best_model.state_dict(), model_save_path + 'rn18_flow_css_' +
            str(NUM_FLOW_FRAMES) + 'stack_fold_' + str(fold) + '_pretrained-' +
            str(pretrained) + '_disp.weights')

        # Save the predictions and the targets
        np.save(
            './predictions_rn18_flow_css_' + str(NUM_FLOW_FRAMES) +
            'stack_jaad_fold_' + str(fold) + 'pretrained-' + str(pretrained) +
            '_disp.npy', all_outputs)
        np.save(
            './targets_rn18_flow_css_' + str(NUM_FLOW_FRAMES) +
            'stack_jaad_fold_' + str(fold) + 'pretrained-' + str(pretrained) +
            '_disp.npy', all_targets)

        # Save the results
        result = {
            'NUM_FLOW_FRAMES': NUM_FLOW_FRAMES,
            'fold': fold,
            'val_mse': best_MSE,
            'val_fde': best_FDE,
            'test_mse_5': test_mse_5,
            'test_fde_5': test_fde_5,
            'test_mse_10': test_mse_10,
            'test_fde_10': test_fde_10,
            'test_mse_15': test_mse_15,
            'test_fde_15': test_fde_15,
            'pretrained': pretrained
        }
        results = results.append(result, ignore_index=True)
        results.to_csv('./results_rn18_jaad.csv', index=False)
Ejemplo n.º 24
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    # create model
    print("Building model ... ")
    model = build_model()
    if torch.cuda.is_available():
        model = torch.nn.DataParallel(model)

    print("Model %s is loaded. " % (args.arch))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    if not os.path.exists(args.resume):
        os.makedirs(args.resume)
    print("Saving everything to directory %s." % (args.resume))

    cudnn.benchmark = True

    # Data transforming
    if args.modality == "rgb":
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        clip_mean = [0.485, 0.456, 0.406] * args.new_length
        clip_std = [0.229, 0.224, 0.225] * args.new_length
    elif args.modality == "flow":
        is_color = False
        scale_ratios = [1.0, 0.875, 0.75]
        clip_mean = [0.5, 0.5] * args.new_length
        clip_std = [0.226, 0.226] * args.new_length
    else:
        print("No such modality. Only rgb and flow supported.")

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    train_transform = video_transforms.Compose([
        # video_transforms.Scale((256)),
        video_transforms.MultiScaleCrop((224, 224), scale_ratios),
        video_transforms.RandomHorizontalFlip(),
        video_transforms.ToTensor(),
        normalize,
    ])

    val_transform = video_transforms.Compose([
        # video_transforms.Scale((256)),
        video_transforms.CenterCrop((224)),
        video_transforms.ToTensor(),
        normalize,
    ])

    # data loading
    #     train_setting_file = "train_%s_split%d.txt" % (args.modality, args.split)
    #     train_split_file = os.path.join(args.settings, args.dataset, train_setting_file)
    #     val_setting_file = "val_%s_split%d.txt" % (args.modality, args.split)
    #     val_split_file = os.path.join(args.settings, args.dataset, val_setting_file)
    #     if not os.path.exists(train_split_file) or not os.path.exists(val_split_file):
    #         print("No split file exists in %s directory. Preprocess the dataset first" % (args.settings))
    train_split_file = './datasets/settings/train_set_detail.csv'
    val_split_file = './datasets/settings/val_set_detail.csv'

    train_dataset = datasets.__dict__[args.dataset](
        root=args.data,  #neet to change
        source=train_split_file,
        phase="train",
        modality=args.modality,
        is_color=is_color,
        new_length=args.new_length,
        new_width=args.new_width,
        new_height=args.new_height,
        video_transform=train_transform,
        name_pattern="frame%06d.jpg")  # frame000001
    val_dataset = datasets.__dict__[args.dataset](
        root=args.data,
        source=val_split_file,
        phase="val",
        modality=args.modality,
        is_color=is_color,
        new_length=args.new_length,
        new_width=args.new_width,
        new_height=args.new_height,
        video_transform=val_transform,
        name_pattern="frame%06d.jpg")

    print('{} samples found, {} train samples and {} test samples.'.format(
        len(val_dataset) + len(train_dataset), len(train_dataset),
        len(val_dataset)))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=True,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return
    model_path = '/home/thl/Desktop/challeng/checkpoints/Mulity_100step_900epoch_batch80/model_best.pth.tar'
    params = torch.load(model_path)
    model.load_state_dict(params['state_dict'])
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = 0.0
        if (epoch + 1) % args.save_freq == 0:
            prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

        if (epoch + 1) % args.save_freq == 0:
            checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar")
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, checkpoint_name, args.resume)
Ejemplo n.º 25
0
import math
import numpy as np
from PIL import Image

from tsn_dataset import TSNDataSet
from p3d_model import P3D199,get_optim_policies
import video_transforms

from tsn_models import TSN
from torch.nn.utils import clip_grad_norm

val_transform=video_transforms.Compose(
    [
        video_transforms.Resize((182,242)),
        video_transforms.CenterCrop(160),
        video_transforms.ToTensor(),
        video_transforms.Normalize((0.485,0.456,0.406),
                      (0.229,0.224,0.225))]
)

val_loader=torch.utils.data.DataLoader(
    TSNDataSet("","tsntest_01.lst",
               num_segments=2,
               new_length=16,
               modality="RGB",
               image_tmpl="frame{:06d}.jpg",
               transform=val_transform,
               random_shift=False),
    batch_size=1,
    shuffle=False,
    num_workers=1,
Ejemplo n.º 26
0
def VideoSpatialPrediction(
        vid_name,
        net,
        num_categories,
        num_frames=0,
        ext_batch_sz=100,
        int_batch_sz=5,
        new_size = 299
        ):

    if num_frames == 0:
        imglist = os.listdir(vid_name)
        duration = len(imglist)
    else:
        duration = num_frames  

    clip_mean = [0.485, 0.456, 0.406]
    clip_std = [0.229, 0.224, 0.225]

    normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)

    val_transform = video_transforms.Compose([
            video_transforms.ToTensor(),
            normalize,
        ])

    deep = 3

    # inception = 320,360, resnet = 240, 320
    width = 320 if new_size==299 else 240
    height = 360 if new_size==299 else 320
    predictions = []
    for i in range(len(num_categories)):
        predictions.append(np.zeros((num_categories[i],num_frames*10)))

    #control memory (RAM) usage
    num_ext_batch = int(math.ceil(float(num_frames)/ext_batch_sz))
   
    for i in range(num_ext_batch):
        start = i*ext_batch_sz
        end = min(start+ext_batch_sz, num_frames)

        dims = (width,height,deep,end-start)
        rgb = np.zeros(shape=dims, dtype=np.float64)
        rgb_flip = np.zeros(shape=dims, dtype=np.float64)

        for j in range(end-start):
            img_file = os.path.join(vid_name, 'img_{0:05d}.jpg'.format(j+start+1))
            img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED)
            img = cv2.resize(img, dims[1::-1])
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            rgb[:,:,:,j] = img
            rgb_flip[:,:,:,j] = img[:,::-1,:]

        # crop 299 = inception, 224 = resnet
        size = new_size
        corner = [(height-size)//2, (width-size)//2]
        rgb_1 = rgb[:size, :size, :,:]
        rgb_2 = rgb[:size, -size:, :,:]
        rgb_3 = rgb[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:]
        rgb_4 = rgb[-size:, :size, :,:]
        rgb_5 = rgb[-size:, -size:, :,:]
        rgb_f_1 = rgb_flip[:size, :size, :,:]
        rgb_f_2 = rgb_flip[:size, -size:, :,:]
        rgb_f_3 = rgb_flip[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:]
        rgb_f_4 = rgb_flip[-size:, :size, :,:]
        rgb_f_5 = rgb_flip[-size:, -size:, :,:]

        rgb = np.concatenate((rgb_1,rgb_2,rgb_3,rgb_4,rgb_5,rgb_f_1,rgb_f_2,rgb_f_3,rgb_f_4,rgb_f_5), axis=3)

        rgb_1, rgb_2, rgb_3, rgb_4, rgb_5 = [],[],[],[],[]
        rgb_f_1, rgb_f_2, rgb_f_3, rgb_f_4, rgb_f_5 = [],[],[],[],[]
        rgb_flip = []

        _, _, _, c = rgb.shape
        rgb_list = []
        for c_index in range(c):
            cur_img = rgb[:,:,:,c_index]
            cur_img_tensor = val_transform(cur_img)
            rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))
        
        rgb_shape = rgb.shape
        rgb = []

        rgb_np = np.concatenate(rgb_list,axis=0)

         #control memory (GPU) usage
        num_int_batches = int(math.ceil(float(rgb_shape[3])/int_batch_sz))

        rgb_list = []

        for bb in range(num_int_batches):
            span = range(int_batch_sz*bb, min(rgb_shape[3],int_batch_sz*(bb+1)))
            input_data = rgb_np[span,:,:,:]
            imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda()
            imgDataVar = torch.autograd.Variable(imgDataTensor)
            output = net(imgDataVar)

            for ii in range(len(output)):
                output_ = output[ii].reshape(-1, num_categories[ii])
                result = output_.data.cpu().numpy()
                pos = [ x%(end-start) + start + int(x/(end-start))*num_frames  for x in span ]
                predictions[ii][:, pos] = np.transpose(result)

        rgb_np = []

    result = []
    for ii in range(len(predictions)):
        result.append(np.split(predictions[ii],10,axis=1))

    return result
Ejemplo n.º 27
0
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = DynamicTrajectoryPredictor(9).to(device)
    model = model.float()
    model = nn.DataParallel(model)
    # 输出网络结构
    # summary(model, input_size=(18, 224, 224))
    model.load_state_dict(torch.load('./model.weights'), False)

    load_path = './data_inference/'
    img_root = '../../../flow_result/'

    # Training settings
    epochs = 15
    batch_size = 1
    learning_rate = 1e-5
    num_workers = 8
    weight_decay = 1e-2
    NUM_FLOW_FRAMES = 9
    training_proportion = 100

    # Transformers
    transform_val = video_transforms.Compose([
        video_transforms.Scale((224)),
        video_transforms.ToTensor(),
    ])

    valset = LocationDatasetBDD(filename='myvideo_val_yolo_0.pkl',
                                root_dir=load_path,
                                transform=transform_val,
                                img_root=img_root,
                                NUM_FLOW_FRAMES=NUM_FLOW_FRAMES)
    val_loader = torch.utils.data.DataLoader(valset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=num_workers)

    # for param in model.parameters():
    #     param.requires_grad = False

    model.eval()
    tmp_result = []
    start_time = time.time()
    with torch.no_grad():
        for batch_idx, data in enumerate(val_loader):
            if batch_idx % 100 == 0:
                end_time = time.time()
                print(' Batch ', batch_idx, ' of ', len(val_loader),
                      ' Cost time: ', end_time - start_time)
                start_time = end_time

            flow = data['flow_stack'].to(device)
            flow = flow.float()
            output = model(flow).detach().cpu().numpy()
            tmp_result.append(output)
            # print(output.shape)
            # if batch_idx == 1:
            #     print(batch_idx, ' : \n', output)
            # print(batch_idx, ' : \n', output)
            if batch_idx == 0:
                break

    ans = np.array(tmp_result).reshape(-1, 120)
    print(ans)
    print(ans.shape)

    # with open('record_extract.txt', 'w') as f:
    #     f.write(fold_type + ' ' + str(fold_num) + ' ' + str(ans.shape))

    # np.save('./data_inference/model_val_inference_result.npy', ans)
    np.save('./data_inference/val_prediction.npy', ans)
def VideoSpatialPrediction3D_bert(
        vid_name,
        net,
        num_categories,
        architecture_name,
        start_frame=0,
        num_frames=0,
        num_seg=4,
        length = 16,
        extension = 'img_{0:05d}.jpg',
        ten_crop = False
        ):

    if num_frames == 0:
        imglist = os.listdir(vid_name)
        newImageList=[]
        if 'rgb' in architecture_name or 'pose' in architecture_name:
            for item in imglist:
                if 'img' in item:
                   newImageList.append(item) 
        elif 'flow' in architecture_name:
            for item in imglist:
                if 'flow_x' in item:
                   newImageList.append(item) 
        duration = len(newImageList)
    else:
        duration = num_frames
    
    if 'rgb' in architecture_name:
        if 'I3D' in architecture_name:
            
            if not 'resnet' in architecture_name:
                clip_mean = [0.5, 0.5, 0.5] 
                clip_std = [0.5, 0.5, 0.5]
            else:
                clip_mean = [0.45, 0.45, 0.45]
                clip_std = [0.225, 0.225, 0.225] 
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor(),
                    normalize,
                ])
            if '112' in architecture_name:
                scale = 0.5
            else:
                scale = 1
        elif 'MFNET3D' in architecture_name:
            clip_mean = [0.48627451, 0.45882353, 0.40784314]
            clip_std = [0.234, 0.234, 0.234] 
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor(),
                    normalize])
            if '112' in architecture_name:
                scale = 0.5
            else:
                scale = 1
        elif 'tsm' in architecture_name:
            clip_mean = [0.485, 0.456, 0.406]
            clip_std = [0.229, 0.224, 0.225] 
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor(),
                    normalize])
            scale = 1
        elif "r2plus1d" in architecture_name:
            clip_mean = [0.43216, 0.394666, 0.37645]
            clip_std = [0.22803, 0.22145, 0.216989]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor(),
                    normalize])
            scale = 0.5
        elif 'rep_flow' in architecture_name:
            clip_mean = [0.5, 0.5, 0.5] 
            clip_std = [0.5, 0.5, 0.5]
    
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor(),
                    normalize,
                ])
            scale = 1
        elif "slowfast" in architecture_name:
            clip_mean = [0.45, 0.45, 0.45]
            clip_std = [0.225, 0.225, 0.225] 
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor(),
                    normalize,       
                ])
            scale = 1
        else:
            scale = 0.5
            clip_mean = [114.7748, 107.7354, 99.4750]
            clip_std = [1, 1, 1]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor2(),
                    normalize,
                ])
    elif 'flow' in architecture_name:
        if 'I3D' in architecture_name:
            clip_mean = [0.5] * 2
            clip_std = [0.5] * 2
            normalize = video_transforms.Normalize(mean=clip_mean,
                                             std=clip_std)
            
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor(),
                    normalize,
                ])
            scale = 1
        else:
            scale = 0.5
            clip_mean = [127.5, 127.5]
            clip_std = [1, 1]
            normalize = video_transforms.Normalize(mean=clip_mean,
                                     std=clip_std)
            val_transform = video_transforms.Compose([
                    video_transforms.ToTensor2(),
                    normalize,
                ])

    # selection
    #step = int(math.floor((duration-1)/(num_samples-1)))
    if '224' in architecture_name:
        scale = 1
    if '112' in architecture_name:
        scale = 0.5

    imageSize=int(224 * scale)
    dims = (int(256 * scale),int(340 * scale),3,duration)
    duration = duration - 1
    average_duration = int(duration / num_seg)
    offsetMainIndexes = []
    offsets = []
    for seg_id in range(num_seg):
        if average_duration >= length:
            offsetMainIndexes.append(int((average_duration - length + 1)/2 + seg_id * average_duration))
        elif duration >=length:
            average_part_length = int(np.floor((duration-length)/num_seg))
            offsetMainIndexes.append(int((average_part_length*(seg_id) + average_part_length*(seg_id+1))/2))
        else:
            increase = int(duration / num_seg)
            offsetMainIndexes.append(0 + seg_id * increase)
    for mainOffsetValue in offsetMainIndexes:
        for lengthID in range(1, length+1):
            loaded_frame_index = lengthID + mainOffsetValue
            moded_loaded_frame_index = loaded_frame_index % (duration + 1)
            if moded_loaded_frame_index == 0:
                moded_loaded_frame_index = (duration + 1)
            offsets.append(moded_loaded_frame_index)
             
    imageList=[]
    imageList1=[]
    imageList2=[]
    imageList3=[]
    imageList4=[]    
    imageList5=[]  
    imageList6=[]
    imageList7=[]
    imageList8=[]
    imageList9=[]    
    imageList10=[] 
    imageList11=[] 
    imageList12=[] 
    interpolation = cv2.INTER_LINEAR
    
    for index in offsets:
        if 'rgb' in architecture_name or 'pose' in architecture_name:
            img_file = os.path.join(vid_name, extension.format(index))
            img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED)
    
            img = cv2.resize(img, dims[1::-1],interpolation)
    
            #img2 = cv2.resize(img, dims2[1::-1],interpolation)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img_flip = img[:,::-1,:].copy()
        elif 'flow' in architecture_name:
            flow_x_file = os.path.join(vid_name, extension.format('x',index))
            flow_y_file = os.path.join(vid_name, extension.format('y',index))
            img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE)
            img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE)
            img_x = np.expand_dims(img_x,-1)
            img_y = np.expand_dims(img_y,-1)
            img = np.concatenate((img_x,img_y),2)    
            img = cv2.resize(img, dims[1::-1],interpolation)
            img_flip = img[:,::-1,:].copy()
        #img_flip2 = img2[:,::-1,:].copy()
        imageList1.append(img[int(16 * scale):int(16 * scale + imageSize), int(58 * scale) : int(58 * scale + imageSize), :])
        imageList2.append(img[:imageSize, :imageSize, :])
        imageList3.append(img[:imageSize, -imageSize:, :])
        imageList4.append(img[-imageSize:, :imageSize, :])
        imageList5.append(img[-imageSize:, -imageSize:, :])
        imageList6.append(img_flip[int(16 * scale):int(16 * scale + imageSize), int(58 * scale) : int(58 * scale + imageSize), :])
        imageList7.append(img_flip[:imageSize, :imageSize, :])
        imageList8.append(img_flip[:imageSize, -imageSize:, :])
        imageList9.append(img_flip[-imageSize:, :imageSize, :])
        imageList10.append(img_flip[-imageSize:, -imageSize:, :])
#        imageList11.append(img2)
#        imageList12.append(img_flip2)

    if ten_crop:
        imageList=imageList1+imageList2+imageList3+imageList4+imageList5+imageList6+imageList7+imageList8+imageList9+imageList10
    else:
        imageList=imageList1
    
    #imageList=imageList11+imageList12
    
    rgb_list=[]     

    for i in range(len(imageList)):
        cur_img = imageList[i]
        cur_img_tensor = val_transform(cur_img)
        rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0))
         
    input_data=np.concatenate(rgb_list,axis=0)   

    with torch.no_grad():
        imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda()
        if 'rgb' in architecture_name or 'pose' in architecture_name:
            imgDataTensor = imgDataTensor.view(-1,length,3,imageSize,imageSize).transpose(1,2)
        elif 'flow' in architecture_name:
            imgDataTensor = imgDataTensor.view(-1,length,2,imageSize,imageSize).transpose(1,2)
            
        if 'bert' in architecture_name or 'pooling' in architecture_name:
            output, input_vectors, sequenceOut, maskSample = net(imgDataTensor)
        else:
            output = net(imgDataTensor)
#        outputSoftmax=soft(output)
        result = output.data.cpu().numpy()
        mean_result=np.mean(result,0)
        prediction=np.argmax(mean_result)
        top3 = mean_result.argsort()[::-1][:3]
        
    return prediction, mean_result, top3
Ejemplo n.º 29
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    # create model
    print("Building model ... ")
    model = build_model()
    print("Model %s is loaded. " % (args.modality + "_" + args.arch))

    if not os.path.exists(args.resume):
        os.makedirs(args.resume)
    print("Saving everything to directory %s." % (args.resume))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True

    # Data transforming
    clip_mean = [0.485, 0.456, 0.406] * args.new_length
    clip_std = [0.229, 0.224, 0.225] * args.new_length
    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)

    if args.modality == "rgb":
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
    elif args.modality == "flow":
        scale_ratios = [1.0, 0.875, 0.75]
    else:
        print("No such modality. Only rgb and flow supported.")

    train_transform = video_transforms.Compose([
        video_transforms.Scale((256)),
        video_transforms.MultiScaleCrop((224, 224), scale_ratios),
        video_transforms.RandomHorizontalFlip(),
        video_transforms.ToTensor(),
        normalize,
    ])

    val_transform = video_transforms.Compose([
        video_transforms.Scale((256)),
        video_transforms.CenterCrop((224)),
        video_transforms.ToTensor(),
        normalize,
    ])

    # data loading
    train_setting_file = "train_%s_split%d.txt" % (args.modality, args.split)
    train_split_file = os.path.join(args.settings, args.dataset,
                                    train_setting_file)
    val_setting_file = "val_%s_split%d.txt" % (args.modality, args.split)
    val_split_file = os.path.join(args.settings, args.dataset,
                                  val_setting_file)
    if not os.path.exists(train_split_file) or not os.path.exists(
            val_split_file):
        print(
            "No split file exists in %s directory. Preprocess the dataset first"
            % (args.settings))

    train_dataset = datasets.__dict__[args.dataset](
        args.data,
        train_split_file,
        "train",
        args.new_length,
        video_transform=train_transform)
    val_dataset = datasets.__dict__[args.dataset](
        args.data,
        val_split_file,
        "val",
        args.new_length,
        video_transform=val_transform)

    print('{} samples found, {} train samples and {} test samples.'.format(
        len(val_dataset) + len(train_dataset), len(train_dataset),
        len(val_dataset)))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=True,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

        if (epoch + 1) % args.save_freq == 0:
            checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar")
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, checkpoint_name, args.resume)
Ejemplo n.º 30
0
def main():
    global args
    args = parser.parse_args()

    if args.dataset == 'ucf101':
        num_categories = 101
    elif args.dataset == 'hmdb51':
        num_categories = 51
    elif args.dataset == 'kinetics':
        num_categories = 400
    else:
        raise ValueError('Unknown dataset ' + args.dataset)

    start_frame = 0

    model_start_time = time.time()
    params = torch.load(args.weights)

    #hard code
    net = models.__dict__[args.arch](pretrained=False,
                                     num_classes=num_categories)
    net.load_state_dict(params['state_dict'])
    net.cuda()
    net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))
    ###

    if args.modality == "rgb":
        new_length = 1
        is_color = True
        scale_ratios = [1.0, 0.875, 0.75, 0.66]
        clip_mean = [0.485, 0.456, 0.406] * new_length
        clip_std = [0.229, 0.224, 0.225] * new_length
    elif args.modality == "flow":
        new_length = 10
        is_color = False
        scale_ratios = [1.0, 0.875, 0.75]
        clip_mean = [0.5, 0.5] * new_length
        clip_std = [0.226, 0.226] * new_length
    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    val_transform = video_transforms.Compose([
        video_transforms.GroupCenterCrop(net.input_size),
        video_transforms.CenterCrop((224)),
        video_transforms.ToTensor(),
        normalize,
    ])

    dataset = datasets.load_clip(root=args.data,
                                 source=args.test_list,
                                 phase="val",
                                 modality=args.modality,
                                 is_color=is_color,
                                 new_length=new_length,
                                 new_width=args.new_width,
                                 new_height=args.new_height,
                                 video_transform=val_transform)

    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=args.workers * 2,
                                              pin_memory=True)

    data_gen = enumerate(data_loader)

    total_num = len(data_loader.dataset)
    output = []

    def eval_video(video_data):
        i, data, label = video_data
        num_crop = 1

        if args.modality == 'rgb':
            length = 3
        elif args.modality == 'flow':
            length = 10
        elif args.modality == 'RGBDiff':
            length = 18
        else:
            raise ValueError("Unknown modality " + args.modality)

        input_var = torch.autograd.Variable(data.view(-1, length, data.size(2),
                                                      data.size(3)),
                                            volatile=True)
        input_var = input_var.type(torch.FloatTensor).cuda()
        rst = net(input_var).data.cpu().numpy().copy()
        return i, rst.reshape(
            (num_crop, args.test_segments,
             num_categories)).mean(axis=0).reshape(
                 (args.test_segments, 1, num_categories)), label[0]

    proc_start_time = time.time()
    max_num = args.max_num if args.max_num > 0 else len(data_loader.dataset)

    for i, (data, label) in data_gen:
        data = data.float().cuda(async=True)
        label = label.cuda(async=True)
        input_var = torch.autograd.Variable(data, volatile=True)
        target_var = torch.autograd.Variable(label, volatile=True)

        rst = net(input_var).data.cpu().numpy()
        #avg_pred_fc8 = np.mean(rst, axis=1)
        # print(avg_spatial_pred_fc8.shape)
        #result_list.append(avg_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)
        pred_index = np.argmax(rst)
        #print (label.cpu().numpy())
        #print (pred_index)
        # print(rst)
        # if i >= max_num:
        #     break
        # rst = eval_video((i, data, label))
        output.append(rst)
        if label.cpu().numpy()[0] == pred_index:
            match_count += 1
        cnt_time = time.time() - proc_start_time
        print('video {} done, total {}/{}, average {} sec/video'.format(
            i, i + 1, total_num,
            float(cnt_time) / (i + 1)))
    print(match_count)
    print(total_num)
    print("Accuracy is %4.4f" % (float(match_count) / float(total_num)))

    np.save("{}_sX_{}_{}.npy".format(args.dataset, args.modality, args.arch),
            np.array(output))  #hard code