def main():
    args = parser.parse_args()
    output_path = logging(args)
    model_path = args.model_path
    data_dir = args.data_dir

    start_frame = 0
    num_samples = 25 if args.modality[:3] == 'rgb' else 1
    ext = ".png" if args.modality == "rhythm" and args.dataset == "hmdb51" else ".jpg"
    num_categories = 51 if args.dataset == 'hmdb51' else 101
    new_size = 224

    model_start_time = time.time()
    params = torch.load(model_path)

    if args.architecture == "inception_v3":
        new_size = 299
        if args.modality == "rhythm":
            spatial_net = models.flow_inception_v3(pretrained=False,
                                                   channels=1,
                                                   num_classes=num_categories)
        else:
            spatial_net = models.rgb_inception_v3(pretrained=False,
                                                  channels=3,
                                                  num_classes=num_categories)
    else:
        if args.modality == "rhythm":
            spatial_net = models.flow_resnet152(pretrained=False,
                                                channels=1,
                                                num_classes=num_categories)
        else:
            spatial_net = models.rgb_resnet152(pretrained=False,
                                               channels=3,
                                               num_classes=num_categories)

    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition spatial model is loaded in %4.4f seconds." %
          (model_time))

    test_path = os.path.join(args.settings, args.dataset)
    test_file = os.path.join(test_path,
                             "dataset_list.txt") if args.w else os.path.join(
                                 test_path, "test_split%d.txt" % (args.split))
    print(test_file)
    f_test = open(test_file, "r")
    test_list = f_test.readlines()
    print("we got %d videos" % len(test_list))

    line_id = 1
    match_count = 0
    result_list = []

    lines = []
    if args.vr_approach == 3:
        direction_path = os.path.join(args.settings, args.dataset,
                                      "direction.txt")
        lines = [int(line.rstrip('\n')) for line in open(direction_path)]
    elif args.vr_approach == 4:
        direction_path = os.path.join(args.settings, args.dataset,
                                      "direction_video.txt")
        lines = {
            line.split()[0]: int(line.split()[1])
            for line in open(direction_path)
        }

    for line in test_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])

        video_name = clip_path.split("/")[-1]
        index = lines[input_video_label] if args.vr_approach == 3 else (
            lines[video_name] if args.vr_approach == 4 else args.vr_approach)

        spatial_prediction = VideoSpatialPrediction(args.modality,
                                                    clip_path,
                                                    spatial_net,
                                                    num_categories,
                                                    start_frame,
                                                    num_frames,
                                                    num_samples,
                                                    index,
                                                    new_size=new_size,
                                                    ext=ext)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        result_list.append(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print(args.modality + " split " + str(args.split) +
              ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" %
              (line_id, len(test_list), input_video_label, pred_index,
               match_count))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(test_list))
    print("Accuracy is: %4.4f" % (float(match_count) / len(test_list)))

    npy_name = args.dataset + "_" + args.modality + "_" + args.architecture + "_s" + str(
        args.split) + ".npy"
    npy_path = os.path.join(output_path, npy_name)
    np.save(npy_path, np.array(result_list))
def main():
    args = parser.parse_args()

    #model_path = '../../parameters/'+args.architecture+"/"+args.modality+'_s'+str(args.split)+'.pth.tar'
    #data_dir = '../datasets/'+args.dataset+'_frames'
    #data_dir = '/home/Datasets/UCF-101-OF_CPU'
    model_path = args.model_path
    data_dir = args.data_dir

    start_frame = 0
    if args.modality[:3] == 'rgb':
        num_samples = 25
    else:
        num_samples = 1
    num_categories = 51 if args.dataset == 'hmdb51' else 101

    model_start_time = time.time()
    params = torch.load(model_path)

    new_size = 224
    if args.architecture == "inception_v3":
        new_size = 299
        if args.modality == "rhythm":
            spatial_net = models.flow_inception_v3(pretrained=False,
                                                   channels=1,
                                                   num_classes=num_categories)
        else:
            spatial_net = models.rgb_inception_v3(pretrained=False,
                                                  channels=3,
                                                  num_classes=num_categories)
    else:
        if args.modality == "rhythm":
            spatial_net = models.flow_resnet152(pretrained=False,
                                                channels=1,
                                                num_classes=num_categories)
        else:
            spatial_net = models.rgb_resnet152(pretrained=False,
                                               channels=3,
                                               num_classes=num_categories)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))

    val_file = "./splits/" + args.dataset + "/val_split%d.txt" % (args.split)
    #val_file = 'spatial_testlist01_with_labels.txt'
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0

    result = []
    lines = [
        int(line.rstrip('\n'))
        for line in open('../datasets/settings/' + args.dataset +
                         '/direction.txt')
    ]
    for line in val_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])
        spatial_prediction = VideoSpatialPrediction(
            args.modality, clip_path, spatial_net, num_categories, start_frame,
            num_frames, num_samples, args.vr_approach
            if args.vr_approach != 3 else lines[input_video_label], new_size)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        result.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)

        print(args.modality + " split " + str(args.split) +
              ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" %
              (line_id, len(val_list), input_video_label, pred_index,
               match_count))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is : %4.4f" % ((float(match_count) / len(val_list))))
    np.save(
        args.dataset + "_" + args.modality + "_" + args.architecture + "_s" +
        str(args.split) + ".npy", np.array(result))
Example #3
0
def main():
    args = parser.parse_args()
    output_path = logging(args)
    model_path = args.model_path
    data_dir = args.data_dir

    start_frame = 0
    num_samples = 10
    num_categories = 51 if args.dataset == 'hmdb51' else 101

    model_start_time = time.time()

    new_size = 224
    if args.architecture == "inception_v3":
        new_size = 299
        spatial_net = models.flow_inception_v3(pretrained=False,
                                               channels=1,
                                               num_classes=num_categories)
    else:
        spatial_net = models.flow_resnet152(pretrained=False,
                                            channels=1,
                                            num_classes=num_categories)

    params = torch.load(model_path)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))

    test_path = os.path.join(args.settings, args.dataset)
    test_file = os.path.join(test_path,
                             "dataset_list.txt") if args.w else os.path.join(
                                 test_path, "test_split%d.txt" % (args.split))

    f_test = open(test_file, "r")
    test_list = f_test.readlines()
    print("we got %d test videos" % len(test_list))

    line_id = 1
    match_count = 0

    result = []
    for line in test_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])
        spatial_prediction = VideoSpatialPrediction(clip_path, spatial_net,
                                                    num_categories,
                                                    num_samples, new_size,
                                                    args.batch_size)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        result.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)

        print("Rhythm split " + str(args.split) +
              ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" %
              (line_id, len(test_list), input_video_label, pred_index,
               match_count))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(test_list))
    print("Accuracy is : %4.4f" % ((float(match_count) / len(test_list))))

    npy_name = args.dataset + "_rhythm_" + args.architecture + "_s" + str(
        args.split) + ".npy"
    npy_path = os.path.join(output_path, npy_name)
    np.save(npy_path, np.array(result))
def main():
    args = parser.parse_args()

    #model_path = '../../parameters/'+args.architecture+'/'+args.modality+'_s'+str(args.split)+'.pth.tar'
    #data_path = '/home/Datasets/UCF-101-OF_CPU'
    model_path = args.model_path
    data_path = args.data_dir

    start_frame = 0
    num_categories = 51 if args.dataset == 'hmdb51' else 101
    new_size = 224
    model_start_time = time.time()
    params = torch.load(model_path)
    if args.architecture == "inception_v3":
        new_size = 299
        temporal_net = models.flow_inception_v3(pretrained=False,
                                                channels=20,
                                                num_classes=num_categories)
    else:
        temporal_net = models.flow_resnet152(pretrained=False,
                                             channels=20,
                                             num_classes=num_categories)
    temporal_net.load_state_dict(params['state_dict'])
    temporal_net.cuda()
    temporal_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition temporal model is loaded in %4.4f seconds." %
          (model_time))

    val_file = "./splits/" + args.dataset + "/val_split%d.txt" % (args.split)
    #val_file = 'spatial_testlist01_with_labels.txt'
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0
    result_list = []

    for line in val_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_path, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])
        spatial_prediction = VideoTemporalPrediction(args.modality,
                                                     clip_path,
                                                     temporal_net,
                                                     num_categories,
                                                     start_frame,
                                                     num_frames,
                                                     new_size=new_size)

        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        # print(avg_spatial_pred_fc8.shape)
        result_list.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print(args.modality + " split " + str(args.split) +
              ", sample %d/%d: GT: %d, Prediction: %d" %
              (line_id, len(val_list), input_video_label, pred_index))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is %4.4f" % (float(match_count) / len(val_list)))
    np.save(
        args.dataset + "_" + args.modality + "_resnet152_s" + str(args.split) +
        ".npy", np.array(result_list))