def load_data(data_dir, batch_size, num_workers, segments, train): #The transformation function does three things: center crop the image to 224x224 in size, transpose it to num_channels,num_frames,height*width, and normalize with mean and standard deviation calculated across all ImageNet images. #Use the general gluoncv dataloader VideoClsCustom to load the data with num_frames = 32 as the length. For another dataset, you can just replace the value of root and setting to your data directory and your prepared text file. transform_train = video.VideoGroupTrainTransform( size=(224, 224), scale_ratios=[1.0, 0.8], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = VideoClsCustom(root=data_dir + '/' + segments, setting=data_dir + '/' + train, train=True, new_length=32, transform=transform_train) print(os.listdir(data_dir + '/' + segments)) print('Load %d training samples.' % len(train_dataset)) return gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
# Just use our general dataloader `VideoClsCustom <https://github.com/dmlc/gluon-cv/blob/master/gluoncv/data/kinetics400/classification.py>`_ to load your data. # # In this tutorial, we will use UCF101 dataset as an example. # For your own dataset, you can just replace the value of ``root`` and ``setting`` to your data directory and your prepared text file. # Let's first define some basics. num_gpus = 1 ctx = [mx.gpu(i) for i in range(num_gpus)] transform_train = video.VideoGroupTrainTransform(size=(224, 224), scale_ratios=[1.0, 0.8], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) per_device_batch_size = 5 num_workers = 8 batch_size = per_device_batch_size * num_gpus train_dataset = VideoClsCustom(root=os.path.expanduser('~/.mxnet/datasets/ucf101/rawframes'), setting=os.path.expanduser('~/.mxnet/datasets/ucf101/ucfTrainTestlist/ucf101_train_split_1_rawframes.txt'), train=True, new_length=32, transform=transform_train) print('Load %d training samples.' % len(train_dataset)) train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) ################################################################ # Custom Network # -------------- # # You can always define your own network architecture. Here, we want to show how to fine-tune on a pre-trained model. # Since I3D model is a very popular network, we will use I3D with ResNet50 backbone trained on Kinetics400 dataset (i.e., ``i3d_resnet50_v1_kinetics400``) as an example. # # For simple fine-tuning, people usually just replace the last classification (dense) layer to the number of classes in your dataset
def get_data_loader(opt, batch_size, num_workers, logger, kvstore=None): data_dir = opt.data_dir val_data_dir = opt.val_data_dir scale_ratios = [float(i) for i in opt.scale_ratios.split(',')] input_size = opt.input_size default_mean = [0.485, 0.456, 0.406] default_std = [0.229, 0.224, 0.225] def batch_fn(batch, ctx): data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) return data, label if opt.data_aug == 'v1': # GluonCV style, not keeping aspect ratio, multi-scale crop transform_train = video.VideoGroupTrainTransform( size=(input_size, input_size), scale_ratios=scale_ratios, mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransform(size=input_size, mean=default_mean, std=default_std) elif opt.data_aug == 'v2': # GluonCV style, keeping aspect ratio, multi-scale crop, same as mmaction style transform_train = video.VideoGroupTrainTransformV2( size=(input_size, input_size), short_side=opt.new_height, scale_ratios=scale_ratios, mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransformV2( crop_size=(input_size, input_size), short_side=opt.new_height, mean=default_mean, std=default_std) elif opt.data_aug == 'v3': # PySlowFast style, keeping aspect ratio, random short side scale jittering transform_train = video.VideoGroupTrainTransformV3( crop_size=(input_size, input_size), min_size=opt.new_height, max_size=opt.new_width, mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransformV2( crop_size=(input_size, input_size), short_side=opt.new_height, mean=default_mean, std=default_std) elif opt.data_aug == 'v4': # mmaction style, keeping aspect ratio, random crop and resize, only for SlowFast family models, similar to 'v3' transform_train = video.VideoGroupTrainTransformV4(size=(input_size, input_size), mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransformV2( crop_size=(input_size, input_size), short_side=opt.new_height, mean=default_mean, std=default_std) else: logger.info('Data augmentation %s is not supported yet.' % (opt.data_aug)) if opt.dataset == 'kinetics400': train_dataset = Kinetics400( setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = Kinetics400( setting=opt.val_list, root=val_data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'ucf101': train_dataset = UCF101(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = UCF101(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'somethingsomethingv2': train_dataset = SomethingSomethingV2(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = SomethingSomethingV2(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'hmdb51': train_dataset = HMDB51(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = HMDB51(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'custom': train_dataset = VideoClsCustom( setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = VideoClsCustom( setting=opt.val_list, root=val_data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) else: logger.info('Dataset %s is not supported yet.' % (opt.dataset)) logger.info('Load %d training samples and %d validation samples.' % (len(train_dataset), len(val_dataset))) if kvstore is not None: train_data = gluon.data.DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, sampler=ShuffleSplitSampler(len(train_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader( val_dataset, batch_size=batch_size, num_workers=num_workers, sampler=ShuffleSplitSampler(len(val_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') else: train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') return train_data, val_data, batch_fn
def main(logger): opt = parse_args() logger.info(opt) gc.set_threshold(100, 5, 5) if not os.path.exists(opt.save_dir): os.makedirs(opt.save_dir) # set env if opt.gpu_id == -1: context = mx.cpu() else: gpu_id = opt.gpu_id context = mx.gpu(gpu_id) # get data preprocess image_norm_mean = [0.485, 0.456, 0.406] image_norm_std = [0.229, 0.224, 0.225] if opt.ten_crop: transform_test = transforms.Compose([ video.VideoTenCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 10 elif opt.three_crop: transform_test = transforms.Compose([ video.VideoThreeCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 3 else: transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=image_norm_mean, std=image_norm_std) opt.num_crop = 1 # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, feat_ext=True, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if opt.resume_params != '' and not opt.use_pretrained: net.load_parameters(opt.resume_params, ctx=context) logger.info('Pre-trained model %s is successfully loaded.' % (opt.resume_params)) else: logger.info( 'Pre-trained model is successfully loaded from the model zoo.') logger.info("Successfully built model {}".format(model_name)) # get data anno_file = opt.data_list f = open(anno_file, 'r') data_list = f.readlines() logger.info('Load %d video samples.' % len(data_list)) # build a pseudo dataset instance to use its children class methods video_utils = VideoClsCustom(root=opt.data_dir, setting=opt.data_list, num_segments=opt.num_segments, num_crop=opt.num_crop, new_length=opt.new_length, new_step=opt.new_step, new_width=opt.new_width, new_height=opt.new_height, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, lazy_init=True) start_time = time.time() for vid, vline in enumerate(data_list): video_path = vline.split()[0] video_name = video_path.split('/')[-1] if opt.need_root: video_path = os.path.join(opt.data_dir, video_path) video_data = read_data(opt, video_path, transform_test, video_utils) video_input = video_data.as_in_context(context) video_feat = net(video_input.astype(opt.dtype, copy=False)) feat_file = '%s_%s_feat.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, feat_file), video_feat.asnumpy()) if vid > 0 and vid % opt.log_interval == 0: logger.info('%04d/%04d is done' % (vid, len(data_list))) end_time = time.time() logger.info('Total feature extraction time is %4.2f minutes' % ((end_time - start_time) / 60))
def main(): opt = parse_args() makedirs(opt.save_dir) filehandler = logging.FileHandler( os.path.join(opt.save_dir, opt.logging_file)) streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) logger.info(opt) gc.set_threshold(100, 5, 5) # set env if opt.gpu_id == -1: context = mx.cpu() else: gpu_id = opt.gpu_id context = mx.gpu(gpu_id) # get data preprocess image_norm_mean = [0.485, 0.456, 0.406] image_norm_std = [0.229, 0.224, 0.225] if opt.ten_crop: transform_test = transforms.Compose([ video.VideoTenCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 10 elif opt.three_crop: transform_test = transforms.Compose([ video.VideoThreeCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 3 else: transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=image_norm_mean, std=image_norm_std) opt.num_crop = 1 # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if opt.resume_params != '' and not opt.use_pretrained: net.load_parameters(opt.resume_params, ctx=context) logger.info('Pre-trained model %s is successfully loaded.' % (opt.resume_params)) else: logger.info( 'Pre-trained model is successfully loaded from the model zoo.') logger.info("Successfully built model {}".format(model_name)) # get classes list, if we are using a pretrained network from the model_zoo classes = None if opt.use_pretrained: if "kinetics400" in model_name: classes = Kinetics400Attr().classes elif "ucf101" in model_name: classes = UCF101Attr().classes elif "hmdb51" in model_name: classes = HMDB51Attr().classes elif "sthsth" in model_name: classes = SomethingSomethingV2Attr().classes # get data anno_file = opt.data_list f = open(anno_file, 'r') data_list = f.readlines() logger.info('Load %d video samples.' % len(data_list)) # build a pseudo dataset instance to use its children class methods video_utils = VideoClsCustom(root=opt.data_dir, setting=opt.data_list, num_segments=opt.num_segments, num_crop=opt.num_crop, new_length=opt.new_length, new_step=opt.new_step, new_width=opt.new_width, new_height=opt.new_height, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, lazy_init=True) start_time = time.time() for vid, vline in enumerate(data_list): video_path = vline.split()[0] video_name = video_path.split('/')[-1] if opt.need_root: video_path = os.path.join(opt.data_dir, video_path) video_data = read_data(opt, video_path, transform_test, video_utils) video_input = video_data.as_in_context(context) pred = net(video_input.astype(opt.dtype, copy=False)) if opt.save_logits: logits_file = '%s_%s_logits.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, logits_file), pred.asnumpy()) pred_label = np.argmax(pred.asnumpy()) if opt.save_preds: preds_file = '%s_%s_preds.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, preds_file), pred_label) # Try to report a text label instead of the number. if classes: pred_label = classes[pred_label] logger.info('%04d/%04d: %s is predicted to class %s' % (vid, len(data_list), video_name, pred_label)) end_time = time.time() logger.info('Total inference time is %4.2f minutes' % ((end_time - start_time) / 60))
def main(logger): opt = parse_args(parser) print(opt) assert not (os.path.isdir(opt.save_dir)), "already done this experiment..." Path(opt.save_dir).mkdir(parents=True) # Garbage collection, default threshold is (700, 10, 10). # Set threshold lower to collect garbage more frequently and release more CPU memory for heavy data loading. gc.set_threshold(100, 5, 5) num_gpus = 1 context = [mx.gpu(i) for i in range(num_gpus)] per_device_batch_size = 5 num_workers = 12 batch_size = per_device_batch_size * num_gpus num_workers = opt.num_workers print('Total batch size is set to %d on %d GPUs' % (batch_size, num_gpus)) # get data default_mean = [0.485, 0.456, 0.406] default_std = [0.229, 0.224, 0.225] # if opt.ten_crop: # if opt.data_aug == 'v1': # transform_test = transforms.Compose([ # video.VideoTenCrop(opt.input_size), # video.VideoToTensor(), # video.VideoNormalize(default_mean, default_std) # ]) # else: # transform_test = transforms.Compose([ # video.ShortSideRescale(opt.input_size), # video.VideoTenCrop(opt.input_size), # video.VideoToTensor(), # video.VideoNormalize(default_mean, default_std) # ]) # opt.num_crop = 10 # elif opt.three_crop: # if opt.data_aug == 'v1': # transform_test = transforms.Compose([ # video.VideoThreeCrop(opt.input_size), # video.VideoToTensor(), # video.VideoNormalize(default_mean, default_std) # ]) # else: # transform_test = transforms.Compose([ # video.ShortSideRescale(opt.input_size), # video.VideoThreeCrop(opt.input_size), # video.VideoToTensor(), # video.VideoNormalize(default_mean, default_std) # ]) # opt.num_crop = 3 # else: # if opt.data_aug == 'v1': # transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=default_mean, std=default_std) # else: # transform_test = video.VideoGroupValTransformV2(crop_size=(opt.input_size, opt.input_size), short_side=opt.input_size, # mean=default_mean, std=default_std) # opt.num_crop = 1 if not opt.deploy: # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model # Currently, these is no hashtag for int8 models. if opt.quantized: model_name += '_int8' opt.use_pretrained = True net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) resume_params = find_model_params(opt) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if resume_params is not '' and not opt.use_pretrained: net.load_parameters(resume_params, ctx=context) print('Pre-trained model %s is successfully loaded.' % (resume_params)) else: print( 'Pre-trained model is successfully loaded from the model zoo.') else: model_name = 'deploy' net = mx.gluon.SymbolBlock.imports( '{}-symbol.json'.format(opt.model_prefix), ['data'], '{}-0000.params'.format(opt.model_prefix)) net.hybridize(static_alloc=True, static_shape=True) print("Successfully loaded model {}".format(model_name)) # dummy data for benchmarking performance if opt.benchmark: benchmarking(opt, net, context) sys.exit() if opt.dataset == 'ucf101': val_dataset = UCF101(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=opt.input_size, target_height=opt.input_size, test_mode=True, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'kinetics400': val_dataset = Kinetics400( setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, test_mode=True, data_aug=opt.data_aug, num_segments=opt.num_segments, num_crop=opt.num_crop, transform=transform_test) elif opt.dataset == 'somethingsomethingv2': val_dataset = SomethingSomethingV2(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'hmdb51': val_dataset = HMDB51(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'custom': transform_test = video.VideoGroupTrainTransform( size=(224, 224), scale_ratios=[1.0, 0.8], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_dataset = VideoClsCustom( root=opt.val_data_dir, setting=opt.val_list, train=False, new_length=32, name_pattern='frame_%d.jpg', transform=transform_test, video_loader=False, slowfast=True, use_decord=True, ) else: logger.info('Dataset %s is not supported yet.' % (opt.dataset)) # val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, # prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) print('Load %d test samples in %d iterations.' % (len(val_dataset), len(val_data))) # calibrate FP32 model into INT8 model if opt.calibration: calibration(net, val_data, opt, context, logger) sys.exit() start_time = time.time() acc_top1_val, acc_top5_val, true_labels, predicted_probabilities = test( context, val_data, opt, net) split_filename = os.path.split(opt.val_list)[1] split = os.path.splitext(split_filename)[0] #load encoder encoder = joblib.load(opt.encoder_path) #set-up metrics classes = np.arange(len(encoder.classes_)) metrics_dict = { "Accuracy": balanced_accuracy_score, "Mcc": matthews_corrcoef, "Precision_Avg": [precision_score, { "average": "micro" }], "Recall_Avg": [recall_score, { "average": "micro" }], "Precision_Class": [precision_score, { "labels": classes, "average": None }], "Recall_Class": [recall_score, { "labels": classes, "average": None }], } split_folder = os.path.join(opt.save_dir, split) #set-up evaluator evaluator = Evaluator_video(split_folder, encoder, true_labels, predicted_probabilities, metrics_dict) #compute report report = get_split_report(evaluator) #save report save_results(report, split_folder) print(f"Correctly process split {split}") end_time = time.time() print('Test accuracy: acc-top1=%f acc-top5=%f' % (acc_top1_val * 100, acc_top5_val * 100)) print('Total evaluation time is %4.2f minutes' % ((end_time - start_time) / 60))
def read_video_data(s3_video_path, num_frames=32): """Read and preprocess video data from the S3 bucket.""" print('read and preprocess video data here ') s3_client = boto3.client('s3') #print(uuid.uuid4()) fname = s3_video_path.replace('s3://', '') fname = fname.replace('S3://', '') fname = fname.replace('/', '') #download_path = '/tmp/{}-{}'.format(uuid.uuid4(), fname) #video_list_path = '/tmp/{}-{}'.format(uuid.uuid4(), 'video_list.txt') download_path = '/tmp/' + fname video_list_path = '/tmp/video_list' + str(uuid.uuid4()) + '.txt' bucket, key = get_bucket_and_key(s3_video_path) s3_client.download_file(bucket, key, download_path) #update download_path filename to be unique filename, ext = os.path.splitext(download_path) # save the file extension filename = filename + str(uuid.uuid4()) os.rename(download_path, filename + ext) download_path = filename + ext #Dummy duration and label with each video path video_list = '{} {} {}'.format(download_path, 10, 1) with open(video_list_path, 'w') as fopen: fopen.write(video_list) #Constants data_dir = '/tmp/' num_segments = 1 new_length = num_frames new_step = 1 use_decord = True video_loader = True slowfast = False #Preprocessing params #The transformation function does three things: center crop the image to 224x224 in size, transpose it to num_channels,num_frames,height*width, and normalize with mean and standard deviation calculated across all ImageNet images. #Use the general gluoncv dataloader VideoClsCustom to load the data with num_frames = 32 as the length. input_size = 224 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] transform = video.VideoGroupValTransform(size=input_size, mean=mean, std=std) video_utils = VideoClsCustom(root=data_dir, setting=video_list_path, num_segments=num_segments, new_length=new_length, new_step=new_step, video_loader=video_loader, use_decord=use_decord, slowfast=slowfast) #Read for the video list video_name = video_list.split()[0] decord = try_import_decord() decord_vr = decord.VideoReader(video_name) duration = len(decord_vr) skip_length = new_length * new_step segment_indices, skip_offsets = video_utils._sample_test_indices(duration) if video_loader: if slowfast: clip_input = video_utils._video_TSN_decord_slowfast_loader( video_name, decord_vr, duration, segment_indices, skip_offsets) else: clip_input = video_utils._video_TSN_decord_batch_loader( video_name, decord_vr, duration, segment_indices, skip_offsets) else: raise RuntimeError('We only support video-based inference.') clip_input = transform(clip_input) if slowfast: sparse_sampels = len(clip_input) // (num_segments * num_crop) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (sparse_sampels, 3, input_size, input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (new_length, 3, input_size, input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) if new_length == 1: clip_input = np.squeeze(clip_input, axis=2) # this is for 2D input case clip_input = nd.array(clip_input) #Cleanup temp files os.remove(download_path) os.remove(video_list_path) #os.system('rm {}'.format(download_path)) #os.system('rm {}'.format(video_list_path)) return clip_input
def get_data_loader(opt, batch_size, num_workers, logger, kvstore=None): data_dir = opt.data_dir val_data_dir = opt.val_data_dir scale_ratios = [float(i) for i in opt.scale_ratios.split(',')] input_size = opt.input_size def batch_fn(batch, ctx): data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) return data, label transform_train = video.VideoGroupTrainTransform( size=(input_size, input_size), scale_ratios=scale_ratios, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_test = video.VideoGroupValTransform(size=input_size, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if opt.dataset == 'kinetics400': train_dataset = Kinetics400( setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, num_segments=opt.num_segments, transform=transform_train) val_dataset = Kinetics400( setting=opt.val_list, root=val_data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'ucf101': train_dataset = UCF101(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_train) val_dataset = UCF101(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'somethingsomethingv2': train_dataset = SomethingSomethingV2(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_train) val_dataset = SomethingSomethingV2(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'hmdb51': train_dataset = HMDB51(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_train) val_dataset = HMDB51(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'custom': train_dataset = VideoClsCustom( setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, num_segments=opt.num_segments, transform=transform_train) val_dataset = VideoClsCustom( setting=opt.val_list, root=val_data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, num_segments=opt.num_segments, transform=transform_test) else: logger.info('Dataset %s is not supported yet.' % (opt.dataset)) logger.info('Load %d training samples and %d validation samples.' % (len(train_dataset), len(val_dataset))) if kvstore is not None: train_data = gluon.data.DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, sampler=ShuffleSplitSampler(len(train_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader( val_dataset, batch_size=batch_size, num_workers=num_workers, sampler=ShuffleSplitSampler(len(val_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') else: train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') return train_data, val_data, batch_fn
from gluoncv.utils import makedirs, LRSequential, LRScheduler, split_and_load, TrainingHistory num_gpus = 1 ctx = [mx.gpu(i) for i in range(num_gpus)] transform_train = video.VideoGroupTrainTransform(size=(224, 224), scale_ratios=[1.0, 0.8], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) per_device_batch_size = 5 num_workers = 8 batch_size = per_device_batch_size * num_gpus train_dataset = VideoClsCustom(root=os.path.expanduser(root_dir), setting=os.path.expanduser(train_val_txt_path), train=True, new_length=32, video_loader=True, transform=transform_train) print('Load %d training samples.' % len(train_dataset)) train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) net = get_model(name='slowfast_8x8_resnet50_kinetics400', nclass=18) net.collect_params().reset_ctx(ctx) # print(net) # Learning rate decay factor lr_decay = 0.1