def get_loaders(opt): """ Make dataloaders for train and validation sets """ # train loader norm_method = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) spatial_transform = Compose([ Scale((opt.sample_size, opt.sample_size)), Resize(256), CenterCrop(224), ToTensor(), norm_method ]) temporal_transform = TemporalRandomCrop(25) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True) # validation loader target_transform = ClassLabel() temporal_transform = LoopPadding(25) validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True) return train_loader, val_loader
def get_dataloader(opt): mean = [110.63666788 / 255, 103.16065604 / 255, 96.29023126 / 255] std = [1, 1, 1] norm_method = Normalize(mean, std) spatial_transform = Compose( [Scale(112), CornerCrop(112, 'c'), ToTensor(255), norm_method]) temporal_transform = LoopPadding(16) target_transform = ClassLabel() test_data = SurgicalDataset(os.path.abspath(opt.frames_path), os.path.abspath( opt.video_phase_annotation_path), opt.class_names, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, sample_duration=16) test_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) return test_loader
def get_traininfo(opt, norm_method): assert opt.train_crop in ['random', 'corner', 'center'] if opt.train_crop == 'random': crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'corner': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'center': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size, crop_positions=['c']) spatial_transform = Compose([ RandomRotate(), RandomResize(), crop_method, ToTensor(opt.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(opt.sample_duration) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'prec1', 'prec5', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'prec1', 'prec5', 'lr']) return train_loader, train_logger, train_batch_logger
def __init__(self, model_file, sample_duration, model_type, cuda_id=0): self.opt = parse_opts() self.opt.model = model_type self.opt.root_path = './C3D_ResNet/data' self.opt.resume_path = os.path.join(self.opt.root_path, model_file) self.opt.pretrain_path = os.path.join(self.opt.root_path, 'models/resnet-18-kinetics.pth') self.opt.cuda_id = cuda_id self.opt.dataset = 'ucf101' self.opt.n_classes = 400 self.opt.n_finetune_classes = 3 self.opt.ft_begin_index = 4 self.opt.model_depth = 18 self.opt.resnet_shortcut = 'A' self.opt.sample_duration = sample_duration self.opt.batch_size = 1 self.opt.n_threads = 1 self.opt.checkpoint = 5 self.opt.arch = '{}-{}'.format(self.opt.model, self.opt.model_depth) self.opt.mean = get_mean(self.opt.norm_value, dataset=self.opt.mean_dataset) self.opt.std = get_std(self.opt.norm_value) # print(self.opt) print('Loading C3D action-recognition model..') self.model, parameters = generate_model(self.opt) # print(self.model) if self.opt.no_mean_norm and not self.opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not self.opt.std_norm: norm_method = Normalize(self.opt.mean, [1, 1, 1]) else: norm_method = Normalize(self.opt.mean, self.opt.std) if self.opt.resume_path: print(' loading checkpoint {}'.format(self.opt.resume_path)) checkpoint = torch.load(self.opt.resume_path) # assert self.opt.arch == checkpoint['arch'] self.opt.begin_epoch = checkpoint['epoch'] self.model.load_state_dict(checkpoint['state_dict']) self.spatial_transform = Compose([ ScaleQC(int(self.opt.sample_size / self.opt.scale_in_test)), CornerCrop(self.opt.sample_size, self.opt.crop_position_in_test), ToTensor(self.opt.norm_value), norm_method ]) self.target_transform = ClassLabel() self.model.eval()
def get_loaders(opt): """ Make dataloaders for train and validation sets """ # train loader opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset) if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) spatial_transform = Compose([ # crop_method, Scale((opt.sample_size, opt.sample_size)), # RandomHorizontalFlip(), ToTensor(opt.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(16) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader( training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True) # validation loader spatial_transform = Compose([ Scale((opt.sample_size, opt.sample_size)), # CenterCrop(opt.sample_size), ToTensor(opt.norm_value), norm_method ]) target_transform = ClassLabel() temporal_transform = LoopPadding(16) validation_data = get_validation_set( opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader( validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True) return train_loader, val_loader
def get_testinfo(opt, norm_method): spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(opt.norm_value), norm_method ]) # temporal_transform = LoopPadding(opt.sample_duration) target_transform = ClassLabel() test_data = get_test_set(opt, spatial_transform, target_transform) test_loader = torch.utils.data.DataLoader( test_data, batch_size=1, # batchsize must be 1 shuffle=False, num_workers=opt.n_threads, pin_memory=True) return test_loader
def get_valinfo(opt, norm_method): spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(opt.norm_value), norm_method ]) temporal_transform = TemporalCenterCrop(opt.sample_duration) target_transform = ClassLabel() validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) val_logger = Logger(os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'prec1', 'prec5']) return validation_data, val_loader, val_logger
def create_dataloader(args): if args.root_path != '': args.video_path = os.path.join(args.root_path, args.video_path) args.annotation_path = os.path.join(args.root_path, args.annotation_path) args.result_path = os.path.join(args.root_path, args.result_path) if args.resume_path: args.resume_path = os.path.join(args.root_path, args.resume_path) if args.pretrain_path: # args.pretrain_path = os.path.join(args.root_path, args.pretrain_path) args.pretrain_path = os.path.abspath(args.pretrain_path) args.scales = [args.initial_scale] for i in range(1, args.n_scales): args.scales.append(args.scales[-1] * args.scale_step) args.mean = get_mean(args.norm_value, dataset=args.mean_dataset) args.std = get_std(args.norm_value) if args.no_mean_norm and not args.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not args.std_norm: norm_method = Normalize(args.mean, [1, 1, 1]) else: norm_method = Normalize(args.mean, args.std) assert args.train_crop in ['random', 'corner', 'center'] if args.train_crop == 'random': crop_method = MultiScaleRandomCrop(args.scales, args.sample_size) elif args.train_crop == 'corner': crop_method = MultiScaleCornerCrop(args.scales, args.sample_size) elif args.train_crop == 'center': crop_method = MultiScaleCornerCrop(args.scales, args.sample_size, crop_positions=['c']) spatial_transform = Compose([ crop_method, RandomHorizontalFlip(), ToTensor(args.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(args.sample_duration) target_transform = ClassLabel() training_data = get_training_set(args, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=args.batch_size, shuffle=True, num_workers=args.n_threads, pin_memory=True) spatial_transform = Compose([ # Scale(args.sample_size), Scale(int(args.sample_size / args.scale_in_test)), # CenterCrop(args.sample_size), CornerCrop(args.sample_size, args.crop_position_in_test), ToTensor(args.norm_value), norm_method ]) temporal_transform = TemporalCenterCrop(args.sample_duration) target_transform = ClassLabel() validation_data = get_validation_set(args, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=1, shuffle=False, num_workers=args.n_threads, pin_memory=True) return train_loader, val_loader
parser.add_argument('--manual_seed', default=1, type=int, help='Manually set random seed') args = parser.parse_args() return args if __name__ == '__main__': args = opt() #argsの読み出し args.arch = "ResNet-{}".format(args.model_depth) #実行するアーキテクチャを書き込む spatial_transform = Compose([ ToTensor(), #1iterごとに読み込まれる各フレーム(PIL Image)をTensorへ変換する ]) temporal_transform = TemporalRandomCrop4flow() #時間方向の前処理,今回はなし target_transform = ClassLabel() #学習する正解データ,2クラス分類なのでラベル #accuracies=AverageMeter()#各回におけるaccとその平均 model = test_generate_model(args) #モデルの読み込み(pretrainがあれば重みも読み込んでおく) test_data = get_training_set(args, spatial_transform, temporal_transform, target_transform) #データローダに入力するデータセットの作成 test_loader = torch.utils.data.DataLoader(test_data, batch_size=20) pred = [] Y = [] for i, (x, y) in enumerate(test_loader): x = torch.tensor(x).cuda() with torch.no_grad(): output = model(x) pred += [int(l.argmax()) for l in output]
def test(self, annotation_path='', video_path=''): opt = self.opt if annotation_path != '': opt.annotation_path = annotation_path if opt.root_path != '': opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path) # if video_path != '': # opt.video_path = video_path # if opt.root_path != '': # opt.video_path = os.path.join(opt.root_path, opt.video_path) if not os.path.exists(opt.result_path): os.makedirs(opt.result_path) with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file: json.dump(vars(opt), opt_file) if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) # original spatial_transform = Compose([ #Scale(opt.sample_size), Scale(112), CenterCrop(112), ToTensor(opt.norm_value), norm_method ]) temporal_transform = TemporalCenterCrop(opt.sample_duration) target_transform = ClassLabel() test_data = get_test_set(opt, spatial_transform, temporal_transform, target_transform) test_loader = torch.utils.data.DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) test_logger = Logger(os.path.join(opt.result_path, 'test.log'), ['top1', 'precision', 'recall']) if opt.resume_path: print('loading checkpoint {}'.format(opt.resume_path)) checkpoint = torch.load(opt.resume_path) assert opt.arch == checkpoint['arch'] opt.begin_epoch = checkpoint['epoch'] self.model.load_state_dict(checkpoint['state_dict']) recorder = [] self.model.eval() batch_time = AverageMeter() top1 = AverageMeter() precisions = AverageMeter() recalls = AverageMeter() y_true = [] y_pred = [] end_time = time.time() for i, (inputs, targets) in enumerate(test_loader): if not opt.no_cuda: targets = targets.cuda(non_blocking=True) #inputs = Variable(torch.squeeze(inputs), volatile=True) with torch.no_grad(): inputs = Variable(inputs) targets = Variable(targets) outputs = self.model(inputs) if not opt.no_softmax_in_test: outputs = F.softmax(outputs, dim=1) recorder.append(outputs.data.cpu().numpy().copy()) y_true.extend(targets.cpu().numpy().tolist()) y_pred.extend(outputs.argmax(1).cpu().numpy().tolist()) _cls = outputs.argmax(1).cpu().numpy().tolist()[0] prec1 = self.calculate_accuracy(outputs, targets, topk=(1, )) precision = calculate_precision(outputs, targets) recall = calculate_recall(outputs, targets) top1.update(prec1[0], inputs.size(0)) precisions.update(precision, inputs.size(0)) recalls.update(recall, inputs.size(0)) batch_time.update(time.time() - end_time) end_time = time.time() test_logger.log({ 'top1': top1.avg, 'precision': precisions.avg, 'recall': recalls.avg }) print('-----Evaluation is finished------') print('Overall Prec@1 {:.05f}%'.format(top1.avg * 100)) return y_pred, y_true, test_data
def main(): detector, classifier = load_models(opt) if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) spatial_transform = Compose([ Scale(112), CenterCrop(112), ToTensor(opt.norm_value), norm_method ]) target_transform = ClassLabel() ## Get list of videos to test if opt.dataset == 'egogesture': subject_list = ['Subject{:02d}'.format(i) for i in [2, 9, 11, 14, 18, 19, 28, 31, 41, 47]] test_paths = [] for subject in subject_list: for x in glob.glob(os.path.join(opt.video_path, subject, '*/*/rgb*')): test_paths.append(x) elif opt.dataset == 'nvgesture': df = pd.read_csv(os.path.join(opt.video_path, 'nvgesture_test_correct_cvpr2016_v2.lst'), delimiter=' ', header=None) test_paths = [] for x in df[0].values: test_paths.append(os.path.join(opt.video_path, x.replace('path:', ''), 'sk_color_all')) print('Start Evaluation') detector.eval() classifier.eval() levenshtein_accuracies = AverageMeter() videoidx = 0 for path in test_paths[:]: if opt.dataset == 'egogesture': opt.whole_path = os.path.join(*path.rsplit(os.sep, 4)[1:]) elif opt.dataset == 'nvgesture': opt.whole_path = os.path.join(*path.rsplit(os.sep, 5)[1:]) videoidx += 1 active_index = 0 passive_count = 0 active = False prev_active = False finished_prediction = None pre_predict = False cum_sum = np.zeros(opt.n_classes_clf, ) clf_selected_queue = np.zeros(opt.n_classes_clf, ) det_selected_queue = np.zeros(opt.n_classes_det, ) myqueue_det = Queue(opt.det_queue_size, n_classes=opt.n_classes_det) myqueue_clf = Queue(opt.clf_queue_size, n_classes=opt.n_classes_clf) print('[{}/{}]============'.format(videoidx, len(test_paths))) print(path) opt.sample_duration = max(opt.sample_duration_clf, opt.sample_duration_det) temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample) test_data = get_online_data( opt, spatial_transform, None, target_transform) test_loader = torch.utils.data.DataLoader( test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) results = [] prev_best1 = opt.n_classes_clf dataset_len = len(test_loader.dataset) for i, (inputs, targets) in enumerate(test_loader): if not opt.no_cuda: targets = targets.cuda() ground_truth_array = np.zeros(opt.n_classes_clf + 1, ) with torch.no_grad(): inputs = Variable(inputs) targets = Variable(targets) if opt.modality_det == 'RGB': inputs_det = inputs[:, :-1, -opt.sample_duration_det:, :, :] elif opt.modality_det == 'Depth': inputs_det = inputs[:, -1, -opt.sample_duration_det:, :, :].unsqueeze(1) elif opt.modality_det == 'RGB-D': inputs_det = inputs[:, :, -opt.sample_duration_det:, :, :] outputs_det = detector(inputs_det) outputs_det = F.softmax(outputs_det, dim=1) outputs_det = outputs_det.cpu().numpy()[0].reshape(-1, ) # enqueue the probabilities to the detector queue myqueue_det.enqueue(outputs_det.tolist()) if opt.det_strategy == 'raw': det_selected_queue = outputs_det elif opt.det_strategy == 'median': det_selected_queue = myqueue_det.median elif opt.det_strategy == 'ma': det_selected_queue = myqueue_det.ma elif opt.det_strategy == 'ewma': det_selected_queue = myqueue_det.ewma prediction_det = np.argmax(det_selected_queue) prob_det = det_selected_queue[prediction_det] #### State of the detector is checked here as detector act as a switch for the classifier if prediction_det == 1: if opt.modality_clf == 'RGB': inputs_clf = inputs[:, :-1, :, :, :] elif opt.modality_clf == 'Depth': inputs_clf = inputs[:, -1, :, :, :].unsqueeze(1) elif opt.modality_clf == 'RGB-D': inputs_clf = inputs[:, :, :, :, :] inputs_clf = torch.Tensor(inputs_clf.numpy()[:,:,::2,:,:]) outputs_clf = classifier(inputs_clf) outputs_clf = F.softmax(outputs_clf, dim=1) outputs_clf = outputs_clf.cpu().numpy()[0].reshape(-1, ) # Push the probabilities to queue myqueue_clf.enqueue(outputs_clf.tolist()) passive_count = 0 if opt.clf_strategy == 'raw': clf_selected_queue = outputs_clf elif opt.clf_strategy == 'median': clf_selected_queue = myqueue_clf.median elif opt.clf_strategy == 'ma': clf_selected_queue = myqueue_clf.ma elif opt.clf_strategy == 'ewma': clf_selected_queue = myqueue_clf.ewma else: outputs_clf = np.zeros(opt.n_classes_clf, ) # Push the probabilities to queue myqueue_clf.enqueue(outputs_clf.tolist()) passive_count += 1 if passive_count >= opt.det_counter or i == (dataset_len -2): active = False else: active = True # one of the following line need to be commented !!!! if active: active_index += 1 cum_sum = ((cum_sum * (active_index - 1)) + ( weighting_func(active_index) * clf_selected_queue)) / active_index # Weighted Aproach # cum_sum = ((cum_sum * (x-1)) + (1.0 * clf_selected_queue))/x #Not Weighting Aproach best2, best1 = tuple(cum_sum.argsort()[-2:][::1]) if float(cum_sum[best1] - cum_sum[best2]) > opt.clf_threshold_pre: finished_prediction = True pre_predict = True else: active_index = 0 if active == False and prev_active == True: finished_prediction = True elif active == True and prev_active == False: finished_prediction = False if finished_prediction == True: best2, best1 = tuple(cum_sum.argsort()[-2:][::1]) if cum_sum[best1] > opt.clf_threshold_final: if pre_predict == True: if best1 != prev_best1: if cum_sum[best1] > opt.clf_threshold_final: results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1)) print('Early Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1], ( i * opt.stride_len) + opt.sample_duration_clf)) else: if cum_sum[best1] > opt.clf_threshold_final: if best1 == prev_best1: if cum_sum[best1] > 5: results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1)) print('Late Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1], ( i * opt.stride_len) + opt.sample_duration_clf)) else: results.append(((i * opt.stride_len) + opt.sample_duration_clf, best1)) print('Late Detected - class : {} with prob : {} at frame {}'.format(best1, cum_sum[best1], ( i * opt.stride_len) + opt.sample_duration_clf)) finished_prediction = False prev_best1 = best1 cum_sum = np.zeros(opt.n_classes_clf, ) if active == False and prev_active == True: pre_predict = False prev_active = active if opt.dataset == 'egogesture': target_csv_path = os.path.join(opt.video_path, 'labels-final-revised1', opt.whole_path.rsplit(os.sep, 2)[0], 'Group' + opt.whole_path[-1] + '.csv').replace('Subject', 'subject') true_classes = [] with open(target_csv_path) as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: true_classes.append(int(row[0]) - 1) elif opt.dataset == 'nvgesture': true_classes = [] with open('./annotation_nvGesture/vallistall.txt') as csvfile: readCSV = csv.reader(csvfile, delimiter=' ') for row in readCSV: if row[0] == opt.whole_path: if row[1] != '26': true_classes.append(int(row[1]) - 1) if len(results) != 0: predicted = np.array(results)[:, 1] else: predicted = [] true_classes = np.array(true_classes) levenshtein_distance = LevenshteinDistance(true_classes, predicted) levenshtein_accuracy = 1 - (levenshtein_distance / len(true_classes)) if levenshtein_distance < 0: # Distance cannot be less than 0 levenshtein_accuracies.update(0, len(true_classes)) else: levenshtein_accuracies.update(levenshtein_accuracy, len(true_classes)) print('predicted classes: \t', predicted) print('True classes :\t\t', true_classes) print('Levenshtein Accuracy = {} ({})'.format(levenshtein_accuracies.val, levenshtein_accuracies.avg)) print('Average Levenshtein Accuracy= {}'.format(levenshtein_accuracies.avg)) print('-----Evaluation is finished------') with open("./results/online-results.log", "a") as myfile: myfile.write("{}, {}, {}, {}, {}, {}".format(datetime.datetime.now(), opt.resume_path_clf, opt.model_clf, opt.width_mult_clf, opt.modality_clf, levenshtein_accuracies.avg))
RandomHorizontalFlip(), ucf_crop, ToTensor(args.norm_value), norm_method, ]) ucf_temporal = TemporalRandomCrop(args.sample_duration, args.downsample) # ucf_temporal = TemporalCenterCrop(args.sample_duration, args.downsample) # kinetics_temporal = TransformTwice(TemporalRandomCrop(args.sample_duration, args.downsample)) kinetics_temporal = TemporalRandomCrop(args.sample_duration, args.downsample) # kinetics_temporal = TemporalCenterCrop(args.sample_duration, args.downsample) spatial_transform = [ucf_spatial, kinetics_spatial] temporal_transform = [ucf_temporal, kinetics_temporal] target_transform = [ClassLabel(), ClassLabel_fromarray(labels)] # here the second is irrelevant since we don't use their labels combined_dataset = UCF_with_Kinetics(args.l_vids_path, args.l_annotation_path, args.ul_vids_path, args.ul_annotation_path, 'training', 1, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, sample_duration=args.sample_duration) label_length = combined_dataset.labeled_length unlabel_length = combined_dataset.unlabeled_length assert label_length + unlabel_length == len(
def classify_video(opt, video_path): classifier = load_models(opt) if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) spatial_transform = Compose( [Scale(112), CenterCrop(112), ToTensor(opt.norm_value), norm_method]) target_transform = ClassLabel() # vedio open idx2label = [ "Zoom_in_with_fingers", "Click_with_index_finger", "Sweep_diagonal", "Sweep_circle", "Sweep_cross", "Make_a_phone_call", "Wave_finger", "Knock", "Dual_hands_heart", "Move_fingers_left" ] opt.sample_duration = opt.sample_duration_clf fps = "" # cap = cv2.VideoCapture(video_path) # cap = cv2.VideoCapture(0) num_frame = 0 clip = [] active_index = 0 passive_count = 0 active = False prev_active = False finished_prediction = None pre_predict = False classifier.eval() cum_sum = np.zeros(opt.n_classes_clf, ) clf_selected_queue = np.zeros(opt.n_classes_clf, ) myqueue_clf = Queue(opt.clf_queue_size, n_classes=opt.n_classes_clf) results = [] prev_best1 = opt.n_classes_clf spatial_transform.randomize_parameters() temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample) pre_len_result = 0 cur_label = "" step = 2 fps_r = [] total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) t1 = time.time() while cap.isOpened(): ret, frame = cap.read() if num_frame == 0: active = True cur_frame = cv2.resize(frame, (320, 240)) cur_frame = Image.fromarray( cv2.cvtColor(cur_frame, cv2.COLOR_BGR2RGB)) cur_frame = cur_frame.convert('RGB') for i in range(opt.sample_duration): clip.append(cur_frame) clip = [spatial_transform(img) for img in clip] elif num_frame == total_frames: break elif num_frame == total_frames - 3: active = False if num_frame % step == 0: clip.pop(0) _frame = cv2.resize(frame, (320, 240)) _frame = Image.fromarray(cv2.cvtColor(_frame, cv2.COLOR_BGR2RGB)) _frame = _frame.convert('RGB') _frame = spatial_transform(_frame) clip.append(_frame) im_dim = clip[0].size()[-2:] try: test_data = torch.cat(clip, 0).view((opt.sample_duration, -1) + im_dim).permute(1, 0, 2, 3) except Exception as e: pdb.set_trace() raise e inputs = torch.cat([test_data], 0).view(1, 3, opt.sample_duration, 112, 112) # print(inputs.size()) with torch.no_grad(): inputs = Variable(inputs) if opt.modality_clf == 'RGB': inputs_clf = inputs[:, :, :, :, :] inputs_clf = torch.Tensor(inputs_clf.numpy()[:, :, ::2, :, :]) outputs_clf = classifier(inputs_clf) outputs_clf = F.softmax(outputs_clf, dim=1) outputs_clf = outputs_clf.cpu().numpy()[0].reshape(-1, ) myqueue_clf.enqueue(outputs_clf.tolist()) if opt.clf_strategy == 'raw': clf_selected_queue = outputs_clf elif opt.clf_strategy == 'median': clf_selected_queue = myqueue_clf.median elif opt.clf_strategy == 'ma': clf_selected_queue = myqueue_clf.ma elif opt.clf_strategy == 'ewma': clf_selected_queue = myqueue_clf.ewma # print(clf_selected_queue) # one of the following line need to be commented !!!! if active: active_index += 1 cum_sum = ((cum_sum * (active_index - 1)) + (1.0 * clf_selected_queue) ) / active_index #Not Weighting Aproach best2, best1 = tuple(cum_sum.argsort()[-2:][::1]) if float(cum_sum[best1] - cum_sum[best2]) > opt.clf_threshold_pre: finished_prediction = True pre_predict = True else: active_index = 0 if active == False and prev_active == True: finished_prediction = True elif active == True and prev_active == False: finished_prediction = False if finished_prediction == True: # print("fnishsed_prediction") #print(finished_prediction,pre_predict) best2, best1 = tuple(cum_sum.argsort()[-2:][::1]) if cum_sum[best1] > opt.clf_threshold_final: results.append( ((i * opt.stride_len) + opt.sample_duration_clf, best1)) finished_prediction = False prev_best1 = best1 cum_sum = np.zeros(opt.n_classes_clf, ) if active == False and prev_active == True: pre_predict = False prev_active = active if len(results) != 0: predicted = np.array(results)[:, 1] prev_best1 = -1 else: predicted = [] if len(results) > pre_len_result: cur_label = idx2label[predicted[pre_len_result]] pre_len_result = len(results) num_frame += 1 elapsedTime = time.time() - t1 return cur_label, elapsedTime
def classify_video(opt, video_path): classifier = load_models(opt) if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) spatial_transform = Compose( [Scale(112), CenterCrop(112), ToTensor(opt.norm_value), norm_method]) target_transform = ClassLabel() # vedio open idx2label = [ "Zoom_in_with_fingers", "Click_with_index_finger", "Sweep_diagonal", "Sweep_circle", "Sweep_cross", "Make_a_phone_call", "Wave_finger", "Knock", "Dual_hands_heart", "Move_fingers_left" ] opt.sample_duration = opt.sample_duration_clf cap = cv2.VideoCapture(video_path) # cap = cv2.VideoCapture(0) num_frame = 0 clip = [] classifier.eval() spatial_transform.randomize_parameters() temporal_transform = TemporalRandomCrop(opt.sample_duration, opt.downsample) cur_label = "" step = 2 total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) t1 = time.time() print('toral:', total_frames) while cap.isOpened(): num_frame += 1 if num_frame == total_frames - 1: break ret, frame = cap.read() cur_frame = cv2.resize(frame, (320, 240)) cur_frame = Image.fromarray(cv2.cvtColor(cur_frame, cv2.COLOR_BGR2RGB)) cur_frame = cur_frame.convert('RGB') if num_frame % step == 0: clip.append(cur_frame) indexes = temporal_transform([i for i in range(len(clip))]) new_clip = [] for i in indexes: new_clip.append(clip[i]) new_clip = [spatial_transform(img) for img in new_clip] im_dim = new_clip[0].size()[-2:] try: test_data = torch.cat(new_clip, 0).view((opt.sample_duration, -1) + im_dim).permute(1, 0, 2, 3) except Exception as e: pdb.set_trace() raise e inputs = torch.cat([test_data], 0).view(1, 3, opt.sample_duration, 112, 112) # print(inputs.size()) with torch.no_grad(): inputs = Variable(inputs) if opt.modality_clf == 'RGB': inputs_clf = inputs[:, :, :, :, :] inputs_clf = torch.Tensor(inputs_clf.numpy()[:, :, ::2, :, :]) outputs_clf = classifier(inputs_clf) outputs_clf = F.softmax(outputs_clf, dim=1) outputs_clf = outputs_clf.cpu().numpy()[0].reshape(-1, ) best2, best1 = tuple(outputs_clf.argsort()[-2:][::1]) cur_label = idx2label[best1] elapsedTime = time.time() - t1 return cur_label, elapsedTime
def objective(trial): opt = parse_opts() if trial: opt.weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1) opt.learning_rate = trial.suggest_uniform('learning_rate', 1 - 5, 1 - 4) if opt.root_path != '': opt.video_path = os.path.join(opt.root_path, opt.video_path) opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path) opt.result_path = os.path.join(opt.root_path, opt.result_path) if opt.resume_path: opt.resume_path = os.path.join(opt.root_path, opt.resume_path) if opt.pretrain_path: opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path) opt.scales = [opt.initial_scale] for i in range(1, opt.n_scales): opt.scales.append(opt.scales[-1] * opt.scale_step) opt.arch = '{}-{}'.format(opt.model, opt.model_depth) opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset) opt.std = get_std(opt.norm_value) print(opt) with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file: json.dump(vars(opt), opt_file) torch.manual_seed(opt.manual_seed) model, parameters = generate_model(opt) print(model) criterion = nn.CrossEntropyLoss() if not opt.no_cuda: criterion = criterion.cuda() if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) # norm_method = Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) if not opt.no_train: assert opt.train_crop in ['random', 'corner', 'center'] if opt.train_crop == 'random': crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'corner': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'center': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size, crop_positions=['c']) spatial_transform = Compose([ crop_method, RandomHorizontalFlip(), ToTensor(opt.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(opt.sample_duration) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader( training_data, batch_size=opt.batch_size, # sampler option is mutually exclusive with shuffle shuffle=False, sampler=ImbalancedDatasetSampler(training_data), num_workers=opt.n_threads, pin_memory=True) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) optimizer = optim.Adam(parameters, lr=opt.learning_rate, weight_decay=opt.weight_decay) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, factor=0.1**0.5) if not opt.no_val: spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = ClassLabel() validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader( validation_data, batch_size=opt.batch_size, shuffle=False, sampler=ImbalancedDatasetSampler(validation_data), num_workers=opt.n_threads, pin_memory=True) val_logger = Logger(os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc']) if opt.resume_path: print('loading checkpoint {}'.format(opt.resume_path)) checkpoint = torch.load(opt.resume_path) assert opt.arch == checkpoint['arch'] opt.begin_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) print('run') writer = SummaryWriter( comment= f"_wd{opt.weight_decay}_lr{opt.learning_rate}_ft_begin{opt.ft_begin_index}_pretrain{not opt.pretrain_path == ''}" ) for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: epoch, losses_avg, accuracies_avg = train_epoch( i, train_loader, model, criterion, optimizer, opt, train_logger, train_batch_logger) writer.add_scalar('loss/train', losses_avg, epoch) writer.add_scalar('acc/train', accuracies_avg, epoch) if not opt.no_val: epoch, val_losses_avg, val_accuracies_avg = val_epoch( i, val_loader, model, criterion, opt, val_logger) writer.add_scalar('loss/val', val_losses_avg, epoch) writer.add_scalar('acc/val', val_accuracies_avg, epoch) if not opt.no_train and not opt.no_val: scheduler.step(val_losses_avg) print('=' * 100) if opt.test: spatial_transform = Compose([ Scale(int(opt.sample_size / opt.scale_in_test)), CornerCrop(opt.sample_size, opt.crop_position_in_test), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = VideoID() test_data = get_test_set(opt, spatial_transform, temporal_transform, target_transform) test_loader = torch.utils.data.DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) test.test(test_loader, model, opt, test_data.class_names) writer.close() return val_losses_avg
def main(): resnet_in = generate_model(opt) resnet_in.module.fc = Identity() model = ReNet34(resnet_in, encode_length=encode_length) if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) if not opt.no_train: assert opt.train_crop in ['random', 'corner', 'center'] if opt.train_crop == 'random': crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'corner': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'center': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size, crop_positions=['c']) ## train loader spatial_transform = Compose([ crop_method, RandomHorizontalFlip(), ToTensor(opt.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(opt.sample_duration) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True) ## test loader spatial_transform = Compose([ Scale(int(opt.sample_size / opt.scale_in_test)), CornerCrop(opt.sample_size, opt.crop_position_in_test), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = ClassLabel() test_data = get_test_set(opt, spatial_transform, temporal_transform, target_transform) test_loader = torch.utils.data.DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) ## Database loader spatial_transform = Compose([ Scale(int(opt.sample_size / opt.scale_in_test)), CornerCrop(opt.sample_size, opt.crop_position_in_test), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = ClassLabel() validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) database_loader = torch.utils.data.DataLoader( validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) if opt.nesterov: dampening = 0 else: dampening = opt.dampening optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, dampening=dampening, weight_decay=opt.weight_decay, nesterov=opt.nesterov) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=opt.lr_patience) if opt.resume_path: print('loading checkpoint {}'.format(opt.resume_path)) checkpoint = torch.load(opt.resume_path) assert opt.arch == checkpoint['arch'] opt.begin_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print('run') for epoch in range(opt.begin_epoch, opt.n_epochs + 1): model.cuda().train() for i, (images, labels) in enumerate(train_loader): images = Variable(images.cuda()) labels = Variable(labels.cuda().long()) # Forward + Backward + Optimize optimizer.zero_grad() x, _, b = model(images) target_b = F.cosine_similarity(b[:int(labels.size(0) / 2)], b[int(labels.size(0) / 2):]) target_x = F.cosine_similarity(x[:int(labels.size(0) / 2)], x[int(labels.size(0) / 2):]) loss = F.mse_loss(target_b, target_x) loss.backward() optimizer.step() scheduler.step() # Test the Model if (epoch + 1) % 10 == 0: model.eval() retrievalB, retrievalL, queryB, queryL = compress( database_loader, test_loader, model) result_map = calculate_top_map(qB=queryB, rB=retrievalB, queryL=queryL, retrievalL=retrievalL, topk=100) print('--------mAP@100: {}--------'.format(result_map))
elif opt.train_crop == 'corner': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'center': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size, crop_positions=['c']) elif opt.train_crop == 'driver focus': crop_method = DriverFocusCrop(opt.scales, opt.sample_size) train_spatial_transform = Compose([ crop_method, MultiScaleRandomCrop(opt.scales, opt.sample_size), ToTensor(opt.norm_value), norm_method ]) train_temporal_transform = UniformRandomSample(opt.sample_duration, opt.end_second) train_target_transform = ClassLabel() train_horizontal_flip = RandomHorizontalFlip() training_data = get_training_set(opt, train_spatial_transform, train_horizontal_flip, train_temporal_transform, train_target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
def main(clf_threshold_pre): print(f'Early-detection threshold: {clf_threshold_pre}') opt = parse_opts_online() detector, classifier = load_models(opt) if opt.no_mean_norm and not opt.std_norm: norm_method = Normalize([0, 0, 0], [1, 1, 1]) elif not opt.std_norm: norm_method = Normalize(opt.mean, [1, 1, 1]) else: norm_method = Normalize(opt.mean, opt.std) if opt.model_clf == 'ssar': opt.sample_size_clf = (126, 224) opt.mean_clf = (0.485, 0.456, 0.406) opt.std_clf = (0.229, 0.224, 0.225) spatial_transform_clf = transforms.Compose([ transforms.Resize(opt.sample_size_clf), transforms.ToTensor(), transforms.Normalize(opt.mean_clf, opt.std_clf) ]) spatial_transform = Compose( [Scale(112), CenterCrop(112), ToTensor(opt.norm_value), norm_method]) target_transform = ClassLabel() ## Get list of videos to test if opt.dataset == 'egogesture': subject_list = [ 'Subject{:02d}'.format(i) for i in [2, 9, 11, 14, 18, 19, 28, 31, 41, 47] ] test_paths = [] for subject in subject_list: for x in glob.glob( os.path.join(opt.video_path, subject, '*/*/rgb*/')): test_paths.append(x) elif opt.dataset == 'nv': df = pd.read_csv(os.path.join( opt.video_path, 'nvgesture_test_correct_cvpr2016_v2.lst'), delimiter=' ', header=None) test_paths = [] for x in df[0].values: test_paths.append( os.path.join(opt.video_path, x.replace('path:', ''), 'sk_color_all').replace(os.sep, '/')) # Figures setup # fig, ax = plt.subplots(nrows=6, ncols=1) # x_data, y_datas = [], [] # lines = [] # for j in range(6): # if j != 0: # ax[j].set_xlim(0, 400) # ax[j].set_ylim(0, 1) # y_datas.append([]) # lines.append([]) # for _ in range(opt.n_classes_clf): # y_data = [] # y_datas[j].append(y_data) # line, = ax[j].plot(x_data, y_data) # lines[j].append(line) print('Start Evaluation') detector.eval() classifier.eval() levenshtein_accuracies = AverageMeter() frames_early_meter = AverageMeter() videoidx = 0 for path in test_paths[4:]: path = os.path.normpath(path) if opt.dataset == 'egogesture': opt.whole_path = path.rsplit(os.sep, 4)[-4:] opt.whole_path = os.sep.join(opt.whole_path) elif opt.dataset == 'nv': opt.whole_path = path.split( os.sep, 3) # TODO: fix bad dependency on fixed depth file locations opt.whole_path = opt.whole_path[-1] videoidx += 1 active_index = 0 passive_count = 999 active = False prev_active = False finished_prediction = None pre_predict = False cum_sum = np.zeros(opt.n_classes_clf, ) cum_sum_unweighted = np.zeros(opt.n_classes_clf, ) clf_selected_queue = np.zeros(opt.n_classes_clf, ) det_selected_queue = np.zeros(opt.n_classes_det, ) myqueue_det = Queue(opt.det_queue_size, n_classes=opt.n_classes_det) myqueue_clf = Queue(opt.clf_queue_size, n_classes=opt.n_classes_clf) print('[{}/{}]============'.format(videoidx, len(test_paths))) print(path) opt.sample_duration = max(opt.sample_duration_clf, opt.sample_duration_det) if opt.model_clf == 'ssar': test_data = get_online_data( opt, [spatial_transform, spatial_transform_clf], None, target_transform, modality='RGB') else: test_data = get_online_data(opt, spatial_transform, None, target_transform) test_loader = torch.utils.data.DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) results = [] prev_best1 = opt.n_classes_clf if opt.model_clf == 'ssar': # Init recurrent state zero lstm_hidden = [None, None, None, None] for i, (inputs, targets) in enumerate(test_loader): if opt.model_clf == 'ssar': inputs, inputs_clf = inputs if not opt.no_cuda: targets = targets.cuda(non_blocking=True) ground_truth_array = np.zeros(opt.n_classes_clf + 1, ) with torch.no_grad(): inputs = Variable(inputs) targets = Variable(targets) if opt.modality_det == 'RGB': inputs_det = inputs[:, :3, -opt.sample_duration_det:, :, :] elif opt.modality_det == 'Depth': inputs_det = inputs[:, -1, -opt. sample_duration_det:, :, :].unsqueeze( 1) elif opt.modality_det == 'RGB-D': inputs_det = inputs[:, :, -opt.sample_duration_det:, :, :] # print(inputs_det[0, :, -1, 0:4, 0:4]) outputs_det = detector(inputs_det) outputs_det = F.softmax(outputs_det, dim=1) outputs_det = outputs_det.cpu().numpy()[0].reshape(-1, ) # enqueue the probabilities to the detector queue myqueue_det.enqueue(outputs_det.tolist()) if opt.det_strategy == 'raw': det_selected_queue = outputs_det elif opt.det_strategy == 'median': det_selected_queue = myqueue_det.median elif opt.det_strategy == 'ma': det_selected_queue = myqueue_det.ma elif opt.det_strategy == 'ewma': det_selected_queue = myqueue_det.ewma prediction_det = np.argmax(det_selected_queue) prob_det = det_selected_queue[1] #### State of the detector is checked here as detector act as a switch for the classifier if prediction_det == 1: if opt.model_clf == 'ssar': inputs_clf = Variable(inputs_clf) if not opt.no_cuda: inputs_clf = inputs_clf.cuda() if opt.modality_clf == 'RGB': inputs_clf = inputs_clf[:, :3, -1, :, :] elif opt.modality_clf == 'Depth': inputs_clf = inputs_clf[:, -1, -1, :, :].unsqueeze(1) elif opt.modality_clf == 'RGB-D': inputs_clf = inputs_clf[:, :, -1, :, :] outputs_clf, lstm_hidden = classifier( inputs_clf, lstm_hidden, get_lstm_state=True) else: if opt.modality_clf == 'RGB': inputs_clf = inputs[:, :3, :, :, :] elif opt.modality_clf == 'Depth': inputs_clf = inputs[:, -1, :, :, :].unsqueeze(1) elif opt.modality_clf == 'RGB-D': inputs_clf = inputs[:, :, :, :, :] outputs_clf = classifier(inputs_clf) outputs_clf = F.softmax(outputs_clf, dim=1) outputs_clf = outputs_clf.cpu().numpy()[0].reshape(-1, ) # Push the probabilities to queue myqueue_clf.enqueue(outputs_clf.tolist()) passive_count = 0 if opt.clf_strategy == 'raw': clf_selected_queue = outputs_clf elif opt.clf_strategy == 'median': clf_selected_queue = myqueue_clf.median elif opt.clf_strategy == 'ma': clf_selected_queue = myqueue_clf.ma elif opt.clf_strategy == 'ewma': clf_selected_queue = myqueue_clf.ewma else: if opt.model_clf == 'ssar': # Reset recurrent state lstm_hidden = [None, None, None, None] outputs_clf = np.zeros(opt.n_classes_clf, ) # Push the probabilities to queue myqueue_clf.enqueue(outputs_clf.tolist()) passive_count += 1 if passive_count >= opt.det_counter: active = False else: active = True # one of the following line need to be commented !!!! if active: active_index += 1 cum_sum = ((cum_sum * (active_index - 1)) + (weighting_func(active_index) * clf_selected_queue) ) / active_index # Weighted Aproach cum_sum_unweighted = ((cum_sum_unweighted * (active_index - 1)) + (1.0 * clf_selected_queue) ) / active_index #Not Weighting Aproach best2, best1 = tuple(cum_sum.argsort()[-2:][::1]) if float(cum_sum[best1] - cum_sum[best2]) > clf_threshold_pre: finished_prediction = True pre_predict = True else: active_index = 0 # Visualize # x_data.append(i) # y_datas[1][0].append(prob_det) # lines[1][0].set_xdata(x_data) # lines[1][0].set_ydata(y_datas[1][0]) # for j in range(opt.n_classes_clf): # y_datas[2][j].append(cum_sum[j]) # y_datas[3][j].append(cum_sum_unweighted[j]) # y_datas[4][j].append(clf_selected_queue[j] if active else 0) # for k in range(2, 5): # lines[k][j].set_xdata(x_data) # lines[k][j].set_ydata(y_datas[k][j]) # for k in range(1, 6): # ax[k].set_xlim(i - 400, i) # mean = np.array(opt.mean, dtype=np.float32).reshape(1, 1, -1) # img = inputs_det[0, :, -1].permute(1, 2, 0).cpu().numpy() + mean # img = img.astype(int) # if i == 0: # im_plt = ax[0].imshow(img) # else: # im_plt.set_data(img) if i % 10 == 0: plt.draw() plt.pause(0.001) if active == False and prev_active == True: finished_prediction = True elif active == True and prev_active == False: finished_prediction = False if finished_prediction == True: detection_frame = (i * opt.stride_len) + opt.sample_duration_clf best2, best1 = tuple(cum_sum.argsort()[-2:][::1]) if cum_sum[best1] > opt.clf_threshold_final: if pre_predict == True: if best1 != prev_best1: if cum_sum[best1] > opt.clf_threshold_final: results.append((detection_frame, best1)) print( 'Early Detected - class : {} with prob : {} at frame {}' .format(best1, cum_sum[best1], detection_frame)) else: # raw_best = clf_selected_queue.argsort()[-1] # results.append((detection_frame,raw_best)) # print( 'Late Detected - class : {} with prob : {} at frame {}'.format(raw_best, clf_selected_queue[raw_best], detection_frame)) if cum_sum[best1] > opt.clf_threshold_final: if best1 == prev_best1: if cum_sum[best1] > 5: results.append((detection_frame, best1)) print( 'Late Detected - class : {} with prob : {} at frame {}' .format(best1, cum_sum[best1], detection_frame)) else: results.append((detection_frame, best1)) print( 'Late Detected - class : {} with prob : {} at frame {}' .format(best1, cum_sum[best1], detection_frame)) prev_best1 = best1 finished_prediction = False # prev_best1 = best1 # finished_prediction = False cum_sum = np.zeros(opt.n_classes_clf, ) cum_sum_unweighted = np.zeros(opt.n_classes_clf, ) if active == False and prev_active == True: pre_predict = False prev_active = active if opt.dataset == 'egogesture': opt.video_path = os.path.normpath(opt.video_path) opt.whole_path = os.path.normpath(opt.whole_path) target_csv_path = os.path.join( opt.video_path, 'labels-final-revised1', opt.whole_path.rsplit(os.sep, 2)[0], 'Group' + opt.whole_path.rsplit('.', 1)[0][-1] + '.csv').replace('Subject', 'subject') true_classes = [] end_frames = [] with open(target_csv_path) as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: true_classes.append(int(row[0]) - 1) end_frames.append(int(row[2])) elif opt.dataset == 'nv': true_classes = [] with open('./annotation_nvGesture/vallistall.txt') as csvfile: readCSV = csv.reader(csvfile, delimiter=' ') for row in readCSV: if row[0] == opt.whole_path: if row[1] != '26': true_classes.append(int(row[1]) - 1) predicted = np.array(results)[:, 1] detection_frames = np.array(results)[:, 0] true_classes = np.array(true_classes) levenshtein_distance, avg_frames_early = LevenshteinDistancePlusAvgFramesEarly( true_classes, predicted, end_frames, detection_frames) levenshtein_accuracy = 1 - (levenshtein_distance / len(true_classes)) if levenshtein_distance < 0: # Distance cannot be less than 0 levenshtein_accuracies.update(0, len(true_classes)) else: levenshtein_accuracies.update(levenshtein_accuracy, len(true_classes)) frames_early_meter.update(avg_frames_early) print('predicted classes: \t', predicted) print('True classes :\t\t', true_classes) print('Levenshtein Accuracy = {} ({})'.format( levenshtein_accuracies.val, levenshtein_accuracies.avg)) print( f'Average frames early = {frames_early_meter.val} ({frames_early_meter.avg})' ) print('Average Levenshtein Accuracy= {}'.format( levenshtein_accuracies.avg)) print('-----Evaluation is finished------') early_x_data.append(clf_threshold_pre) early_y_data.append(frames_early_meter.avg) early_plot.set_xdata(early_x_data) early_plot.set_ydata(early_y_data) plt.annotate(f'{levenshtein_accuracies.avg * 100:.2f}', (clf_threshold_pre, frames_early_meter.avg), textcoords='offset pixels', xytext=(10, 10)) plt.gca().relim() plt.gca().autoscale_view() plt.pause(0.0001) plt.draw()
def main(): opt = parse_opts() ecd_name, cls_name = opt.model_name.split('-') ecd_model = get_encoder_net(ecd_name) cls_model = get_end_net(cls_name) cfg.encoder_model = ecd_name cfg.classification_model = cls_name if opt.debug: cfg.debug = opt.debug else: if opt.tensorboard == 'TEST': cfg.tensorboard = opt.model_name else: cfg.tensorboard = opt.tensorboard cfg.flag = opt.flag model = cls_model(cfg, encoder=CNNencoder( cfg, ecd_model(pretrained=True, path=opt.encoder_model))) cfg.video_path = os.path.join(cfg.root_path, cfg.video_path) cfg.annotation_path = os.path.join(cfg.root_path, cfg.annotation_path) cfg.list_all_member() torch.manual_seed(cfg.manual_seed) print('##########################################') print('####### model 仅支持单GPU') print('##########################################') model = model.cuda() print(model) criterion = nn.CrossEntropyLoss() if cfg.cuda: criterion = criterion.cuda() norm_method = Normalize([0, 0, 0], [1, 1, 1]) print('##########################################') print('####### train') print('##########################################') assert cfg.train_crop in ['random', 'corner', 'center'] if cfg.train_crop == 'random': crop_method = (cfg.scales, cfg.sample_size) elif cfg.train_crop == 'corner': crop_method = MultiScaleCornerCrop(cfg.scales, cfg.sample_size) elif cfg.train_crop == 'center': crop_method = MultiScaleCornerCrop(cfg.scales, cfg.sample_size, crop_positions=['c']) spatial_transform = Compose([ crop_method, RandomHorizontalFlip(), ToTensor(cfg.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(cfg.sample_duration) target_transform = ClassLabel() training_data = get_training_set(cfg, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.n_threads, drop_last=False, pin_memory=True) optimizer = model.get_optimizer(lr1=cfg.lr, lr2=cfg.lr2) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=cfg.lr_patience) print('##########################################') print('####### val') print('##########################################') spatial_transform = Compose([ Scale(cfg.sample_size), CenterCrop(cfg.sample_size), ToTensor(cfg.norm_value), norm_method ]) temporal_transform = LoopPadding(cfg.sample_duration) target_transform = ClassLabel() validation_data = get_validation_set(cfg, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.n_threads, drop_last=False, pin_memory=True) print('##########################################') print('####### run') print('##########################################') if cfg.debug: logger = None else: path = get_log_dir(cfg.logdir, name=cfg.tensorboard, flag=cfg.flag) logger = Logger(logdir=path) cfg.save_config(path) for i in range(cfg.begin_epoch, cfg.n_epochs + 1): train_epoch(i, train_loader, model, criterion, optimizer, cfg, logger) validation_loss = val_epoch(i, val_loader, model, criterion, cfg, logger) scheduler.step(validation_loss)
train_loader = None val_loader = None # 画像郡のパスとそのチャンネル数をそれぞれ辞書に登録 paths = {opt.video_path: '3ch'} if opt.add_gray_image_paths: for one_ch in opt.add_gray_image_paths: paths[one_ch] = '1ch' if opt.add_RGB_image_paths: for three_ch in opt.add_RGB_image_paths: paths[three_ch] = '3ch' spatial_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize([0, 0, 0], [1, 1, 1])]) target_transform = ClassLabel(True) training_data = data_set[opt.data_set]( paths, opt.annotation_path, 'training', spatial_transform=spatial_transform, target_transform=target_transform, ) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True, worker_init_fn=worker_init_fn) dampening = 0 if opt.nesterov else opt.dampening
def get_ucf_data(opt): mean = get_mean(opt.norm_value, dataset='kinetics') std = get_std(opt.norm_value) norm_method = Normalize(mean, [1, 1, 1]) spatial_transform = Compose([ Scale(opt.sample_size), CornerCrop(opt.sample_size, 'c'), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = ClassLabel() # VideoID() # get training data training_data = UCF101(opt.video_path, opt.annotation_path, 'training', 0, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, sample_duration=16) # wrap training data train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=False) # True # get validation data val_data = UCF101(opt.video_path, opt.annotation_path, 'validation', 0, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, sample_duration=16) # wrap validation data val_loader = torch.utils.data.DataLoader(val_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=False) target_transform = VideoID() # get test data test_data = UCF101(opt.video_path, opt.annotation_path, 'testing', 0, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, sample_duration=16) # wrap test data test_loader = torch.utils.data.DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=False) return train_loader, val_loader, test_loader, test_data
assert opt.train_crop in ['random', 'corner', 'center'] if opt.train_crop == 'random': crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'corner': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'center': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size, crop_positions=['c']) spatial_transform = Compose([ crop_method, RandomHorizontalFlip(opt.dataset), ToTensor(), norm_method ]) temporal_transform = TemporalSampling(opt.sample_duration) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True, drop_last=True) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) if opt.nesterov:
ToTensor(opt.norm_value), norm_method, ]) kinetics_transform = transforms.Compose([ transforms.Resize(128), transforms.CenterCrop(112), ToTensor(opt.norm_value), norm_method, ]) spatial_transform.append(ucf_transform) spatial_transform.append(kinetics_transform) temporal_transform.append(TemporalRandomCrop(opt.sample_duration, opt.downsample)) temporal_transform.append(TemporalCenterCrop(opt.sample_duration, opt.downsample)) target_transform.append(ClassLabel()) target_transform.append(ClassLabel_fromarray(labels)) kinetics_clustered = Kinetics_clustered(opt.ul_vids_path, opt.ul_annotation_path, 'training', 1, spatial_transform=spatial_transform[0], temporal_transform=temporal_transform[0], target_transform=target_transform[1]) train_loader = torch.utils.data.DataLoader(dataset=kinetics_clustered, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True)
assert opt.train_crop in ['random', 'corner', 'center'] if opt.train_crop == 'random': crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'corner': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'center': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size, crop_positions=['c']) spatial_transform = Compose([ crop_method, RandomHorizontalFlip(), ToTensor(opt.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(opt.sample_duration) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) if opt.nesterov: dampening = 0
def main(): opt = parse_opts() # Path configurations opt.annotation_path = os.path.join(opt.annotation_directory, opt.annotation_path) save_result_dir_name = \ os.path.join(opt.result_path, get_prefix() + '_{}{}_{}_epochs'.format(opt.model, opt.model_depth, opt.n_epochs)) if not os.path.exists(save_result_dir_name): os.mkdir(save_result_dir_name) opt.result_path = os.path.join(opt.result_path, save_result_dir_name) # For data generator opt.scales = [opt.initial_scale] for epoch in range(1, opt.n_scales): opt.scales.append(opt.scales[-1] * opt.scale_step) opt.arch = '{}-{}'.format(opt.model, opt.model_depth) # Model model, parameters = generate_model(opt) # print(model) # Loss function criterion = nn.CrossEntropyLoss() if not opt.no_cuda: criterion = criterion.cuda() # Normalizing if not opt.no_mean_norm: opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset) opt.std = get_std(opt.norm_value, dataset=opt.std_dataset) norm_method = Normalize(opt.mean, opt.std) else: norm_method = Normalize([0, 0, 0], [1, 1, 1]) print(opt) with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file: json.dump(vars(opt), opt_file) # **************************** TRAINING CONFIGURATIONS ************************************ assert opt.train_crop in ['corner', 'center'] if opt.train_crop == 'corner': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) elif opt.train_crop == 'center': crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size, crop_positions=['c']) # Пространственное преобразование spatial_transform = Compose([ crop_method, #RandomHorizontalFlip(), ToTensor(opt.norm_value), norm_method ]) # Временное преобразование temporal_transform = TemporalRandomCrop(opt.sample_duration) # Целевое преобразование target_transform = ClassLabel() train_loader_list = [] if not opt.no_cross_validation: annotation_list = os.listdir(opt.annotation_directory) for annotation in annotation_list: opt.annotation_path = os.path.join(opt.annotation_directory, annotation) training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader( training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True) train_loader_list.append(train_loader) else: training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True) train_loader_list.append(train_loader) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) optimizer = optim.SGD(parameters, lr=opt.learning_rate, momentum=opt.momentum, dampening=opt.dampening, weight_decay=opt.weight_decay) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=opt.lr_patience) # ***************************** VALIDATION CONFIGURATIONS ********************************* spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = ClassLabel() val_loader_list = [] if not opt.no_cross_validation: annotation_list = os.listdir(opt.annotation_directory) for annotation in annotation_list: opt.annotation_path = os.path.join(opt.annotation_directory, annotation) validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) val_loader_list.append(val_loader) else: validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) val_loader_list.append(val_loader) val_logger = Logger(os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc']) # **************************************** TRAINING **************************************** epoch_avg_time = AverageMeter() train_loss_list = [] train_acc_list = [] valid_acc_list = [] best_accuracy = 0 current_train_data = 0 current_valid_data = 0 opt.frequence_cross_validation = round(opt.n_epochs / opt.n_cross_validation_sets + 0.5) for epoch in range(opt.begin_epoch, opt.n_epochs + 1): epoch_start_time = time.time() print('Epoch #' + str(epoch)) # optimizer = regulate_learning_rate(optimizer, epoch, opt.frequence_regulate_lr) train_loader = train_loader_list[current_train_data] if not opt.no_cross_validation and epoch % opt.frequence_cross_validation == 0: print('\t##### Cross-validation: switch training data #####') current_train_data = (current_train_data + 1) % len(train_loader_list) train_loader = train_loader_list[current_train_data] train_loss, train_acc = train_epoch(epoch, train_loader, model, criterion, optimizer, opt, train_logger, train_batch_logger) val_loader = val_loader_list[current_valid_data] if not opt.no_cross_validation and epoch % opt.frequence_cross_validation == 0: print('\t##### Cross-validation: switch validation data #####') current_valid_data = (current_valid_data + 1) % len(val_loader_list) val_loader = val_loader_list[current_valid_data] validation_acc = val_epoch(epoch, val_loader, model, criterion, opt, val_logger) train_loss_list.append(train_loss) train_acc_list.append(train_acc) valid_acc_list.append(validation_acc) # Save model with best accuracy if validation_acc > best_accuracy: best_accuracy = validation_acc save_file_path = os.path.join(opt.result_path, 'best_model.pth') states = { 'epoch': epoch + 1, 'arch': opt.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(states, save_file_path) epoch_end_time = time.time() - epoch_start_time epoch_avg_time.update(epoch_end_time) print('\tTime left: ' + str(round(epoch_avg_time.avg * (opt.n_epochs - epoch) / 60, 1)) + ' minutes') # ******************************* SAVING RESULTS OF TRAINING ****************************** save_pictures(np.linspace(1, opt.n_epochs, opt.n_epochs), train_loss_list, 'red', 'Loss', os.path.join(opt.result_path, 'train_loss.png')) save_pictures(np.linspace(1, opt.n_epochs, opt.n_epochs), train_acc_list, 'blue', 'Accuracy', os.path.join(opt.result_path, 'train_accuracy.png')) save_pictures(np.linspace(1, opt.n_epochs, opt.n_epochs), valid_acc_list, 'blue', 'Accuracy', os.path.join(opt.result_path, 'validation_accuracy.png'))