def run(init_lr=0.0001, max_steps=5e3, frames_per_clip=64, dataset_path='/media/sitzikbs/6TB/ANU_ikea_dataset/', train_filename='train_cross_env.txt', testset_filename='test_cross_env.txt', db_filename='../ikea_dataset_frame_labeler/ikea_annotation_db', logdir='', frame_skip=1, batch_size=8, camera='dev3', refine=False, refine_epoch=0, load_mode='img', pose_path='predictions/pose2d/openpose', arch='HCN', steps_per_update=1): os.makedirs(logdir, exist_ok=True) # setup dataset train_transforms = None test_transforms = None train_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename, transform=train_transforms, set='train', camera=camera, frame_skip=frame_skip, frames_per_clip=frames_per_clip, mode=load_mode, pose_path=pose_path, arch=arch) print("Number of clips in the dataset:{}".format(len(train_dataset))) weights = utils.make_weights_for_balanced_classes( train_dataset.clip_set, train_dataset.clip_label_count) sampler = torch.utils.data.sampler.WeightedRandomSampler( weights, len(weights)) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, num_workers=6, pin_memory=False) test_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename, test_filename=testset_filename, transform=test_transforms, set='test', camera=camera, frame_skip=frame_skip, frames_per_clip=frames_per_clip, mode=load_mode, pose_path=pose_path, arch=arch) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=False) # setup the model num_classes = train_dataset.num_classes if arch == 'HCN': model = HCN.HCN(in_channel=2, num_joint=19, num_person=1, out_channel=64, window_size=frames_per_clip, num_class=num_classes) elif arch == 'ST_GCN': graph_args = {'layout': 'openpose', 'strategy': 'spatial'} #ntu-rgb+d model = st_gcn.Model(in_channels=2, num_class=num_classes, graph_args=graph_args, edge_importance_weighting=True, dropout=0.5) else: raise ValueError( "Unsupported architecture: please select HCN | ST_GCN") if refine: if refine_epoch == 0: raise ValueError( "You set the refine epoch to 0. No need to refine, just retrain." ) refine_model_filename = os.path.join( logdir, str(refine_epoch).zfill(6) + '.pt') checkpoint = torch.load(refine_model_filename) model.load_state_dict(checkpoint["model_state_dict"]) model.cuda() # model = nn.DataParallel(model) lr = init_lr optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1E-6) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [1000, 2000, 3000, 4000]) # criterion = nn.CrossEntropyLoss() # standard crossentropy loss for classification if refine: lr_sched.load_state_dict(checkpoint["lr_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) train_writer = SummaryWriter(os.path.join(logdir, 'train')) test_writer = SummaryWriter(os.path.join(logdir, 'test')) num_steps_per_update = steps_per_update # accum gradient - try to have number of examples per update match original code 8*5*4 # eval_steps = 5 steps = 0 # train it n_examples = 0 train_num_batch = len(train_dataloader) test_num_batch = len(test_dataloader) refine_flag = True best_acc = 0 while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) if steps <= refine_epoch and refine and refine_flag: lr_sched.step() steps += 1 n_examples += len(train_dataset.clip_set) continue else: refine_flag = False # Each epoch has a training and validation phase test_batchind = -1 test_fraction_done = 0.0 test_enum = enumerate(test_dataloader, 0) tot_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. avg_acc = [] for train_batchind, data in enumerate(train_dataloader): num_iter += 1 # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) labels = torch.argmax(labels, dim=1) logits = model(inputs) t = inputs.size(2) per_frame_logits = torch.nn.functional.interpolate( logits.unsqueeze(-1), t, mode='linear', align_corners=True) probs = torch.nn.functional.softmax(per_frame_logits, dim=1) loss = nn.CrossEntropyLoss()(per_frame_logits, labels) tot_loss += loss.item() loss.backward() acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), labels) avg_acc.append(acc.item()) train_fraction_done = (train_batchind + 1) / train_num_batch print('[{}] train Acc: {}, Loss: {:.4f} [{} / {}]'.format( steps, acc.item(), loss.item(), train_batchind, len(train_dataloader))) if (num_iter == num_steps_per_update or train_batchind == len(train_dataloader) - 1): n_steps = num_steps_per_update if train_batchind == len(train_dataloader) - 1: n_steps = num_iter n_examples += batch_size * n_steps print('updating the model...') print('train Total Loss: {:.4f}'.format(tot_loss / n_steps)) optimizer.step() optimizer.zero_grad() train_writer.add_scalar('loss', tot_loss / n_steps, n_examples) train_writer.add_scalar('Accuracy', np.mean(avg_acc), n_examples) train_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], n_examples) num_iter = 0 tot_loss = 0. if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch: model.train(False) # Set model to evaluate mode test_batchind, data = next(test_enum) inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) labels = torch.argmax(labels, dim=1) with torch.no_grad(): logits = model(inputs) t = inputs.size(2) per_frame_logits = torch.nn.functional.interpolate( logits.unsqueeze(-1), t, mode='linear', align_corners=True) probs = torch.nn.functional.softmax(per_frame_logits, dim=1) loss = nn.CrossEntropyLoss()(per_frame_logits, labels) acc = utils.accuracy_v2( torch.argmax(per_frame_logits, dim=1), labels) print('[{}] test Acc: {}, Loss: {:.4f} [{} / {}]'.format( steps, acc.item(), loss.item(), test_batchind, len(test_dataloader))) test_writer.add_scalar('loss', loss.item(), n_examples) test_writer.add_scalar('Accuracy', acc.item(), n_examples) test_fraction_done = (test_batchind + 1) / test_num_batch model.train(True) if steps % 100 == 0: # save model torch.save( { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "lr_state_dict": lr_sched.state_dict() }, logdir + str(steps).zfill(6) + '.pt') # remember best prec@1 and save checkpoint is_best = acc > best_acc best_acc = max(acc, best_acc) if (is_best): model_tmp = copy.deepcopy(model.state_dict()) model.load_state_dict(model_tmp) torch.save( { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "lr_state_dict": lr_sched.state_dict() }, os.path.join(logdir, 'best_classifier.pth')) steps += 1 lr_sched.step() train_writer.close() test_writer.close()
def run(dataset_path, db_filename, model_path, output_path, frames_per_clip=16, testset_filename='test_cross_env.txt', trainset_filename='train_cross_env.txt', frame_skip=1, batch_size=8, device='dev3', arch='HCN', pose_path='predictions/pose2d/openpose'): pred_output_filename = os.path.join(output_path, 'pred.npy') json_output_filename = os.path.join(output_path, 'action_segments.json') # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) test_dataset = Dataset(dataset_path, db_filename=db_filename, test_filename=testset_filename, train_filename=trainset_filename, transform=test_transforms, set='test', camera=device, frame_skip=frame_skip, frames_per_clip=frames_per_clip, mode='img', pose_path=pose_path, arch=arch) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True) # setup the model num_classes = test_dataset.num_classes if arch == 'HCN': model = HCN.HCN(in_channel=2, num_joint=19, num_person=1, out_channel=64, window_size=frames_per_clip, num_class=num_classes) elif arch == 'ST_GCN': graph_args = {'layout': 'openpose', 'strategy': 'spatial'} # layout:'ntu-rgb+d' model = st_gcn.Model(in_channels=2, num_class=num_classes, graph_args=graph_args, edge_importance_weighting=True, dropout=0.5) else: raise ValueError("Unsupported architecture: please select HCN | ST_GCN") checkpoints = torch.load(model_path) model.load_state_dict(checkpoints["model_state_dict"]) # load trained model model.cuda() # model = nn.DataParallel(model) n_examples = 0 # Iterate over data. avg_acc = [] pred_labels_per_video = [[] for i in range(len(test_dataset.video_list))] logits_per_video = [[] for i in range(len(test_dataset.video_list))] for test_batchind, data in enumerate(test_dataloader): model.train(False) # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) t = inputs.size(2) logits = model(inputs) logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True) # logits = F.interpolate(logits, t, mode='linear', align_corners=True) # b x classes x frames acc = i3d_utils.accuracy_v2(torch.argmax(logits, dim=1), torch.argmax(labels, dim=1)) avg_acc.append(acc.item()) n_examples += batch_size print('batch Acc: {}, [{} / {}]'.format(acc.item(), test_batchind, len(test_dataloader))) logits = logits.permute(0, 2, 1) # [ batch, classes, frames] -> [ batch, frames, classes] logits = logits.reshape(inputs.shape[0] * frames_per_clip, -1) pred_labels = torch.argmax(logits, 1).detach().cpu().numpy().tolist() logits = torch.nn.functional.softmax(logits, dim=1).detach().cpu().numpy().tolist() pred_labels_per_video, logits_per_video = \ utils.accume_per_video_predictions(vid_idx, frame_pad,pred_labels_per_video, logits_per_video, pred_labels, logits, frames_per_clip) pred_labels_per_video = [np.array(pred_video_labels) for pred_video_labels in pred_labels_per_video] logits_per_video = [np.array(pred_video_logits) for pred_video_logits in logits_per_video] np.save(pred_output_filename, {'pred_labels': pred_labels_per_video, 'logits': logits_per_video}) utils.convert_frame_logits_to_segment_json(logits_per_video, json_output_filename, test_dataset.video_list, test_dataset.action_list)
def run( dataset_path, annotation_path, model_path, output_path, frames_per_clip, frame_skip, mode, batch_size, ): pred_output_filename = os.path.join(output_path, 'predictions.npy') json_output_filename = os.path.join(output_path, 'action_segments.json') # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) test_dataset = Dataset( dataset_path=dataset_path, annotation_path=annotation_path, transform=test_transforms, index_filename="test_dataset_index.txt", frame_skip=frame_skip, frames_per_clip=frames_per_clip, ) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True) # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(157, in_channels=3) num_classes = len(test_dataset.action_name_list) i3d.replace_logits(num_classes) checkpoints = torch.load(model_path) i3d.load_state_dict(checkpoints["model_state_dict"]) # load trained model i3d.cuda() i3d = nn.DataParallel(i3d) n_examples = 0 # Iterate over data. avg_acc = [] pred_labels_per_video = [[] for i in range(len(test_dataset.video_list))] logits_per_video = [[] for i in range(len(test_dataset.video_list))] # last_vid_idx = 0 bar = tqdm(test_dataloader) for test_batchind, data in enumerate(bar): i3d.train(False) # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) t = inputs.size(2) logits = i3d(inputs) logits = F.interpolate(logits, t, mode='linear', align_corners=True) # b x classes x frames acc = i3d_utils.accuracy_v2(torch.argmax(logits, dim=1), torch.argmax(labels, dim=1)) avg_acc.append(acc.item()) n_examples += batch_size bar.set_postfix({"Batch Acc": acc.item()}) # print('batch Acc: {}, [{} / {}]'.format(acc.item(), test_batchind + 1, len(test_dataloader))) logits = logits.permute(0, 2, 1) logits = logits.reshape(inputs.shape[0] * frames_per_clip, -1) pred_labels = torch.argmax(logits, 1).detach().cpu().numpy() logits = torch.nn.functional.softmax( logits, dim=1).detach().cpu().numpy().tolist() pred_labels_per_video, logits_per_video = \ utils.accume_per_video_predictions(vid_idx, frame_pad, pred_labels_per_video, logits_per_video, pred_labels, logits, frames_per_clip) pred_labels_per_video = [ np.array(pred_video_labels) for pred_video_labels in pred_labels_per_video ] logits_per_video = [ np.array(pred_video_logits) for pred_video_logits in logits_per_video ] np.save(pred_output_filename, { 'pred_labels': pred_labels_per_video, 'logits': logits_per_video }) utils.convert_frame_logits_to_segment_json( logits_per_video, json_output_filename, [video[0] for video in test_dataset.video_list], test_dataset.action_name_list)
def run( dataset_path, annotation_path, init_lr, frames_per_clip, mode, logdir, frame_skip, batch_size, refine, refine_epoch, pretrained_model, max_steps, ): os.makedirs(logdir, exist_ok=True) # setup dataset train_transforms = transforms.Compose( [ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ] ) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) train_dataset = Dataset( dataset_path=dataset_path, annotation_path=annotation_path, transform=train_transforms, index_filename="train_dataset_index.txt", frame_skip=frame_skip, frames_per_clip=frames_per_clip, ) print("Number of clips in the train dataset:{}".format(len(train_dataset))) test_dataset = Dataset( dataset_path=dataset_path, annotation_path=annotation_path, transform=test_transforms, index_filename="test_dataset_index.txt", frame_skip=frame_skip, frames_per_clip=frames_per_clip, ) print("Number of clips in the test dataset:{}".format(len(test_dataset))) weights = utils.make_weights_for_balanced_classes(train_dataset.clip_list, train_dataset.clip_label_count) sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=sampler, num_workers=3, pin_memory=True ) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True ) # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_' + pretrained_model + '.pt')) else: i3d = InceptionI3d(157, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_' + pretrained_model + '.pt')) num_classes = len(train_dataset.action_name_list) i3d.replace_logits(num_classes) for name, param in i3d.named_parameters(): # freeze i3d parameters if 'logits' in name: param.requires_grad = True elif 'Mixed_5c' in name: param.requires_grad = True else: param.requires_grad = False if refine: if refine_epoch == 0: raise ValueError("You set the refine epoch to 0. No need to refine, just retrain.") refine_model_filename = os.path.join(logdir, str(refine_epoch).zfill(6) + '.pt') checkpoint = torch.load(refine_model_filename) i3d.load_state_dict(checkpoint["model_state_dict"]) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=1E-6) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [15, 30, 45, 60]) if refine: lr_sched.load_state_dict(checkpoint["lr_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) train_writer = SummaryWriter(os.path.join(logdir, 'train')) test_writer = SummaryWriter(os.path.join(logdir, 'test')) num_steps_per_update = 4 * 5 # accum gradient - try to have number of examples per update match original code 8*5*4 # eval_steps = 5 steps = 0 # train it n_examples = 0 train_num_batch = len(train_dataloader) test_num_batch = len(test_dataloader) refine_flag = True while steps <= max_steps: # for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) if steps <= refine_epoch and refine and refine_flag: lr_sched.step() steps += 1 n_examples += len(train_dataset.clip_list) continue else: refine_flag = False # Each epoch has a training and validation phase test_batchind = -1 test_fraction_done = 0.0 test_enum = enumerate(test_dataloader) tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. avg_acc = [] for train_batchind, data in enumerate(train_dataloader): num_iter += 1 # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) t = inputs.size(2) per_frame_logits = i3d(inputs) per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=True) # compute localization loss loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) tot_loc_loss += loc_loss.item() # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.item() loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.item() loss.backward() acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), torch.argmax(labels, dim=1)) # acc = utils.accuracy(per_frame_logits, labels) avg_acc.append(acc.item()) train_fraction_done = (train_batchind + 1) / train_num_batch print('[{}] train Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), train_batchind, len(train_dataloader))) if num_iter == num_steps_per_update or train_batchind == len(train_dataloader) - 1: n_steps = num_steps_per_update if train_batchind == len(train_dataloader) - 1: n_steps = num_iter n_examples += batch_size * n_steps print('updating the model...') print('train Total Loss: {:.4f}'.format(tot_loss / n_steps)) optimizer.step() optimizer.zero_grad() train_writer.add_scalar('loss', tot_loss / n_steps, n_examples) train_writer.add_scalar('cls loss', tot_cls_loss / n_steps, n_examples) train_writer.add_scalar('loc loss', tot_loc_loss / n_steps, n_examples) train_writer.add_scalar('Accuracy', np.mean(avg_acc), n_examples) train_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], n_examples) num_iter = 0 tot_loss = 0. if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch: i3d.train(False) # Set model to evaluate mode test_batchind, data = next(test_enum) inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) with torch.no_grad(): per_frame_logits = i3d(inputs) per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=True) # compute localization loss loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), torch.argmax(labels, dim=1)) print('[{}] test Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), test_batchind, len(test_dataloader))) test_writer.add_scalar('loss', loss.item(), n_examples) test_writer.add_scalar('cls loss', loc_loss.item(), n_examples) test_writer.add_scalar('loc loss', cls_loss.item(), n_examples) test_writer.add_scalar('Accuracy', acc.item(), n_examples) test_fraction_done = (test_batchind + 1) / test_num_batch i3d.train(True) if steps % 2 == 0: # save model torch.save({"model_state_dict": i3d.module.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "lr_state_dict": lr_sched.state_dict()}, logdir + str(steps).zfill(6) + '.pt') steps += 1 lr_sched.step() train_writer.close() test_writer.close()
def run(dataset_path, db_filename, model_path, output_path, frames_per_clip=16, mode='rgb', testset_filename='test_cross_env.txt', trainset_filename='train_cross_env.txt', frame_skip=1, batch_size=8, device='dev3', model_name='p3d'): pred_output_filename = os.path.join(output_path, 'pred.npy') json_output_filename = os.path.join(output_path, 'action_segments.json') # setup dataset img_size = 112 if model_name == 'c3d' else 160 test_transforms = transforms.Compose([videotransforms.CenterCrop(img_size)]) test_dataset = Dataset(dataset_path, db_filename=db_filename, test_filename=testset_filename, train_filename=trainset_filename, transform=test_transforms, set='test', camera=device, frame_skip=frame_skip, frames_per_clip=frames_per_clip, resize=None, mode='img', input_type=mode) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True) # setup the model num_classes = test_dataset.num_classes if model_name == 'c3d': model = c3d.C3D() model.load_state_dict(torch.load('c3d.pickle')) model.replace_logits(num_classes) elif model_name == 'p3d': model = p3d.P3D199(pretrained=False, modality='RGB', num_classes=num_classes) else: raise ValueError("unsupported model") checkpoints = torch.load(model_path) model.load_state_dict(checkpoints["model_state_dict"]) # load trained model model.cuda() model = nn.DataParallel(model) n_examples = 0 # Iterate over data. avg_acc = [] pred_labels_per_video = [[] for i in range(len(test_dataset.video_list))] logits_per_video = [[] for i in range(len(test_dataset.video_list))] for test_batchind, data in enumerate(test_dataloader): model.train(False) # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) t = inputs.size(2) logits = model(inputs) logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True) acc = i3d_utils.accuracy_v2(torch.argmax(logits, dim=1), torch.argmax(labels, dim=1)) avg_acc.append(acc.item()) n_examples += batch_size print('batch Acc: {}, [{} / {}]'.format(acc.item(), test_batchind, len(test_dataloader))) logits = logits.permute(0, 2, 1) logits = logits.reshape(inputs.shape[0] * frames_per_clip, -1) pred_labels = torch.argmax(logits, 1).detach().cpu().numpy() logits = torch.nn.functional.softmax(logits, dim=1).detach().cpu().numpy().tolist() pred_labels_per_video, logits_per_video = \ utils.accume_per_video_predictions(vid_idx, frame_pad, pred_labels_per_video, logits_per_video, pred_labels, logits, frames_per_clip) pred_labels_per_video = [np.array(pred_video_labels) for pred_video_labels in pred_labels_per_video] logits_per_video = [np.array(pred_video_logits) for pred_video_logits in logits_per_video] np.save(pred_output_filename, {'pred_labels': pred_labels_per_video, 'logits': logits_per_video}) utils.convert_frame_logits_to_segment_json(logits_per_video, json_output_filename, test_dataset.video_list, test_dataset.action_list)
def run(init_lr=0.0001, max_steps=64e3, frames_per_clip=16, dataset_path='/media/sitzikbs/6TB/ANU_ikea_dataset/', train_filename='train_cross_env.txt', testset_filename='test_cross_env.txt', db_filename='../ikea_dataset_frame_labeler/ikea_annotation_db', logdir='', frame_skip=1, batch_size=8, camera='dev3', refine=False, refine_epoch=0, load_mode='vid', input_type='rgb', model_name='c3d'): os.makedirs(logdir, exist_ok=True) # setup dataset img_size = 112 if model_name == 'c3d' else 160 #224 train_transforms = transforms.Compose([videotransforms.RandomCrop(img_size), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(img_size)]) train_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename, transform=train_transforms, set='train', camera=camera, frame_skip=frame_skip, frames_per_clip=frames_per_clip, resize=None, mode=load_mode, input_type=input_type) print("Number of clips in the dataset:{}".format(len(train_dataset))) weights = utils.make_weights_for_balanced_classes(train_dataset.clip_set, train_dataset.clip_label_count) sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, num_workers=6, pin_memory=False) test_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename, test_filename=testset_filename, transform=test_transforms, set='test', camera=camera, frame_skip=frame_skip, frames_per_clip=frames_per_clip, resize=None, mode=load_mode, input_type=input_type) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=False) # setup the model num_classes = train_dataset.num_classes if model_name == 'c3d': model = c3d.C3D() model.load_state_dict(torch.load('c3d.pickle')) model.replace_logits(num_classes) elif model_name == 'p3d': model = p3d.P3D199(pretrained=True, modality='RGB', num_classes=num_classes) else: raise ValueError("unsupported model") if refine: if refine_epoch == 0: raise ValueError("You set the refine epoch to 0. No need to refine, just retrain.") refine_model_filename = os.path.join(logdir, str(refine_epoch).zfill(6)+'.pt') checkpoint = torch.load(refine_model_filename) model.load_state_dict(checkpoint["model_state_dict"]) model.cuda() model = nn.DataParallel(model) lr = init_lr optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1E-6) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 20, 30, 40]) criterion = nn.CrossEntropyLoss() # standard crossentropy loss for classification if refine: lr_sched.load_state_dict(checkpoint["lr_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) train_writer = SummaryWriter(os.path.join(logdir, 'train')) test_writer = SummaryWriter(os.path.join(logdir, 'test')) num_steps_per_update = 4 # accum gradient - try to have number of examples per update match original code 8*5*4 # eval_steps = 5 steps = 0 # train it n_examples = 0 train_num_batch = len(train_dataloader) test_num_batch = len(test_dataloader) refine_flag = True while steps < max_steps:#for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) if steps <= refine_epoch and refine and refine_flag: lr_sched.step() steps += 1 n_examples += len(train_dataset.clip_set) continue else: refine_flag = False # Each epoch has a training and validation phase test_batchind = -1 test_fraction_done = 0.0 test_enum = enumerate(test_dataloader, 0) tot_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. avg_acc = [] for train_batchind, data in enumerate(train_dataloader): num_iter += 1 # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) labels = torch.argmax(labels, dim=1) logits = model(inputs) t = inputs.size(2) per_frame_logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True) probs = torch.nn.functional.softmax(per_frame_logits, dim=1) preds = torch.max(probs, 1)[1] loss = criterion(per_frame_logits, labels) tot_loss += loss.item() loss.backward() acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), labels) avg_acc.append(acc.item()) train_fraction_done = (train_batchind + 1) / train_num_batch print('[{}] train Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), train_batchind, len(train_dataloader))) if (num_iter == num_steps_per_update or train_batchind == len(train_dataloader)-1) : n_steps = num_steps_per_update if train_batchind == len(train_dataloader)-1: n_steps = num_iter n_examples += batch_size*n_steps print('updating the model...') print('train Total Loss: {:.4f}'.format(tot_loss / n_steps)) optimizer.step() optimizer.zero_grad() train_writer.add_scalar('loss', tot_loss / n_steps, n_examples) train_writer.add_scalar('Accuracy', np.mean(avg_acc), n_examples) train_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], n_examples) num_iter = 0 tot_loss = 0. if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch: model.train(False) # Set model to evaluate mode test_batchind, data = next(test_enum) inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) labels = torch.argmax(labels, dim=1) with torch.no_grad(): logits = model(inputs) t = inputs.size(2) per_frame_logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True) probs = torch.nn.functional.softmax(per_frame_logits, dim=1) preds = torch.max(probs, 1)[1] loss = criterion(per_frame_logits, labels) acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), labels) print('[{}] test Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), test_batchind, len(test_dataloader))) test_writer.add_scalar('loss', loss.item(), n_examples) test_writer.add_scalar('Accuracy', acc.item(), n_examples) test_fraction_done = (test_batchind + 1) / test_num_batch model.train(True) if steps % 2 == 0: # save model torch.save({"model_state_dict": model.module.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "lr_state_dict": lr_sched.state_dict()}, logdir + str(steps).zfill(6) + '.pt') steps += 1 lr_sched.step() train_writer.close() test_writer.close()