def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument('--model', '-m', choices=['resnet50', 'resnet101'], default='resnet50', help='Base model of Mask R-CNN.') parser.add_argument('--pooling-func', '-p', choices=['pooling', 'align', 'resize'], default='align', help='Pooling function.') parser.add_argument('--gpu', '-g', type=int, help='GPU id.') parser.add_argument('--multi-node', action='store_true', help='use multi node') default_max_epoch = 120 parser.add_argument('--max-epoch', type=float, default=default_max_epoch, help='epoch') parser.add_argument('--pretrained-model', help='pretrained model') parser.add_argument( '--notrain', choices=['pix', 'ins'], help='not training pixel or instance segmentation', ) parser.add_argument( '--lr-base', default=0.00125, type=float, help='learning rate per batch size 1', ) parser.add_argument( '--noaugmentation', action='store_true', help='not apply data augmentation', ) parser.add_argument( '--pix-loss-scale', default=1., type=float, help='scale of pixel loss', ) parser.add_argument( '--dataset', default='occlusion', choices=['occlusion', 'occlusion+synthetic'], help='dataset', ) args = parser.parse_args() if args.multi_node: import chainermn comm = chainermn.create_communicator('pure_nccl') device = comm.intra_rank args.n_node = comm.inter_size args.n_gpu = comm.size chainer.cuda.get_device_from_id(device).use() else: if args.gpu is None: print( '--gpu option is required if --multi-node is not specified.', file=sys.stderr, ) sys.exit(1) args.n_node = 1 args.n_gpu = 1 chainer.cuda.get_device_from_id(args.gpu).use() device = args.gpu args.seed = 0 now = datetime.datetime.now() args.timestamp = now.isoformat() if not args.multi_node or comm.rank == 0: out = osp.join(here, 'logs', now.strftime('%Y%m%d_%H%M%S.%f')) else: out = None if args.multi_node: args.out = comm.bcast_obj(out) else: args.out = out del out # 0.00125 * 8 = 0.01 in original args.batch_size = 1 * args.n_gpu args.lr = args.lr_base * args.batch_size args.weight_decay = 0.0001 # lr / 10 at 120k iteration with # 160k iteration * 16 batchsize in original args.step_size = [(120e3 / 180e3) * args.max_epoch, (160e3 / 180e3) * args.max_epoch] random.seed(args.seed) np.random.seed(args.seed) # Default Config # args.min_size = 800 # args.max_size = 1333 # args.anchor_scales = (2, 4, 8, 16, 32) args.min_size = 600 args.max_size = 1000 args.anchor_scales = (4, 8, 16, 32) args.rpn_dim = 512 # ------------------------------------------------------------------------- # Dataset train_data = \ instance_occlsegm.datasets.PanopticOcclusionSegmentationDataset( 'train', augmentation=not args.noaugmentation ) if args.dataset == 'occlusion+synthetic': synthetic_data = \ instance_occlsegm.datasets.\ PanopticOcclusionSegmentationSyntheticDataset( do_aug=not args.noaugmentation, size=len(train_data), ) train_data = chainer.datasets.ConcatenatedDataset( train_data, synthetic_data) test_data = \ instance_occlsegm.datasets.PanopticOcclusionSegmentationDataset( 'test' ) fg_class_names = test_data.class_names args.class_names = fg_class_names.tolist() test_data_list = test_data.get_video_datasets() del test_data # ------------------------------------------------------------------------- # Model + Optimizer. if args.pooling_func == 'align': pooling_func = cmr.functions.roi_align_2d elif args.pooling_func == 'pooling': pooling_func = chainer.functions.roi_pooling_2d elif args.pooling_func == 'resize': pooling_func = cmr.functions.crop_and_resize else: raise ValueError args.mask_loss = 'softmax' assert args.model in ['resnet50', 'resnet101'] n_layers = int(args.model.lstrip('resnet')) mask_rcnn = instance_occlsegm.models.MaskRCNNPanopticResNet( n_layers=n_layers, n_fg_class=len(fg_class_names), pretrained_model=args.pretrained_model, pooling_func=pooling_func, anchor_scales=args.anchor_scales, min_size=args.min_size, max_size=args.max_size, rpn_dim=args.rpn_dim, ) mask_rcnn.nms_thresh = 0.3 mask_rcnn.score_thresh = 0.05 model = instance_occlsegm.models.MaskRCNNPanopticTrainChain( mask_rcnn, notrain=args.notrain, pix_loss_scale=args.pix_loss_scale, ) if args.multi_node or args.gpu >= 0: model.to_gpu() optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9) if args.multi_node: optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay)) mask_rcnn.extractor.conv1.disable_update() mask_rcnn.extractor.bn1.disable_update() mask_rcnn.extractor.res2.disable_update() for link in mask_rcnn.links(): if isinstance(link, cmr.links.AffineChannel2D): link.disable_update() # ------------------------------------------------------------------------- # Iterator. train_data = chainer.datasets.TransformDataset( train_data, instance_occlsegm.datasets.MaskRCNNPanopticTransform(mask_rcnn), ) test_data_list = [ chainer.datasets.TransformDataset( td, instance_occlsegm.datasets.MaskRCNNPanopticTransform( mask_rcnn, train=False, )) for td in test_data_list ] test_concat_data = chainer.datasets.ConcatenatedDataset(*test_data_list) if args.multi_node: if comm.rank != 0: train_data = None train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True) # for training train_iter = chainer.iterators.MultiprocessIterator( train_data, batch_size=1, n_processes=14, shared_mem=10**9, ) # for evaluation test_iters = { i: chainer.iterators.SerialIterator(td, batch_size=1, repeat=False, shuffle=False) for i, td in enumerate(test_data_list) } # for visualization test_concat_iter = chainer.iterators.SerialIterator(test_concat_data, batch_size=1, repeat=False, shuffle=False) # ------------------------------------------------------------------------- converter = functools.partial( cmr.datasets.concat_examples, padding=0, # img, bboxes, labels, masks, scales, lbls_vis, lbls_occ indices_concat=[0, 2, 3, 4, 5, 6], indices_to_device=[0, 1, 5, 6], ) updater = chainer.training.updater.StandardUpdater(train_iter, optimizer, device=device, converter=converter) trainer = training.Trainer(updater, (args.max_epoch, 'epoch'), out=args.out) trainer.extend(extensions.FailOnNonNumber()) trainer.extend(extensions.ExponentialShift('lr', 0.1), trigger=training.triggers.ManualScheduleTrigger( args.step_size, 'epoch')) eval_interval = 1, 'epoch' log_interval = 10, 'iteration' plot_interval = 0.1, 'epoch' print_interval = log_interval if not args.multi_node or comm.rank == 0: evaluator = \ instance_occlsegm.extensions.PanopticSegmentationVOCEvaluator( test_iters, model.mask_rcnn, device=device, use_07_metric=False, label_names=fg_class_names, ) trainer.extend(evaluator, trigger=eval_interval) trainer.extend(extensions.snapshot_object(model.mask_rcnn, 'snapshot_model.npz'), trigger=training.triggers.MaxValueTrigger( 'validation/main/mpq', eval_interval)) args.git_hash = cmr.utils.git_hash() args.hostname = socket.gethostname() trainer.extend(fcn.extensions.ParamsReport(args.__dict__)) trainer.extend( instance_occlsegm.extensions.PanopticSegmentationVisReport( test_concat_iter, model.mask_rcnn, label_names=fg_class_names), trigger=eval_interval, ) trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend( extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss', 'main/pix_vis_loss', 'main/pix_occ_loss', 'validation/main/miou', 'validation/main/mpq', ], ), trigger=print_interval, ) trainer.extend(extensions.ProgressBar(update_interval=10)) # plot assert extensions.PlotReport.available() trainer.extend( extensions.PlotReport( [ 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss', 'main/ins_loss', 'main/pix_vis_loss', 'main/pix_occ_loss' 'main/pix_loss' 'main/loss', ], file_name='loss.png', trigger=plot_interval, ), trigger=plot_interval, ) trainer.extend( extensions.PlotReport([ 'validation/main/miou/vis', 'validation/main/miou/occ', 'validation/main/miou', 'validation/main/map', 'validation/main/msq', 'validation/main/mdq', 'validation/main/mpq', ], file_name='accuracy.png', trigger=plot_interval), trigger=eval_interval, ) trainer.extend(extensions.dump_graph('main/loss')) trainer.run()
def main(): parser = argparse.ArgumentParser(description='Train Deblur Network') parser.add_argument('--seed', '-s', type=int, default=0, help='seed for random values') parser.add_argument('--batchsize', '-b', type=int, default=128, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.1, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=50, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print(args) print('') set_random_seed(args.seed) predictor = srcnn.create_srcnn() model = L.Classifier(predictor, lossfun=F.mean_squared_error, accfun=psnr) if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU optimizer = chainer.optimizers.MomentumSGD(args.learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) base_dir = 'data/blurred_sharp' train_data = pairwise_dataset.PairwiseDataset( blur_image_list=str(Path(base_dir).joinpath('train_blur_images.txt')), sharp_image_list=str( Path(base_dir).joinpath('train_sharp_images.txt')), root=base_dir) train_data = chainer.datasets.TransformDataset(train_data, transform.Transform()) test_data = pairwise_dataset.PairwiseDataset( blur_image_list=str(Path(base_dir).joinpath('test_blur_images.txt')), sharp_image_list=str(Path(base_dir).joinpath('test_sharp_images.txt')), root=base_dir) # 普通はTransformしないような気がするけど、解像度がかわっちゃうのがなー test_data = chainer.datasets.TransformDataset(test_data, transform.Transform()) train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) test_iter = chainer.iterators.SerialIterator(test_data, args.batchsize, repeat=False, shuffle=False) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.FailOnNonNumber()) # Evaluate the model with the test dataset for each epoch eval_trigger = (1, 'epoch') trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu), trigger=eval_trigger) # Reduce the learning rate by half every 25 epochs. lr_drop_epoch = [int(args.epoch * 0.5), int(args.epoch * 0.75)] lr_drop_ratio = 0.1 print('lr schedule: {}, timing: {}'.format(lr_drop_ratio, lr_drop_epoch)) def lr_drop(trainer): trainer.updater.get_optimizer('main').lr *= lr_drop_ratio trainer.extend(lr_drop, trigger=chainer.training.triggers.ManualScheduleTrigger( lr_drop_epoch, 'epoch')) trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch')) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # Take a snapshot at each epoch trainer.extend(extensions.snapshot(model.predictor, 'model_{.updater.epoch}.npz'), trigger=(1, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(100, 'iteration'))) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend(extensions.PrintReport([ 'epoch', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ]), trigger=(100, 'iteration')) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # interact with chainerui trainer.extend(CommandsExtension(), trigger=(100, 'iteration')) # save args save_args(args, args.out) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def create_trainer( config: Config, output: Path, ): assert_config(config) if output.exists(): raise Exception(f"output directory {output} already exists.") # model predictor = create_predictor(config.model) if config.train.trained_model is not None: chainer.serializers.load_npz( config.train.trained_model["predictor_path"], predictor) model = Model( loss_config=config.loss, predictor=predictor, local_padding_size=config.dataset.local_padding_size, ) model.to_gpu(config.train.gpu[0]) cuda.get_device_from_id(config.train.gpu[0]).use() # dataset dataset = create_dataset(config.dataset) batchsize_devided = config.train.batchsize // len(config.train.gpu) train_iter = MultiprocessIterator(dataset["train"], config.train.batchsize) test_iter = MultiprocessIterator(dataset["test"], batchsize_devided, repeat=False, shuffle=True) train_test_iter = MultiprocessIterator(dataset["train_test"], batchsize_devided, repeat=False, shuffle=True) if dataset["test_eval"] is not None: test_eval_iter = MultiprocessIterator(dataset["test_eval"], batchsize_devided, repeat=False, shuffle=True) else: test_eval_iter = None # optimizer def create_optimizer(model): cp: Dict[str, Any] = copy(config.train.optimizer) n = cp.pop("name").lower() if n == "adam": optimizer = optimizers.Adam(**cp) elif n == "sgd": optimizer = optimizers.SGD(**cp) else: raise ValueError(n) optimizer.setup(model) if config.train.optimizer_gradient_clipping is not None: optimizer.add_hook( optimizer_hooks.GradientClipping( config.train.optimizer_gradient_clipping)) return optimizer optimizer = create_optimizer(model) if config.train.trained_model is not None: chainer.serializers.load_npz( config.train.trained_model["optimizer_path"], optimizer) # updater if len(config.train.gpu) <= 1: updater = StandardUpdater( iterator=train_iter, optimizer=optimizer, converter=concat_optional, device=config.train.gpu[0], ) else: updater = ParallelUpdater( iterator=train_iter, optimizer=optimizer, converter=concat_optional, devices={ "main" if i == 0 else f"gpu{gpu}": gpu for i, gpu in enumerate(config.train.gpu) }, ) if config.train.trained_model is not None: updater.iteration = optimizer.t # trainer output.mkdir() config.save_as_json((output / "config.json").absolute()) trigger_log = (config.train.log_iteration, "iteration") trigger_snapshot = (config.train.snapshot_iteration, "iteration") trigger_stop = ((config.train.stop_iteration, "iteration") if config.train.stop_iteration is not None else None) trainer = training.Trainer(updater, stop_trigger=trigger_stop, out=output) tb_writer = SummaryWriter(Path(output)) shift_ext = None if config.train.linear_shift is not None: shift_ext = extensions.LinearShift(**config.train.linear_shift) if config.train.step_shift is not None: shift_ext = extensions.StepShift(**config.train.step_shift) if shift_ext is not None: if config.train.trained_model is not None: shift_ext._t = optimizer.t trainer.extend(shift_ext) if config.train.ema_decay is not None: train_predictor = predictor predictor = deepcopy(predictor) ext = ExponentialMovingAverage(target=train_predictor, ema_target=predictor, decay=config.train.ema_decay) trainer.extend(ext, trigger=(1, "iteration")) ext = extensions.Evaluator(test_iter, model, concat_optional, device=config.train.gpu[0]) trainer.extend(ext, name="test", trigger=trigger_log) ext = extensions.Evaluator(train_test_iter, model, concat_optional, device=config.train.gpu[0]) trainer.extend(ext, name="train", trigger=trigger_log) if test_eval_iter is not None: generator = Generator(config=config, model=predictor, max_batch_size=config.train.batchsize) generate_evaluator = GenerateEvaluator( generator=generator, time_length=config.dataset.time_length_evaluate, local_padding_time_length=config.dataset. local_padding_time_length_evaluate, ) ext = extensions.Evaluator( test_eval_iter, generate_evaluator, concat_optional, device=config.train.gpu[0], ) trainer.extend(ext, name="eval", trigger=trigger_snapshot) ext = extensions.snapshot_object(predictor, filename="main_{.updater.iteration}.npz") trainer.extend(ext, trigger=trigger_snapshot) # ext = extensions.snapshot_object( # optimizer, filename="optimizer_{.updater.iteration}.npz" # ) # trainer.extend(ext, trigger=trigger_snapshot) trainer.extend(extensions.FailOnNonNumber(), trigger=trigger_log) trainer.extend(extensions.observe_lr(), trigger=trigger_log) trainer.extend(extensions.LogReport(trigger=trigger_log)) trainer.extend( extensions.PrintReport(["iteration", "main/loss", "test/main/loss"]), trigger=trigger_log, ) trainer.extend(TensorBoardReport(writer=tb_writer), trigger=trigger_log) trainer.extend(extensions.dump_graph(root_name="main/loss")) if trigger_stop is not None: trainer.extend(extensions.ProgressBar(trigger_stop)) return trainer
def main(): parser = argparse.ArgumentParser(description='Chainer example: VAE') parser.add_argument('--gpu', default=0, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--output_dir', '-o', default='result_mvae/', help='Directory to output the result') parser.add_argument('--epochs', '-e', default=100, type=int, help='Number of epochs') parser.add_argument('--dimz', '-z', default=8, type=int, help='Dimention of encoded vector') parser.add_argument('--batchsize', '-batch', type=int, default=32, help='Learning minibatch size') parser.add_argument('--beta', '-b', default=1, help='Beta coefficient for the KL loss') parser.add_argument( '--gamma_obj', '-gO', default=1, help='Gamma coefficient for the OBJECT classification loss') parser.add_argument( '--gamma_rel', '-gR', default=1, help='Gamma coefficient for the RELATIONAL classification loss') parser.add_argument('--alpha', '-a', default=1, help='Alpha coefficient for the reconstruction loss') parser.add_argument( '--freq', '-f', default=1000, help='Frequency at which snapshots of the model are saved.') parser.add_argument('--augment_counter', type=int, default=0, help='Number ot times to augment the train data') parser.add_argument('--objects_n', default=2, type=int, help='# of objects to be used') args = parser.parse_args() if not osp.isdir(osp.join(args.output_dir)): os.makedirs(args.output_dir) if not osp.isdir(osp.join(args.output_dir, 'models')): os.makedirs(osp.join(args.output_dir, 'models')) print('\n###############################################') print('# GPU: \t\t\t{}'.format(args.gpu)) print('# dim z: \t\t{}'.format(args.dimz)) print('# Minibatch-size: \t{}'.format(args.batchsize)) print('# Epochs: \t\t{}'.format(args.epochs)) print('# Beta: \t\t{}'.format(args.beta)) print('# Gamma OBJ: \t\t{}'.format(args.gamma_obj)) print('# Gamma REL: \t\t{}'.format(args.gamma_rel)) print('# Frequency: \t\t{}'.format(args.freq)) print('# Out Folder: \t\t{}'.format(args.output_dir)) print('###############################################\n') stats = { 'train_loss': [], 'train_rec_loss': [], 'train_kl': [], 'train_label_obj_acc': [], 'train_label_obj_loss': [], 'train_label_rel_acc': [], 'train_label_rel_loss': [], 'valid_loss': [], 'valid_rec_loss': [], 'valid_kl': [], 'valid_label_obj_acc': [], 'valid_label_obj_loss': [], 'valid_label_rel_acc': [], 'valid_label_rel_loss': [] } models_folder = os.path.join(args.output_dir, "models") folder_names = [ 'yordan_experiments/off-on', 'yordan_experiments/nonfacing-facing', 'yordan_experiments/out-in' ] # folder_names = ['yordan_experiments/off-on'] generator = data_generator.DataGenerator(folder_names=folder_names,\ data_split=0.8) train, train_labels, train_concat, train_vectors, test, test_labels, test_concat, test_vectors,\ unseen, unseen_labels, unseen_concat, unseen_vectors,\ groups_obj, groups_rel = generator.generate_dataset(args=args) data_dimensions = train.shape print('\n###############################################') print("DATA_LOADED") print("# Training Images: \t\t{0}".format(train.shape)) print("# Testing Images: \t\t{0}".format(test.shape)) print("# Unseen Images: \t\t{0}".format(unseen.shape)) print("# Training Rel Labels: \t\t{0}".format(train_labels.shape)) print("# Testing Rel Labels: \t\t{0}".format(test_labels.shape)) print("# Unseen Rel Labels: \t\t{0}".format(unseen_labels.shape)) print("# Training Rel Vectors: \t\t{0}".format(train_vectors.shape)) print("# Testing Rel Vectors: \t\t{0}".format(test_vectors.shape)) print('###############################################\n') if len(train_concat[1]) > 0: print("# Relation Label Stats:") for group_idx, group in groups_rel.items(): print("# Group: \t\t{0} : {1}".format(group_idx, group)) for label_idx, label in enumerate(group + ["unlabelled"]): print("#{0} Train: \t\t{1}".format( label, len(filter(lambda x: label == x[group_idx], train_labels)))) print("#{0} Test: \t\t{1}".format( label, len(filter(lambda x: label == x[group_idx], test_labels)))) print('###############################################\n') if len(train_concat[3]) > 0: print("# Object Label Stats:") train_object_vectors = np.array([ train_concat[i][3][j] for i in range(len(train_concat)) for j in range(args.objects_n) ]) test_object_vectors = np.array([ test_concat[i][3][j] for i in range(len(test_concat)) for j in range(args.objects_n) ]) train_object_vector_masks = np.array([ train_concat[i][4][j] for i in range(len(train_concat)) for j in range(args.objects_n) ]) test_object_vector_masks = np.array([ test_concat[i][4][j] for i in range(len(test_concat)) for j in range(args.objects_n) ]) for group_idx, group in groups_obj.items(): print("# Group: \t\t{0} : {1}".format(group_idx, group)) for label_idx, label in enumerate(group): print("#{0} Train: \t\t{1}".format( label, len( filter( lambda (x, y): label_idx == x[group_idx] and y[ group_idx] != 0, zip(train_object_vectors, train_object_vector_masks))))) print("#{0} Test: \t\t{1}".format( label, len( filter( lambda (x, y): label_idx == x[group_idx] and y[ group_idx] != 0, zip(test_object_vectors, test_object_vector_masks))))) for label_idx, label in enumerate(["unlabelled"]): print("#{0} Train: \t\t{1}".format( label, len( filter( lambda (x, y): label_idx == x[group_idx] and y[ group_idx] == 0, zip(train_object_vectors, train_object_vector_masks))))) print("#{0} Test: \t\t{1}".format( label, len( filter( lambda (x, y): label_idx == x[group_idx] and y[ group_idx] == 0, zip(test_object_vectors, test_object_vector_masks))))) print('###############################################\n') train_iter = chainer.iterators.SerialIterator(train_concat, args.batchsize) test_iter = chainer.iterators.SerialIterator(test_concat, args.batchsize, repeat=False, shuffle=False) model = net.Conv_MVAE(train.shape[1], latent_n=args.dimz, groups_obj=groups_obj, groups_rel=groups_rel, alpha=args.alpha, beta=args.beta, gamma_obj=args.gamma_obj, gamma_rel=args.gamma_rel, objects_n=args.objects_n) if args.gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Setup an optimizer optimizer = chainer.optimizers.Adam() # optimizer = chainer.optimizers.RMSprop() optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0005)) # optimizer.add_hook(chainer.optimizer_hooks.GradientClipping(0.00001)) updater = training.StandardUpdater(train_iter, optimizer, loss_func=model.lf, device=args.gpu) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.output_dir) trainer.extend(extensions.Evaluator(test_iter, model, eval_func=model.lf, device=args.gpu), name="val", trigger=(1, 'epoch')) trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) trainer.extend(extensions.PrintReport([ 'epoch', \ 'main/rec_l', 'val/main/rec_l', \ 'val/main/kl', \ 'main/obj_a','val/main/obj_a', \ 'main/rel_a','val/main/rel_a', \ 'main/obj_l', \ 'val/main/obj_l', \ 'main/rel_l',\ 'val/main/rel_l'])) trainer.extend(extensions.PlotReport(['main/rec_l', \ 'val/main/rec_l'], \ x_key='epoch', file_name='rec_loss.png', marker=None)) trainer.extend(extensions.PlotReport(['main/kl', \ 'val/main/kl'], \ x_key='epoch', file_name='kl.png', marker=None)) trainer.extend(extensions.PlotReport(['main/obj_a', \ 'val/main/obj_a'], \ x_key='epoch', file_name='object_acc.png', marker=None)) trainer.extend(extensions.PlotReport(['main/obj_l', \ 'val/main/obj_l'], \ x_key='epoch', file_name='object_loss.png', marker=None)) trainer.extend(extensions.PlotReport(['main/rel_a', \ 'val/main/rel_a'], \ x_key='epoch', file_name='relation_acc.png', marker=None)) trainer.extend(extensions.PlotReport(['main/rel_l', \ 'val/main/rel_l'], \ x_key='epoch', file_name='relation_loss.png', marker=None)) # trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.FailOnNonNumber()) trainer.extend(extensions.snapshot( filename='snapshot_epoch_{.updater.epoch}.trainer'), trigger=(args.epochs, 'epoch')) trainer.extend(extensions.snapshot_object( model, filename='snapshot_epoch_{.updater.epoch}.model'), trigger=(10, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'final.model'), trigger=(args.epochs, 'epoch')) trainer.extend(model.check_loss_coefficients(), trigger=(1, 'epoch')) trainer.extend(extensions.ExponentialShift('alpha', 0.5, init=1e-3, target=1e-8), trigger=(args.epochs / 2, 'epoch')) # For Adam trainer.run()
def main3(): parser = argparse.ArgumentParser() parser.add_argument('--gpu_id', '-g', type=int, default=0) parser.add_argument('--batch_size', '-b', type=int, default=60) parser.add_argument('--test_split', type=float, default=0.2) parser.add_argument( '--real_test', dest='real_test', action='store_true', help='Whether to split the data or use a complete new trial.') parser.add_argument('--max_epoch', '-e', type=int, default=110) parser.add_argument('--resume', '-r', type=int, default=None) parser.add_argument( '--out_dir', '-o', type=str, default= '/mnt/7ac4c5b9-8c05-451f-9e6d-897daecb7442/gears/results_gsm/result_right_arm2' ) args = parser.parse_args() model = GoalScoreModel() frames, labels = load_all_data(prep_f=model.prepare) frames, labels = igp.unison_shuffled_copies(frames, labels) print('Frames shape: ', frames.shape, ' Labels shape: ', labels.shape) data = chainer.datasets.TupleDataset(frames, labels) #.to_device(gpu_id) print('Dataset length: ', data._length) print('Frame size: ', data[0][0].shape, data[0][0].dtype) if args.real_test: print('Using test trial.') train_iter = iterators.SerialIterator(data, args.batch_size, shuffle=True) # Load the test data test_frames, test_labels = load_frames_labels( ids=[11], filestype=''.join((args.data_base_dir, args.data_file_pattern)), blackout=args.blackout) data_test = chainer.datasets.TupleDataset(test_frames, test_labels) test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False) else: data_test, data_train = split_dataset(data, int(args.test_split * len(data))) train_iter = iterators.SerialIterator(data_train, args.batch_size, shuffle=True) test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False) if args.gpu_id >= 0: chainer.backends.cuda.get_device_from_id(args.gpu_id).use() model.to_gpu(args.gpu_id) # Create the optimizer for the model optimizer = optimizers.Adam().setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=1e-6)) updater = training.StandardUpdater(train_iter, optimizer, loss_func=model.calc_loss, device=args.gpu_id) # Full training print('Full model training ...') trainer = training.Trainer(updater, (args.max_epoch, 'epoch'), out=args.out_dir) trainer.extend(extensions.Evaluator(test_iter, model, eval_func=model.calc_loss, device=args.gpu_id), name='val', trigger=(1, 'epoch')) trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) trainer.extend( extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'main/mae', 'main/gnll', 'main/weighted', 'main/VAE', 'main/VAE_REC', 'main/VAE_KL', 'val/main/loss', 'val/main/mae', 'val/main/weighted', 'elapsed_time' ]) ) #, 'val/main/VAE', 'main/loss', 'validation/main/loss', 'elapsed_time'], )) trainer.extend( extensions.PlotReport( ['main/mae', 'val/main/mae', 'main/VAE', 'val/main/VAE'], x_key='epoch', file_name='loss.png', marker=None)) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.FailOnNonNumber()) # Save every X epochs trainer.extend(extensions.snapshot( filename='snapshot_epoch_{.updater.epoch}.trainer'), trigger=(200, 'epoch')) trainer.extend(extensions.snapshot_object( model, '%s_model_epoch_{.updater.epoch}.model' % (model.__class__.__name__)), trigger=(10, 'epoch')) trainer.extend(utils.display_image(model.vae_image, data_test, args.out_dir, args.gpu_id, n=3), trigger=(1, 'epoch')) trainer.extend(extensions.ExponentialShift('alpha', 0.5, init=1e-3, target=1e-8), trigger=(100, 'epoch')) # Resume from a specified snapshot if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run() print('Done.')
def main(hpt): logger.info('load dataset') train, valid, test = dataset.get_dataset(hpt.dataset.type, **hpt.dataset) assert valid is None assert test is None if hpt.general.test: train, _ = chainer.datasets.split_dataset(train, 100) chainer.set_debug(True) train_iter = chainer.iterators.SerialIterator(train, hpt.training.batch_size) logger.info('build model') loss = get_model(hpt) if hpt.general.gpu >= 0: loss.to_gpu(hpt.general.gpu) logger.info('setup optimizer') if hpt.optimizer.type == 'adam': optimizer = chainer.optimizers.Adam(alpha=hpt.optimizer.lr) elif hpt.optimizer.type == 'adagrad': optimizer = chainer.optimizers.AdaGrad(lr=hpt.optimizer.lr) else: raise AttributeError optimizer.setup(loss) logger.info('setup updater/trainer') updater = training.updaters.StandardUpdater(train_iter, optimizer, device=hpt.general.gpu, loss_func=loss) trainer = training.Trainer(updater, (hpt.training.iteration, 'iteration'), out=po.namedir(output='str')) lr_name = 'alpha' if hpt.optimizer.type == 'adam' else 'lr' trainer.extend( Burnin(lr_name, burnin_step=hpt.training.burnin_step, c=hpt.training.c)) trainer.extend(extensions.FailOnNonNumber()) trainer.extend(extensions.snapshot_object( loss, 'loss_snapshot_iter_{.updater.iteration}'), trigger=(int(hpt.training.iteration / 5), 'iteration')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.observe_lr()) trainer.extend( extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'main/kl_target', 'main/kl_negative', 'lr', 'main/bound', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) # Save plot images to the result dir if (not hpt.general.noplot) and extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss'], 'epoch', file_name=(po.imagesdir() / 'loss.png').as_posix())) trainer.extend( extensions.PlotReport(['main/kl_target', 'main/kl_negative'], 'epoch', file_name=(po.imagesdir() / 'kldiv.png').as_posix())) # Run the training logger.info('run training') trainer.run() logger.info('evaluate') metrics = evaluate(hpt, train, test, loss) for metric_name, metric in metrics.items(): logger.info('{}: {:.4f}'.format(metric_name, metric)) if hpt.general.noplot: return metrics return metrics
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("--multi-node", action="store_true", help="multi node") parser.add_argument("--out", help="output directory") parser.add_argument("--debug", action="store_true", help="debug mode") parser.add_argument("--gpu", type=int, default=0, help="gpu id") parser.add_argument("--seed", type=int, default=0, help="random seed") parser.add_argument( "--lr", type=float, default=0.0001, help="learning rate", ) parser.add_argument( "--max-epoch", type=int, default=30, help="max epoch", ) parser.add_argument( "--call-evaluation-before-training", action="store_true", help="call evaluation before training", ) def argparse_type_class_ids(string): if string == "all": n_class = len(morefusion.datasets.ycb_video.class_names) class_ids = np.arange(n_class)[1:].tolist() elif string == "asymmetric": class_ids = ( morefusion.datasets.ycb_video.class_ids_asymmetric.tolist()) elif string == "symmetric": class_ids = ( morefusion.datasets.ycb_video.class_ids_symmetric.tolist()) else: class_ids = [int(x) for x in string.split(",")] return class_ids parser.add_argument( "--class-ids", type=argparse_type_class_ids, default="all", help="class id (e.g., 'all', 'asymmetric', 'symmetric', '1,6,9')", ) parser.add_argument( "--pretrained-model", help="pretrained model", ) parser.add_argument( "--with-occupancy", action="store_true", help="with occupancy", ) parser.add_argument( "--note", help="note", ) parser.add_argument( "--pretrained-resnet18", action="store_true", help="pretrained resnet18", ) parser.add_argument( "--resume", help="resume", ) parser.add_argument( "--loss", choices=[ "add/add_s", "add/add_s+occupancy", "add->add_s|1", "add->add/add_s|1", "add->add/add_s|1+occupancy", ], default="add->add/add_s|1", help="loss", ) parser.add_argument( "--loss-scale", type=json.loads, default={"occupancy": 1.0}, help="loss scale", ) args = parser.parse_args() chainer.global_config.debug = args.debug # ------------------------------------------------------------------------- # device initialization if args.multi_node: import chainermn comm = chainermn.create_communicator("pure_nccl") device = comm.intra_rank n_gpu = comm.size else: device = args.gpu n_gpu = 1 if not args.multi_node or comm.rank == 0: now = datetime.datetime.now(datetime.timezone.utc) args.timestamp = now.isoformat() args.hostname = socket.gethostname() args.githash = morefusion.utils.githash(__file__) termcolor.cprint("==> Started training", attrs={"bold": True}) if args.out is None: if not args.multi_node or comm.rank == 0: args.out = osp.join(here, "logs", now.strftime("%Y%m%d_%H%M%S.%f")) else: args.out = None if args.multi_node: args.out = comm.bcast_obj(args.out) if device >= 0: chainer.cuda.get_device_from_id(device).use() # seed initialization random.seed(args.seed) np.random.seed(args.seed) if device >= 0: chainer.cuda.cupy.random.seed(args.seed) # dataset initialization data_train = None data_valid = None if not args.multi_node or comm.rank == 0: termcolor.cprint("==> Dataset size", attrs={"bold": True}) data_ycb_trainreal = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed( # NOQA "trainreal", class_ids=args.class_ids, augmentation=True) data_ycb_syn = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed( # NOQA "syn", class_ids=args.class_ids, augmentation=True) data_ycb_syn = morefusion.datasets.RandomSamplingDataset( data_ycb_syn, len(data_ycb_trainreal)) data_my_train = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed( # NOQA "train", class_ids=args.class_ids, augmentation=True) data_train = chainer.datasets.ConcatenatedDataset( data_ycb_trainreal, data_ycb_syn, data_my_train) print(f"ycb_trainreal={len(data_ycb_trainreal)}, " f"ycb_syn={len(data_ycb_syn)}, my_train={len(data_my_train)}") del data_ycb_trainreal, data_ycb_syn, data_my_train data_ycb_val = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed( # NOQA "val", class_ids=args.class_ids) data_my_val = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed( # NOQA "val", class_ids=args.class_ids) data_valid = chainer.datasets.ConcatenatedDataset( data_ycb_val, data_my_val, ) print(f"ycb_val={len(data_ycb_val)}, my_val={len(data_my_val)}") del data_ycb_val, data_my_val data_train = chainer.datasets.TransformDataset( data_train, Transform(train=True, with_occupancy=args.with_occupancy), ) data_valid = chainer.datasets.TransformDataset( data_valid, Transform(train=False, with_occupancy=args.with_occupancy), ) print(f"train={len(data_train)}, valid={len(data_valid)}") if args.multi_node: data_train = chainermn.scatter_dataset(data_train, comm, shuffle=True, seed=args.seed) data_valid = chainermn.scatter_dataset(data_valid, comm, shuffle=False, seed=args.seed) args.class_names = morefusion.datasets.ycb_video.class_names.tolist() loss = args.loss if loss in ["add->add_s|1", "add->add/add_s|1"]: loss = "add" elif loss == "add->add/add_s|1+occupancy": loss = "add+occupancy" # model initialization model = singleview_3d.models.Model( n_fg_class=len(args.class_names[1:]), pretrained_resnet18=args.pretrained_resnet18, with_occupancy=args.with_occupancy, loss=loss, loss_scale=args.loss_scale, ) if args.pretrained_model is not None: chainer.serializers.load_npz(args.pretrained_model, model) if device >= 0: model.to_gpu() # optimizer initialization optimizer = chainer.optimizers.Adam(alpha=args.lr) if args.multi_node: optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) optimizer.setup(model) if args.pretrained_resnet18: model.resnet_extractor.init_block.disable_update() model.resnet_extractor.res2.disable_update() for link in model.links(): if isinstance(link, chainer.links.BatchNormalization): link.disable_update() if not args.multi_node or comm.rank == 0: termcolor.cprint("==> Link update rules", attrs={"bold": True}) for name, link in model.namedlinks(): print(name, link.update_enabled) # iterator initialization iter_train = chainer.iterators.MultithreadIterator( data_train, batch_size=16 // n_gpu, repeat=True, shuffle=True, ) iter_valid = chainer.iterators.MultithreadIterator( data_valid, batch_size=48, repeat=False, shuffle=False, ) updater = chainer.training.StandardUpdater( iterator=iter_train, optimizer=optimizer, device=device, ) if not args.multi_node or comm.rank == 0: writer = tensorboardX.SummaryWriter(log_dir=args.out) writer_with_updater = morefusion.training.SummaryWriterWithUpdater( writer) writer_with_updater.setup(updater) # ------------------------------------------------------------------------- trainer = chainer.training.Trainer(updater, (args.max_epoch, "epoch"), out=args.out) trainer.extend(E.FailOnNonNumber()) @chainer.training.make_extension(trigger=(1, "iteration")) def update_loss(trainer): updater = trainer.updater optimizer = updater.get_optimizer("main") target = optimizer.target assert trainer.stop_trigger.unit == "epoch" if args.loss == "add->add/add_s|1": if updater.epoch_detail < 1: assert target._loss == "add" else: target._loss = "add/add_s" elif args.loss == "add->add_s|1": if updater.epoch_detail < 1: assert target._loss == "add" else: target._loss = "add_s" elif args.loss == "add->add/add_s|1+occupancy": if updater.epoch_detail < 1: assert target._loss == "add+occupancy" else: target._loss = "add/add_s+occupancy" else: assert args.loss in ["add/add_s", "add/add_s+occupancy"] return trainer.extend(update_loss) log_interval = 10, "iteration" eval_interval = 0.25, "epoch" # evaluate evaluator = morefusion.training.extensions.PoseEstimationEvaluator( iterator=iter_valid, target=model, device=device, progress_bar=True, ) if args.multi_node: evaluator.comm = comm trainer.extend( evaluator, trigger=eval_interval, call_before_training=args.call_evaluation_before_training, ) if not args.multi_node or comm.rank == 0: # print arguments msg = pprint.pformat(args.__dict__) msg = textwrap.indent(msg, prefix=" " * 2) termcolor.cprint("==> Arguments", attrs={"bold": True}) print(f"\n{msg}\n") trainer.extend( morefusion.training.extensions.ArgsReport(args), call_before_training=True, ) # snapshot trigger_best_add = chainer.training.triggers.MinValueTrigger( key="validation/main/add_or_add_s", trigger=eval_interval, ) trigger_best_auc = chainer.training.triggers.MaxValueTrigger( key="validation/main/auc/add_or_add_s", trigger=eval_interval, ) trainer.extend( E.snapshot(filename="snapshot_trainer_latest.npz"), trigger=eval_interval, ) trainer.extend( E.snapshot_object(model, filename="snapshot_model_latest.npz"), trigger=eval_interval, ) trainer.extend( E.snapshot_object(model, filename="snapshot_model_best_add.npz"), trigger=trigger_best_add, ) trainer.extend( E.snapshot_object(model, filename="snapshot_model_best_auc.npz"), trigger=trigger_best_auc, ) # log trainer.extend( morefusion.training.extensions.LogTensorboardReport( writer=writer, trigger=log_interval, ), call_before_training=True, ) trainer.extend( E.PrintReport( [ "epoch", "iteration", "elapsed_time", "main/loss", "main/add_or_add_s", "validation/main/auc/add_or_add_s", ], log_report="LogTensorboardReport", ), trigger=log_interval, call_before_training=True, ) trainer.extend(E.ProgressBar(update_interval=1)) # ------------------------------------------------------------------------- if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def train( lr=0.001, device=0, epoch=10, h_size=1000, b_size=100, weight_decay=0.0005, margin=1., saveto='output/checkpoint/', text_net='cnn', ocr_type='cloudvision', model_name='ocr', san_check=False, early_stopping=False, remove_stopwords=False, ): chainer.config.remove_stopwords = remove_stopwords args = locals() if not os.path.exists(saveto): os.makedirs(saveto) json.dump(args, open(os.path.join(saveto, 'args'), 'w')) log_interval = (10, 'iteration') val_interval = (1, 'epoch') dataset = DatasetOCR('train', ocr_type=ocr_type, san_check=san_check) train, val = chainer.datasets.split_dataset_random(dataset, first_size=int( len(dataset) * .9), seed=1234) print('train: %i, val: %i' % (len(train), len(val))) train_itr = chainer.iterators.SerialIterator(train, batch_size=b_size) val_itr = chainer.iterators.SerialIterator(val, batch_size=b_size, repeat=False, shuffle=False) if remove_stopwords: wvec_f = 'data/wordvec_wo_stopwords.npy' else: wvec_f = 'data/wordvec.npy' word_vec = np.load(wvec_f) if text_net == 'cnn': lng_net = TextCNN(len(dataset.tokenizer.word_index) + 1, word_vec) elif text_net == 'lstm': lng_net = TextLSTM(len(dataset.tokenizer.word_index) + 1, word_vec) else: raise RuntimeError('invalid text_net') if model_name == 'ocr': model = NonVisualNet(lng_net, h_size=h_size, margin=margin) elif model_name == 'ocr+vis': att_net = AttentionNetWTL(h_size=100) model = Net(lng_net, att_net) else: raise RuntimeError if device is not None: chainer.cuda.get_device_from_id(device).use() model.to_gpu() optimizer = chainer.optimizers.Adam(alpha=lr) optimizer.use_cleargrads() optimizer.setup(model) if text_net == 'lstm': optimizer.add_hook(chainer.optimizer.GradientClipping(5), name='grad_clip') if weight_decay is not None: optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay), name='weight_decay') updater = training.StandardUpdater(train_itr, optimizer, converter=my_converter, device=device) stop_trigger = (epoch, 'epoch') if early_stopping: stop_trigger = training.triggers.EarlyStoppingTrigger( monitor='validation/main/r@1', patients=2, mode='max', verbose=True, max_trigger=(epoch, 'epoch')) trainer = training.Trainer(updater, stop_trigger, saveto) trainer.extend(extensions.FailOnNonNumber()) trainer.extend(extensions.Evaluator(val_itr, model, converter=my_converter, device=device), trigger=val_interval) if not san_check: trainer.extend(extensions.ExponentialShift('alpha', 0.5), trigger=(5, 'epoch')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.ProgressBar(update_interval=10)) best_val_trigger = training.triggers.MaxValueTrigger( 'validation/main/ranking_score', trigger=val_interval) trainer.extend(extensions.snapshot_object(model, 'model'), trigger=best_val_trigger) trainer.run() return best_val_trigger._best_value
def set_event_handler(self): self.set_target() # (Not Implemented)Evaluator(train) self.trainer.extend(extensions.Evaluator( self.valid_loader, self.target, converter=self.converter, device=self.device, ), trigger=(self.eval_interval, 'epoch'), call_before_training=self.call_before_training) self.trainer.extend(extensions.ProgressBar()) self.trainer.extend(extensions.observe_lr()) # self.trainer.extend(extensions.MicroAverage('loss', 'lr', 'mav')) self.trainer.extend(extensions.LogReport(trigger=(self.log_interval, 'epoch')), call_before_training=self.call_before_training) self.trainer.extend(extensions.FailOnNonNumber()) # self.trainer.extend(extensions.ExponentialShift('lr', rate=0.9)) self.trainer.extend( extensions.ExponentialShift('lr', rate=0.99, init=self.lr * 10.0)) # (Not Implemented)InverseShift # (Not Implemented)LinearShift # (Not Implemented)MultistepShift # (Not Implemented)PolynomialShift # (Not Implemented)StepShift # (Not Implemented)WarmupShift self.trainer.extend( extensions.ParameterStatistics(self.model, trigger=(self.eval_interval, 'epoch'))) self.trainer.extend(extensions.VariableStatisticsPlot(self.model)) self.trainer.extend(extensions.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ]), call_before_training=self.call_before_training) self.trainer.extend(extensions.PlotReport( ['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png'), call_before_training=self.call_before_training) self.trainer.extend(extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png'), call_before_training=self.call_before_training) self.trainer.extend(extensions.snapshot(n_retains=self.retain_num), trigger=(self.log_interval, 'epoch')) self.set_additonal_event_handler()
def main3(): parser = argparse.ArgumentParser() parser.add_argument('--gpu_id', '-g', type=int, default=1) parser.add_argument('--batch_size', '-b', type=int, default=100) parser.add_argument('--test_split', type=float, default=0.2) parser.add_argument( '--real_test', dest='real_test', action='store_true', help='Whether to split the data or use a complete new trial.') parser.add_argument('--mdn_hidden-units', '-u', type=int, default=24) parser.add_argument('--mdn_gaussian-mixtures', '-m', type=int, default=24) parser.add_argument('--max_epoch', '-e', type=int, default=250) parser.add_argument('--resume', '-r', type=int, default=None) parser.add_argument('--out_dir', '-o', type=str, default='results/result_test') parser.add_argument('--data_base_dir', type=str, default='/media/daniel/data/hhc/') parser.add_argument('--data_file_pattern', '-f', type=str, default='trial{}.avi') args = parser.parse_args() # frames, labels = load_frames_labels(filestype='/media/daniel/data/hhc/trial{}_r_forearm.avi') frames, labels = load_frames_labels(filestype=''.join( (args.data_base_dir, args.data_file_pattern)), verbose=0) frames, labels = unison_shuffled_copies(frames, labels) print('Frames shape: ', frames.shape, ' Labels shape: ', labels.shape) data = chainer.datasets.TupleDataset(frames, labels) #.to_device(gpu_id) print('Dataset length: ', data._length) print('Frame size: ', data[0][0].shape, data[0][0].dtype) if args.real_test: print('Using test trial.') train_iter = iterators.SerialIterator(data, args.batch_size, shuffle=True) # Load the test data test_frames, test_labels = load_frames_labels( ids=[11], filestype=''.join((args.data_base_dir, args.data_file_pattern))) test_data = chainer.datasets.TupleDataset(test_frames, test_labels) test_iter = iterators.SerialIterator(test_data, args.batch_size, repeat=False, shuffle=False) else: data_test, data_train = split_dataset(data, int(args.test_split * len(data))) train_iter = iterators.SerialIterator(data_train, args.batch_size, shuffle=True) test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False) model = GoalScoreModel() if args.gpu_id >= 0: chainer.backends.cuda.get_device_from_id(args.gpu_id).use() model.to_gpu(args.gpu_id) # labels = chainer.dataset.to_device(args.gpu_id, labels) # frames = chainer.dataset.to_device(args.gpu_id, frames) # Create the optimizer for the model optimizer = optimizers.Adam().setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=1e-6)) # optimizer.add_hook(chainer.optimizer_hooks.GradientHardClipping(-.1, .1)) # xp = chainer.backend.get_array_module(data_train) # optimizer.update(model.calc_loss, xp.asarray([data_train[0][0]]), xp.asarray([data_train[0][1]])) # import chainer.computational_graph as c # g = c.build_computational_graph(model.calc_loss) # with open('results/graph.dot', 'w') as o: # o.write(g.dump()) updater = training.StandardUpdater(train_iter, optimizer, loss_func=model.calc_loss, device=args.gpu_id) # updater = training.ParallelUpdater(train_iter, optimizer, # loss_func=model.calc_loss, # devices={'main': args.gpu_id, 'second': 1}) # Pre-training print('Pretraining started.') trainer = training.Trainer(updater, (3, 'epoch'), out=args.out_dir) # Disable update for the head model print('Disabling training of head model.') model.head_model.disable_update() trainer.extend(extensions.ProgressBar()) trainer.extend(extensions.FailOnNonNumber()) trainer.run() # Full training print('Full model training ...') trainer = training.Trainer(updater, (args.max_epoch, 'epoch'), out=args.out_dir) trainer.extend(extensions.Evaluator(test_iter, model, eval_func=model.calc_loss, device=args.gpu_id), trigger=(1, 'epoch')) trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'main/nll', 'main/mae', 'main/sigma', 'validation/main/loss', 'validation/main/mae', 'validation/main/sigma', 'elapsed_time' ])) #, 'main/loss', 'validation/main/loss', 'elapsed_time'], )) trainer.extend( extensions.PlotReport(['main/mae', 'validation/main/mae'], x_key='epoch', file_name='loss.png', marker=None)) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.ProgressBar()) trainer.extend(extensions.FailOnNonNumber()) trainer.extend( extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'), trigger=(20, 'epoch')) trainer.extend(extensions.snapshot_object( model, 'model_epoch_{.updater.epoch}.model'), trigger=(20, 'epoch')) # Disable/Enable update for the head model model.head_model.enable_update() # Resume from a specified snapshot if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run() print('Done.')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu_id', '-g', type=int, default=1) parser.add_argument('--batch_size', '-b', type=int, default=30) parser.add_argument('--test_split', type=float, default=0.2) parser.add_argument('--real_test', dest='real_test', action='store_true', help='Whether to split the data or use a complete new trial.') parser.add_argument('--aug', type=int, default=2, help='How many times to increase the dataset with augmented images.') parser.add_argument('--subset', type=int, default=-1, help='Should we read just `x` number of folders.') parser.add_argument('--skipcount', type=int, default=1, help='Take every `x`-th frame from a sequence.') # parser.add_argument('--blackout', dest='blackout', action='store_true', # help='Whether to blackout part of the image or not.') # parser.add_argument('--mdn_hidden-units', '-u', type=int, default=24) # parser.add_argument('--mdn_gaussian-mixtures', '-m', type=int, default=24) parser.add_argument('--max_epoch', '-e', type=int, default=2500) parser.add_argument('--resume', '-r', type=int, default=None) parser.add_argument('--out_dir', '-o', type=str, default='results/result_test') # parser.add_argument('--data_base_dir', type=str, default='/media/daniel/data/hhc/') # parser.add_argument('--data_file_pattern', '-f', type=str, default='trial{}.avi') args = parser.parse_args() model = InsertGearPolicy() # frames, joints = load_data(prep_f=model.prepare, prepare_joints=model.prepare_joints) # Scale it all frames, joints = load_data(prep_f=model.prepare, prepare_joints=None, subset=args.subset, skipcount=args.skipcount) # Scale only images print('Frames shape: ', frames.shape, ' joints shape: ', joints.shape) from sklearn.model_selection import train_test_split frames_train, frames_test, joints_train, joints_test = train_test_split( frames, joints, test_size=args.test_split, random_state=42) if args.aug > 1: frames_train, joints_train = colour_agumentations(frames_train, joints_train, n=args.aug) print('After augmentation. Frames shape: ', frames.shape, ' joints shape: ', joints.shape) data_train = chainer.datasets.TupleDataset(frames_train, joints_train) data_test = chainer.datasets.TupleDataset(frames_test, joints_test) train_iter = iterators.SerialIterator(data_train, args.batch_size, shuffle=True) test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False) # frames, joints = unison_shuffled_copies(frames, joints) # data = chainer.datasets.TupleDataset(frames, joints) # print('Dataset length: ', data._length) # print('Frame size: ', data[0][0].shape, data[0][0].dtype) # if args.real_test: # print('Using test trial.') # train_iter = iterators.SerialIterator(data, args.batch_size, shuffle=True) # # Load the test data # print('Not done.') # exit(0) # # test_frames, test_joints = load_frames_labels(ids=[11], filestype=''.join((args.data_base_dir, args.data_file_pattern)), blackout=args.blackout) # data_test = chainer.datasets.TupleDataset(test_frames, test_joints) # test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False) # else: # print('Splitting data at ratio: ', args.test_split) # data_test, data_train = split_dataset(data, int(args.test_split*len(data))) # train_iter = iterators.SerialIterator(data_train, args.batch_size, shuffle=True) # test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False) if args.gpu_id >= 0: print('Loading model to gpu', args.gpu_id) chainer.backends.cuda.get_device_from_id(args.gpu_id).use() model.to_gpu(args.gpu_id) # Create the optimizer for the model optimizer = optimizers.Adam().setup(model) # optimizer = optimizers.SGD().setup(model) # optimizer.add_hook(chainer.optimizer.WeightDecay(rate=1e-6)) # optimizer.add_hook(chainer.optimizer_hooks.GradientHardClipping(-.1, .1)) # xp = chainer.backend.get_array_module(data_train) # optimizer.update(model.calc_loss, xp.asarray([data_train[0][0]]), xp.asarray([data_train[0][1]])) # import chainer.computational_graph as c # g = c.build_computational_graph(model.calc_loss) # with open('results/graph.dot', 'w') as o: # o.write(g.dump()) updater = training.StandardUpdater(train_iter, optimizer, loss_func=model.calc_loss, device=args.gpu_id) # Resume from a specified snapshot if args.resume: print('Loading from resume snapshot: ', args.resume, '{}/snapshot_epoch_{}.trainer'.format(args.out_dir, args.resume)) chainer.serializers.load_npz('{}/snapshot_epoch_{}.trainer'.format(args.out_dir, args.resume), trainer) # Pre-training # print('Pretraining started.') # trainer = training.Trainer(updater, (3, 'epoch'), out=args.out_dir) # # Disable update for the head model # print('Disabling training of head model.') # model.encode_model.disable_update() # trainer.extend(extensions.ProgressBar()) # trainer.extend(extensions.FailOnNonNumber()) # trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'main/mae', 'main/VAE', 'validation/main/loss', 'validation/main/mae', 'validation/main/VAE', 'elapsed_time'])) # trainer.extend(utils.display_image(model.vae_image, data_test, args.out_dir, args.gpu_id), trigger=(1, 'epoch')) # trainer.run() # Full training print('Full model training ...') trainer = training.Trainer(updater, (args.max_epoch, 'epoch'), out=args.out_dir) trainer.extend(extensions.Evaluator(test_iter, model, eval_func=model.calc_loss, device=args.gpu_id), name='val', trigger=(1, 'epoch')) # trainer.extend(extensions.Evaluator(test_iter, {'m':model}, eval_func=model.calc_loss, device=args.gpu_id), trigger=(1, 'epoch')) trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) trainer.extend(extensions.PrintReport(['epoch', 'iteration', 'main/loss', 'main/mae', 'main/gnll', 'main/weighted', 'main/VAE', 'main/VAE_REC','main/VAE_KL', 'val/main/loss', 'val/main/mae', 'val/main/weighted', 'elapsed_time']))#, 'val/main/VAE', 'main/loss', 'validation/main/loss', 'elapsed_time'], )) trainer.extend(extensions.PlotReport(['main/mae', 'val/main/mae', 'main/VAE', 'val/main/VAE'], x_key='epoch', file_name='loss.png', marker=None)) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.FailOnNonNumber()) # Save every X epochs trainer.extend(extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}.trainer'), trigger=(200, 'epoch')) trainer.extend(extensions.snapshot_object(model, '%s_model_epoch_{.updater.epoch}.model' % (model.__class__.__name__)), trigger=(10, 'epoch')) # # Take a best snapshot # record_trigger = training.triggers.MinValueTrigger('validation/main/mae', (1, 'epoch')) # trainer.extend(extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'), trigger=record_trigger) # trainer.extend(extensions.snapshot_object(model, '%s_best_model.npz' % (model.__class__.__name__)), trigger=record_trigger) trainer.extend(utils.display_image(model.vae_image, data_test, args.out_dir, args.gpu_id, n=3), trigger=(1, 'epoch')) # FOR SGD trainer.extend(extensions.ExponentialShift('lr', 0.5, init=1e-4, target=1e-8), trigger=(200, 'epoch')) # trainer.extend(extensions.ExponentialShift('alpha', 0.5, init=1e-3, target=1e-8), trigger=(100, 'epoch')) # Disable/Enable update for the head model model.encode_model.enable_update() trainer.run() print('Done.')