def create(self): return optimizers.CorrectedMomentumSGD(0.1)
elif args.adabound: optimizer = optimizers.AdaBound() elif args.rmsprop: optimizer = optimizers.RMSprop() elif args.adam: optimizer = optimizers.Adam() elif args.sgd: optimizer = optimizers.SGD() elif args.adagrad: optimizer = optimizers.AdaGrad() elif args.amsgrad: optimizer = optimizers.AMSGrad() elif args.amsbound: optimizer = optimizers.AMSBound() elif args.correctedmomentsgd: optimizer = optimizers.CorrectedMomentumSGD() elif args.nesterovag: optimizer = optimizers.NesterovAG() elif args.msvag: optimizer = optimizers.MSVAG() elif args.rmspropgraves: optimizer = optimizers.RMSpropGraves() elif args.smorms3: optimizer = optimizers.SMORMS3() else: optimizer = optimizers.AdaDelta() optimizer.setup(net) if args.lasso: #Lasso回帰でスパース化
def main(): # Start the multiprocessing environment # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator if hasattr(multiprocessing, 'set_start_method'): multiprocessing.set_start_method('forkserver') p = multiprocessing.Process() p.start() p.join() # Set up workspace # 12 GB GPU RAM for workspace chainer.cuda.set_max_workspace_size(16 * 1024 * 1024 * 1024) # Setup the multi-node environment comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank print( '==> Successfully setup communicator: "{}" rank: {} device: {} size: {}' .format(args.communicator, comm.rank, device, comm.size)) set_random_seed(args, device) # Setup LR if args.lr is not None: lr = args.lr else: lr = 0.1 * (args.batchsize * comm.size) / 256 # TODO: why? if comm.rank == 0: print( 'LR = {} is selected based on the linear scaling rule'.format( lr)) # Setup dataset train_dir = os.path.join(args.dataset_dir, 'train') val_dir = os.path.join(args.dataset_dir, 'val') label_names = datasets.directory_parsing_label_names(train_dir) train_data = datasets.DirectoryParsingLabelDataset(train_dir) val_data = datasets.DirectoryParsingLabelDataset(val_dir) train_data = TransformDataset(train_data, ('img', 'label'), TrainTransform(_mean, args)) val_data = TransformDataset(val_data, ('img', 'label'), ValTransform(_mean, args)) print('==> [{}] Successfully finished loading dataset'.format(comm.rank)) # Initializing dataset iterators if comm.rank == 0: train_indices = np.arange(len(train_data)) val_indices = np.arange(len(val_data)) else: train_indices = None val_indices = None train_indices = chainermn.scatter_dataset(train_indices, comm, shuffle=True) val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True) train_data = train_data.slice[train_indices] val_data = val_data.slice[val_indices] train_iter = chainer.iterators.MultiprocessIterator( train_data, args.batchsize, n_processes=args.loaderjob) val_iter = iterators.MultiprocessIterator(val_data, args.batchsize, repeat=False, shuffle=False, n_processes=args.loaderjob) # Create the model kwargs = {} if args.first_bn_mixed16 and args.dtype == 'float16': print('==> Setting the first BN layer to mixed16') kwargs['first_bn_mixed16'] = True # Initialize the model net = models.__dict__[args.arch](n_class=len(label_names), **kwargs) # Following https://arxiv.org/pdf/1706.02677.pdf, # the gamma of the last BN of each resblock is initialized by zeros. for l in net.links(): if isinstance(l, Bottleneck): l.conv3.bn.gamma.data[:] = 0 # Apply ada loss transform recorder = AdaLossRecorder(sample_per_n_iter=100) # Update the model to support AdaLoss net = AdaLossScaled(net, init_scale=args.init_scale, cfg={ 'loss_scale_method': args.loss_scale_method, 'scale_upper_bound': args.scale_upper_bound, 'accum_upper_bound': args.accum_upper_bound, 'update_per_n_iteration': args.update_per_n_iteration, 'recorder': recorder, }, transforms=[ AdaLossTransformLinear(), AdaLossTransformBottleneck(), AdaLossTransformBasicBlock(), AdaLossTransformConv2DBNActiv(), ], verbose=args.verbose) if comm.rank == 0: # print network only in the 1-rank machine print(net) net = L.Classifier(net) hook = AdaLossMonitor(sample_per_n_iter=100, verbose=args.verbose, includes=['Grad', 'Deconvolution']) # Setup optimizer optim = chainermn.create_multi_node_optimizer( optimizers.CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm) if args.dtype == 'mixed16': print('==> Using FP32 update for dtype=mixed16') optim.use_fp32_update() # by default use fp32 update # HACK: support skipping update by existing loss scaling functionality if args.dynamic_interval is not None: optim.loss_scaling(interval=args.dynamic_interval, scale=None) else: optim.loss_scaling(interval=float('inf'), scale=None) optim._loss_scale_max = 1.0 # to prevent actual loss scaling optim.setup(net) # setup weight decay for param in net.params(): if param.name not in ('beta', 'gamma'): param.update_rule.add_hook(WeightDecay(args.weight_decay)) # allocate model to multiple GPUs if device >= 0: chainer.cuda.get_device(device).use() net.to_gpu() # Create an updater that implements how to update based on one train_iter input updater = chainer.training.StandardUpdater(train_iter, optim, device=device) # Setup Trainer stop_trigger = (args.epoch, 'epoch') if args.iter is not None: stop_trigger = (args.iter, 'iteration') trainer = training.Trainer(updater, stop_trigger, out=args.out) @make_shift('lr') def warmup_and_exponential_shift(trainer): """ LR schedule for training ResNet especially. NOTE: lr should be within the context. """ epoch = trainer.updater.epoch_detail warmup_epoch = 5 # NOTE: mentioned the original ResNet paper. if epoch < warmup_epoch: if lr > 0.1: warmup_rate = 0.1 / lr rate = warmup_rate \ + (1 - warmup_rate) * epoch / warmup_epoch else: rate = 1 elif epoch < 30: rate = 1 elif epoch < 60: rate = 0.1 elif epoch < 80: rate = 0.01 else: rate = 0.001 return rate * lr trainer.extend(warmup_and_exponential_shift) evaluator = chainermn.create_multi_node_evaluator( extensions.Evaluator(val_iter, net, device=device), comm) trainer.extend(evaluator, trigger=(1, 'epoch')) log_interval = 0.1, 'epoch' print_interval = 0.1, 'epoch' if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) print('Using {} communicator'.format(args.communicator)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) # NOTE: may take snapshot every iteration now snapshot_label = 'epoch' if args.iter is None else 'iteration' snapshot_trigger = (args.snapshot_freq, snapshot_label) snapshot_filename = ('snapshot_' + snapshot_label + '_{.updater.' + snapshot_label + '}.npz') trainer.extend(extensions.snapshot(filename=snapshot_filename), trigger=snapshot_trigger) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_value( 'loss_scale', lambda trainer: trainer.updater.get_optimizer('main')._loss_scale), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'loss_scale', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy' ]), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: serializers.load_npz(args.resume, trainer) recorder.trainer = trainer hook.trainer = trainer with ExitStack() as stack: if comm.rank == 0: stack.enter_context(hook) trainer.run() # store recorded results if comm.rank == 0: # NOTE: only export in the first rank recorder.export().to_csv(os.path.join(args.out, 'loss_scale.csv')) hook.export_history().to_csv(os.path.join(args.out, 'grad_stats.csv'))
def main(): parser = ArgumentParser() parser.add_argument('train_data', help='train data') parser.add_argument('train_labels', help='train labels') parser.add_argument('--val-data', default=None, help='val data') parser.add_argument('--val-labels', default=None, help='val labels') parser.add_argument('-b', '--batch-size', type=int, default=5, help='mini-batch size (default=5)') parser.add_argument('--beta2', type=float, default=0.999, help='beta2 of Adam (default=0.999)') parser.add_argument('-g', '--gpu-id', type=int, default=-1, help='GPU ID (default=-1, indicates CPU)') parser.add_argument('--ignore-labels', type=int, default=[], nargs='+', help='labels to ignore (default=[])') parser.add_argument('-l', '--learning-rate', type=float, default=0.1, help='learning rate (default=0.1)') parser.add_argument('--max-iter', type=int, default=160000, help='train model up to max-iter (default=160000)') parser.add_argument( '--mean-interval', type=int, default=1000, help='calculate mean of train/loss (and validation loss) ' + 'every mean-interval iters (default=1000)') parser.add_argument('--model', default=None, help='resume to train the model') parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate (default=0.9)') parser.add_argument('--n-classes', type=int, default=5, help='number of classes (default=5)') parser.add_argument('--noise', default='no', help='noise injection method. \'no\', \'patch\', ' + 'and \'permutation\' are available (default=\'no\')') parser.add_argument('--optim', default='nesterov', help='optimization method. \'sgd\', \'nesterov\', ' + 'and \'adam\' are available (default=\'nesterov\')') parser.add_argument( '-o', '--outdir', default='./', help='trained models and optimizer states are stored in outdir ' + '(default=\'./\')') parser.add_argument( '--queue-maxsize', type=int, default=10, help='maxsize of queues for training and validation (default=10)') parser.add_argument( '--save-interval', type=int, default=10000, help='save model & optimizer every save-interval iters (default=10000)' ) parser.add_argument( '--state', default=None, help='optimizer state. resume to train the model with the optimizer') parser.add_argument('-w', '--weight-decay', type=float, default=1e-4, help='weight decay factor (default=1e-4)') args = parser.parse_args() print(argv2string(sys.argv) + '\n') for arg in dir(args): if arg[:1] == '_': continue print('{} = {}'.format(arg, getattr(args, arg))) print() if not os.path.isdir(args.outdir): os.makedirs(args.outdir) print('mkdir ' + args.outdir + '\n') model = Model(in_ch=3, out_ch=args.n_classes) if args.model is not None: S.load_npz(args.model, model) loss_func = Loss(model) if args.optim.lower() in 'sgd': if args.momentum > 0: optim = optims.CorrectedMomentumSGD(lr=args.learning_rate, momentum=args.momentum) else: optim = optims.SGD(lr=args.learning_rate) elif args.optim.lower() in 'nesterovag': optim = optims.NesterovAG(lr=args.learning_rate, momentum=args.momentum) elif args.optim.lower() in 'adam': optim = optims.Adam(alpha=args.learning_rate, beta1=args.momentum, beta2=args.beta2, weight_decay_rate=args.weight_decay, amsgrad=True) else: raise ValueError('Please specify an available optimizer name.\n' + 'SGD, NesterovAG, and Adam are available.') print('{}\n'.format(type(optim))) optim.setup(model) if args.state is not None: S.load_npz(args.state, optim) if (args.weight_decay > 0) and not isinstance(optim, optims.Adam): optim.add_hook(WeightDecay(args.weight_decay)) optim.add_hook(GradientClipping(1)) lr_decay_iter_dict = { int(5 * args.max_iter / 8): 0.1, int(7 * args.max_iter / 8): 0.1, } with open(args.train_data, 'r') as f: train_data_path_list = [line.strip() for line in f.readlines()] with open(args.train_labels, 'r') as f: train_labels_path_list = [line.strip() for line in f.readlines()] assert len(train_data_path_list) == len(train_labels_path_list) if (args.val_data is not None) or (args.val_labels is not None): if (args.val_data is not None) and (args.val_labels is not None): with open(args.val_data, 'r') as f: val_data_path_list = [line.strip() for line in f.readlines()] with open(args.val_labels, 'r') as f: val_labels_path_list = [line.strip() for line in f.readlines()] assert len(val_data_path_list) == len(val_labels_path_list) else: raise ValueError('Either val_data or val_labels is not specified.') train_queue = mp.Queue(maxsize=args.queue_maxsize) train_generator = BatchGenerator(args.batch_size, train_data_path_list, train_labels_path_list, train_queue, train=True, noise_injection=args.noise, out_height=512, out_width=512, max_height=1216, max_width=1216, min_height=832, min_width=832) train_generator.start() if args.val_data is None: val_queue = None else: val_queue = mp.Queue(maxsize=args.queue_maxsize) try: val_generator = BatchGenerator(1, val_data_path_list, val_labels_path_list, val_queue, train=False, out_height=608, out_width=968) val_generator.start() except Exception: train_generator.terminate() train_queue.close() val_queue.close() raise try: train(loss_func, optim, train_queue, args.max_iter, args.mean_interval, args.save_interval, val_queue, lr_decay_iter_dict, args.gpu_id, args.ignore_labels, args.outdir) except BaseException: train_generator.terminate() train_queue.close() if val_queue is not None: val_generator.terminate() val_queue.close() raise train_generator.terminate() train_queue.close() if val_queue is not None: val_generator.terminate() val_queue.close()