def get_data_iterators(data_dir, batch_size, num_workers, num_classes, input_image_size=224, resize_inv_factor=0.875): assert (resize_inv_factor > 0.0) resize_value = int(math.ceil(float(input_image_size) / resize_inv_factor)) train_dir_path = os.path.join(data_dir, 'train') train_dataset = PreprocessedDataset(root=train_dir_path, scale_size=resize_value, crop_size=input_image_size) assert (len(directory_parsing_label_names(train_dir_path)) == num_classes) val_dir_path = os.path.join(data_dir, 'val') val_dataset = PreprocessedDataset(root=val_dir_path, scale_size=resize_value, crop_size=input_image_size) assert (len(directory_parsing_label_names(val_dir_path)) == num_classes) train_iterator = iterators.MultiprocessIterator(dataset=train_dataset, batch_size=batch_size, repeat=False, shuffle=True, n_processes=num_workers) val_iterator = iterators.MultiprocessIterator(dataset=val_dataset, batch_size=batch_size, repeat=False, shuffle=False, n_processes=num_workers) return train_iterator, val_iterator
def get_data_iterators(data_dir, batch_size, num_workers, num_classes): train_dir_path = os.path.join(data_dir, 'train') train_dataset = PreprocessedDataset(root=train_dir_path) assert(len(directory_parsing_label_names(train_dir_path)) == num_classes) val_dir_path = os.path.join(data_dir, 'val') val_dataset = PreprocessedDataset(root=val_dir_path) assert (len(directory_parsing_label_names(val_dir_path)) == num_classes) train_iterator = iterators.MultiprocessIterator( dataset=train_dataset, batch_size=batch_size, repeat=False, shuffle=True, n_processes=num_workers) val_iterator = iterators.MultiprocessIterator( dataset=val_dataset, batch_size=batch_size, repeat=False, shuffle=False, n_processes=num_workers) return train_iterator, val_iterator
def test_directory_parsing_label_dataset(self): dataset = DirectoryParsingLabelDataset( self.tmp_dir, color=self.color) if self.depth == 1: expected_legnth = self.n_img_per_class * self.n_class elif self.depth == 2: expected_legnth =\ self.n_img_per_class * self.n_sub_directory * self.n_class self.assertEqual(len(dataset), expected_legnth) assert_is_label_dataset(dataset, self.n_class, color=self.color) label_names = directory_parsing_label_names(self.tmp_dir) self.assertEqual( label_names, ['class_{}'.format(i) for i in range(self.n_class)]) if self.depth == 1: self.assertEqual( dataset.img_paths, ['{}/class_{}/img{}.{}'.format(self.tmp_dir, i, j, self.suffix) for i in range(self.n_class) for j in range(self.n_img_per_class)]) elif self.depth == 2: self.assertEqual( dataset.img_paths, ['{}/class_{}/nested_{}/img{}.{}'.format( self.tmp_dir, i, j, k, self.suffix) for i in range(self.n_class) for j in range(self.n_sub_directory) for k in range(self.n_img_per_class)])
def test_directory_parsing_classification_dataset(self): dataset = DirectoryParsingClassificationDataset(self.tmp_dir, color=self.color) if self.depth == 1: expected_legnth = self.n_img_per_class * self.n_class elif self.depth == 2: expected_legnth =\ self.n_img_per_class * self.n_sub_directory * self.n_class self.assertEqual(len(dataset), expected_legnth) assert_is_classification_dataset(dataset, self.n_class, color=self.color) label_names = directory_parsing_label_names(self.tmp_dir) self.assertEqual(label_names, ['class_{}'.format(i) for i in range(self.n_class)]) if self.depth == 1: self.assertEqual(dataset.img_paths, [ '{}/class_{}/img{}.{}'.format(self.tmp_dir, i, j, self.suffix) for i in range(self.n_class) for j in range(self.n_img_per_class) ]) elif self.depth == 2: self.assertEqual(dataset.img_paths, [ '{}/class_{}/nested_{}/img{}.{}'.format( self.tmp_dir, i, j, k, self.suffix) for i in range(self.n_class) for j in range(self.n_sub_directory) for k in range(self.n_img_per_class) ])
def test_numerical_sort(self): dataset = DirectoryParsingLabelDataset( self.tmp_dir, numerical_sort=True) assert_is_label_dataset(dataset, self.n_class) label_names = directory_parsing_label_names( self.tmp_dir, numerical_sort=True) self.assertEqual( label_names, ['{}'.format(i) for i in range(self.n_class)])
def test_numerical_sort(self): dataset = DirectoryParsingClassificationDataset(self.tmp_dir, numerical_sort=True) assert_is_classification_dataset(dataset, self.n_class) label_names = directory_parsing_label_names(self.tmp_dir, numerical_sort=True) self.assertEqual(label_names, ['{}'.format(i) for i in range(self.n_class)])
def main(): parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('val', help='Path to root of the validation dataset') parser.add_argument( '--model', choices=('vgg16', 'resnet50', 'resnet101', 'resnet152')) parser.add_argument('--pretrained_model', default='imagenet') parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--crop', choices=('center', '10'), default='center') parser.add_argument('--resnet_mode', default='he') args = parser.parse_args() dataset = DirectoryParsingLabelDataset(args.val) label_names = directory_parsing_label_names(args.val) n_class = len(label_names) iterator = iterators.MultiprocessIterator( dataset, args.batchsize, repeat=False, shuffle=False, n_processes=6, shared_mem=300000000) if args.model == 'vgg16': extractor = VGG16(n_class, args.pretrained_model) elif args.model == 'resnet50': extractor = ResNet50( n_class, args.pretrained_model, mode=args.resnet_mode) elif args.model == 'resnet101': extractor = ResNet101( n_class, args.pretrained_model, mode=args.resnet_mode) elif args.model == 'resnet152': extractor = ResNet152( n_class, args.pretrained_model, mode=args.resnet_mode) model = FeaturePredictor( extractor, crop_size=224, scale_size=256, crop=args.crop) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu() print('Model has been prepared. Evaluation starts.') in_values, out_values, rest_values = apply_to_iterator( model.predict, iterator, hook=ProgressHook(len(dataset))) del in_values pred_probs, = out_values gt_labels, = rest_values accuracy = F.accuracy( np.array(list(pred_probs)), np.array(list(gt_labels))).data print() print('Top 1 Error {}'.format(1. - accuracy))
def get_val_data_iterator(data_dir, batch_size, num_workers, num_classes): val_dir_path = os.path.join(data_dir, 'val') val_dataset = DirectoryParsingLabelDataset(val_dir_path) val_dataset_len = len(val_dataset) assert (len(directory_parsing_label_names(val_dir_path)) == num_classes) val_iterator = iterators.MultiprocessIterator(dataset=val_dataset, batch_size=batch_size, repeat=False, shuffle=False, n_processes=num_workers, shared_mem=300000000) return val_iterator, val_dataset_len
def setup(dataset, model, pretrained_model, batchsize, val, crop, resnet_arch): dataset_name = dataset if dataset_name == 'imagenet': dataset = DirectoryParsingLabelDataset(val) label_names = directory_parsing_label_names(val) def eval_(out_values, rest_values): pred_probs, = out_values gt_labels, = rest_values accuracy = F.accuracy(np.array(list(pred_probs)), np.array(list(gt_labels))).data print() print('Top 1 Error {}'.format(1. - accuracy)) cls, pretrained_models, default_batchsize = models[model][:3] if pretrained_model is None: pretrained_model = pretrained_models.get(dataset_name, dataset_name) if crop is None: crop = models[model][3] kwargs = { 'n_class': len(label_names), 'pretrained_model': pretrained_model, } if model in ['resnet50', 'resnet101', 'resnet152']: if resnet_arch is None: resnet_arch = models[model][4] kwargs.update({'arch': resnet_arch}) extractor = cls(**kwargs) model = FeaturePredictor(extractor, crop_size=224, scale_size=256, crop=crop) if batchsize is None: batchsize = default_batchsize return dataset, eval_, model, batchsize
def main(): model_cfgs = { 'resnet50': { 'class': ResNet50, 'score_layer_name': 'fc6', 'kwargs': { 'arch': 'fb' } }, 'resnet101': { 'class': ResNet101, 'score_layer_name': 'fc6', 'kwargs': { 'arch': 'fb' } }, 'resnet152': { 'class': ResNet152, 'score_layer_name': 'fc6', 'kwargs': { 'arch': 'fb' } } } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to root of the train dataset') parser.add_argument('val', help='Path to root of the validation dataset') parser.add_argument('--model', '-m', choices=model_cfgs.keys(), default='resnet50', help='Convnet models') parser.add_argument('--communicator', type=str, default='pure_nccl', help='Type of communicator') parser.add_argument('--loaderjob', type=int, default=4) parser.add_argument('--batchsize', type=int, default=32, help='Batch size for each worker') parser.add_argument('--lr', type=float) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--weight_decay', type=float, default=0.0001) parser.add_argument('--out', type=str, default='result') parser.add_argument('--epoch', type=int, default=90) args = parser.parse_args() # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator if hasattr(multiprocessing, 'set_start_method'): multiprocessing.set_start_method('forkserver') p = multiprocessing.Process() p.start() p.join() comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank if args.lr is not None: lr = args.lr else: lr = 0.1 * (args.batchsize * comm.size) / 256 if comm.rank == 0: print('lr={}: lr is selected based on the linear ' 'scaling rule'.format(lr)) label_names = directory_parsing_label_names(args.train) model_cfg = model_cfgs[args.model] extractor = model_cfg['class'](n_class=len(label_names), **model_cfg['kwargs']) extractor.pick = model_cfg['score_layer_name'] model = Classifier(extractor) # Following https://arxiv.org/pdf/1706.02677.pdf, # the gamma of the last BN of each resblock is initialized by zeros. for l in model.links(): if isinstance(l, Bottleneck): l.conv3.bn.gamma.data[:] = 0 train_data = DirectoryParsingLabelDataset(args.train) val_data = DirectoryParsingLabelDataset(args.val) train_data = TransformDataset(train_data, ('img', 'label'), TrainTransform(extractor.mean)) val_data = TransformDataset(val_data, ('img', 'label'), ValTransform(extractor.mean)) print('finished loading dataset') if comm.rank == 0: train_indices = np.arange(len(train_data)) val_indices = np.arange(len(val_data)) else: train_indices = None val_indices = None train_indices = chainermn.scatter_dataset(train_indices, comm, shuffle=True) val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True) train_data = train_data.slice[train_indices] val_data = val_data.slice[val_indices] train_iter = chainer.iterators.MultiprocessIterator( train_data, args.batchsize, n_processes=args.loaderjob) val_iter = iterators.MultiprocessIterator(val_data, args.batchsize, repeat=False, shuffle=False, n_processes=args.loaderjob) optimizer = chainermn.create_multi_node_optimizer( CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm) optimizer.setup(model) for param in model.params(): if param.name not in ('beta', 'gamma'): param.update_rule.add_hook(WeightDecay(args.weight_decay)) if device >= 0: chainer.cuda.get_device(device).use() model.to_gpu() updater = chainer.training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) @make_shift('lr') def warmup_and_exponential_shift(trainer): epoch = trainer.updater.epoch_detail warmup_epoch = 5 if epoch < warmup_epoch: if lr > 0.1: warmup_rate = 0.1 / lr rate = warmup_rate \ + (1 - warmup_rate) * epoch / warmup_epoch else: rate = 1 elif epoch < 30: rate = 1 elif epoch < 60: rate = 0.1 elif epoch < 80: rate = 0.01 else: rate = 0.001 return rate * lr trainer.extend(warmup_and_exponential_shift) evaluator = chainermn.create_multi_node_evaluator( extensions.Evaluator(val_iter, model, device=device), comm) trainer.extend(evaluator, trigger=(1, 'epoch')) log_interval = 0.1, 'epoch' print_interval = 0.1, 'epoch' if comm.rank == 0: trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.snapshot_object( extractor, 'snapshot_model_{.updater.epoch}.npz'), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy' ]), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run()
def main(): # Start the multiprocessing environment # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator if hasattr(multiprocessing, 'set_start_method'): multiprocessing.set_start_method('forkserver') p = multiprocessing.Process() p.start() p.join() # Set up workspace # 12 GB GPU RAM for workspace chainer.cuda.set_max_workspace_size(16 * 1024 * 1024 * 1024) # Setup the multi-node environment comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank print( '==> Successfully setup communicator: "{}" rank: {} device: {} size: {}' .format(args.communicator, comm.rank, device, comm.size)) set_random_seed(args, device) # Setup LR if args.lr is not None: lr = args.lr else: lr = 0.1 * (args.batchsize * comm.size) / 256 # TODO: why? if comm.rank == 0: print( 'LR = {} is selected based on the linear scaling rule'.format( lr)) # Setup dataset train_dir = os.path.join(args.dataset_dir, 'train') val_dir = os.path.join(args.dataset_dir, 'val') label_names = datasets.directory_parsing_label_names(train_dir) train_data = datasets.DirectoryParsingLabelDataset(train_dir) val_data = datasets.DirectoryParsingLabelDataset(val_dir) train_data = TransformDataset(train_data, ('img', 'label'), TrainTransform(_mean, args)) val_data = TransformDataset(val_data, ('img', 'label'), ValTransform(_mean, args)) print('==> [{}] Successfully finished loading dataset'.format(comm.rank)) # Initializing dataset iterators if comm.rank == 0: train_indices = np.arange(len(train_data)) val_indices = np.arange(len(val_data)) else: train_indices = None val_indices = None train_indices = chainermn.scatter_dataset(train_indices, comm, shuffle=True) val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True) train_data = train_data.slice[train_indices] val_data = val_data.slice[val_indices] train_iter = chainer.iterators.MultiprocessIterator( train_data, args.batchsize, n_processes=args.loaderjob) val_iter = iterators.MultiprocessIterator(val_data, args.batchsize, repeat=False, shuffle=False, n_processes=args.loaderjob) # Create the model kwargs = {} if args.first_bn_mixed16 and args.dtype == 'float16': print('==> Setting the first BN layer to mixed16') kwargs['first_bn_mixed16'] = True # Initialize the model net = models.__dict__[args.arch](n_class=len(label_names), **kwargs) # Following https://arxiv.org/pdf/1706.02677.pdf, # the gamma of the last BN of each resblock is initialized by zeros. for l in net.links(): if isinstance(l, Bottleneck): l.conv3.bn.gamma.data[:] = 0 # Apply ada loss transform recorder = AdaLossRecorder(sample_per_n_iter=100) # Update the model to support AdaLoss net = AdaLossScaled(net, init_scale=args.init_scale, cfg={ 'loss_scale_method': args.loss_scale_method, 'scale_upper_bound': args.scale_upper_bound, 'accum_upper_bound': args.accum_upper_bound, 'update_per_n_iteration': args.update_per_n_iteration, 'recorder': recorder, }, transforms=[ AdaLossTransformLinear(), AdaLossTransformBottleneck(), AdaLossTransformBasicBlock(), AdaLossTransformConv2DBNActiv(), ], verbose=args.verbose) if comm.rank == 0: # print network only in the 1-rank machine print(net) net = L.Classifier(net) hook = AdaLossMonitor(sample_per_n_iter=100, verbose=args.verbose, includes=['Grad', 'Deconvolution']) # Setup optimizer optim = chainermn.create_multi_node_optimizer( optimizers.CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm) if args.dtype == 'mixed16': print('==> Using FP32 update for dtype=mixed16') optim.use_fp32_update() # by default use fp32 update # HACK: support skipping update by existing loss scaling functionality if args.dynamic_interval is not None: optim.loss_scaling(interval=args.dynamic_interval, scale=None) else: optim.loss_scaling(interval=float('inf'), scale=None) optim._loss_scale_max = 1.0 # to prevent actual loss scaling optim.setup(net) # setup weight decay for param in net.params(): if param.name not in ('beta', 'gamma'): param.update_rule.add_hook(WeightDecay(args.weight_decay)) # allocate model to multiple GPUs if device >= 0: chainer.cuda.get_device(device).use() net.to_gpu() # Create an updater that implements how to update based on one train_iter input updater = chainer.training.StandardUpdater(train_iter, optim, device=device) # Setup Trainer stop_trigger = (args.epoch, 'epoch') if args.iter is not None: stop_trigger = (args.iter, 'iteration') trainer = training.Trainer(updater, stop_trigger, out=args.out) @make_shift('lr') def warmup_and_exponential_shift(trainer): """ LR schedule for training ResNet especially. NOTE: lr should be within the context. """ epoch = trainer.updater.epoch_detail warmup_epoch = 5 # NOTE: mentioned the original ResNet paper. if epoch < warmup_epoch: if lr > 0.1: warmup_rate = 0.1 / lr rate = warmup_rate \ + (1 - warmup_rate) * epoch / warmup_epoch else: rate = 1 elif epoch < 30: rate = 1 elif epoch < 60: rate = 0.1 elif epoch < 80: rate = 0.01 else: rate = 0.001 return rate * lr trainer.extend(warmup_and_exponential_shift) evaluator = chainermn.create_multi_node_evaluator( extensions.Evaluator(val_iter, net, device=device), comm) trainer.extend(evaluator, trigger=(1, 'epoch')) log_interval = 0.1, 'epoch' print_interval = 0.1, 'epoch' if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) print('Using {} communicator'.format(args.communicator)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('Num epoch: {}'.format(args.epoch)) print('==========================================') trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) # NOTE: may take snapshot every iteration now snapshot_label = 'epoch' if args.iter is None else 'iteration' snapshot_trigger = (args.snapshot_freq, snapshot_label) snapshot_filename = ('snapshot_' + snapshot_label + '_{.updater.' + snapshot_label + '}.npz') trainer.extend(extensions.snapshot(filename=snapshot_filename), trigger=snapshot_trigger) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_value( 'loss_scale', lambda trainer: trainer.updater.get_optimizer('main')._loss_scale), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'loss_scale', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy' ]), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: serializers.load_npz(args.resume, trainer) recorder.trainer = trainer hook.trainer = trainer with ExitStack() as stack: if comm.rank == 0: stack.enter_context(hook) trainer.run() # store recorded results if comm.rank == 0: # NOTE: only export in the first rank recorder.export().to_csv(os.path.join(args.out, 'loss_scale.csv')) hook.export_history().to_csv(os.path.join(args.out, 'grad_stats.csv'))
def main(): model_cfgs = { 'detnas_small_coco': { 'class': DetNASSmallCOCO, 'score_layer_name': 'fc', 'kwargs': { #'n_class': 1000 } }, } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to root of the train dataset') parser.add_argument('val', help='Path to root of the validation dataset') parser.add_argument('--trial', action='store_true') parser.add_argument('--gpu', type=int, default=0) parser.add_argument( '--model', '-m', choices=model_cfgs.keys(), default='detnas_small_coco', help='Convnet models') parser.add_argument('--loaderjob', type=int, default=4) parser.add_argument( '--batchsize', type=int, help='Batch size for each worker') parser.add_argument('--lr', type=float) parser.add_argument('--momentum', type=float) parser.add_argument('--weight_decay', type=float) parser.add_argument('--out', type=str, default='result') parser.add_argument('--epoch', type=int) args = parser.parse_args() # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator if hasattr(multiprocessing, 'set_start_method'): multiprocessing.set_start_method('forkserver') p = multiprocessing.Process() p.start() p.join() label_names = directory_parsing_label_names(args.train) model_cfg = model_cfgs[args.model] extractor = model_cfg['class']( n_class=len(label_names), **model_cfg['kwargs']) extractor.pick = model_cfg['score_layer_name'] model = Classifier(extractor) train_data = DirectoryParsingLabelDataset(args.train) val_data = DirectoryParsingLabelDataset(args.val) train_data = TransformDataset(train_data, TrainTransform(extractor.mean)) val_data = TransformDataset(val_data, ValTransform(extractor.mean)) print('finished loading dataset') train_indices = np.arange(len(train_data)//(100 if args.trial else 1)) val_indices = np.arange(len(val_data)) """ train_data = train_data.slice[train_indices] val_data = val_data.slice[val_indices] """ train_iter = chainer.iterators.MultiprocessIterator( train_data, args.batchsize, n_processes=args.loaderjob) val_iter = iterators.MultiprocessIterator( val_data, args.batchsize, repeat=False, shuffle=False, n_processes=args.loaderjob) optimizer = CorrectedMomentumSGD(lr=args.lr, momentum=args.momentum) optimizer.setup(model) for param in model.params(): if param.name not in ('beta', 'gamma'): param.update_rule.add_hook(WeightDecay(args.weight_decay)) if args.gpu != -1: model.to_gpu(args.gpu) updater = chainer.training.StandardUpdater( train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(LinearShift('lr', (args.lr, 0.0), (0, len(train_indices) / args.batchsize))) evaluator = extensions.Evaluator(val_iter, model) trainer.extend(evaluator, trigger=(1, 'epoch')) log_interval = 0.1, 'epoch' print_interval = 0.1, 'epoch' trainer.extend( chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend( extensions.snapshot_object(extractor, 'snapshot_model_{.updater.epoch}.npz'), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend( extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy' ]), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run()
def main(): model_cfgs = { 'resnet50': {'class': ResNet50, 'score_layer_name': 'fc6', 'kwargs': {'arch': 'fb'}}, 'resnet101': {'class': ResNet101, 'score_layer_name': 'fc6', 'kwargs': {'arch': 'fb'}}, 'resnet152': {'class': ResNet152, 'score_layer_name': 'fc6', 'kwargs': {'arch': 'fb'}} } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to root of the train dataset') parser.add_argument('val', help='Path to root of the validation dataset') parser.add_argument('--export', type=str, default=None, help='Export the model to ONNX') parser.add_argument('--compile', type=str, default=None, help='Compile the model') parser.add_argument('--computation_order', type=str, default=None, help='Computation order in backpropagation') parser.add_argument('--model', '-m', choices=model_cfgs.keys(), default='resnet50', help='Convnet models') parser.add_argument('--communicator', type=str, default='pure_nccl', help='Type of communicator') parser.add_argument('--loaderjob', type=int, default=4) parser.add_argument('--batchsize', type=int, default=32, help='Batch size for each worker') parser.add_argument('--lr', type=float) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--weight-decay', type=float, default=0.0001) parser.add_argument('--out', type=str, default='result') parser.add_argument('--epoch', type=int, default=90) parser.add_argument('--iterations', '-I', type=int, default=None, help='Number of iterations to train') parser.add_argument('--no_use_fixed_batch_dataset', dest='use_fixed_batch_dataset', action='store_false', help='Disable the use of FixedBatchDataset') parser.add_argument('--compiler-log', action='store_true', help='Enables compile-time logging') parser.add_argument('--trace', action='store_true', help='Enables runtime tracing') parser.add_argument('--verbose', action='store_true', help='Enables runtime verbose log') parser.add_argument('--skip_runtime_type_check', action='store_true', help='Skip runtime type check') parser.add_argument('--dump_memory_usage', type=int, default=0, help='Dump memory usage (0-2)') parser.add_argument('--quiet_period', type=int, default=0, help='Quiet period after runtime report') parser.add_argument('--overwrite_batchsize', action='store_true', help='Overwrite batch size') args = parser.parse_args() # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator if hasattr(multiprocessing, 'set_start_method'): multiprocessing.set_start_method('forkserver') p = multiprocessing.Process() p.start() p.join() comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank if args.lr is not None: lr = args.lr else: lr = 0.1 * (args.batchsize * comm.size) / 256 if comm.rank == 0: print('lr={}: lr is selected based on the linear ' 'scaling rule'.format(lr)) label_names = directory_parsing_label_names(args.train) model_cfg = model_cfgs[args.model] extractor = model_cfg['class']( n_class=len(label_names), **model_cfg['kwargs']) extractor.pick = model_cfg['score_layer_name'] # Following https://arxiv.org/pdf/1706.02677.pdf, # the gamma of the last BN of each resblock is initialized by zeros. for l in extractor.links(): if isinstance(l, Bottleneck): l.conv3.bn.gamma.data[:] = 0 if args.export is not None: chainer_compiler.use_unified_memory_allocator() extractor.to_device(device) x = extractor.xp.zeros((args.batchsize, 3, 224, 224)).astype('f') chainer_compiler.export(extractor, [x], args.export) return if args.compile is not None: print('run compiled model') chainer_compiler.use_chainerx_shared_allocator() extractor.to_device(device) # init params with chainer.using_config('enable_backprop', False),\ chainer.using_config('train', False): x = extractor.xp.zeros((1, 3, 224, 224)).astype('f') extractor(x) compiler_kwargs = {} if args.compiler_log: compiler_kwargs['compiler_log'] = True runtime_kwargs = {} if args.trace: runtime_kwargs['trace'] = True if args.verbose: runtime_kwargs['verbose'] = True if args.skip_runtime_type_check: runtime_kwargs['check_types'] = False if args.dump_memory_usage >= 1: runtime_kwargs['dump_memory_usage'] = args.dump_memory_usage free, total = cupy.cuda.runtime.memGetInfo() used = total - free runtime_kwargs['base_memory_usage'] = used onnx_filename = args.compile if args.overwrite_batchsize: new_onnx_filename = ('/tmp/overwrite_batchsize_' + os.path.basename(onnx_filename)) new_input_types = [ input_rewriter.Type(shape=(args.batchsize, 3, 224, 224)) ] input_rewriter.rewrite_onnx_file(onnx_filename, new_onnx_filename, new_input_types) onnx_filename = new_onnx_filename extractor_cc = chainer_compiler.compile_onnx( extractor, onnx_filename, 'onnx_chainer', computation_order=args.computation_order, compiler_kwargs=compiler_kwargs, runtime_kwargs=runtime_kwargs, quiet_period=args.quiet_period) model = Classifier(extractor_cc) else: print('run vanilla chainer model') model = Classifier(extractor) train_data = DirectoryParsingLabelDataset(args.train) val_data = DirectoryParsingLabelDataset(args.val) train_data = TransformDataset( train_data, ('img', 'label'), TrainTransform(extractor.mean)) val_data = TransformDataset( val_data, ('img', 'label'), ValTransform(extractor.mean)) print('finished loading dataset') if comm.rank == 0: train_indices = np.arange(len(train_data)) val_indices = np.arange(len(val_data)) else: train_indices = None val_indices = None train_indices = chainermn.scatter_dataset( train_indices, comm, shuffle=True) val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True) train_data = train_data.slice[train_indices] val_data = val_data.slice[val_indices] if args.use_fixed_batch_dataset: train_data = FixedBatchDataset(train_data, args.batchsize) val_data = FixedBatchDataset(val_data, args.batchsize) train_iter = chainer.iterators.MultiprocessIterator( train_data, args.batchsize, n_processes=args.loaderjob) val_iter = iterators.MultiprocessIterator( val_data, args.batchsize, repeat=False, shuffle=False, n_processes=args.loaderjob) optimizer = chainermn.create_multi_node_optimizer( CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm) optimizer.setup(model) for param in model.params(): if param.name not in ('beta', 'gamma'): param.update_rule.add_hook(WeightDecay(args.weight_decay)) if device >= 0: chainer.cuda.get_device(device).use() model.to_gpu() updater = chainer.training.StandardUpdater( train_iter, optimizer, device=device) if args.iterations: stop_trigger = (args.iterations, 'iteration') else: stop_trigger = (args.epoch, 'epoch') trainer = training.Trainer( updater, stop_trigger, out=args.out) @make_shift('lr') def warmup_and_exponential_shift(trainer): epoch = trainer.updater.epoch_detail warmup_epoch = 5 if epoch < warmup_epoch: if lr > 0.1: warmup_rate = 0.1 / lr rate = warmup_rate \ + (1 - warmup_rate) * epoch / warmup_epoch else: rate = 1 elif epoch < 30: rate = 1 elif epoch < 60: rate = 0.1 elif epoch < 80: rate = 0.01 else: rate = 0.001 return rate * lr trainer.extend(warmup_and_exponential_shift) evaluator = chainermn.create_multi_node_evaluator( extensions.Evaluator(val_iter, model, device=device), comm) trainer.extend(evaluator, trigger=(1, 'epoch')) log_interval = 0.1, 'epoch' print_interval = 0.1, 'epoch' if comm.rank == 0: trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend( extensions.snapshot_object( extractor, 'snapshot_model_{.updater.epoch}.npz'), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.PrintReport( ['iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'] ), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run()
def main(): archs = { 'resnet50': {'class': ResNet50, 'score_layer_name': 'fc6', 'kwargs': {'arch': 'fb'}}, 'resnet101': {'class': ResNet101, 'score_layer_name': 'fc6', 'kwargs': {'arch': 'fb'}}, 'resnet152': {'class': ResNet152, 'score_layer_name': 'fc6', 'kwargs': {'arch': 'fb'}} } parser = argparse.ArgumentParser( description='Learning convnet from ILSVRC2012 dataset') parser.add_argument('train', help='Path to root of the train dataset') parser.add_argument('val', help='Path to root of the validation dataset') parser.add_argument('--arch', '-a', choices=archs.keys(), default='resnet50', help='Convnet architecture') parser.add_argument('--loaderjob', type=int, default=4) parser.add_argument('--batchsize', type=int, default=32, help='Batch size for each worker') parser.add_argument('--lr', type=float) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--weight_decay', type=float, default=0.0001) parser.add_argument('--out', type=str, default='result') parser.add_argument('--epoch', type=int, default=90) parser.add_argument('--min', type=int, required=True, help='Minimum number of processes') parser.add_argument('--start', type=int, required=True, help='Number of processes to start') parser.add_argument('--bind', '-p', type=str, required=True, help='address to bind gRPC server') parser.add_argument('--etcd', '-c', type=str, default='etcd://*****:*****@make_shift('lr') def warmup_and_exponential_shift(trainer): epoch = trainer.updater.epoch_detail warmup_epoch = 5 if epoch < warmup_epoch: if lr > 0.1: warmup_rate = 0.1 / lr rate = warmup_rate \ + (1 - warmup_rate) * epoch / warmup_epoch else: rate = 1 elif epoch < 30: rate = 1 elif epoch < 60: rate = 0.1 elif epoch < 80: rate = 0.01 else: rate = 0.001 return rate * lr trainer.extend(warmup_and_exponential_shift) evaluator = chainermn.create_multi_node_evaluator( extensions.Evaluator(val_iter, model, device=device), comm) trainer.extend(evaluator, trigger=(1, 'epoch')) trainer.extend(comm.get_uninitializer(), trigger=(1, 'iteration')) log_interval = 0.1, 'epoch' print_interval = 0.5, 'epoch' plot_interval = 1, 'epoch' if comm.intra_rank == 0: # TODO: lr is not properly controlled for accuracy trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend(echainer.extension.Lineage(comm, trigger=log_interval)) trainer.extend(extensions.PrintReport( ['iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'], log_report='Lineage'), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport( ['main/loss', 'validation/main/loss'], file_name='loss.png', trigger=plot_interval ), trigger=plot_interval ) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], file_name='accuracy.png', trigger=plot_interval ), trigger=plot_interval ) # Optimizer includes model parameters and other params in optimizer comm.register_state('optimizer', optimizer) comm.register_state('iterator', train_iter) if retry or not comm.initial: (iteration, epoch) = comm.fetch_state('optimizer', optimizer) # train_iter.epoch = epoch comm.fetch_state('iterator', train_iter) updater.iteration = iteration optimizers = trainer.updater.get_all_optimizers() for name in optimizers.keys(): optimizers[name].reset_prev_params() try: print('start trainer.run(), ', trainer.updater.iteration, trainer.updater.epoch) trainer.run() done = trainer._done except CommException as ce: print("Comm exception >>>>>>>>>>>", ce, updater.iteration, updater.epoch) comm.save_all_states(updater.iteration, updater.epoch) # Here comm will be ready to accept fetch state calls and once all # nodes got catched up it'll return and continue to run: TODO comm.sync_cluster(trainer.updater.get_all_optimizers()) retry = True continue except ClusterUpdatedException as ce: print("Cluster updated: >>>>>>>>>>>", ce) comm.save_all_states(updater.iteration, updater.epoch) comm.sync_cluster(trainer.updater.get_all_optimizers()) retry = True continue except Exception as e: print("Unexpected >>>>>>>>>>>", e) break comm.leave()