def make_batch_script(trainer_params, model_params, script_params): # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size) model = make_model(**model_params) reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch trainer.callbacks.append( lbann.CallbackCheckpoint( checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), checkpoint_epochs=1, )) # Dump weights after every epoch model.callbacks.append( lbann.CallbackDumpWeights( basename=os.path.join(script_params['work_dir'], 'weights'), epoch_interval=1, )) # Create Protobuf file protobuf_file = os.path.join(script_params['work_dir'], 'experiment.prototext') lbann.proto.save_prototext( protobuf_file, trainer=trainer, model=model, data_reader=reader, optimizer=opt, ) # Create batch script script = lbann.contrib.launcher.make_batch_script(**script_params, ) script.add_command('echo "Started training at $(date)"') script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={protobuf_file}', ]) script.add_command('status=$?') script.add_command('echo "Finished training at $(date)"') script.add_command('exit ${status}') return script
def make_batch_script(trainer_params, model_params, script_params): #inference exe lbann_exe = abspath(lbann.lbann_exe()) lbann_exe = join(dirname(lbann_exe), 'lbann_inf') # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params['mini_batch_size']) model = make_model(**model_params) # model.eval() reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. # opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) opt = lbann.NoOptimizer() model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch # trainer.callbacks.append( # lbann.CallbackCheckpoint( # checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), # checkpoint_epochs=1, # ) # ) # Dump weights after every epoch # model.callbacks.append( # lbann.CallbackDumpWeights( # basename=os.path.join(script_params['work_dir'], 'weights'), # epoch_interval=1, # ) # ) status = lbann.contrib.launcher.run( trainer, model, reader, opt, lbann_exe, nodes=script_params['nodes'], procs_per_node=script_params['procs_per_node'], time_limit=30, setup_only=False, batch_job=False, ) # **kwargs) print(status)
def set_up_experiment(args, input_, probs, labels): # Set up objective function cross_entropy = lbann.CrossEntropy([probs, labels]) layers = list(lbann.traverse_layer_graph(input_)) weights = set() for l in layers: weights.update(l.weights) # scale = weight decay l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Set up model top1 = lbann.CategoricalAccuracy([probs, labels]) top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'), lbann.Metric(top5, name='top-5 accuracy', unit='%')] callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDropFixedLearningRate( drop_epoch=[30, 60], amt=0.1)] model = lbann.Model(args.mini_batch_size, args.num_epochs, layers=layers, weights=weights, objective_function=objective_function, metrics=metrics, callbacks=callbacks) # Load data reader from prototext data_reader_proto = lbann.lbann_pb2.LbannPB() with open(args.data_reader, 'r') as f: txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader # Set up optimizer if args.optimizer == 'sgd': print('Creating sgd optimizer') optimizer = lbann.optimizer.SGD( learn_rate=args.optimizer_learning_rate, momentum=0.9, nesterov=True ) else: optimizer = lbann.contrib.args.create_optimizer(args) # Save prototext to args.prototext if args.prototext: lbann.proto.save_prototext(args.prototext, model=model, optimizer=optimizer, data_reader=data_reader_proto) return model, data_reader_proto, optimizer
def set_up_experiment(args, input_, probs, labels): # Set up objective function cross_entropy = lbann.CrossEntropy([probs, labels]) layers = list(lbann.traverse_layer_graph(input_)) l2_reg_weights = set() for l in layers: if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: l2_reg_weights.update(l.weights) # scale = weight decay l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Set up model top1 = lbann.CategoricalAccuracy([probs, labels]) top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) metrics = [ lbann.Metric(top1, name='top-1 accuracy', unit='%'), lbann.Metric(top5, name='top-5 accuracy', unit='%') ] callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDropFixedLearningRate(drop_epoch=[30, 60], amt=0.1) ] model = lbann.Model(args.num_epochs, layers=layers, objective_function=objective_function, metrics=metrics, callbacks=callbacks) # Set up data reader data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) # Set up optimizer if args.optimizer == 'sgd': print('Creating sgd optimizer') optimizer = lbann.optimizer.SGD( learn_rate=args.optimizer_learning_rate, momentum=0.9, nesterov=True) else: optimizer = lbann.contrib.args.create_optimizer(args) # Setup trainer trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) return trainer, model, data_reader, optimizer
def setup(data_reader_file, name='classifier', num_labels=200, mini_batch_size=128, num_epochs=1000, learning_rate=0.1, bn_statistics_group_size=2, fc_data_layout='model_parallel', warmup_epochs=50, learning_rate_drop_interval=50, learning_rate_drop_factor=0.25, checkpoint_interval=None): # Setup input data input = lbann.Input(target_mode = 'classification') images = lbann.Identity(input) labels = lbann.Identity(input) # Classification network head_cnn = modules.ResNet(bn_statistics_group_size=bn_statistics_group_size) class_fc = lbann.modules.FullyConnectedModule(num_labels, activation=lbann.Softmax, name=f'{name}_fc', data_layout=fc_data_layout) x = head_cnn(images) probs = class_fc(x) # Setup objective function cross_entropy = lbann.CrossEntropy([probs, labels]) l2_reg_weights = set() for l in lbann.traverse_layer_graph(input): if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: l2_reg_weights.update(l.weights) l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002) obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Setup model metrics = [lbann.Metric(lbann.CategoricalAccuracy([probs, labels]), name='accuracy', unit='%')] callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] if checkpoint_interval: callbacks.append( lbann.CallbackCheckpoint( checkpoint_dir='ckpt', checkpoint_epochs=5 ) ) # Learning rate schedules if warmup_epochs: callbacks.append( lbann.CallbackLinearGrowthLearningRate( target=learning_rate * mini_batch_size / 128, num_epochs=warmup_epochs ) ) if learning_rate_drop_factor: callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=list(range(0, num_epochs, learning_rate_drop_interval)), amt=learning_rate_drop_factor) ) # Construct model model = lbann.Model(num_epochs, layers=lbann.traverse_layer_graph(input), objective_function=obj, metrics=metrics, callbacks=callbacks) # Setup optimizer # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8) opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9) # Load data reader from prototext data_reader_proto = lbann.lbann_pb2.LbannPB() with open(data_reader_file, 'r') as f: google.protobuf.text_format.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader for reader_proto in data_reader_proto.reader: reader_proto.python.module_dir = os.path.dirname(os.path.realpath(__file__)) # Return experiment objects return model, data_reader_proto, opt
l2_reg_weights = set() for l in layers: if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: l2_reg_weights.update(l.weights) l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Setup model metrics = [ lbann.Metric(top1, name='top-1 accuracy', unit='%'), lbann.Metric(top5, name='top-5 accuracy', unit='%') ] callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDropFixedLearningRate(drop_epoch=[30, 60, 80], amt=0.1) ] if args.warmup: callbacks.append( lbann.CallbackLinearGrowthLearningRate(target=0.1 * args.mini_batch_size / 256, num_epochs=5)) model = lbann.Model(args.num_epochs, layers=layers, objective_function=obj, metrics=metrics, callbacks=callbacks) # Setup optimizer opt = lbann.contrib.args.create_optimizer(args)
# Setup objective function weights = set() for l in layers: weights.update(l.weights) l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4) obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Setup model metrics = [ lbann.Metric(top1, name='top-1 accuracy', unit='%'), lbann.Metric(top5, name='top-5 accuracy', unit='%') ] callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDropFixedLearningRate(drop_epoch=[20, 40, 60], amt=0.1) ] model = lbann.Model(args.mini_batch_size, args.num_epochs, layers=layers, weights=weights, objective_function=obj, metrics=metrics, callbacks=callbacks) # Setup optimizer opt = lbann.contrib.args.create_optimizer(args) # Load data reader from prototext data_reader_proto = lbann.lbann_pb2.LbannPB() with open(args.data_reader, 'r') as f:
def setup(num_patches=3, mini_batch_size=512, num_epochs=75, learning_rate=0.005, bn_statistics_group_size=2, fc_data_layout='model_parallel', warmup=True, checkpoint_interval=None): # Data dimensions patch_dims = patch_generator.patch_dims num_labels = patch_generator.num_labels(num_patches) # Extract tensors from data sample input = lbann.Input() slice_points = [0] for _ in range(num_patches): patch_size = functools.reduce(operator.mul, patch_dims) slice_points.append(slice_points[-1] + patch_size) slice_points.append(slice_points[-1] + num_labels) sample = lbann.Slice(input, slice_points=str_list(slice_points)) patches = [ lbann.Reshape(sample, dims=str_list(patch_dims)) for _ in range(num_patches) ] labels = lbann.Identity(sample) # Siamese network head_cnn = modules.ResNet( bn_statistics_group_size=bn_statistics_group_size) heads = [head_cnn(patch) for patch in patches] heads_concat = lbann.Concatenation(heads) # Classification network class_fc1 = modules.FcBnRelu( 4096, statistics_group_size=bn_statistics_group_size, name='siamese_class_fc1', data_layout=fc_data_layout) class_fc2 = modules.FcBnRelu( 4096, statistics_group_size=bn_statistics_group_size, name='siamese_class_fc2', data_layout=fc_data_layout) class_fc3 = lbann.modules.FullyConnectedModule(num_labels, activation=lbann.Softmax, name='siamese_class_fc3', data_layout=fc_data_layout) x = class_fc1(heads_concat) x = class_fc2(x) probs = class_fc3(x) # Setup objective function cross_entropy = lbann.CrossEntropy([probs, labels]) l2_reg_weights = set() for l in lbann.traverse_layer_graph(input): if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: l2_reg_weights.update(l.weights) l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002) obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Setup model metrics = [ lbann.Metric(lbann.CategoricalAccuracy([probs, labels]), name='accuracy', unit='%') ] callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] if checkpoint_interval: callbacks.append( lbann.CallbackCheckpoint(checkpoint_dir='ckpt', checkpoint_epochs=5)) # Learning rate schedules if warmup: callbacks.append( lbann.CallbackLinearGrowthLearningRate(target=learning_rate * mini_batch_size / 128, num_epochs=5)) callbacks.append( lbann.CallbackDropFixedLearningRate(drop_epoch=list(range(0, 100, 15)), amt=0.25)) # Construct model model = lbann.Model(num_epochs, layers=lbann.traverse_layer_graph(input), objective_function=obj, metrics=metrics, callbacks=callbacks) # Setup optimizer opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9) # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8) # Setup data reader data_reader = make_data_reader(num_patches) # Return experiment objects return model, data_reader, opt