def make_batch_script(trainer_params, model_params, script_params): # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size) model = make_model(**model_params) reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch trainer.callbacks.append( lbann.CallbackCheckpoint( checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), checkpoint_epochs=1, )) # Dump weights after every epoch model.callbacks.append( lbann.CallbackDumpWeights( basename=os.path.join(script_params['work_dir'], 'weights'), epoch_interval=1, )) # Create Protobuf file protobuf_file = os.path.join(script_params['work_dir'], 'experiment.prototext') lbann.proto.save_prototext( protobuf_file, trainer=trainer, model=model, data_reader=reader, optimizer=opt, ) # Create batch script script = lbann.contrib.launcher.make_batch_script(**script_params, ) script.add_command('echo "Started training at $(date)"') script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={protobuf_file}', ]) script.add_command('status=$?') script.add_command('echo "Finished training at $(date)"') script.add_command('exit ${status}') return script
def make_batch_script(trainer_params, model_params, script_params): #inference exe lbann_exe = abspath(lbann.lbann_exe()) lbann_exe = join(dirname(lbann_exe), 'lbann_inf') # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params['mini_batch_size']) model = make_model(**model_params) # model.eval() reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. # opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) opt = lbann.NoOptimizer() model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch # trainer.callbacks.append( # lbann.CallbackCheckpoint( # checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), # checkpoint_epochs=1, # ) # ) # Dump weights after every epoch # model.callbacks.append( # lbann.CallbackDumpWeights( # basename=os.path.join(script_params['work_dir'], 'weights'), # epoch_interval=1, # ) # ) status = lbann.contrib.launcher.run( trainer, model, reader, opt, lbann_exe, nodes=script_params['nodes'], procs_per_node=script_params['procs_per_node'], time_limit=30, setup_only=False, batch_job=False, ) # **kwargs) print(status)
def setup_experiment(lbann): """Construct LBANN experiment. args: lbann (module): Module for LBANN Python frontend """ trainer = lbann.Trainer(mini_batch_size=mini_batch_size) model = make_model(NUM_NODES, NUM_EDGES, NUM_NODES_FEATURES, NUM_EDGE_FEATURES, EMBEDDING_DIM, EDGE_EMBEDDING_DIM, NUM_OUT_FEATURES, num_epochs) reader = LSC_PPQM4M.make_data_reader("LSC_100K", validation_percent=0) optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 ) return trainer, model, reader, optimizer
def setup_experiment(lbann): """Construct LBANN experiment. args: lbann (module): Module for LBANN Python frontend """ trainer = lbann.Trainer(mini_batch_size=mini_batch_size) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage() ] model = Sparse_Graph_Trainer.make_model(kernel_type='GatedGraph', num_epochs=num_epochs, callbacks=callbacks) reader = data.PROTEINS.make_data_reader() # No validation set optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8) return trainer, model, reader, optimizer
def setup_embeddings(script, config): # Get parameters num_vertices = config.getint('Graph', 'num_vertices') motif_size = config.getint('Motifs', 'motif_size') walk_length = config.getint('Walks', 'walk_length') embeddings_dir = config.get('Embeddings', 'embeddings_dir') embed_dim = config.getint('Embeddings', 'embed_dim') learn_rate = config.getfloat('Embeddings', 'learn_rate') mini_batch_size = config.getint('Embeddings', 'mini_batch_size') sgd_steps = config.getint('Embeddings', 'sgd_steps') sgd_steps_per_epoch = config.getint('Embeddings', 'sgd_steps_per_epoch') assert (num_vertices>0 and motif_size>0 and walk_length>=motif_size and embeddings_dir and embed_dim>0 and mini_batch_size>0 and sgd_steps>=0 and sgd_steps_per_epoch>0), \ 'invalid configuration for training embeddings' # Construct LBANN objects num_epochs = (sgd_steps + sgd_steps_per_epoch - 1) // sgd_steps_per_epoch trainer = lbann.Trainer( mini_batch_size=mini_batch_size, num_parallel_readers=0, ) model_ = make_model( motif_size, walk_length, num_vertices, embed_dim, learn_rate, num_epochs, embeddings_dir, ) optimizer = lbann.SGD(learn_rate=learn_rate) data_reader = make_data_reader() # Add LBANN invocation to batch script prototext_file = os.path.join(script.work_dir, 'experiment.prototext') lbann.proto.save_prototext( prototext_file, trainer=trainer, model=model_, data_reader=data_reader, optimizer=optimizer, ) script.add_body_line('') script.add_body_line('# Train embeddings') script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={prototext_file}', f'--num_io_threads=1', ])
def set_up_experiment(args, input_, probs, labels): # Set up objective function cross_entropy = lbann.CrossEntropy([probs, labels]) layers = list(lbann.traverse_layer_graph(input_)) l2_reg_weights = set() for l in layers: if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: l2_reg_weights.update(l.weights) # scale = weight decay l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Set up model top1 = lbann.CategoricalAccuracy([probs, labels]) top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) metrics = [ lbann.Metric(top1, name='top-1 accuracy', unit='%'), lbann.Metric(top5, name='top-5 accuracy', unit='%') ] callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDropFixedLearningRate(drop_epoch=[30, 60], amt=0.1) ] model = lbann.Model(args.num_epochs, layers=layers, objective_function=objective_function, metrics=metrics, callbacks=callbacks) # Set up data reader data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) # Set up optimizer if args.optimizer == 'sgd': print('Creating sgd optimizer') optimizer = lbann.optimizer.SGD( learn_rate=args.optimizer_learning_rate, momentum=0.9, nesterov=True) else: optimizer = lbann.contrib.args.create_optimizer(args) # Setup trainer trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) return trainer, model, data_reader, optimizer
metrics=metrics, callbacks=callbacks) # Setup optimizer optimizer = lbann.contrib.args.create_optimizer(args) # Setup data reader data_reader = create_cosmoflow_data_reader(args.train_dir, args.val_dir, args.test_dir, num_responses=args.num_secrets) # Setup trainer random_seed_arg = {'random_seed': args.random_seed} \ if args.random_seed is not None else {} trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, **random_seed_arg) # Runtime parameters/arguments environment = lbann.contrib.args.get_distconv_environment( num_io_partitions=args.depth_groups) if args.dynamically_reclaim_error_signals: environment['LBANN_KEEP_ERROR_SIGNALS'] = 0 else: environment['LBANN_KEEP_ERROR_SIGNALS'] = 1 lbann_args = ['--use_data_store'] # Run experiment kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.launcher.run(trainer, model, data_reader,
def main(): run_args = construct_lc_launcher_args() # add data_config data # and do not overwrite args if data_reader_prototext is enabled if os.path.isfile( run_args.data_config) and not run_args.data_reader_prototext: with open(run_args.data_config, "r") as f: config = json.load(f) for k, v in config.items(): setattr(run_args, k, v) trainer = lbann.Trainer(run_args.batch_size, #name=None, ) # define data_reader if run_args.data_reader_prototext: print("Using data_reader_prototext") assert run_args.sequence_length is not None assert run_args.vocab is not None data_reader_proto = lbann.lbann_pb2.LbannPB() with open(run_args.data_reader_prototext, "r") as f: txtf.Merge(f.read(), data_reader_proto) data_reader = data_reader_proto.data_reader else: data_reader = construct_data_reader(run_args) if "LBANN_EXPERIMENT_DIR" in os.environ: work_dir = os.environ["LBANN_EXPERIMENT_DIR"] else: work_dir = os.path.join(os.getcwd()) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") experiment_dir = os.path.join(work_dir, "{}_{}".format(timestamp, run_args.job_name)) if not os.path.exists(experiment_dir): os.makedirs(experiment_dir) # model and optimizer model = construct_model(run_args) opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8) # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase) ppn = 4 if run_args.scheduler == "lsf" else 2 print("args:\n" + str(run_args)) if (run_args.scheduler == 'slurm'): import torch torch.save(run_args, "{}/{}_config.pt".format(experiment_dir, run_args.job_name)) m_lbann_args = f"--load_model_weights_dir_is_complete --load_model_weights_dir={run_args.dump_model_dir} --vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length} --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}" if (run_args.data_reader_prototext): m_lbann_args = " ".join( (m_lbann_args, " --use_data_store --preload_data_store ")) if (run_args.procs_per_trainer): m_lbann_args = " ".join( (m_lbann_args, f"--procs_per_trainer={run_args.procs_per_trainer}")) status = lbann.contrib.launcher.run( trainer, model, data_reader, opt, lbann_exe, partition=run_args.partition, scheduler=run_args.scheduler, account=run_args.account, time_limit=run_args.time_limit, nodes=run_args.nodes, procs_per_node=ppn, batch_job=True, #setup_only = True, job_name=run_args.job_name, experiment_dir=experiment_dir, lbann_args=m_lbann_args, #turn on for tensor core environment={ 'LBANN_USE_CUBLAS_TENSOR_OPS': 1, 'LBANN_USE_CUDNN_TENSOR_OPS': 1, }, ) print("LBANN launcher status:\n" + str(status))
num_epochs = 20 model = lbann.Model(num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=loss, metrics=[lbann.Metric(acc, name='accuracy', unit='%')], callbacks=[ lbann.CallbackPrintModelDescription(), lbann.CallbackPrint(), lbann.CallbackTimer() ]) # Setup optimizer opt = lbann.SGD(learn_rate=0.01, momentum=0.9) # Setup data reader data_reader = data.mnist.make_data_reader() # Setup trainer trainer = lbann.Trainer(mini_batch_size=mini_batch_size) # ---------------------------------- # Run experiment # ---------------------------------- kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.launcher.run(trainer, model, data_reader, opt, job_name=args.job_name, **kwargs)
objective_function=obj, metrics=metrics, callbacks=callbacks, summary_dir=".") # Setup optimizer opt = lbann.contrib.args.create_optimizer(args) # Setup data reader num_classes = min(args.num_classes, num_labels) if dataset == "cifar10": data_reader = cifar10.make_data_reader(num_classes=num_classes) else: data_reader = imagenet.make_data_reader(num_classes=num_classes) # Setup trainer trainer = lbann.Trainer(random_seed=args.random_seed, mini_batch_size=args.mini_batch_size) # Run experiment kwargs = lbann.contrib.args.get_scheduler_kwargs(args) kwargs['lbann_args'] = '--data_reader_percent=' + str(args.data_reader_percent) lbann.contrib.launcher.run(trainer, model, data_reader, opt, job_name=args.job_name, **kwargs)
print("Random seed",random_seed) if mcr: print("Using Multi-channel rescaling") ### Create prefix for foldername now=datetime.now() fldr_name=now.strftime('%Y%m%d_%H%M%S') ## time format data_pct,val_ratio=1.0,0.1 # Percentage of data to use, % of data for validation batchsize=args.batchsize step_interval=args.step_interval print('Step interval',step_interval) work_dir="/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/512square/{0}_bsize{1}_{2}".format(fldr_name,batchsize,args.suffix) ##################### ### Run lbann trainer = lbann.Trainer(mini_batch_size=batchsize,random_seed=random_seed,callbacks=lbann.CallbackCheckpoint(checkpoint_dir='chkpt', checkpoint_epochs=10)) # checkpoint_steps=step_interval)) spectral_loss=args.spec_loss print("Spectral loss: ",spectral_loss) model = construct_model(num_epochs,mcr,spectral_loss=spectral_loss,save_batch_interval=int(step_interval)) #'step_interval*val_ratio' is the step interval for validation set. # Setup optimizer opt = lbann.Adam(learn_rate=args.learn_rate,beta1=0.5,beta2=0.99,eps=1e-8) # Load data reader from prototext data_reader = construct_data_reader(data_pct,val_ratio) status = lbann.run(trainer,model, data_reader, opt, nodes=num_nodes, procs_per_node=num_procs, work_dir=work_dir, scheduler='slurm', time_limit=1440, setup_only=False)
# ============================================== # Setup and launch experiment # ============================================== # Default data reader model_zoo_dir = dirname(dirname(abspath(__file__))) data_reader_prototext = join(model_zoo_dir, 'data', 'jag_100Kdata.prototext') if __name__ == '__main__': import lbann y_dim = 16399 #image+scalar shape z_dim = 20 #Latent space dim num_epochs = 100 mini_batch_size = 128 trainer = lbann.Trainer(mini_batch_size=mini_batch_size, serialize_io=True) model = jag_models.construct_jag_wae_model(y_dim=y_dim, z_dim=z_dim, num_epochs=num_epochs) # Setup optimizer opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.99, eps=1e-8) # Load data reader from prototext data_reader_proto = lbann.lbann_pb2.LbannPB() with open(data_reader_prototext, 'r') as f: txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader status = lbann.run(trainer, model, data_reader_proto, opt,
metrics = [lbann.Metric(obj, name="loss")] model = lbann.Model( lbann_params.epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, metrics=metrics, callbacks=[ lbann.CallbackPrint(), lbann.CallbackTimer(), ], ) # Setup trainer, optimizer, data_reader trainer = lbann.Trainer( mini_batch_size=lbann_params.mini_batch_size, num_parallel_readers=1, ) optimizer = lbann.Adam( learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8, ) data_reader = make_data_reader() # Launch LBANN kwargs = lbann.contrib.args.get_scheduler_kwargs(lbann_params) kwargs["environment"] = {} lbann.contrib.launcher.run( trainer, model,
fldr_name = now.strftime('%Y%m%d_%H%M%S') ## time format data_pct, val_ratio = 1.0, 0.1 # Percentage of data to use, % of data for validation batchsize = gdict['batchsize'] work_dir = gdict['op_loc'] + "{0}_bsize{1}_{2}".format( fldr_name, batchsize, gdict['run_suffix']) print(work_dir) print(gdict) ##################### ### Run lbann trainer = lbann.Trainer( mini_batch_size=batchsize, random_seed=random_seed, callbacks=lbann.CallbackCheckpoint( checkpoint_dir='chkpt', # checkpoint_epochs=10)) checkpoint_steps=gdict['step_interval'])) spectral_loss = gdict['lambda_spec'] if spectral_loss: print("Using Spectral loss with coupling", spectral_loss) model = construct_model( num_epochs, gdict['mcr'], spectral_loss=spectral_loss, save_batch_interval=int(gdict['step_interval']) ) #'step_interval*val_ratio' is the step interval for validation set. # Setup optimizer opt = lbann.Adam(learn_rate=gdict['learn_rate'],
print('Save interval', save_interval) ### Create foldername inside initial folder now = datetime.now() d1 = now.strftime('%Y%m%d_%H%M%S') ## time format strg = args.pretrained_dir top_dir = strg.split('chkpt')[0] # parent directory suffix = strg.split('chkpt')[-1].split('/')[2].split('training')[-1][ 1:] # Adding epoch-step info as folder suffix work_dir = top_dir + 'gen_imgs_chkpt/{0}_{1}'.format(suffix, d1) print("Generating images at ", work_dir) # os.makedirs(work_dir) ##################### ### Run lbann trainer = lbann.Trainer(mini_batch_size=batchsize, random_seed=random_seed) model = construct_model(num_epochs, mcr, save_batch_interval=save_interval) # Setup optimizer opt = lbann.Adam(learn_rate=0.0002, beta1=0.5, beta2=0.99, eps=1e-8) # Load data reader from prototext data_reader = construct_data_reader(data_pct, val_ratio) ### Initialize LBANN inf executable lbann_exe = abspath(lbann.lbann_exe()) lbann_exe = join(dirname(lbann_exe), 'lbann_inf') print('Loading model from :', args.pretrained_dir) status = lbann.run( trainer, model, data_reader,
# Initialize LBANN executable and command-line arguments lbann_exe = os.path.realpath(lbann.lbann_exe()) lbann_exe = os.path.join(os.path.dirname(lbann_exe), 'lbann2') lbann_command = [lbann_exe] # Construct experiment directory experiment_dir = util.make_experiment_dir(args.job_name) # Export model prototext files # Note: lbann2 driver doesn't have a command-line argument to get # trainer. file1 = os.path.join(experiment_dir, 'model1.prototext') file2 = os.path.join(experiment_dir, 'model2.prototext') lbann.proto.save_prototext(file1, model=model1, trainer=lbann.Trainer(mini_batch_size=512)) lbann.proto.save_prototext(file2, model=model2, trainer=lbann.Trainer(mini_batch_size=512)) lbann_command.append(f'--model={{{file1},{file2}}}') # Export data reader prototext files file1 = os.path.join(experiment_dir, 'reader1.prototext') file2 = os.path.join(experiment_dir, 'reader2.prototext') lbann.proto.save_prototext(file1, data_reader=reader1) lbann.proto.save_prototext(file2, data_reader=reader2) lbann_command.append(f'--reader={{{file1},{file2}}}') # Export optimizer prototext files file1 = os.path.join(experiment_dir, 'opt1.prototext') file2 = os.path.join(experiment_dir, 'opt2.prototext')
low_score_wins=True, exchange_hyperparameters=True)) # Construct model return lbann.Model(args.num_epochs, weights=weights, serialize_io=True, layers=layers, metrics=metrics, objective_function=obj, callbacks=callbacks) if __name__ == '__main__': import lbann trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, procs_per_trainer=args.procs_per_trainer) model = construct_model() # Setup optimizer opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.99, eps=1e-8) # Load data reader from prototext data_reader_proto = lbann.lbann_pb2.LbannPB() with open(data_reader_prototext, 'r') as f: txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader kwargs = lbann.contrib.args.get_scheduler_kwargs(args) status = lbann.contrib.launcher.run( trainer, model, data_reader_proto, opt,
callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] layers = list(lbann.traverse_layer_graph([images, responses])) model = lbann.Model(args.num_epochs, layers=layers, metrics=metrics, objective_function=mse, callbacks=callbacks) # Load data reader from prototext data_reader_proto = lbann.lbann_pb2.LbannPB() with open(data_reader_prototext, 'r') as f: txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader # Setup trainer trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) # Setup optimizer opt = lbann.Adam(learn_rate=0.0002, beta1=0.9, beta2=0.99, eps=1e-8) # Run experiment kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.launcher.run(trainer, model, data_reader_proto, opt, lbann_args=" --use_data_store --preload_data_store", job_name=args.job_name, **kwargs)
# ---------------------------------- # Set up Experiment # ---------------------------------- #Generate Model model = lbann.Model( num_epochs, layers=layers, objective_function=img_loss, metrics=metrics, callbacks=[print_model, training_output, gpu_usage, encoded_output]) #Optimizer opt = lbann.Adam(learn_rate=1e-2, beta1=0.9, beta2=0.99, eps=1e-8) data_reader = MOFae.make_data_reader() #Trainer trainer = lbann.Trainer(mini_batch_size=mini_batch_size, name="MOF_AE_1") # ---------------------------------- # Run Experiment # ---------------------------------- kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.launcher.run(trainer, model, data_reader, opt, **kwargs)
l.datatype = lbann.DataType.DOUBLE for w in l.weights: w.datatype = lbann.DataType.DOUBLE # ---------------------------------- # Run LBANN # ---------------------------------- # Create optimizer opt = lbann.SGD(learn_rate=args.learning_rate) # Create LBANN objects iterations_per_epoch = utils.ceildiv(epoch_size, args.mini_batch_size) num_epochs = utils.ceildiv(args.num_iterations, iterations_per_epoch) trainer = lbann.Trainer( mini_batch_size=args.mini_batch_size, num_parallel_readers=0, ) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDumpWeights( directory='embeddings', epoch_interval=num_epochs, format='distributed_binary', ), ] model = lbann.Model( num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, metrics=metrics,
kfac_args["update_interval_steps"] = args.kfac_update_interval_steps kfac_args["compute_interval"] = args.kfac_compute_interval_steps algo = lbann.KFAC("kfac", algo, **kfac_args) # Setup model model = lbann.Model(args.num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, metrics=metrics, callbacks=callbacks) # Setup optimizer opt = lbann.contrib.args.create_optimizer(args) # Setup trainer trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, training_algo=algo) # Setup environment variables environment = {"LBANN_KEEP_ERROR_SIGNALS": 1} # ---------------------------------- # Run experiment # ---------------------------------- kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.launcher.run(trainer, model, data_reader, opt, job_name=args.job_name, environment=environment, batch_job=args.batch_job,