def make_batch_script(trainer_params, model_params, script_params): # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size) model = make_model(**model_params) reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch trainer.callbacks.append( lbann.CallbackCheckpoint( checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), checkpoint_epochs=1, )) # Dump weights after every epoch model.callbacks.append( lbann.CallbackDumpWeights( basename=os.path.join(script_params['work_dir'], 'weights'), epoch_interval=1, )) # Create Protobuf file protobuf_file = os.path.join(script_params['work_dir'], 'experiment.prototext') lbann.proto.save_prototext( protobuf_file, trainer=trainer, model=model, data_reader=reader, optimizer=opt, ) # Create batch script script = lbann.contrib.launcher.make_batch_script(**script_params, ) script.add_command('echo "Started training at $(date)"') script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={protobuf_file}', ]) script.add_command('status=$?') script.add_command('echo "Finished training at $(date)"') script.add_command('exit ${status}') return script
def make_model( motif_size, walk_length, num_vertices, embed_dim, learn_rate, num_epochs, embeddings_dir, ): # Layer graph input_ = lbann.Slice( lbann.Input(data_field='samples'), slice_points=str_list([0, motif_size, motif_size+walk_length]), ) motif_indices = lbann.Identity(input_) walk_indices = lbann.Identity(input_) gan = model.gan.CommunityGAN( num_vertices, motif_size, embed_dim, learn_rate, ) loss, real_disc_prob, fake_disc_prob, gen_prob = gan( motif_indices, motif_size, walk_indices, walk_length, ) # Metrics metrics = [ lbann.Metric(real_disc_prob, name='D(real)'), lbann.Metric(fake_disc_prob, name='D(fake)'), lbann.Metric(gen_prob, name='G'), ] # Callbacks callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDumpWeights(directory=embeddings_dir, epoch_interval=num_epochs), ] # Perform computation at double precision for l in lbann.traverse_layer_graph(input_): l.datatype = lbann.DataType.DOUBLE for w in l.weights: w.datatype = lbann.DataType.DOUBLE # Contruct model return lbann.Model( num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=loss, metrics=metrics, callbacks=callbacks, )
def make_model( data_dim, latent_dim, num_epochs, ): # Layer graph data = lbann.Input(data_field='samples') autoencoder = model.autoencoder.FullyConnectedAutoencoder( data_dim, latent_dim, ) reconstructed = autoencoder(data) loss = lbann.MeanSquaredError(data, reconstructed) # Metrics metrics = [ lbann.Metric(loss, name='loss'), ] # Callbacks callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDumpWeights(directory='weights', epoch_interval=num_epochs), ] # Contruct model return lbann.Model( num_epochs, layers=lbann.traverse_layer_graph(loss), objective_function=loss, metrics=metrics, callbacks=callbacks, )
# Create optimizer opt = lbann.SGD(learn_rate=args.learning_rate) # Create LBANN objects iterations_per_epoch = utils.ceildiv(epoch_size, args.mini_batch_size) num_epochs = utils.ceildiv(args.num_iterations, iterations_per_epoch) trainer = lbann.Trainer( mini_batch_size=args.mini_batch_size, num_parallel_readers=0, ) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDumpWeights( directory='embeddings', epoch_interval=num_epochs, format='distributed_binary', ), ] model = lbann.Model( num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, metrics=metrics, callbacks=callbacks, ) # Create batch script kwargs = lbann.contrib.args.get_scheduler_kwargs(args) script = lbann.contrib.launcher.make_batch_script( job_name=args.job_name,
def construct_model(run_args): """Construct LBANN model. Initial model for ATOM molecular VAE """ import lbann pad_index = run_args.pad_index assert pad_index is not None sequence_length = run_args.sequence_length assert sequence_length is not None print("sequence length is {}".format(sequence_length)) data_layout = "data_parallel" # Layer graph input_ = lbann.Identity(lbann.Input(name='inp', target_mode="N/A"), name='inp1') vae_loss = [] input_feature_dims = sequence_length embedding_size = run_args.embedding_dim dictionary_size = run_args.num_embeddings assert embedding_size is not None assert dictionary_size is not None kl, recon = molvae.MolVAE(input_feature_dims, dictionary_size, embedding_size, pad_index)(input_) vae_loss.append(kl) vae_loss.append(recon) print("LEN vae loss ", len(vae_loss)) layers = list(lbann.traverse_layer_graph(input_)) # Setup objective function weights = set() for l in layers: weights.update(l.weights) l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4) obj = lbann.ObjectiveFunction(vae_loss) # Initialize check metric callback metrics = [ lbann.Metric(kl, name='kl_loss'), lbann.Metric(recon, name='recon') ] callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] if (run_args.dump_weights_interval > 0): callbacks.append( lbann.CallbackDumpWeights( directory=run_args.dump_weights_dir, epoch_interval=run_args.dump_weights_interval)) if (run_args.ltfb): send_name = ('' if run_args.weights_to_send == 'All' else run_args.weights_to_send) #hack for Merlin empty string weights_to_ex = [w.name for w in weights if send_name in w.name] print("LTFB Weights to exchange ", weights_to_ex) callbacks.append( lbann.CallbackLTFB(batch_interval=run_args.ltfb_batch_interval, metric='recon', weights=list2str(weights_to_ex), low_score_wins=True, exchange_hyperparameters=True)) if (run_args.warmup): callbacks.append( lbann.CallbackLinearGrowthLearningRate(target=run_args.lr / 512 * run_args.batch_size, num_epochs=5)) # Construct model return lbann.Model(run_args.num_epochs, weights=weights, layers=layers, objective_function=obj, metrics=metrics, callbacks=callbacks)
# Run LBANN # ---------------------------------- # Create optimizer # Note: Learning rate in original word2vec is 0.025 learning_rate = args.learning_rate if learning_rate < 0: learning_rate = 0.025 * args.mini_batch_size opt = lbann.SGD(learn_rate=learning_rate) # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDumpWeights(basename='embeddings', epoch_interval=args.num_epochs), ] model = lbann.Model(args.num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, callbacks=callbacks) # Run LBANN kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.launcher.run(trainer, model, reader, opt, job_name=args.job_name, work_dir=args.work_dir, overwrite_script=True, **kwargs)
# ---------------------------------- # Create optimizer opt = lbann.SGD(learn_rate=args.learning_rate) # Create LBANN objects iterations_per_epoch = utils.ceildiv(epoch_size, args.mini_batch_size) num_epochs = utils.ceildiv(args.num_iterations, iterations_per_epoch) trainer = lbann.Trainer( mini_batch_size=args.mini_batch_size, num_parallel_readers=0, ) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDumpWeights(directory='weights', epoch_interval=num_epochs), ] model = lbann.Model( num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, metrics=metrics, callbacks=callbacks, ) # Create batch script kwargs = lbann.contrib.args.get_scheduler_kwargs(args) script = lbann.contrib.launcher.make_batch_script( job_name=args.job_name, work_dir=args.work_dir, **kwargs,
# ---------------------------------- # Create optimizer opt = lbann.SGD(learn_rate=args.learning_rate) # Create LBANN objects iterations_per_epoch = utils.ceildiv(epoch_size, args.mini_batch_size) num_epochs = utils.ceildiv(args.num_iterations, iterations_per_epoch) trainer = lbann.Trainer( mini_batch_size=args.mini_batch_size, num_parallel_readers=0, ) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackDumpWeights(directory='embeddings', epoch_interval=num_epochs), ] model = lbann.Model( num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, metrics=metrics, callbacks=callbacks, ) # Create batch script kwargs = lbann.contrib.args.get_scheduler_kwargs(args) script = lbann.contrib.launcher.make_batch_script( job_name=args.job_name, work_dir=args.work_dir, **kwargs,
def construct_model(run_args): """Construct LBANN model. Initial model for ATOM molecular SMILES generation Network architecture and training hyperparameters from https://github.com/samadejacobs/moses/tree/master/moses/char_rnn """ pad_index = run_args.pad_index assert pad_index is not None sequence_length = run_args.sequence_length assert sequence_length is not None print("sequence length is {}".format(sequence_length)) data_layout = "data_parallel" # Layer graph _input = lbann.Input(name="inp_tensor", data_field='samples') print(sequence_length) x_slice = lbann.Slice( _input, axis=0, slice_points=str_list(range(sequence_length + 1)), name="inp_slice", ) # embedding layer emb = [] embedding_dim = run_args.embedding_dim num_embeddings = run_args.num_embeddings assert embedding_dim is not None assert num_embeddings is not None emb_weights = lbann.Weights( initializer=lbann.NormalInitializer(mean=0, standard_deviation=1), name="emb_matrix", ) lstm1 = lbann.modules.GRU(size=run_args.hidden, data_layout=data_layout) fc = lbann.modules.FullyConnectedModule(size=num_embeddings, data_layout=data_layout) last_output = lbann.Constant( value=0.0, num_neurons="{}".format(run_args.hidden), data_layout=data_layout, name="lstm_init_output", ) lstm1_prev_state = [last_output] loss = [] idl = [] for i in range(sequence_length): idl.append( lbann.Identity(x_slice, name="slice_idl_" + str(i), device="CPU")) for i in range(sequence_length - 1): emb_l = lbann.Embedding( idl[i], name="emb_" + str(i), weights=emb_weights, embedding_dim=embedding_dim, num_embeddings=num_embeddings, ) x, lstm1_prev_state = lstm1(emb_l, lstm1_prev_state) fc_l = fc(x) y_soft = lbann.Softmax(fc_l, name="soft_" + str(i)) gt = lbann.OneHot(idl[i + 1], size=num_embeddings) ce = lbann.CrossEntropy([y_soft, gt], name="loss_" + str(i)) # mask padding in input pad_mask = lbann.NotEqual( [idl[i], lbann.Constant(value=pad_index, num_neurons="1")], ) ce_mask = lbann.Multiply([pad_mask, ce], name="loss_mask_" + str(i)) loss.append(lbann.LayerTerm(ce_mask, scale=1 / (sequence_length - 1))) layers = list(lbann.traverse_layer_graph(_input)) # Setup objective function weights = set() for l in layers: weights.update(l.weights) obj = lbann.ObjectiveFunction(loss) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackStepLearningRate(step=run_args.step_size, amt=run_args.gamma), lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, epoch_interval=1), ] # Construct model return lbann.Model(run_args.num_epochs, layers=layers, weights=weights, objective_function=obj, callbacks=callbacks)
def construct_model(run_args): """Construct LBANN model. Initial model for ATOM molecular VAE """ import lbann pad_index = run_args.pad_index assert pad_index is not None sequence_length = run_args.sequence_length assert sequence_length is not None print("sequence length is {}".format(sequence_length)) data_layout = "data_parallel" # Layer graph input_ = lbann.Identity(lbann.Input(name='inp', target_mode="N/A"), name='inp1') vae_loss = [] input_feature_dims = sequence_length embedding_size = run_args.embedding_dim dictionary_size = run_args.num_embeddings assert embedding_size is not None assert dictionary_size is not None save_output = True if run_args.dump_outputs_dir else False print("save output? ", save_output, "out dir ", run_args.dump_outputs_dir) z = lbann.Gaussian(mean=0.0, stdev=1.0, neuron_dims="128") recon, d1_real, d1_fake, d_adv, arg_max = molwae.MolWAE( input_feature_dims, dictionary_size, embedding_size, pad_index, save_output)(input_, z) zero = lbann.Constant(value=0.0, num_neurons='1', name='zero') one = lbann.Constant(value=1.0, num_neurons='1', name='one') d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real, one], name='d1_real_bce') d1_fake_bce = lbann.SigmoidBinaryCrossEntropy([d1_fake, zero], name='d1_fake_bce') d_adv_bce = lbann.SigmoidBinaryCrossEntropy([d_adv, one], name='d_adv_bce') vae_loss.append(recon) layers = list(lbann.traverse_layer_graph(input_)) # Setup objective function weights = set() src_layers = [] dst_layers = [] for l in layers: if (l.weights and "disc0" in l.name and "instance1" in l.name): src_layers.append(l.name) #freeze weights in disc2 if (l.weights and "disc1" in l.name): dst_layers.append(l.name) for idx in range(len(l.weights)): l.weights[idx].optimizer = lbann.NoOptimizer() weights.update(l.weights) l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) vae_loss.append(d1_real_bce) vae_loss.append(d_adv_bce) vae_loss.append(d1_fake_bce) vae_loss.append(l2_reg) print("LEN vae loss ", len(vae_loss)) obj = lbann.ObjectiveFunction(vae_loss) # Initialize check metric callback metrics = [ lbann.Metric(d_adv_bce, name='adv_loss'), lbann.Metric(recon, name='recon') ] callbacks = [ lbann.CallbackPrint(), #lbann.CallbackStepLearningRate(step=10, amt=0.5), lbann.CallbackTimer() ] if (run_args.dump_weights_interval > 0): callbacks.append( lbann.CallbackDumpWeights( directory=run_args.dump_weights_dir, epoch_interval=run_args.dump_weights_interval)) if (run_args.ltfb): send_name = ('' if run_args.weights_to_send == 'All' else run_args.weights_to_send) #hack for Merlin empty string weights_to_ex = [w.name for w in weights if send_name in w.name] print("LTFB Weights to exchange ", weights_to_ex) callbacks.append( lbann.CallbackLTFB(batch_interval=run_args.ltfb_batch_interval, metric='recon', weights=list2str(weights_to_ex), low_score_wins=True, exchange_hyperparameters=True)) callbacks.append( lbann.CallbackReplaceWeights(source_layers=list2str(src_layers), destination_layers=list2str(dst_layers), batch_interval=2)) #Dump final weight for inference if (run_args.dump_model_dir): callbacks.append(lbann.CallbackSaveModel(dir=run_args.dump_model_dir)) #Dump output (activation) for post processing if (run_args.dump_outputs_dir): pred_tensor = lbann.Concatenation(arg_max, name='pred_tensor') callbacks.append( lbann.CallbackDumpOutputs( batch_interval=run_args.dump_outputs_interval, execution_modes='test', directory=run_args.dump_outputs_dir, layers='inp pred_tensor')) if (run_args.warmup): callbacks.append( lbann.CallbackLinearGrowthLearningRate(target=run_args.lr / 512 * run_args.batch_size, num_epochs=5)) # Construct model return lbann.Model(run_args.num_epochs, weights=weights, layers=layers, objective_function=obj, metrics=metrics, callbacks=callbacks)