def train_discrete_voxel_vae(config, kwargs, num_epochs=100): # with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) dataset = build_dataset() # drop the image as the model expects only graphs dataset = dataset.map(lambda graphs, images: (graphs, )) # run on first input to set variable shapes for batch in iter(dataset): train_one_epoch.model(*batch) break log_dir = build_log_dir('log_dir', config) checkpoint_dir = build_checkpoint_dir('checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=dataset, num_epochs=num_epochs, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=checkpoint_dir, debug=False) return train_one_epoch.model, checkpoint_dir
def main(data_dir, config): # Make strategy at the start of your main before any other tf code is run. strategy = get_distribution_strategy(use_cpus=True, logical_per_physical_factor=1) train_dataset = build_dataset(os.path.join(data_dir, 'train')) test_dataset = build_dataset(os.path.join(data_dir, 'test')) for (graph, img, c) in iter(test_dataset): print(graph) break with strategy.scope(): train_one_epoch = build_training(**config) log_dir = build_log_dir('test_log_dir', config) checkpoint_dir = build_checkpoint_dir('test_checkpointing', config) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=3, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def main(data_dir, config, kwargs): # Make strategy at the start of your main before any other tf code is run. # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, # memory_limit=None) train_dataset = build_dataset(os.path.join(data_dir, 'train'), batch_size=4) test_dataset = build_dataset(os.path.join(data_dir, 'test'), batch_size=4) # with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) train_one_epoch.model.set_temperature(10.) log_dir = build_log_dir('new_im_16_log_dir', config) checkpoint_dir = build_checkpoint_dir('new_im_16_checkpointing', config) save_model_dir = os.path.join('new_im_16_saved_models') os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=100, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=save_model_dir, debug=False)
def train_disc_img_vae(data_dir, config, kwargs): # strategy = get_distribution_strategy(use_cpus=True, logical_per_physical_factor=1) train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords')) test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords')) print(f'Number of training tfrecord files : {len(train_tfrecords)}') print(f'Number of test tfrecord files : {len(test_tfrecords)}') print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}') train_dataset = build_dataset(train_tfrecords, batch_size=4) test_dataset = build_dataset(test_tfrecords, batch_size=4) # with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) # log_dir = build_log_dir('test_log_dir', config) # checkpoint_dir = build_checkpoint_dir('test_checkpointing', config) log_dir = 'test_log_dir' checkpoint_dir = 'test_checkpointing' save_dir = 'saved_model' os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=10, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=save_dir, debug=True)
def train_identify_medium(data_dir, config): train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords')) test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords')) print(f'Number of training tfrecord files : {len(train_tfrecords)}') print(f'Number of test tfrecord files : {len(test_tfrecords)}') print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}') train_dataset = build_dataset(train_tfrecords) test_dataset = build_dataset(test_tfrecords) train_dataset = batch_dataset_set_graph_tuples(all_graphs_same_size=True, dataset=train_dataset, batch_size=32) test_dataset = batch_dataset_set_graph_tuples(all_graphs_same_size=True, dataset=test_dataset, batch_size=32) train_one_epoch = build_training(**config) log_dir = build_log_dir('test_log_dir', config) checkpoint_dir = build_checkpoint_dir('test_checkpointing', config) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=20, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def main(data_dir, config, kwargs): # Make strategy at the start of your main before any other tf code is run. strategy = get_distribution_strategy(use_cpus=True, logical_per_physical_factor=1) train_dataset = build_dataset(os.path.join(data_dir, 'train'), batch_size=4) test_dataset = build_dataset(os.path.join(data_dir, 'test'), batch_size=4) # for (graph, positions) in iter(test_dataset): # print(graph) # break with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) train_one_epoch.model.set_beta(0.) log_dir = build_log_dir('test_log_dir', config) checkpoint_dir = build_checkpoint_dir('test_checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) save_model_dir = os.path.join('saved_models') vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=10, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=save_model_dir, debug=False)
def train_auto_regressive_prior(config, kwargs, num_epochs=100): # with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) dataset = build_dataset() # run on first input to set variable shapes for batch in iter(dataset): train_one_epoch.model(*batch) break log_dir = build_log_dir('log_dir', config) checkpoint_dir = build_checkpoint_dir('checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) exclude_variables = [variable.name for variable in kwargs['discrete_image_vae'].trainable_variables] \ + [variable.name for variable in kwargs['discrete_voxel_vae'].trainable_variables] trainable_variables = list( filter(lambda variable: (variable.name not in exclude_variables), train_one_epoch.model.trainable_variables)) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=dataset, num_epochs=num_epochs, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=checkpoint_dir, variables=trainable_variables, debug=False) return train_one_epoch.model, checkpoint_dir
def main(data_dir, batch_size, config, kwargs): # Make strategy at the start of your main before any other tf code is run. # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, # memory_limit=None) strategy = None if strategy is not None: train_dataset = build_distributed_dataset(os.path.join( data_dir, 'train'), global_batch_size=batch_size, strategy=strategy) test_dataset = build_distributed_dataset(os.path.join( data_dir, 'test'), global_batch_size=batch_size, strategy=strategy) else: train_dataset = build_dataset(os.path.join(data_dir, 'train'), batch_size=batch_size) test_dataset = build_dataset(os.path.join(data_dir, 'test'), batch_size=batch_size) # for (graph, positions) in iter(test_dataset): # print(graph) # break if strategy is not None: with strategy.scope(): train_one_epoch = build_training(**config, **kwargs, strategy=strategy) else: train_one_epoch = build_training(**config, **kwargs, strategy=strategy) train_one_epoch.model.set_temperature(10.) train_one_epoch.model.set_beta(6.6) log_dir = build_log_dir('simple_complete_log_dir', config) checkpoint_dir = build_checkpoint_dir('simple_complete_checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) save_model_dir = os.path.join('simple_complete_saved_models') vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=100, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=save_model_dir, debug=False)
def train_VQVAE(data_dir): # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000) # lists containing tfrecord files train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords')) test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords')) train_dataset = build_dataset(train_tfrecords) test_dataset = build_dataset(test_tfrecords) train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) # with strategy.scope(): autoencoder_depth = 6 model = VectorQuantizerVariationalAutoEncoder(embedding_dim=2 * 2**autoencoder_depth, num_embeddings=1024, kernel_size=4, num_layers=autoencoder_depth, num_residual_layers=2) learning_rate = 1e-6 opt = snt.optimizers.Adam(learning_rate) def loss(model_outputs, batch): (img, ) = batch vq_loss, decoded_img = model_outputs print('im shape', img.shape) print('dec im shape', decoded_img.shape) # reconstruction_loss = tf.reduce_mean(tf.reduce_sum( # keras.losses.binary_crossentropy(img, decoded_img), axis=(1, 2) # )) reconstruction_loss = tf.reduce_mean( (gaussian_filter2d(img, filter_shape=[6, 6]) - decoded_img[:, 12:-12, 12:-12, :])**2) total_loss = reconstruction_loss + vq_loss return total_loss train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None) log_dir = 'vqvae2_log_dir' checkpoint_dir = 'vqvae2_checkpointing' vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=50, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def train_VQVAE(data_dir): # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000) # lists containing tfrecord files train_dataset = build_dataset(os.path.join(data_dir, 'train')) test_dataset = build_dataset(os.path.join(data_dir, 'test')) train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) # with strategy.scope(): model = VectorQuantizerVariationalAutoEncoder(embedding_dim=32, num_embeddings=1024, kernel_size=4) learning_rate = 1e-4 opt = snt.optimizers.Adam(learning_rate) def loss(model_outputs, batch): (img, ) = batch vq_loss, decoded_img = model_outputs print('im shape', img.shape) print('dec im shape', decoded_img.shape) # reconstruction_loss = tf.reduce_mean(tf.reduce_sum( # keras.losses.binary_crossentropy(img, decoded_img), axis=(1, 2) # )) reconstruction_loss = tf.reduce_mean( (img - decoded_img[:, :, :, :])**2) tf.summary.scalar('reconstruction loss', reconstruction_loss, step=model.step) tf.summary.scalar('vq_loss', vq_loss, step=model.step) total_loss = reconstruction_loss + vq_loss return total_loss train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None) log_dir = 'VQVAE_log_dir_16_1024' checkpoint_dir = 'VQVAE_checkpointing_16_1024' model_dir = 'trained_VAE_model_16_1024' vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=10000, early_stop_patience=10000, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False, save_model_dir=model_dir)
def train_ae_3d(data_dir, config): train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords')) test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords')) print(f'Number of training tfrecord files : {len(train_tfrecords)}') print(f'Number of test tfrecord files : {len(test_tfrecords)}') print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}') train_dataset = build_dataset(train_tfrecords) test_dataset = build_dataset(test_tfrecords) train_one_epoch = build_training(**config) log_dir = build_log_dir('test_log_dir', config) checkpoint_dir = build_checkpoint_dir('test_checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) # checkpoint = tf.train.Checkpoint(module=train_one_epoch) # manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3, # checkpoint_name=train_one_epoch.model.__class__.__name__) # # if manager.latest_checkpoint is not None: # checkpoint.restore(manager.latest_checkpoint) # print(f"Restored from {manager.latest_checkpoint}") # output_dir = './output_evaluations' # os.makedirs(output_dir, exist_ok=True) # # property_names = ['vx','vy','vz','rho','U','mass','smoothing_length'] # for i, test_graph in enumerate(iter(test_dataset)): # input_properties = test_graph.nodes[:,3:].numpy() # reconstructed_graph = train_one_epoch.model(test_graph) # decoded_properties = reconstructed_graph.nodes.numpy() # positions = test_graph.nodes[:,:3].numpy() # save_dict = dict(positions=positions) # for j in range(len(property_names)): # save_dict[f"prop_{property_names[j]}_input"] = input_properties[:, j] # save_dict[f"prop_{property_names[j]}_decoded"] = decoded_properties[:, j] # np.savez(os.path.join(output_dir,'test_example_{:04d}.npz'.format(i)), **save_dict) # if i == 20: # break vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=100, early_stop_patience=10, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def train_variational_autoencoder(data_dir): # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000) # lists containing tfrecord files train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords')) test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords')) train_dataset = build_dataset(train_tfrecords) test_dataset = build_dataset(test_tfrecords) train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) # with strategy.scope(): model = VariationalAutoEncoder(n_latent=4, kernel_size=4) learning_rate = 1e-3 opt = snt.optimizers.Adam(learning_rate) def loss(model_outputs, batch): (img, ) = batch mn, std, z, decoded_img = model_outputs # reconstruction_loss = tf.reduce_mean(tf.reduce_sum( # keras.losses.binary_crossentropy(img, decoded_img), axis=(1, 2) # )) reconstruction_loss = tf.reduce_mean( (gaussian_filter2d(img, filter_shape=[6, 6]) - decoded_img[:, 12:-12, 12:-12, :])**2) kl_loss = -0.5 * (1 + std - tf.square(mn) - tf.exp(std)) kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1)) total_loss = reconstruction_loss + kl_loss print(f"recon_loss = {reconstruction_loss}") print(f"kl_loss = {kl_loss}") return total_loss train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None) log_dir = 'VAE_log_dir' checkpoint_dir = 'VAE_checkpointing' vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=50, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def train_auto_regressive_prior(data_dir, batch_size, config, kwargs, num_epochs=100): # with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) # dataset = build_example_dataset(1000, batch_size=2, num_blobs=5, num_nodes=64 ** 3, image_dim=256) dataset = build_dataset(data_dir, batch_size) # the model will call grid_graphs internally to learn the 3D autoencoder. # we show here what that produces from a batch of graphs. # for graphs, image in iter(dataset): # assert image.numpy().shape == (2, 256, 256, 1) # plt.imshow(image[0,...,0].numpy()) # plt.colorbar() # plt.show() # voxels = grid_graphs(graphs, 64) # assert voxels.numpy().shape == (2, 64, 64, 64, 1) # plt.imshow(tf.reduce_mean(voxels[0,...,0], axis=-1)) # plt.colorbar() # plt.show() # break # run on first input to set variable shapes for batch in iter(dataset): train_one_epoch.model(*batch) break log_dir = build_log_dir('log_dir', config) checkpoint_dir = build_checkpoint_dir('checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) exclude_variables = [variable.name for variable in kwargs['discrete_image_vae'].trainable_variables] \ + [variable.name for variable in kwargs['discrete_voxel_vae'].trainable_variables] trainable_variables = list(filter(lambda variable: (variable.name not in exclude_variables), train_one_epoch.model.trainable_variables)) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=dataset, num_epochs=num_epochs, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=checkpoint_dir, variables=trainable_variables, debug=False)
def train_discrete_voxel_vae(config, kwargs): # with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) dataset = build_example_dataset(100, batch_size=2, num_blobs=3, num_nodes=64**3, image_dim=256) # the model will call grid_graphs internally to learn the 3D autoencoder. # we show here what that produces from a batch of graphs. for graphs, image in iter(dataset): assert image.numpy().shape == (2, 256, 256, 1) plt.imshow(image[0].numpy()) plt.colorbar() plt.show() voxels = grid_graphs(graphs, 64) assert voxels.numpy().shape == (2, 64, 64, 64, 1) plt.imshow(tf.reduce_mean(voxels[0], axis=-2)) plt.colorbar() plt.show() break # drop the image as the model expects only graphs dataset = dataset.map(lambda graphs, images: (graphs, )) # run on first input to set variable shapes for batch in iter(dataset): train_one_epoch.model(*batch) break log_dir = build_log_dir('log_dir', config) checkpoint_dir = build_checkpoint_dir('checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=dataset, num_epochs=1, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=checkpoint_dir, debug=False)
def train_discrete_image_vae(data_dirs, config, model_kwargs, batch_size=1, num_epochs=100): print('\n') train_one_epoch = build_training(**config, **model_kwargs) train_dataset = build_dataset(data_dirs, batch_size=batch_size, train_test_dir='train') test_dataset = build_dataset(data_dirs, batch_size=batch_size, train_test_dir='test') print(f'Number of epochs: {num_epochs}') print('Training discrete image VAE\n') # drop the graph as the model expects only images train_dataset = train_dataset.map(lambda voxels, images: (images, )) test_dataset = test_dataset.map(lambda voxels, images: (images, )) # run on first input to set variable shapes for batch in iter(train_dataset): train_one_epoch.model(*batch) break log_dir = build_log_dir('log_dir', config) checkpoint_dir = build_checkpoint_dir('checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=num_epochs, early_stop_patience=20, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=checkpoint_dir, variables=train_one_epoch.model.trainable_variables, debug=False) return train_one_epoch.model, checkpoint_dir
def train_auto_regressive_prior(data_dirs, config, model_kwargs, batch_size=1, num_epochs=100): print('\n') train_one_epoch = build_training(**config, **model_kwargs) train_dataset = build_dataset(data_dirs, batch_size=batch_size, train_test_dir='train') test_dataset = build_dataset(data_dirs, batch_size=batch_size, train_test_dir='test') print(f'Number of epochs: {num_epochs}') print('Training autoregressive prior\n') # run on first input to set variable shapes for batch in iter(train_dataset): train_one_epoch.model(*batch) break log_dir = build_log_dir('log_dir', config) checkpoint_dir = build_checkpoint_dir('checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) exclude_variables = [variable.name for variable in model_kwargs['discrete_image_vae'].trainable_variables] + \ [variable.name for variable in model_kwargs['discrete_voxel_vae'].trainable_variables] trainable_variables = list( filter(lambda variable: (variable.name not in exclude_variables), train_one_epoch.model.trainable_variables)) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=num_epochs, early_stop_patience=40, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=checkpoint_dir, variables=trainable_variables, debug=False)
def train_autoencoder(data_dir): # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000) # lists containing tfrecord files train_dataset = build_dataset(os.path.join(data_dir, 'train')) test_dataset = build_dataset(os.path.join(data_dir, 'test')) # print(f'Number of training tfrecord files : {len(train_tfrecords)}') # print(f'Number of test tfrecord files : {len(test_tfrecords)}') # print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}') # # train_dataset = build_dataset(train_tfrecords) # test_dataset = build_dataset(test_tfrecords) train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) # with strategy.scope(): model = AutoEncoder() learning_rate = 1.0e-5 opt = snt.optimizers.Adam(learning_rate) def loss(model_outputs, batch): (img, ) = batch decoded_img = model_outputs # return tf.reduce_mean((gaussian_filter2d(img, filter_shape=[6, 6]) - decoded_img[:, :, :, :]) ** 2) return 100 * tf.reduce_mean((img - decoded_img[:, :, :, :])**2) train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None) log_dir = 'autoencoder_log_dir' checkpoint_dir = 'autoencoder_checkpointing' vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=1000, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def train_discrete_image_vae(config, kwargs): # with strategy.scope(): train_one_epoch = build_training(**config, **kwargs) dataset = build_example_dataset(10, batch_size=2, num_blobs=3, num_nodes=64**3, image_dim=256) # show example of image for graphs, image in iter(dataset): assert image.numpy().shape == (2, 256, 256, 1) plt.imshow(image[0].numpy()) plt.colorbar() plt.show() break # drop the graph as the model expects only images dataset = dataset.map(lambda graphs, images: (images, )) # run on first input to set variable shapes for batch in iter(dataset): train_one_epoch.model(*batch) break log_dir = build_log_dir('log_dir', config) checkpoint_dir = build_checkpoint_dir('checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=dataset, num_epochs=1, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=checkpoint_dir, variables=train_one_epoch.model.trainable_variables, debug=False)
def test_vanillia_training_loop(): import sonnet as snt class Model(AbstractModule): def __init__(self, name=None): super(Model, self).__init__(name=name) self.net = snt.nets.MLP([10, 1], activate_final=False) def _build(self, batch): (inputs, _) = batch return self.net(inputs) def loss(model_output, batch): (_, target) = batch return tf.reduce_mean((target - model_output)**2) dataset = tf.data.Dataset.from_tensor_slices((tf.random.normal( (100, 5)), tf.random.normal((100, 1)))).batch(10) training = TrainOneEpoch(Model(), loss, snt.optimizers.Adam(1e-4)) vanilla_training_loop(dataset, training, 100, debug=False)
def train_autoencoder(data_dir): train_tfrecords = glob.glob(os.path.join(data_dir, 'train', '*.tfrecords')) test_tfrecords = glob.glob(os.path.join(data_dir, 'test', '*.tfrecords')) print(f'Number of training tfrecord files : {len(train_tfrecords)}') print(f'Number of test tfrecord files : {len(test_tfrecords)}') print(f'Total : {len(train_tfrecords) + len(test_tfrecords)}') train_dataset = build_dataset(train_tfrecords) test_dataset = build_dataset(test_tfrecords) train_dataset = train_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) test_dataset = test_dataset.map(lambda graph, img, c: (img, )).batch( batch_size=32) model = AutoEncoder(kernel_size=4) learning_rate = 1e-5 opt = snt.optimizers.Adam(learning_rate) def loss(model_outputs, batch): (img, ) = batch decoded_img = model_outputs return tf.reduce_mean((gaussian_filter2d(img, filter_shape=[6, 6]) - decoded_img[:, 12:-12, 12:-12, :])**2) train_one_epoch = TrainOneEpoch(model, loss, opt, strategy=None) log_dir = 'autoencoder_log_dir' checkpoint_dir = 'autoencoder_checkpointing' vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=50, early_stop_patience=5, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def train_identify_medium(data_dir, config): # Make strategy at the start of your main before any other tf code is run. strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=11000) train_dataset = build_dataset(os.path.join(data_dir, 'train')) test_dataset = build_dataset(os.path.join(data_dir, 'test')) print('\nEXAMPLE FROM TEST DATASET:') for (graph, img, c) in iter(train_dataset): print(img) print('max: ', tf.math.reduce_max(img)) break with strategy.scope(): train_one_epoch = build_training(**config) log_dir = build_log_dir('test_log_dir', config) checkpoint_dir = build_checkpoint_dir('test_checkpointing', config) if checkpoint_dir is not None: # originally from vanilla_training_loop os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: # checkpoint_dir not yet created json.dump(config, f) print('\nvanilla training loop...') vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=10, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def train_ae_3d(data_dir, config): # strategy = get_distribution_strategy(use_cpus=False, logical_per_physical_factor=1, memory_limit=10000) # lists containing tfrecord files train_dataset = build_dataset(os.path.join(data_dir, 'train')) test_dataset = build_dataset(os.path.join(data_dir, 'test')) # train_dataset = train_dataset.map(lambda graph, img, c: (graph,)) # test_dataset = test_dataset.map(lambda graph, img, c: (graph,)) # for ds_item in iter(train_dataset): # print(ds_item) # break # for ds_item in iter(train_dataset): # print(ds_item) # br train_one_epoch = build_training(**config) log_dir = build_log_dir('new_test_log_dir', config) checkpoint_dir = build_checkpoint_dir('new_test_checkpointing', config) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) checkpoint = tf.train.Checkpoint(module=train_one_epoch) manager = tf.train.CheckpointManager( checkpoint, checkpoint_dir, max_to_keep=3, checkpoint_name=train_one_epoch.model.__class__.__name__) if manager.latest_checkpoint is not None: checkpoint.restore(manager.latest_checkpoint) print(f"Restored from {manager.latest_checkpoint}") # output_dir = './output_evaluations' # os.makedirs(output_dir, exist_ok=True) # names = ['vx', 'vy', 'vz', 'grav_potential', 'rho', 'temperature', 'cell_mass', 'cell_volume'] # for i, test_graph in enumerate(iter(test_dataset)): # positions = test_graph.nodes[:, :3].numpy() # input_properties = test_graph.nodes[:, 3:].numpy() # # decoded_graph = train_one_epoch.model(test_graph) # decoded_properties = decoded_graph.nodes.numpy() # save_dict = dict(positions=positions) # for j in range(len(names)): # save_dict[f"prop_{names[j]}_input"] = input_properties[:,j] # save_dict[f"prop_{names[j]}_decoded"] = decoded_properties[:,j] # np.savez(os.path.join(output_dir, 'test_evaluation_{:04d}.npz'.format(i)), **save_dict) # # if i == 20: # break vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=1000, early_stop_patience=10, checkpoint_dir=checkpoint_dir, log_dir=log_dir, debug=False)
def train_disc_graph_vae(data_dir, config): # strategy = get_distribution_strategy(use_cpus=True, logical_per_physical_factor=1) # lists containing tfrecord files train_dataset = build_dataset(os.path.join(data_dir, 'train')) test_dataset = build_dataset(os.path.join(data_dir, 'test')) # train_dataset = train_dataset.map(lambda graph, img, c: (graph,)) # test_dataset = test_dataset.map(lambda graph, img, c: (graph,)) # for ds_item in iter(train_dataset): # print(ds_item) # break # for ds_item in iter(train_dataset): # print(ds_item) # br train_one_epoch = build_training( model_parameters=dict( encoder_fn=EncoderNetwork3D, decode_fn=DecoderNetwork3D, embedding_dim=4, # 64 num_embedding=4, # 64 num_gaussian_components=4, # 128 num_token_samples=1, num_properties=8, temperature=50., beta=1., encoder_kwargs=dict( inter_graph_connect_prob=0.01, reducer=tf.math.unsorted_segment_mean, starting_global_size=4, node_size=4, # 64 edge_size=4, crossing_steps=1, name=None), decode_kwargs=dict( inter_graph_connect_prob=0.01, reducer=tf.math.unsorted_segment_mean, starting_global_size=4, node_size=4, # 64 edge_size=4, crossing_steps=1, name=None), name=None), **config) # log_dir = build_log_dir('test_log_dir', config) # checkpoint_dir = build_checkpoint_dir('test_checkpointing', config) log_dir = 'dVAE_log_dir' checkpoint_dir = 'dVAE_checkpointing' model_dir = 'dVAE_model' os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'config.json'), 'w') as f: json.dump(config, f) vanilla_training_loop(train_one_epoch=train_one_epoch, training_dataset=train_dataset, test_dataset=test_dataset, num_epochs=100, early_stop_patience=10, checkpoint_dir=checkpoint_dir, log_dir=log_dir, save_model_dir=model_dir, debug=False)