Beispiel #1
0
def trainer(hyperp, run_options, file_paths):
    #=== GPU Settings ===# Must put this first! Because TF2 will automatically work on a GPU and it may clash with used ones if the visible device list is not yet specified
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    if run_options.use_distributed_training == 0:
        os.environ["CUDA_VISIBLE_DEVICES"] = run_options.which_gpu
        GLOBAL_BATCH_SIZE = hyperp.batch_size
    if run_options.use_distributed_training == 1:
        os.environ["CUDA_VISIBLE_DEVICES"] = run_options.dist_which_gpus
        gpus = tf.config.experimental.list_physical_devices('GPU')
        GLOBAL_BATCH_SIZE = hyperp.batch_size * len(
            gpus
        )  # To avoid the core dump issue, have to do this instead of hyperp.batch_size * dist_strategy.num_replicas_in_sync

    #=== Load Data ===#
    obs_indices, parameter_train, state_obs_train,\
    parameter_test, state_obs_test,\
    data_input_shape, parameter_dimension\
    = load_thermal_fin_data(file_paths, run_options.num_data_train, run_options.num_data_test, run_options.parameter_dimensions)

    #=== Construct Validation Set and Batches ===#
    parameter_and_state_obs_train, parameter_and_state_obs_val, parameter_and_state_obs_test,\
    run_options.num_data_train, num_data_val, run_options.num_data_test,\
    num_batches_train, num_batches_val, num_batches_test\
    = form_train_val_test_batches(parameter_train, state_obs_train, parameter_test, state_obs_test, GLOBAL_BATCH_SIZE, run_options.random_seed)

    #=== Data and Latent Dimensions of Autoencoder ===#
    data_dimension = parameter_dimension
    if hyperp.data_type == 'full':
        latent_dimension = run_options.full_domain_dimensions
    if hyperp.data_type == 'bnd':
        latent_dimension = len(obs_indices)

    #=== Non-distributed Training ===#
    if run_options.use_distributed_training == 0:
        #=== Neural Network ===#
        NN = AutoencoderFwdInv(hyperp, data_dimension, latent_dimension)

        #=== Training ===#
        storage_array_loss_train, storage_array_loss_train_autoencoder, storage_array_loss_train_forward_problem, storage_array_loss_train_model_augmented,\
        storage_array_loss_val, storage_array_loss_val_autoencoder, storage_array_loss_val_forward_problem, storage_array_loss_val_model_augmented,\
        storage_array_loss_test, storage_array_loss_test_autoencoder, storage_array_loss_test_forward_problem, storage_array_loss_test_model_augmented,\
        storage_array_relative_error_parameter_autoencoder, storage_array_relative_error_state_obs, storage_array_relative_error_parameter_inverse_problem\
        = optimize(hyperp, run_options, file_paths, NN, obs_indices, loss_autoencoder, loss_encoder, loss_model_augmented, relative_error,\
                   parameter_and_state_obs_train, parameter_and_state_obs_val, parameter_and_state_obs_test,\
                   parameter_dimension, num_batches_train)

    #=== Distributed Training ===#
    if run_options.use_distributed_training == 1:
        dist_strategy = tf.distribute.MirroredStrategy()
        with dist_strategy.scope():
            #=== Neural Network ===#
            NN = AutoencoderFwdInv(hyperp, data_dimension, latent_dimension)

        #=== Training ===#
        storage_array_loss_train, storage_array_loss_train_autoencoder, storage_array_loss_train_forward_problem, storage_array_loss_train_model_augmented,\
        storage_array_loss_val, storage_array_loss_val_autoencoder, storage_array_loss_val_forward_problem, storage_array_loss_val_model_augmented,\
        storage_array_loss_test, storage_array_loss_test_autoencoder, storage_array_loss_test_forward_problem, storage_array_loss_test_model_augmented,\
        storage_array_relative_error_parameter_autoencoder, storage_array_relative_error_state_obs, storage_array_relative_error_parameter_inverse_problem\
        = optimize_distributed(dist_strategy, GLOBAL_BATCH_SIZE,
                               hyperp, run_options, file_paths, NN, obs_indices, loss_autoencoder, loss_model_augmented, relative_error,\
                               parameter_and_state_obs_train, parameter_and_state_obs_val, parameter_and_state_obs_test,\
                               parameter_dimension, num_batches_train)

    #=== Saving Metrics ===#
    metrics_dict = {}
    metrics_dict['loss_train'] = storage_array_loss_train
    metrics_dict[
        'loss_train_autoencoder'] = storage_array_loss_train_autoencoder
    metrics_dict[
        'loss_train_forward_problem'] = storage_array_loss_train_forward_problem
    metrics_dict[
        'loss_train_model_augmented'] = storage_array_loss_train_model_augmented
    metrics_dict['loss_val'] = storage_array_loss_val
    metrics_dict['loss_val_autoencoder'] = storage_array_loss_val_autoencoder
    metrics_dict[
        'loss_val_forward_problem'] = storage_array_loss_val_forward_problem
    metrics_dict[
        'loss_val_model_augmented'] = storage_array_loss_val_model_augmented
    metrics_dict[
        'relative_error_parameter_autoencoder'] = storage_array_relative_error_parameter_autoencoder
    metrics_dict[
        'relative_error_state_obs'] = storage_array_relative_error_state_obs
    metrics_dict[
        'relative_error_parameter_inverse_problem'] = storage_array_relative_error_parameter_inverse_problem
    df_metrics = pd.DataFrame(metrics_dict)
    df_metrics.to_csv(file_paths.NN_savefile_name + "_metrics" + '.csv',
                      index=False)
    def objective_functional(**hyperp_of_interest_objective_args_tuple):
        #=== Assign Hyperparameters of Interest ===#
        for key, val in hyperp_of_interest_objective_args_tuple.items():
            setattr(hyperp, key, val)
        hyperp.truncation_layer = int(np.ceil(hyperp.num_hidden_layers / 2))

        #=== Update File Paths with New Hyperparameters ===#
        file_paths = FilePaths(hyperp, run_options)

        #=== Construct Validation Set and Batches ===#
        if run_options.use_distributed_training == 0:
            GLOBAL_BATCH_SIZE = hyperp.batch_size
        if run_options.use_distributed_training == 1:
            GLOBAL_BATCH_SIZE = hyperp.batch_size * len(
                gpus
            )  # To avoid the core dump issue, have to do this instead of hyperp.batch_size * dist_strategy.num_replicas_in_sync
        parameter_and_state_obs_train, parameter_and_state_obs_val, parameter_and_state_obs_test,\
        run_options.num_data_train, num_data_val, run_options.num_data_test,\
        num_batches_train, num_batches_val, num_batches_test\
        = form_train_val_test_batches(parameter_train, state_obs_train, parameter_test, state_obs_test, GLOBAL_BATCH_SIZE, run_options.random_seed)

        #=== Non-distributed Training ===#
        if run_options.use_distributed_training == 0:
            #=== Neural Network ===#
            NN = AutoencoderFwdInv(hyperp, parameter_dimension,
                                   run_options.full_domain_dimensions,
                                   obs_indices)

            #=== Training ===#
            storage_array_loss_train, storage_array_loss_train_autoencoder, storage_array_loss_train_forward_problem,\
            storage_array_loss_val, storage_array_loss_val_autoencoder, storage_array_loss_val_forward_problem,\
            storage_array_loss_test, storage_array_loss_test_autoencoder, storage_array_loss_test_forward_problem,\
            storage_array_relative_error_parameter_autoencoder, storage_array_relative_error_state_obs, storage_array_relative_error_parameter_inverse_problem\
            = optimize(hyperp, run_options, file_paths, NN, loss_autoencoder, loss_encoder, relative_error,\
                       parameter_and_state_obs_train, parameter_and_state_obs_val, parameter_and_state_obs_test,\
                       parameter_dimension, num_batches_train)

        #=== Distributed Training ===#
        if run_options.use_distributed_training == 1:
            dist_strategy = tf.distribute.MirroredStrategy()
            GLOBAL_BATCH_SIZE = hyperp.batch_size * dist_strategy.num_replicas_in_sync
            with dist_strategy.scope():
                #=== Neural Network ===#
                NN = AutoencoderFwdInv(hyperp, parameter_dimension,
                                       run_options.full_domain_dimensions,
                                       obs_indices)

                #=== Training ===#
                storage_array_loss_train, storage_array_loss_train_autoencoder, storage_array_loss_train_forward_problem,\
                storage_array_loss_val, storage_array_loss_val_autoencoder, storage_array_loss_val_forward_problem,\
                storage_array_loss_test, storage_array_loss_test_autoencoder, storage_array_loss_test_forward_problem,\
                storage_array_relative_error_parameter_autoencoder, storage_array_relative_error_state_obs, storage_array_relative_error_parameter_inverse_problem\
                = optimize_distributed(dist_strategy, GLOBAL_BATCH_SIZE, hyperp, run_options, file_paths, NN, loss_autoencoder, loss_encoder, relative_error,\
                                       parameter_and_state_obs_train, parameter_and_state_obs_val, parameter_and_state_obs_test,\
                                       parameter_dimension, num_batches_train)

        #=== Saving Metrics ===#
        metrics_dict = {}
        metrics_dict['loss_train'] = storage_array_loss_train
        metrics_dict[
            'loss_train_autoencoder'] = storage_array_loss_train_autoencoder
        metrics_dict[
            'loss_train_forward_problem'] = storage_array_loss_train_forward_problem
        metrics_dict['loss_val'] = storage_array_loss_val
        metrics_dict[
            'loss_val_autoencoder'] = storage_array_loss_val_autoencoder
        metrics_dict[
            'loss_val_forward_problem'] = storage_array_loss_val_forward_problem
        metrics_dict[
            'relative_error_parameter_autoencoder'] = storage_array_relative_error_parameter_autoencoder
        metrics_dict[
            'relative_error_state_obs'] = storage_array_relative_error_state_obs
        metrics_dict[
            'relative_error_parameter_inverse_problem'] = storage_array_relative_error_parameter_inverse_problem
        df_metrics = pd.DataFrame(metrics_dict)
        df_metrics.to_csv(file_paths.NN_savefile_name + "_metrics" + '.csv',
                          index=False)

        return storage_array_loss_val[-1]