def run_model(data_set, kmer_size, norm_input, encoding_dim_1, encoding_dim_2,
              encoded_activation, input_dropout_pct, dropout_pct, num_epochs,
              batch_size, n_splits, n_repeats, compute_informative_features,
              plot_iteration, graph_dir, outFile):

    # format strings for outputting the paramters associated with this run:
    summary_string, plotting_string = stats_utils.format_input_parameters_printing_2layers(
        data_set, kmer_size, norm_input, encoding_dim_1, encoding_dim_2,
        encoded_activation, input_dropout_pct, dropout_pct, num_epochs,
        batch_size, n_splits, n_repeats, compute_informative_features,
        plot_iteration)

    outFile_header = 'data_set\tkmer_size\tnorm_input\tencoding_dim_1\tencoding_dim_2\tencoded_activation\tinput_dropout_pct\tdropout_pct\tnum_epochs\tbatch_size\tn_splits\tn_repeats\t'

    #################
    # Load the data #
    #################
    print('Loading data...')

    data_normalized, labels, rskf = load_kmer_cnts_jf.load_single_disease(
        data_set, kmer_size, n_splits, n_repeats, precomputed_kfolds=False)

    # rskf = repeated stratified k fold. This contains all the kfold-by-iteration combos.

    ###################################################
    # iterate through the data kfolds and iterations #
    ###################################################

    # Create a dictionary to store the metrics of each fold
    aggregated_statistics = {}  # key=n_repeat, values= dictionary with stats

    for n_repeat in range(0, len(rskf[0])):

        print('Iteration %s...' % n_repeat)

        aggregated_statistics[n_repeat] = {}

        train_idx = rskf[0][n_repeat]
        test_idx = rskf[1][n_repeat]
        x_train, y_train = data_normalized[train_idx], labels[train_idx]
        x_test, y_test = data_normalized[test_idx], labels[test_idx]

        #standardize the data, mean=0, std=1
        if norm_input:
            x_train, x_test = stats_utils.standardize_data(x_train, x_test)

        ###########################################
        # set up a model (supervised learning)    #
        ###########################################
        # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch.

        input_dim = len(
            data_normalized[0])  # this is the number of input kmers

        model = deep_learning_models.create_supervised_model_2layers(
            input_dim, encoding_dim_1, encoding_dim_2, encoded_activation,
            input_dropout_pct, dropout_pct)

        #weightFile = os.environ['HOME'] + '/deep_learning_microbiome/data/weights.txt'

        ##################################################
        # Fit the model with the train data of this fold #
        ##################################################
        history = History()
        # history is a dictionary. To get the keys, type print(history.history.keys())

        model.fit(x_train,
                  y_train,
                  epochs=num_epochs,
                  batch_size=len(x_train),
                  shuffle=True,
                  validation_data=(x_test, y_test),
                  verbose=0,
                  callbacks=[history])

        # predict using the held out data
        y_pred = model.predict(x_test)

        # save the weights of this model. TODO

        ################################################################
        # Compute summary statistics                                   #
        ################################################################
        # Store the results of this fold in aggregated_statistics
        aggregated_statistics = stats_utils.compute_summary_statistics(
            y_test, y_pred, history, aggregated_statistics, n_repeat)

        # could  plot everything (roc, accuracy vs epoch, loss vs epoch, confusion matrix, precision recall) for each fold, but this will produce a lot of graphs.
        if compute_informative_features:
            shap_values, shap_values_summed = stats_utils.compute_shap_values_deeplearning(
                input_dim, model, x_test)
            aggregated_statistics[n_repeat][
                'shap_values_summed'] = shap_values_summed
            aggregated_statistics[n_repeat]['shap_values'] = shap_values

        # also plot:
        #shap.summary_plot(shap_values, X, plot_type="bar")
        #shap.summary_plot(shap_values, X)

    ##############################################
    # aggregate the results from all the k-folds #
    # Print and Plot                             #
    ##############################################
    print('Aggregating statistics across iterations and printing/plotting...')

    stats_utils.aggregate_statistics_across_folds(aggregated_statistics, rskf,
                                                  n_splits, outFile,
                                                  summary_string,
                                                  plotting_string,
                                                  outFile_header)

    ###################
    # Aggregate shap: #
    ###################

    if compute_informative_features:
        print('Computing informative features with Shap...')
        stats_utils.aggregate_shap(aggregated_statistics, rskf)
Ejemplo n.º 2
0
def run_model(data_set, kmer_size, norm_input, encoding_dim, encoded_activation, input_dropout_pct, dropout_pct, num_epochs, batch_size, n_splits, n_repeats, compute_informative_features, plot_iteration, graph_dir, outFile):
    
    # format strings for outputting the paramters associated with this run:
    summary_string, plotting_string= stats_utils.format_input_parameters_printing(data_set, kmer_size, norm_input, encoding_dim, encoded_activation,input_dropout_pct,dropout_pct,num_epochs,batch_size,n_splits,n_repeats,compute_informative_features,plot_iteration)

    #################
    # Load the data # 
    #################
    print('Loading data...')

    data_normalized, labels, rskf = load_kmer_cnts_jf.load_all_autoencoder(kmer_size, n_splits, n_repeats,precomputed_kfolds=False)

    # rskf = repeated stratified k fold. This contains all the kfold-by-iteration combos. 


    ###################################################
    # iterate through the data kfolds and iterations #
    ###################################################

    # Create a dictionary to store the metrics of each fold 
    aggregated_statistics={} # key=n_repeat, values= dictionary with stats

    for n_repeat in range(0,len(rskf[0])):
        
        print('Iteration %s...' %n_repeat)
        
        aggregated_statistics[n_repeat] = {}
        
        train_idx = rskf[0][n_repeat]
        test_idx = rskf[1][n_repeat]
        x_train, y_train = data_normalized[train_idx], labels[train_idx]
        x_test, y_test = data_normalized[test_idx], labels[test_idx]
    
        #standardize the data, mean=0, std=1
        if norm_input:
            x_train, x_test= stats_utils.standardize_data(x_train, x_test)
    
        ###########################################
        # set up a model (supervised learning)    #
        ###########################################
        # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch. 
    
        input_dim=len(data_normalized[0]) # this is the number of input kmers

        decoded_activation='softmax'

        model=deep_learning_models.create_autoencoder_dropout(encoding_dim, input_dim, encoded_activation, decoded_activation, input_dropout_pct, dropout_pct)
        #weightFile = os.environ['HOME'] + '/deep_learning_microbiome/data/weights.txt'
       
        ##################################################
        # Fit the model with the train data of this fold #
        ##################################################
        history = History()
        # history is a dictionary. To get the keys, type print(history.history.keys())
        
        model.fit(x_train, x_train, 
                  epochs=num_epochs, 
                  batch_size=batch_size, 
                  shuffle=True,
                  validation_data=(x_test, x_test),
                  verbose=0,
                  callbacks=[history])
    
        # save the weights of this model. TODO 
    
        ################################################################
        # Compute summary statistics                                   #
        ################################################################
        # Store the results of this fold in aggregated_statistics
        aggregated_statistics=stats_utils.compute_summary_statistics_autoencoder(history, aggregated_statistics, n_repeat)

    ##############################################
    # aggregate the results from all the k-folds #
    # Print and Plot                             #
    ##############################################
    print('Aggregating statistics across iterations and printing/plotting...')

    stats_utils.aggregate_statistics_across_folds_autoencoder(aggregated_statistics, rskf, n_splits, outFile, summary_string, plotting_string)