Ejemplo n.º 1
0
            range(len(df_plot.index)), df_plot.accuracy, df_plot.accuracy)
    ]

    fig: plt.Figure

    if save_fig:
        fig.tight_layout()
        fig.savefig(f"paper/figures/test_set_score_{mname}")
    else:
        plt.show()


if __name__ == '__main__':
    datasets = ['GLYLIP', 'DNA_Rec_Int', 'halo', 'ANIPLA']
    use_mmseqs_cluster = True
    y, y_test, data, data_seq, X = enct.load_data(
        datasets[0], use_mmseqs_cluster=use_mmseqs_cluster)
    use_mmseqs_cluster = 'mmseqs' if use_mmseqs_cluster else ""
    maxlen = data['length'].max()
    Mname = f'{"_".join(sorted(data.type.unique()))}_{len(data)}' \
            f'{use_mmseqs_cluster}'
    model_results = load_trained_model(
        model_path=f'code_modules/saved_models/gridsearchmodel070319{Mname}')
    plot_cv_results(in_cvresult=model_results, mname=Mname, save_fig=True)

    # Get results on test set
    X_test = data_seq.loc[y_test.index]
    x_dump_location = f"data/uniprot/X_{Mname}"
    if (os.path.exists(f"{x_dump_location}.dump")
            and os.path.exists(f"{x_dump_location}_test.dump")):
        encoded_xs = joblib.load(f'{x_dump_location}.dump')
        encoded_xs_test = joblib.load(f'{x_dump_location}_test.dump')
Ejemplo n.º 2
0
import importlib
import random

import numpy as np

import code_modules.encoding.encoding_testing_functions as enc
import code_modules.word2vec_imputation.impute as imp

importlib.reload(imp)
_, _, data, _, _ = enc.load_data(dataset_to_use='DNA_Rec_Int',
                                 max_length=np.inf,
                                 allow_splitsave=False,
                                 min_length=0)

# Drop rows already containing X
data = data[~data['Sequence'].str.contains('X')]
seqs = data['Sequence'].tolist()
lengths = data['length'].tolist()


def mutate_sequences(in_sequences, in_lengths):
    random.seed(648732)
    mutations_per_seq = [
        random.sample(range(l), k=l // 10) for l in in_lengths
    ]

    out_seqs_mutated = []
    for seqi, mutations in enumerate(mutations_per_seq):
        mutated_seq = in_sequences[seqi]
        for mutation in mutations:
            mutated_seq = (f'{mutated_seq[:mutation]}X'
Ejemplo n.º 3
0
import joblib
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

import code_modules.encoding.aa_encoding_functions as enc
import code_modules.encoding.encoding_testing_functions as enct

os.chdir("/home/wogie/Documents/KU/Bioinformatics_Master/Block_3/Master Thesis")
# %% Load data sets
importlib.reload(enct)
importlib.reload(enc)

datasets = ['GLYLIP', 'DNA_Rec_Int', 'halo', 'ANIPLA']
dataset_i = 0
use_mmseqs_cluster = True
y, y_test, data, data_seq, X = enct.load_data(
    dataset_to_use=datasets[dataset_i], use_mmseqs_cluster=use_mmseqs_cluster)
use_mmseqs_cluster = 'mmseqs' if use_mmseqs_cluster else ""
# %% Cross validation setup
importlib.reload(enct)
importlib.reload(enc)

Max_length = data.length.max()

Model_name = f'{"_".join(sorted(data.type.unique()))}_{len(data)}{use_mmseqs_cluster}'

make_encoding_models = False
if make_encoding_models:
    enc.atchley_encode(in_df=data_seq, save_model=True, model_name=Model_name + "_atchley",
                       get_fractions=False)
    enc.w2v_embedding_cluster_encode(data_seq, save_model=True,
                                     model_name=Model_name + "_WE",
from math import floor

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import code_modules.encoding.encoding_testing_functions as enct
import code_modules.miscellaneous.csv2fasta as csv2fasta

matplotlib.use('module://backend_interagg')  # Allennlp changes backend
#################################Generate fasta#################################
# Load data
y, y_test, data, data_seq, x_raw = enct.load_data(dataset_to_use='hamid',
                                                  use_mmseqs_cluster=False,
                                                  max_length=9999999,
                                                  min_length=0)

df = data.loc[y_test.index, ['Sequence', 'Entry name', 'Entry', 'type']]
df['description'] = df['Entry name'] + df['Entry']

# Save fasta
with open("code_modules/ampep/ampep-matlab-code/bacteriocin.fasta", 'w') as f:
    f.write(
        csv2fasta.make_fasta_str(df.drop('type', axis=1), ['description'],
                                 'Sequence'))

##############################Load MATLAB results###############################
MATLAB_COMMAND_LINES = """
test_fasta_path = 'bacteriocin.fasta'
[predict_result] = main_function(test_fasta_path)
def train_final_model(nn_architechture='CNNPAR',
                      in_kwargs=None,
                      epochs=68,
                      batch_size=1024,
                      just_test_model=False,
                      weights_path=None,
                      show_nn_summary=False):
    if in_kwargs is None:
        # Choose best hyper parameters
        in_kwargs = {
            'conv_filters1': 75,
            'conv_filters2': 300,
            'dense_units': 120,
            'dropout1': 0.0,
            'dropout2': 0.0,
            'kernel_sizes': [6, 8, 10],
            'lr': 0.001,
            'maps_per_kernel': 2,
            'pool_size': 3
        }

    # Load data
    y, y_test, data, data_seq, x_raw = enct.load_data(dataset_to_use='hamid',
                                                      use_mmseqs_cluster=False,
                                                      max_length=inf,
                                                      min_length=0)
    x_test = data_seq.loc[y_test.index]

    x_type = 'test' if just_test_model else 'train'
    dump_path = (f"code_modules/nn_training/BAC_UNI_len2006/encoding_dumps/"
                 f"x_{x_type}_elmo_encoded_final")

    if just_test_model:
        encoded_x = encode_elmo_dump(x_raw=x_test,
                                     data=data,
                                     dump_path=dump_path)
    else:
        encoded_x = encode_elmo_dump(x_raw=x_raw,
                                     data=data,
                                     dump_path=dump_path)
    if encoded_x is None:
        print("Shutting down")
        return

    # Transform y values to binary, with most frequent value as 0
    # y_mapper = {k: v for v, k in enumerate(y.value_counts().index)}
    y_mapper = {'UNI': 0, 'BAC': 1}  # This has been verified

    if just_test_model:
        y_int = y_test.map(y_mapper)

    else:
        y_int = y.map(y_mapper)

    # Specify embedding dimension
    encoding_dimension = 1024

    # Convert each AA to an embedding matrix
    encoded_x = encoded_x.reshape(len(encoded_x), -1, encoding_dimension)

    if just_test_model:
        # Assert that test data set has been chosen
        assert len(encoded_x) == len(x_test)
        assert len(y_int) == len(y_test)

    # Set up and compile model
    nn_model = get_nn_model(x_shape=encoded_x.shape,
                            in_kwargs=in_kwargs,
                            architecture=nn_architechture,
                            use_tpu=False,
                            show_nn_summary=show_nn_summary)

    if just_test_model:
        nn_model.load_weights(weights_path)

        evaluate_test_results(nn_model=nn_model,
                              x_test=encoded_x,
                              y_test=y_int)

        return

        # Make folder for saving model weights
    model_path = "code_modules/nn_training/saved_models/"
    if not os.path.exists(model_path):
        os.makedirs(model_path, exist_ok=True)

    # Make unique model name
    model_path = (f'{model_path}final_elmo_{nn_architechture}'
                  f'_{"_".join(sorted(data.type.unique()))}_len{len(data)}'
                  f'_{int(time.time())}')

    print(model_path)

    # Fit model
    nn_model.fit(encoded_x, y_int, epochs=epochs, batch_size=batch_size)

    # Save weights
    nn_model.save_weights(model_path)

    # Load model and validate that it's correctly saved
    del nn_model
    nn_model = get_nn_model(x_shape=encoded_x.shape,
                            in_kwargs=in_kwargs,
                            architecture=nn_architechture,
                            use_tpu=False)
    nn_model.evaluate(encoded_x, y_int)  # Untrained model
    nn_model.load_weights(model_path)
    nn_model.evaluate(encoded_x, y_int)  # Trained model
def run_cross_validation(nn_architechture: str,
                         in_kwargs=None,
                         n_folds=10,
                         do_elmo=True,
                         do_w2v=False,
                         epochs=100,
                         batch_size=1024,
                         dataset_to_use='hamid',
                         save_logs=True,
                         return_evaluation=False,
                         fold_i_to_skip=tuple(),
                         use_tpu=False):
    """
    :param use_tpu:
    :param fold_i_to_skip:
    :param return_evaluation:
    :param save_logs:
    :param in_kwargs:
    :param do_elmo:
    :param nn_architechture: 'BIDGRU', 'DNN', 'CNN', 'CNNPAR', 'CNNPARLSTM' or
    'RNNCNN'
    :param n_folds:
    :param do_w2v:
    :param epochs:
    :param batch_size:
    :param dataset_to_use: 'hamid' or 'GLYLIP'
    """
    # Load data sets
    if dataset_to_use == 'hamid':
        kwargs = dict(dataset_to_use='hamid',
                      use_mmseqs_cluster=False,
                      max_length=inf,
                      min_length=0)
    elif dataset_to_use == 'GLYLIP':
        kwargs = dict(dataset_to_use='GLYLIP', use_mmseqs_cluster=True)
    else:
        raise AssertionError

    y, y_test, data, data_seq, x_raw = enct.load_data(**kwargs)

    # Set up model
    max_length = data['length'].max()
    flat_input = True if nn_architechture == 'DNN' else False
    output_is_categorical = True if nn_architechture == 'DNN' else False

    # Make unique model name
    model_name = (f'{nn_architechture}_{"_".join(sorted(data.type.unique()))}_'
                  f'len{len(data)}')

    x_w2v_encoded = enc.w2v_embedding_encode(in_df=x_raw,
                                             input_max_length=max_length)

    # Set up path for dumping pretrained ELMo embeddings
    elmo_dump_path = (f'code_modules/nn_training/'
                      f'{"_".join(model_name.split("_")[1:])}'
                      f'/encoding_dumps/x_train_elmo_encoded_n')
    if do_elmo:
        if not all(([
                f'{elmo_dump_path.split("/")[-1]}{i}' in os.listdir("/".join(
                    elmo_dump_path.split('/')[:-1])) for i in range(n_folds)
        ])):
            for f_i in range(n_folds):
                start_time = time.time()
                print(f"ELMo encoding {f_i}")
                elmo_encode_n_dump(from_dump=False,
                                   save_dump=True,
                                   model_name=model_name,
                                   max_len=max_length,
                                   in_x=x_raw,
                                   cuda_device=0)
                end_time = time.time()
                print(end_time - start_time)
        else:
            print("ELMo already encoded")

    # Transform y values to binary, with most frequent value as 0
    y_mapper = {k: v for v, k in enumerate(y.value_counts().index)}
    y_int = y.map(y_mapper)

    # Stratify K folds by y value ratios
    kfold = StratifiedKFold(n_splits=n_folds)
    folds = list(kfold.split(x_raw, y_int.values))

    # Specify which encodings to use
    encoding_styles = []
    encoding_styles.append('w2v') if do_w2v else None
    encoding_styles.append('elmo') if do_elmo else None

    # Specify their dimensions
    encoding_dimensions = []
    encoding_dimensions.append(200) if do_w2v else None
    encoding_dimensions.append(1024) if do_elmo else None

    # Run Cross Validation
    for encoding, embedding_size in zip(encoding_styles, encoding_dimensions):

        for fold_i, fold in enumerate(folds):
            if fold_i in fold_i_to_skip:
                continue

            tensorboard_entry_name = (f'{encoding}_{model_name}_'
                                      f'{int(time.time())}')

            # Select encoding method
            if encoding == 'w2v':
                x = x_w2v_encoded
            else:
                x = elmo_encode_n_dump(
                    load_from_dump_path=f"{elmo_dump_path}{fold_i}")

            if not flat_input:
                # Convert each AA to an embedding matrix
                x = x.reshape(len(x), -1, embedding_size)

            x_fold_train = x[fold[0]]
            x_fold_val = x[fold[1]]

            y_fold_train = y_int.iloc[fold[0]]
            y_fold_val = y_int.iloc[fold[1]]

            # Make a unique log for each run
            tensorboard_entry_name_i = f"{tensorboard_entry_name}_fold{fold_i}"

            tensorboard = TensorBoard(log_dir=f"code_modules/nn_training/logs/"
                                      f"{tensorboard_entry_name_i}")

            if output_is_categorical:
                # Convert to one-hot output instead of 0-1
                y_fold_train = to_categorical(y_fold_train)
                y_fold_val = to_categorical(y_fold_val)

            # Compile model
            nn_model = get_nn_model(x_shape=x.shape,
                                    in_kwargs=in_kwargs,
                                    architecture=nn_architechture,
                                    use_tpu=use_tpu)

            print(tensorboard_entry_name_i)

            callbacks = [tensorboard] if save_logs else None
            steps_per_epoch = 1 if use_tpu else None

            # Fit model
            nn_model.fit(x_fold_train,
                         y_fold_train,
                         validation_data=(x_fold_val, y_fold_val),
                         epochs=epochs,
                         callbacks=callbacks,
                         batch_size=batch_size,
                         steps_per_epoch=steps_per_epoch)

            if return_evaluation:
                # Return the error and accuracy and abandon rest of folds
                return (nn_model.evaluate(x_fold_val, y_fold_val),
                        nn_model.evaluate(x_fold_train, y_fold_train))

            # Delete model to clean up memory
            del nn_model