Beispiel #1
0
def KNN_classification(dataset, filename):
    """
    Classification of data with k-nearest neighbors,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import and one hot encode training/test set
    X_train, X_test, y_train, y_test = prepare_data(dataset)

    # Fitting classifier to the training set
    KNN_classifier = KNeighborsClassifier(
        n_neighbors=100, metric='minkowski', p=2)
    KNN_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = KNN_classifier.predict(X_test)
    y_score = KNN_classifier.predict_proba(X_test)

    # ROC curve
    title = 'KNN ROC curve (Train={})'.format(filename)
    plot_ROC_curve(
        y_test, y_score[:, 1], plot_title=title,
        plot_dir='figures/KNN_ROC_Test_{}.png'.format(filename)
    )

    # Precision-recall curve
    title = 'KNN Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(
        y_test, y_score[:, 1], plot_title=title,
        plot_dir='figures/KNN_P-R_Test_{}.png'.format(filename)
    )

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Beispiel #2
0
def LogReg_classification(dataset, filename):
    """
    Classification of data with logistic regression,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import and one hot encode training/test set
    X_train, X_test, y_train, y_test = prepare_data(dataset)

    # Fitting Logistic Regression to the training set
    LR_classifier = LogisticRegression(random_state=0)
    LR_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = LR_classifier.predict(X_test)
    y_score = LR_classifier.predict_proba(X_test)

    # ROC curve
    title = 'Logistic Regression ROC curve (Train={})'.format(filename)
    plot_ROC_curve(y_test,
                   y_score[:, 1],
                   plot_title=title,
                   plot_dir='figures/LR_ROC_Test_{}.png'.format(filename))

    # Precision-recall curve
    title = 'Logistic Regression Precision-Recall curve (Train={})'.format(
        filename)
    plot_PR_curve(y_test,
                  y_score[:, 1],
                  plot_title=title,
                  plot_dir='figures/LR_P-R_Test_{}.png'.format(filename))

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Beispiel #3
0
def SVM_classification(dataset, filename):
    """
    Classification of data with support vectors,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import and one hot encode training/test set
    X_train, X_test, y_train, y_test = prepare_data(dataset)

    # Fitting classifier to the training set
    SVM_classifier = SVC(kernel='rbf')
    SVM_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = SVM_classifier.predict(X_test)
    y_score = SVM_classifier.decision_function(X_test)

    # ROC curve
    title = 'SVM ROC curve (Train={})'.format(filename)
    plot_ROC_curve(
        y_test, y_score, plot_title=title,
        plot_dir='figures/SVM_ROC_Test_{}.png'.format(filename)
    )

    # Precision-recall curve
    title = 'SVM Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(
        y_test, y_score, plot_title=title,
        plot_dir='figures/SVM_P-R_Test_{}.png'.format(filename)
    )

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Beispiel #4
0
def LogReg2D_classification(dataset, filename):
    """
    Classification of data with 2D logistic regression,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import training/test set
    X_train = dataset.train.loc[:, 'AASeq'].values
    X_test = dataset.test.loc[:, 'AASeq'].values

    # One hot encode the sequences in 2D
    X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train]
    X_train_2D_list = []
    for x in range(0, len(X_train)):
        X_train_2D = np.empty([20, 0])
        for y in range(0, X_train[x].shape[1] - 1):
            for z in range(0, X_train[x].shape[0]):
                X_train_2D = np.concatenate(
                    (X_train_2D, X_train[x][z, y] * X_train[x][:, y + 1:]),
                    axis=1)
        X_train_2D_list.append(X_train_2D)
    X_train = [x.flatten('F') for x in X_train_2D_list]

    X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test]
    X_test_2D_list = []
    for x in range(0, len(X_test)):
        X_test_2D = np.empty([20, 0])
        for y in range(0, X_test[x].shape[1] - 1):
            for z in range(0, X_test[x].shape[0]):
                X_test_2D = np.concatenate(
                    (X_test_2D, X_test[x][z, y] * X_test[x][:, y + 1:]),
                    axis=1)
        X_test_2D_list.append(X_test_2D)
    X_test = [x.flatten('F') for x in X_test_2D_list]

    # Extract labels of training/test set
    y_train = dataset.train.loc[:, 'AgClass'].values
    y_test = dataset.test.loc[:, 'AgClass'].values

    # Fitting Logistic Regression to the training set
    LR_classifier = LogisticRegression(random_state=0)
    LR_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = LR_classifier.predict(X_test)
    y_score = LR_classifier.predict_proba(X_test)

    # ROC curve
    title = '2D Logistic Regression ROC curve (Train={})'.format(filename)
    plot_ROC_curve(y_test,
                   y_score[:, 1],
                   plot_title=title,
                   plot_dir='figures/2DLR_ROC_Test_{}.png'.format(filename))

    # Precision-recall curve
    title = '2D Logistic Regression Precision-Recall curve (Train={})'.format(
        filename)
    plot_PR_curve(y_test,
                  y_score[:, 1],
                  plot_title=title,
                  plot_dir='figures/2DLR_P-R_Test_{}.png'.format(filename))

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Beispiel #5
0
def train_and_evaluate(model, model_name='CNN', save_folder='./', nr_epochs=10, \
                        pred_threshold=0.5):
    """
    DESCRIPTION: This function trains and evaluates the CNN.

    Parameters:
    ----------
    model:       A fully constructed and compiled model;
    model_name:  Used for the filenames of results and the TensorBoard logs
    save_folder: Used to determine where results should be saved (excl. Tensorboard
                 logs!)

    Returns:
    -------
    model:       The fitted model is returned.
    """

    # Get the data generators
    train_gen, val_gen, val_gen_no_shuffle = get_generators(DATA_PATH)

    # Define filepaths to save the model and weights
    model_filepath = os.path.join(save_folder, model_name + '.json')
    weights_filepath = os.path.join(save_folder, model_name + '_weights.hdf5')

    # Save the model to a .json file
    model_json = model.to_json()
    with open(model_filepath, 'w') as json_file:
        json_file.write(model_json)

    # Define the model checkpoint and TensorBoard callbacks
    checkpoint = ModelCheckpoint(weights_filepath,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min')
    tensorboard = TensorBoard(os.path.join(TB_LOG_PATH, model_name))
    callbacks_list = [checkpoint, tensorboard]

    # Define the number of training samples to use during a single epoch, and
    # the number of validation samples validated on per epoch.
    train_steps = train_gen.n // train_gen.batch_size
    val_steps = val_gen.n // val_gen.batch_size

    # Train the model
    history = model.fit(train_gen,
                        steps_per_epoch=train_steps,
                        validation_data=val_gen,
                        validation_steps=val_steps,
                        epochs=nr_epochs,
                        callbacks=callbacks_list)

    # Evaluate model
    y_pred = model.predict(val_gen_no_shuffle, verbose=0)
    y_pred_bin = (y_pred > pred_threshold).astype(int)

    # Calculate scores
    fpr, tpr, _ = roc_curve(val_gen.labels, y_pred)
    auc_score = roc_auc_score(val_gen.labels, y_pred)
    f1 = f1_score(val_gen.labels, y_pred_bin)
    acc = accuracy_score(val_gen.labels, y_pred_bin)

    # Save results
    save_history(history, model_name, save_folder)
    plot_ROC_curve(fpr, tpr, save_folder, model_name)
    with open(os.path.join(save_folder, model_name + '_scores.txt'),
              'w') as result_file:
        result_file.write('AUC score      = {}\n'.format(auc_score))
        result_file.write('F1 score       = {}\n'.format(f1))
        result_file.write('Accuracy score = {}\n'.format(acc))
    return model
Beispiel #6
0
def train_and_evaluate(model, model_name='CNN', save_folder='./', nr_epochs=25, \
                        train_fraction=1, val_fraction=1, adaptive_LR=False, \
                        pred_threshold=0.5):
    """
    DESCRIPTION: This function trains and evaluates the CNN.

    Parameters:
    ----------
    model:          A fully constructed and compiled model;
    model_name:     Used for the filenames of results and the TensorBoard logs
    save_folder:    Used to determine where results should be saved (excl. Tensorboard
                    logs!)
    nr_epochs:      The number of epochs to be used for training the model
    train_fraction: The fraction of training steps to be used. A lower fraction
                    results in a faster training model, but this model will be trained
                    on less data.
    val_fraction:   The fraction of validation steps to be used during model training.
                    A lower fraction results in a faster evaluation of the model,
                    but this evaluation will be less accurate.
    pred_threshold: The confidence after which a prediction is assumed to be positive.
                    Defaults to 0.5; everything above 0.5 will be regarded as a
                    positive identification.

    Returns:
    -------
    model:       The fitted model is returned.
    """
    save_folder = os.path.join(save_folder, model_name)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Get the data generators
    train_gen, val_gen, val_gen_no_shuffle = get_generators(DATA_PATH)

    # Define filepaths to save the model and weights
    model_filepath = os.path.join(save_folder, model_name + '.json')
    weights_filepath = os.path.join(save_folder, model_name + '_weights.hdf5')

    # Save the model to a .json file
    model_json = model.to_json()
    with open(model_filepath, 'w') as json_file:
        json_file.write(model_json)

    # Define the model checkpoint and TensorBoard callbacks
    checkpoint = ModelCheckpoint(weights_filepath,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min')
    tensorboard = TensorBoard(os.path.join(TB_LOG_PATH, model_name))
    lr_schedule = LearningRateScheduler(adaptive_LR_schedule, verbose=0)
    if adaptive_LR:
        callbacks_list = [checkpoint, tensorboard, lr_schedule]
    else:
        callbacks_list = [checkpoint, tensorboard]

    # Define the number of training samples to use during a single epoch, and
    # the number of validation samples validated on per epoch.
    train_steps = train_gen.n // train_gen.batch_size // (1 / train_fraction)
    val_steps = val_gen.n // val_gen.batch_size // (1 / val_fraction)

    # Train the model
    history = model.fit(train_gen,
                        steps_per_epoch=train_steps,
                        validation_data=val_gen,
                        validation_steps=val_steps,
                        epochs=nr_epochs,
                        callbacks=callbacks_list)

    # Evaluate model
    y_pred = model.predict(val_gen_no_shuffle, verbose=0)
    y_pred_bin = (y_pred > pred_threshold).astype(int)

    # Calculate scores
    fpr, tpr, _ = roc_curve(val_gen.labels, y_pred)
    auc_score = roc_auc_score(val_gen.labels, y_pred)
    f1 = f1_score(val_gen.labels, y_pred_bin)
    acc = accuracy_score(val_gen.labels, y_pred_bin)

    # Save results
    save_history(history.history, model_name, save_folder)
    plot_history(history.history, model_name, save_folder)
    plot_ROC_curve(fpr, tpr, save_folder, model_name)
    with open(os.path.join(save_folder, model_name + '_scores.txt'),
              'w') as result_file:
        result_file.write('AUC score      = {}\n'.format(auc_score))
        result_file.write('F1 score       = {}\n'.format(f1))
        result_file.write('Accuracy score = {}\n'.format(acc))
    return model
Beispiel #7
0
def RNN_classification(dataset, filename, save_model=False):
    """
    Classification of data with a recurrent neural
    network, followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    save_model: optional; if provided, should specify the directory
       to save model summary and weights. The classification model
       will be returned in this case.
       If False, an array containing classification accuracy,
       precision and recall will be returned instead.
    """

    # Import training/test set
    X_train = dataset.train.loc[:, 'AASeq'].values
    X_test = dataset.test.loc[:, 'AASeq'].values
    X_val = dataset.val.loc[:, 'AASeq'].values

    # One hot encode the sequences
    X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train]
    X_train = np.transpose(np.asarray(X_train), (0, 2, 1))
    X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test]
    X_test = np.transpose(np.asarray(X_test), (0, 2, 1))
    X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val]
    X_val = np.transpose(np.asarray(X_val), (0, 2, 1))

    # Extract labels of training/test/validation set
    y_train = dataset.train.loc[:, 'AgClass'].values
    y_test = dataset.test.loc[:, 'AgClass'].values
    y_val = dataset.val.loc[:, 'AgClass'].values

    # Building the RNN
    RNN_classifier = create_rnn()

    # Compiling the RNN
    RNN_classifier.compile(
        optimizer='rmsprop', loss='binary_crossentropy',
        metrics=['accuracy']
    )

    # Fit the RNN to the training set
    _ = RNN_classifier.fit(
        x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val),
        epochs=20, batch_size=32, verbose=2
    )

    # Predicting the test set results
    y_pred = RNN_classifier.predict(x=X_test)

    # ROC curve
    title = 'RNN ROC curve (Train={})'.format(filename)
    plot_ROC_curve(
        y_test, y_pred, plot_title=title,
        plot_dir='figures/RNN_ROC_Test_{}.png'.format(filename)
    )

    # Precision-recall curve
    title = 'RNN Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(
        y_test, y_pred, plot_title=title,
        plot_dir='figures/RNN_P-R_Test_{}.png'.format(filename)
    )

    # Save model if specified
    if save_model:
        # Model summary
        with open(os.path.join(save_model, 'RNN_summary.txt'), 'w') as f:
            with redirect_stdout(f):
                RNN_classifier.summary()

        # Model weights
        RNN_classifier.save(
            os.path.join(save_model, 'RNN_HER2')
        )

        # Return classification model
        return RNN_classifier
    else:
        # Probabilities larger than 0.5 are significant
        y_pred_stand = (y_pred > 0.5)

        # Calculate statistics
        stats = calc_stat(y_test, y_pred_stand)

        # Return statistics
        return stats
Beispiel #8
0
def CNN_classification(dataset, filename, save_model=False, params=None):
    """
    Classification of data with a convolutional neural
    network, followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    save_model: optional; if provided, should specify the directory
       to save model summary and weights. The classification model
       will be returned in this case.
       If False, an array containing classification accuracy,
       precision and recall will be returned instead.

    params: optional; if provided, should specify the optimized
        model parameters that were determined in a separate model
        tuning step. If None, model parameters are hard-coded.
    """

    # Import training/test set
    X_train = dataset.train.loc[:, 'AASeq'].values
    X_test = dataset.test.loc[:, 'AASeq'].values
    X_val = dataset.val.loc[:, 'AASeq'].values

    # One hot encode the sequences
    X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train]
    X_train = np.transpose(np.asarray(X_train), (0, 2, 1))
    X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test]
    X_test = np.transpose(np.asarray(X_test), (0, 2, 1))
    X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val]
    X_val = np.transpose(np.asarray(X_val), (0, 2, 1))

    # Extract labels of training/test/validation set
    y_train = dataset.train.loc[:, 'AgClass'].values
    y_test = dataset.test.loc[:, 'AgClass'].values
    y_val = dataset.val.loc[:, 'AgClass'].values

    # Set parameters for CNN
    if not params:
        params = [['CONV', 400, 3, 1], ['DROP', 0.5], ['POOL', 2, 1], ['FLAT'],
                  ['DENSE', 50]]

    # Create the CNN with above-specified parameters
    CNN_classifier = create_cnn(params, (10, 20), 'relu', None)

    # Compiling the CNN
    opt = Adam(learning_rate=0.000075)
    CNN_classifier.compile(optimizer=opt,
                           loss='binary_crossentropy',
                           metrics=['accuracy'])

    # Fit the CNN to the training set
    _ = CNN_classifier.fit(x=X_train,
                           y=y_train,
                           shuffle=True,
                           validation_data=(X_val, y_val),
                           epochs=20,
                           batch_size=16,
                           verbose=2)

    # Predicting the test set results
    y_pred = CNN_classifier.predict(x=X_test)

    # ROC curve
    title = 'CNN ROC curve (Train={})'.format(filename)
    plot_ROC_curve(y_test,
                   y_pred,
                   plot_title=title,
                   plot_dir='figures/CNN_ROC_Test_{}.png'.format(filename))

    # Precision-recall curve
    title = 'CNN Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(y_test,
                  y_pred,
                  plot_title=title,
                  plot_dir='figures/CNN_P-R_Test_{}.png'.format(filename))
    # Save model if specified
    if save_model:
        # Model summary
        with open(os.path.join(save_model, 'CNN_summary.txt'), 'w') as f:
            with redirect_stdout(f):
                CNN_classifier.summary()

        # Model weights
        CNN_classifier.save(os.path.join(save_model, 'CNN_HER2'))

        # Return classification model
        return CNN_classifier
    else:
        # Probabilities larger than 0.5 are significant
        y_pred_stand = (y_pred > 0.5)

        # Calculate statistics
        stats = calc_stat(y_test, y_pred_stand)

        # Return statistics
        return stats