Esempio n. 1
0
def KNN_classification(dataset, filename):
    """
    Classification of data with k-nearest neighbors,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import and one hot encode training/test set
    X_train, X_test, y_train, y_test = prepare_data(dataset)

    # Fitting classifier to the training set
    KNN_classifier = KNeighborsClassifier(
        n_neighbors=100, metric='minkowski', p=2)
    KNN_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = KNN_classifier.predict(X_test)
    y_score = KNN_classifier.predict_proba(X_test)

    # ROC curve
    title = 'KNN ROC curve (Train={})'.format(filename)
    plot_ROC_curve(
        y_test, y_score[:, 1], plot_title=title,
        plot_dir='figures/KNN_ROC_Test_{}.png'.format(filename)
    )

    # Precision-recall curve
    title = 'KNN Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(
        y_test, y_score[:, 1], plot_title=title,
        plot_dir='figures/KNN_P-R_Test_{}.png'.format(filename)
    )

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Esempio n. 2
0
def LogReg_classification(dataset, filename):
    """
    Classification of data with logistic regression,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import and one hot encode training/test set
    X_train, X_test, y_train, y_test = prepare_data(dataset)

    # Fitting Logistic Regression to the training set
    LR_classifier = LogisticRegression(random_state=0)
    LR_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = LR_classifier.predict(X_test)
    y_score = LR_classifier.predict_proba(X_test)

    # ROC curve
    title = 'Logistic Regression ROC curve (Train={})'.format(filename)
    plot_ROC_curve(y_test,
                   y_score[:, 1],
                   plot_title=title,
                   plot_dir='figures/LR_ROC_Test_{}.png'.format(filename))

    # Precision-recall curve
    title = 'Logistic Regression Precision-Recall curve (Train={})'.format(
        filename)
    plot_PR_curve(y_test,
                  y_score[:, 1],
                  plot_title=title,
                  plot_dir='figures/LR_P-R_Test_{}.png'.format(filename))

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Esempio n. 3
0
def SVM_classification(dataset, filename):
    """
    Classification of data with support vectors,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import and one hot encode training/test set
    X_train, X_test, y_train, y_test = prepare_data(dataset)

    # Fitting classifier to the training set
    SVM_classifier = SVC(kernel='rbf')
    SVM_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = SVM_classifier.predict(X_test)
    y_score = SVM_classifier.decision_function(X_test)

    # ROC curve
    title = 'SVM ROC curve (Train={})'.format(filename)
    plot_ROC_curve(
        y_test, y_score, plot_title=title,
        plot_dir='figures/SVM_ROC_Test_{}.png'.format(filename)
    )

    # Precision-recall curve
    title = 'SVM Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(
        y_test, y_score, plot_title=title,
        plot_dir='figures/SVM_P-R_Test_{}.png'.format(filename)
    )

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Esempio n. 4
0
def LogReg2D_classification(dataset, filename):
    """
    Classification of data with 2D logistic regression,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import training/test set
    X_train = dataset.train.loc[:, 'AASeq'].values
    X_test = dataset.test.loc[:, 'AASeq'].values

    # One hot encode the sequences in 2D
    X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train]
    X_train_2D_list = []
    for x in range(0, len(X_train)):
        X_train_2D = np.empty([20, 0])
        for y in range(0, X_train[x].shape[1] - 1):
            for z in range(0, X_train[x].shape[0]):
                X_train_2D = np.concatenate(
                    (X_train_2D, X_train[x][z, y] * X_train[x][:, y + 1:]),
                    axis=1)
        X_train_2D_list.append(X_train_2D)
    X_train = [x.flatten('F') for x in X_train_2D_list]

    X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test]
    X_test_2D_list = []
    for x in range(0, len(X_test)):
        X_test_2D = np.empty([20, 0])
        for y in range(0, X_test[x].shape[1] - 1):
            for z in range(0, X_test[x].shape[0]):
                X_test_2D = np.concatenate(
                    (X_test_2D, X_test[x][z, y] * X_test[x][:, y + 1:]),
                    axis=1)
        X_test_2D_list.append(X_test_2D)
    X_test = [x.flatten('F') for x in X_test_2D_list]

    # Extract labels of training/test set
    y_train = dataset.train.loc[:, 'AgClass'].values
    y_test = dataset.test.loc[:, 'AgClass'].values

    # Fitting Logistic Regression to the training set
    LR_classifier = LogisticRegression(random_state=0)
    LR_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = LR_classifier.predict(X_test)
    y_score = LR_classifier.predict_proba(X_test)

    # ROC curve
    title = '2D Logistic Regression ROC curve (Train={})'.format(filename)
    plot_ROC_curve(y_test,
                   y_score[:, 1],
                   plot_title=title,
                   plot_dir='figures/2DLR_ROC_Test_{}.png'.format(filename))

    # Precision-recall curve
    title = '2D Logistic Regression Precision-Recall curve (Train={})'.format(
        filename)
    plot_PR_curve(y_test,
                  y_score[:, 1],
                  plot_title=title,
                  plot_dir='figures/2DLR_P-R_Test_{}.png'.format(filename))

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Esempio n. 5
0
def RNN_classification(dataset, filename, save_model=False):
    """
    Classification of data with a recurrent neural
    network, followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    save_model: optional; if provided, should specify the directory
       to save model summary and weights. The classification model
       will be returned in this case.
       If False, an array containing classification accuracy,
       precision and recall will be returned instead.
    """

    # Import training/test set
    X_train = dataset.train.loc[:, 'AASeq'].values
    X_test = dataset.test.loc[:, 'AASeq'].values
    X_val = dataset.val.loc[:, 'AASeq'].values

    # One hot encode the sequences
    X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train]
    X_train = np.transpose(np.asarray(X_train), (0, 2, 1))
    X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test]
    X_test = np.transpose(np.asarray(X_test), (0, 2, 1))
    X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val]
    X_val = np.transpose(np.asarray(X_val), (0, 2, 1))

    # Extract labels of training/test/validation set
    y_train = dataset.train.loc[:, 'AgClass'].values
    y_test = dataset.test.loc[:, 'AgClass'].values
    y_val = dataset.val.loc[:, 'AgClass'].values

    # Building the RNN
    RNN_classifier = create_rnn()

    # Compiling the RNN
    RNN_classifier.compile(
        optimizer='rmsprop', loss='binary_crossentropy',
        metrics=['accuracy']
    )

    # Fit the RNN to the training set
    _ = RNN_classifier.fit(
        x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val),
        epochs=20, batch_size=32, verbose=2
    )

    # Predicting the test set results
    y_pred = RNN_classifier.predict(x=X_test)

    # ROC curve
    title = 'RNN ROC curve (Train={})'.format(filename)
    plot_ROC_curve(
        y_test, y_pred, plot_title=title,
        plot_dir='figures/RNN_ROC_Test_{}.png'.format(filename)
    )

    # Precision-recall curve
    title = 'RNN Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(
        y_test, y_pred, plot_title=title,
        plot_dir='figures/RNN_P-R_Test_{}.png'.format(filename)
    )

    # Save model if specified
    if save_model:
        # Model summary
        with open(os.path.join(save_model, 'RNN_summary.txt'), 'w') as f:
            with redirect_stdout(f):
                RNN_classifier.summary()

        # Model weights
        RNN_classifier.save(
            os.path.join(save_model, 'RNN_HER2')
        )

        # Return classification model
        return RNN_classifier
    else:
        # Probabilities larger than 0.5 are significant
        y_pred_stand = (y_pred > 0.5)

        # Calculate statistics
        stats = calc_stat(y_test, y_pred_stand)

        # Return statistics
        return stats
Esempio n. 6
0
def CNN_classification(dataset, filename, save_model=False, params=None):
    """
    Classification of data with a convolutional neural
    network, followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    save_model: optional; if provided, should specify the directory
       to save model summary and weights. The classification model
       will be returned in this case.
       If False, an array containing classification accuracy,
       precision and recall will be returned instead.

    params: optional; if provided, should specify the optimized
        model parameters that were determined in a separate model
        tuning step. If None, model parameters are hard-coded.
    """

    # Import training/test set
    X_train = dataset.train.loc[:, 'AASeq'].values
    X_test = dataset.test.loc[:, 'AASeq'].values
    X_val = dataset.val.loc[:, 'AASeq'].values

    # One hot encode the sequences
    X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train]
    X_train = np.transpose(np.asarray(X_train), (0, 2, 1))
    X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test]
    X_test = np.transpose(np.asarray(X_test), (0, 2, 1))
    X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val]
    X_val = np.transpose(np.asarray(X_val), (0, 2, 1))

    # Extract labels of training/test/validation set
    y_train = dataset.train.loc[:, 'AgClass'].values
    y_test = dataset.test.loc[:, 'AgClass'].values
    y_val = dataset.val.loc[:, 'AgClass'].values

    # Set parameters for CNN
    if not params:
        params = [['CONV', 400, 3, 1], ['DROP', 0.5], ['POOL', 2, 1], ['FLAT'],
                  ['DENSE', 50]]

    # Create the CNN with above-specified parameters
    CNN_classifier = create_cnn(params, (10, 20), 'relu', None)

    # Compiling the CNN
    opt = Adam(learning_rate=0.000075)
    CNN_classifier.compile(optimizer=opt,
                           loss='binary_crossentropy',
                           metrics=['accuracy'])

    # Fit the CNN to the training set
    _ = CNN_classifier.fit(x=X_train,
                           y=y_train,
                           shuffle=True,
                           validation_data=(X_val, y_val),
                           epochs=20,
                           batch_size=16,
                           verbose=2)

    # Predicting the test set results
    y_pred = CNN_classifier.predict(x=X_test)

    # ROC curve
    title = 'CNN ROC curve (Train={})'.format(filename)
    plot_ROC_curve(y_test,
                   y_pred,
                   plot_title=title,
                   plot_dir='figures/CNN_ROC_Test_{}.png'.format(filename))

    # Precision-recall curve
    title = 'CNN Precision-Recall curve (Train={})'.format(filename)
    plot_PR_curve(y_test,
                  y_pred,
                  plot_title=title,
                  plot_dir='figures/CNN_P-R_Test_{}.png'.format(filename))
    # Save model if specified
    if save_model:
        # Model summary
        with open(os.path.join(save_model, 'CNN_summary.txt'), 'w') as f:
            with redirect_stdout(f):
                CNN_classifier.summary()

        # Model weights
        CNN_classifier.save(os.path.join(save_model, 'CNN_HER2'))

        # Return classification model
        return CNN_classifier
    else:
        # Probabilities larger than 0.5 are significant
        y_pred_stand = (y_pred > 0.5)

        # Calculate statistics
        stats = calc_stat(y_test, y_pred_stand)

        # Return statistics
        return stats