def KNN_classification(dataset, filename): """ Classification of data with k-nearest neighbors, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import and one hot encode training/test set X_train, X_test, y_train, y_test = prepare_data(dataset) # Fitting classifier to the training set KNN_classifier = KNeighborsClassifier( n_neighbors=100, metric='minkowski', p=2) KNN_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = KNN_classifier.predict(X_test) y_score = KNN_classifier.predict_proba(X_test) # ROC curve title = 'KNN ROC curve (Train={})'.format(filename) plot_ROC_curve( y_test, y_score[:, 1], plot_title=title, plot_dir='figures/KNN_ROC_Test_{}.png'.format(filename) ) # Precision-recall curve title = 'KNN Precision-Recall curve (Train={})'.format(filename) plot_PR_curve( y_test, y_score[:, 1], plot_title=title, plot_dir='figures/KNN_P-R_Test_{}.png'.format(filename) ) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def LogReg_classification(dataset, filename): """ Classification of data with logistic regression, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import and one hot encode training/test set X_train, X_test, y_train, y_test = prepare_data(dataset) # Fitting Logistic Regression to the training set LR_classifier = LogisticRegression(random_state=0) LR_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = LR_classifier.predict(X_test) y_score = LR_classifier.predict_proba(X_test) # ROC curve title = 'Logistic Regression ROC curve (Train={})'.format(filename) plot_ROC_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/LR_ROC_Test_{}.png'.format(filename)) # Precision-recall curve title = 'Logistic Regression Precision-Recall curve (Train={})'.format( filename) plot_PR_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/LR_P-R_Test_{}.png'.format(filename)) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def SVM_classification(dataset, filename): """ Classification of data with support vectors, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import and one hot encode training/test set X_train, X_test, y_train, y_test = prepare_data(dataset) # Fitting classifier to the training set SVM_classifier = SVC(kernel='rbf') SVM_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = SVM_classifier.predict(X_test) y_score = SVM_classifier.decision_function(X_test) # ROC curve title = 'SVM ROC curve (Train={})'.format(filename) plot_ROC_curve( y_test, y_score, plot_title=title, plot_dir='figures/SVM_ROC_Test_{}.png'.format(filename) ) # Precision-recall curve title = 'SVM Precision-Recall curve (Train={})'.format(filename) plot_PR_curve( y_test, y_score, plot_title=title, plot_dir='figures/SVM_P-R_Test_{}.png'.format(filename) ) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def LogReg2D_classification(dataset, filename): """ Classification of data with 2D logistic regression, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import training/test set X_train = dataset.train.loc[:, 'AASeq'].values X_test = dataset.test.loc[:, 'AASeq'].values # One hot encode the sequences in 2D X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train] X_train_2D_list = [] for x in range(0, len(X_train)): X_train_2D = np.empty([20, 0]) for y in range(0, X_train[x].shape[1] - 1): for z in range(0, X_train[x].shape[0]): X_train_2D = np.concatenate( (X_train_2D, X_train[x][z, y] * X_train[x][:, y + 1:]), axis=1) X_train_2D_list.append(X_train_2D) X_train = [x.flatten('F') for x in X_train_2D_list] X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test] X_test_2D_list = [] for x in range(0, len(X_test)): X_test_2D = np.empty([20, 0]) for y in range(0, X_test[x].shape[1] - 1): for z in range(0, X_test[x].shape[0]): X_test_2D = np.concatenate( (X_test_2D, X_test[x][z, y] * X_test[x][:, y + 1:]), axis=1) X_test_2D_list.append(X_test_2D) X_test = [x.flatten('F') for x in X_test_2D_list] # Extract labels of training/test set y_train = dataset.train.loc[:, 'AgClass'].values y_test = dataset.test.loc[:, 'AgClass'].values # Fitting Logistic Regression to the training set LR_classifier = LogisticRegression(random_state=0) LR_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = LR_classifier.predict(X_test) y_score = LR_classifier.predict_proba(X_test) # ROC curve title = '2D Logistic Regression ROC curve (Train={})'.format(filename) plot_ROC_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/2DLR_ROC_Test_{}.png'.format(filename)) # Precision-recall curve title = '2D Logistic Regression Precision-Recall curve (Train={})'.format( filename) plot_PR_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/2DLR_P-R_Test_{}.png'.format(filename)) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def train_and_evaluate(model, model_name='CNN', save_folder='./', nr_epochs=10, \ pred_threshold=0.5): """ DESCRIPTION: This function trains and evaluates the CNN. Parameters: ---------- model: A fully constructed and compiled model; model_name: Used for the filenames of results and the TensorBoard logs save_folder: Used to determine where results should be saved (excl. Tensorboard logs!) Returns: ------- model: The fitted model is returned. """ # Get the data generators train_gen, val_gen, val_gen_no_shuffle = get_generators(DATA_PATH) # Define filepaths to save the model and weights model_filepath = os.path.join(save_folder, model_name + '.json') weights_filepath = os.path.join(save_folder, model_name + '_weights.hdf5') # Save the model to a .json file model_json = model.to_json() with open(model_filepath, 'w') as json_file: json_file.write(model_json) # Define the model checkpoint and TensorBoard callbacks checkpoint = ModelCheckpoint(weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min') tensorboard = TensorBoard(os.path.join(TB_LOG_PATH, model_name)) callbacks_list = [checkpoint, tensorboard] # Define the number of training samples to use during a single epoch, and # the number of validation samples validated on per epoch. train_steps = train_gen.n // train_gen.batch_size val_steps = val_gen.n // val_gen.batch_size # Train the model history = model.fit(train_gen, steps_per_epoch=train_steps, validation_data=val_gen, validation_steps=val_steps, epochs=nr_epochs, callbacks=callbacks_list) # Evaluate model y_pred = model.predict(val_gen_no_shuffle, verbose=0) y_pred_bin = (y_pred > pred_threshold).astype(int) # Calculate scores fpr, tpr, _ = roc_curve(val_gen.labels, y_pred) auc_score = roc_auc_score(val_gen.labels, y_pred) f1 = f1_score(val_gen.labels, y_pred_bin) acc = accuracy_score(val_gen.labels, y_pred_bin) # Save results save_history(history, model_name, save_folder) plot_ROC_curve(fpr, tpr, save_folder, model_name) with open(os.path.join(save_folder, model_name + '_scores.txt'), 'w') as result_file: result_file.write('AUC score = {}\n'.format(auc_score)) result_file.write('F1 score = {}\n'.format(f1)) result_file.write('Accuracy score = {}\n'.format(acc)) return model
def train_and_evaluate(model, model_name='CNN', save_folder='./', nr_epochs=25, \ train_fraction=1, val_fraction=1, adaptive_LR=False, \ pred_threshold=0.5): """ DESCRIPTION: This function trains and evaluates the CNN. Parameters: ---------- model: A fully constructed and compiled model; model_name: Used for the filenames of results and the TensorBoard logs save_folder: Used to determine where results should be saved (excl. Tensorboard logs!) nr_epochs: The number of epochs to be used for training the model train_fraction: The fraction of training steps to be used. A lower fraction results in a faster training model, but this model will be trained on less data. val_fraction: The fraction of validation steps to be used during model training. A lower fraction results in a faster evaluation of the model, but this evaluation will be less accurate. pred_threshold: The confidence after which a prediction is assumed to be positive. Defaults to 0.5; everything above 0.5 will be regarded as a positive identification. Returns: ------- model: The fitted model is returned. """ save_folder = os.path.join(save_folder, model_name) if not os.path.exists(save_folder): os.makedirs(save_folder) # Get the data generators train_gen, val_gen, val_gen_no_shuffle = get_generators(DATA_PATH) # Define filepaths to save the model and weights model_filepath = os.path.join(save_folder, model_name + '.json') weights_filepath = os.path.join(save_folder, model_name + '_weights.hdf5') # Save the model to a .json file model_json = model.to_json() with open(model_filepath, 'w') as json_file: json_file.write(model_json) # Define the model checkpoint and TensorBoard callbacks checkpoint = ModelCheckpoint(weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min') tensorboard = TensorBoard(os.path.join(TB_LOG_PATH, model_name)) lr_schedule = LearningRateScheduler(adaptive_LR_schedule, verbose=0) if adaptive_LR: callbacks_list = [checkpoint, tensorboard, lr_schedule] else: callbacks_list = [checkpoint, tensorboard] # Define the number of training samples to use during a single epoch, and # the number of validation samples validated on per epoch. train_steps = train_gen.n // train_gen.batch_size // (1 / train_fraction) val_steps = val_gen.n // val_gen.batch_size // (1 / val_fraction) # Train the model history = model.fit(train_gen, steps_per_epoch=train_steps, validation_data=val_gen, validation_steps=val_steps, epochs=nr_epochs, callbacks=callbacks_list) # Evaluate model y_pred = model.predict(val_gen_no_shuffle, verbose=0) y_pred_bin = (y_pred > pred_threshold).astype(int) # Calculate scores fpr, tpr, _ = roc_curve(val_gen.labels, y_pred) auc_score = roc_auc_score(val_gen.labels, y_pred) f1 = f1_score(val_gen.labels, y_pred_bin) acc = accuracy_score(val_gen.labels, y_pred_bin) # Save results save_history(history.history, model_name, save_folder) plot_history(history.history, model_name, save_folder) plot_ROC_curve(fpr, tpr, save_folder, model_name) with open(os.path.join(save_folder, model_name + '_scores.txt'), 'w') as result_file: result_file.write('AUC score = {}\n'.format(auc_score)) result_file.write('F1 score = {}\n'.format(f1)) result_file.write('Accuracy score = {}\n'.format(acc)) return model
def RNN_classification(dataset, filename, save_model=False): """ Classification of data with a recurrent neural network, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. save_model: optional; if provided, should specify the directory to save model summary and weights. The classification model will be returned in this case. If False, an array containing classification accuracy, precision and recall will be returned instead. """ # Import training/test set X_train = dataset.train.loc[:, 'AASeq'].values X_test = dataset.test.loc[:, 'AASeq'].values X_val = dataset.val.loc[:, 'AASeq'].values # One hot encode the sequences X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train] X_train = np.transpose(np.asarray(X_train), (0, 2, 1)) X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test] X_test = np.transpose(np.asarray(X_test), (0, 2, 1)) X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val] X_val = np.transpose(np.asarray(X_val), (0, 2, 1)) # Extract labels of training/test/validation set y_train = dataset.train.loc[:, 'AgClass'].values y_test = dataset.test.loc[:, 'AgClass'].values y_val = dataset.val.loc[:, 'AgClass'].values # Building the RNN RNN_classifier = create_rnn() # Compiling the RNN RNN_classifier.compile( optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'] ) # Fit the RNN to the training set _ = RNN_classifier.fit( x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=2 ) # Predicting the test set results y_pred = RNN_classifier.predict(x=X_test) # ROC curve title = 'RNN ROC curve (Train={})'.format(filename) plot_ROC_curve( y_test, y_pred, plot_title=title, plot_dir='figures/RNN_ROC_Test_{}.png'.format(filename) ) # Precision-recall curve title = 'RNN Precision-Recall curve (Train={})'.format(filename) plot_PR_curve( y_test, y_pred, plot_title=title, plot_dir='figures/RNN_P-R_Test_{}.png'.format(filename) ) # Save model if specified if save_model: # Model summary with open(os.path.join(save_model, 'RNN_summary.txt'), 'w') as f: with redirect_stdout(f): RNN_classifier.summary() # Model weights RNN_classifier.save( os.path.join(save_model, 'RNN_HER2') ) # Return classification model return RNN_classifier else: # Probabilities larger than 0.5 are significant y_pred_stand = (y_pred > 0.5) # Calculate statistics stats = calc_stat(y_test, y_pred_stand) # Return statistics return stats
def CNN_classification(dataset, filename, save_model=False, params=None): """ Classification of data with a convolutional neural network, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. save_model: optional; if provided, should specify the directory to save model summary and weights. The classification model will be returned in this case. If False, an array containing classification accuracy, precision and recall will be returned instead. params: optional; if provided, should specify the optimized model parameters that were determined in a separate model tuning step. If None, model parameters are hard-coded. """ # Import training/test set X_train = dataset.train.loc[:, 'AASeq'].values X_test = dataset.test.loc[:, 'AASeq'].values X_val = dataset.val.loc[:, 'AASeq'].values # One hot encode the sequences X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train] X_train = np.transpose(np.asarray(X_train), (0, 2, 1)) X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test] X_test = np.transpose(np.asarray(X_test), (0, 2, 1)) X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val] X_val = np.transpose(np.asarray(X_val), (0, 2, 1)) # Extract labels of training/test/validation set y_train = dataset.train.loc[:, 'AgClass'].values y_test = dataset.test.loc[:, 'AgClass'].values y_val = dataset.val.loc[:, 'AgClass'].values # Set parameters for CNN if not params: params = [['CONV', 400, 3, 1], ['DROP', 0.5], ['POOL', 2, 1], ['FLAT'], ['DENSE', 50]] # Create the CNN with above-specified parameters CNN_classifier = create_cnn(params, (10, 20), 'relu', None) # Compiling the CNN opt = Adam(learning_rate=0.000075) CNN_classifier.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) # Fit the CNN to the training set _ = CNN_classifier.fit(x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val), epochs=20, batch_size=16, verbose=2) # Predicting the test set results y_pred = CNN_classifier.predict(x=X_test) # ROC curve title = 'CNN ROC curve (Train={})'.format(filename) plot_ROC_curve(y_test, y_pred, plot_title=title, plot_dir='figures/CNN_ROC_Test_{}.png'.format(filename)) # Precision-recall curve title = 'CNN Precision-Recall curve (Train={})'.format(filename) plot_PR_curve(y_test, y_pred, plot_title=title, plot_dir='figures/CNN_P-R_Test_{}.png'.format(filename)) # Save model if specified if save_model: # Model summary with open(os.path.join(save_model, 'CNN_summary.txt'), 'w') as f: with redirect_stdout(f): CNN_classifier.summary() # Model weights CNN_classifier.save(os.path.join(save_model, 'CNN_HER2')) # Return classification model return CNN_classifier else: # Probabilities larger than 0.5 are significant y_pred_stand = (y_pred > 0.5) # Calculate statistics stats = calc_stat(y_test, y_pred_stand) # Return statistics return stats