def compare_hmmscores(fasta, hmm1, hmm2): '''Return a dataframe whose columns are the accession numbers in fasta and the corresponding hmm scores from the hmm output files, hmm1 and hmm2, respectively.''' acc_all = bioinf.get_accession(fasta) [acc1, score1] = get_acc_and_scores(hmm1) [acc2, score2] = get_acc_and_scores(hmm2) hmm1_scores,hmm2_scores = [],[] for i in range(len(acc_all)): try: hmm1_scores.append(score1[acc1.index(acc_all[i])]) except: hmm1_scores.append(0) # Assign a score of 0 if it's below the threshold try: hmm2_scores.append(score2[acc2.index(acc_all[i])]) except: hmm2_scores.append(0) store = pd.DataFrame([acc_all, hmm1_scores, hmm2_scores]).transpose() store.columns = ['accession', 'hmm1_scores', 'hmm2_scores'] return store
from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import KFold from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix import warnings warnings.filterwarnings("ignore") import bioinformatics as bioinf # Prepare sequences and data #=====================================================# GH13_df = pd.read_csv('results_final/ncbi_subtypes.csv') GH13_SH = GH13_df[(GH13_df.ncbi_pred_class == 0)] accession_SH = GH13_SH.Accession.tolist() accession_all = bioinf.get_accession('fasta/initial_blast/nrblast_all.fasta') GH13 = [1 if x in accession_SH else 0 for x in accession_all] # class labels y = pd.Series(GH13) GH13_not_SH = y[y == 0] GH13_yes_SH = y[y == 1] # Derive features for machine learning with one-hot encoding #============================================================# cat_domain_fasta = 'fasta/GH13_positions_only/GH13_cat.fasta' sequence_df = bioinf.fasta_to_df(cat_domain_fasta) X_features = pd.DataFrame() # empty dataframe for storing features for i in range(len(sequence_df.columns)): # Convert amino acids to integers X_resid = list(sequence_df.iloc[:, i])
from keras.callbacks import ReduceLROnPlateau import os import tensorflow as tf from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.preprocessing import label_binarize from sklearn.metrics import roc_curve from sklearn.metrics import auc from sklearn.metrics import confusion_matrix from keras.models import load_model # load train and test dataset h, sequences = bioinf.split_fasta('fasta/GH13_positions_only/GH13_cat.fasta') heads=bioinf.get_accession('fasta/GH13_positions_only/GH13_cat.fasta') subtype = list(pd.read_csv('results_final/ncbi_subtypes.csv')['ncbi_pred_class']) lb = LabelBinarizer() y = lb.fit_transform(subtype) y = to_categorical(y) cat_domain_fasta = 'fasta/GH13_positions_only/GH13_cat.fasta' sequence_df = bioinf.fasta_to_df(cat_domain_fasta) max_length = len(sequence_df.columns) embedding_dim = 11 top_classes=2 tokenizer = Tokenizer(char_level=True) tokenizer.fit_on_texts(sequences) X_seq = tokenizer.texts_to_sequences(sequences) X_seq = sequence.pad_sequences(X_seq, maxlen=max_length) #X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=.2)
store.append(looplength) # Save results as DataFrame result = pd.DataFrame(store) result.columns = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] return result # Calculate loop lengths msafile = 'fasta/structure_based_alignment/cel7_nr99_structaln.fasta' looplength = get_gh7looplength(msafile, trecel7a_pos=0) # Write results to spreadhseet looplength.index = range(1, len(looplength)+1) looplength['accession'] = bioinf.get_accession(msafile) looplength.to_csv('results_final/looplength.csv') # Data preprocessing: prepare data for machine learning #================================================================# # Retreive data looplength = pd.read_csv('results_final/looplength.csv', index_col=0) subtype = pd.read_csv('results_final/cel7_subtypes.csv', index_col=0) looplength.index = range(len(looplength)) subtype.index = range(len(subtype))
output = 'results_final/cbm_blast_output.txt' blast_cline = NcbiblastpCommandline(cmd=blastp_exe, query=trecel7a_cbm, db='cbm_database/cel7_cbm', evalue=1e-3, outfmt=7, num_alignments=2000, out=output) stdout, stderr = blast_cline( ) # evalue of 1e-3 or less corresponds to bit score of 30 or more # CBM data for all sequences ex = pd.read_csv( 'results_final/cbm_blast_output.csv') # csv derived from txt file output accession_cbm = list(ex['subject']) accession_all = bioinf.get_accession( 'fasta/initial_blast/cel7_nr99_full_length.fasta') has_cbm = [1 if x in accession_cbm else 0 for x in accession_all] df = pd.DataFrame([accession_all, has_cbm], index=['accession', 'has_cbm']).transpose() df.to_csv('results_final/has_cbm.csv') # CBM distribution df['subtype'] = pd.read_excel( 'results_final/cel7_subtypes.xlsx')['ncbi_pred_class'] df_cbh = df[df['subtype'] == 1] df_egl = df[df['subtype'] == 0] cbh_cbm = df_cbh.has_cbm.value_counts()[1] cbh_nocbm = df_cbh.has_cbm.value_counts()[0] egl_cbm = df_egl.has_cbm.value_counts()[1] egl_nocbm = df_egl.has_cbm.value_counts()[0]