Esempio n. 1
0
def compare_hmmscores(fasta, hmm1, hmm2):
    '''Return a dataframe whose columns are the accession 
    numbers in fasta and the corresponding hmm scores from 
    the hmm output files, hmm1 and hmm2, respectively.'''
    
    acc_all = bioinf.get_accession(fasta)
    [acc1, score1] = get_acc_and_scores(hmm1)
    [acc2, score2] = get_acc_and_scores(hmm2)
    hmm1_scores,hmm2_scores = [],[]
    for i in range(len(acc_all)):
        try:
            hmm1_scores.append(score1[acc1.index(acc_all[i])])
        except:
            hmm1_scores.append(0)  # Assign a score of 0 if it's below the threshold
            
        try:
            hmm2_scores.append(score2[acc2.index(acc_all[i])])
        except:
            hmm2_scores.append(0)
    store = pd.DataFrame([acc_all, hmm1_scores, hmm2_scores]).transpose()
    store.columns = ['accession', 'hmm1_scores', 'hmm2_scores']
    return store
Esempio n. 2
0
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

import bioinformatics as bioinf

# Prepare sequences and data
#=====================================================#
GH13_df = pd.read_csv('results_final/ncbi_subtypes.csv')
GH13_SH = GH13_df[(GH13_df.ncbi_pred_class == 0)]
accession_SH = GH13_SH.Accession.tolist()
accession_all = bioinf.get_accession('fasta/initial_blast/nrblast_all.fasta')
GH13 = [1 if x in accession_SH else 0 for x in accession_all]
# class labels
y = pd.Series(GH13)
GH13_not_SH = y[y == 0]
GH13_yes_SH = y[y == 1]

# Derive features for machine learning with one-hot encoding
#============================================================#
cat_domain_fasta = 'fasta/GH13_positions_only/GH13_cat.fasta'
sequence_df = bioinf.fasta_to_df(cat_domain_fasta)
X_features = pd.DataFrame()  # empty dataframe for storing features

for i in range(len(sequence_df.columns)):
    # Convert amino acids to integers
    X_resid = list(sequence_df.iloc[:, i])
Esempio n. 3
0
from keras.callbacks import ReduceLROnPlateau
import os

import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from keras.models import load_model

# load train and test dataset
h, sequences = bioinf.split_fasta('fasta/GH13_positions_only/GH13_cat.fasta')
heads=bioinf.get_accession('fasta/GH13_positions_only/GH13_cat.fasta')
subtype = list(pd.read_csv('results_final/ncbi_subtypes.csv')['ncbi_pred_class'])
lb = LabelBinarizer()
y = lb.fit_transform(subtype)
y = to_categorical(y)
cat_domain_fasta = 'fasta/GH13_positions_only/GH13_cat.fasta'
sequence_df = bioinf.fasta_to_df(cat_domain_fasta)
max_length = len(sequence_df.columns)
embedding_dim = 11
top_classes=2
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sequences)

X_seq = tokenizer.texts_to_sequences(sequences)
X_seq = sequence.pad_sequences(X_seq, maxlen=max_length)
#X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=.2)
Esempio n. 4
0
        store.append(looplength)
        
    # Save results as DataFrame
    result = pd.DataFrame(store)
    result.columns = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4']
    return result


# Calculate loop lengths
msafile = 'fasta/structure_based_alignment/cel7_nr99_structaln.fasta'
looplength = get_gh7looplength(msafile, trecel7a_pos=0)


# Write results to spreadhseet
looplength.index = range(1, len(looplength)+1)
looplength['accession'] = bioinf.get_accession(msafile)
looplength.to_csv('results_final/looplength.csv')






# Data preprocessing: prepare data for machine learning
#================================================================#

# Retreive data
looplength = pd.read_csv('results_final/looplength.csv', index_col=0)
subtype = pd.read_csv('results_final/cel7_subtypes.csv', index_col=0)
looplength.index = range(len(looplength))
subtype.index = range(len(subtype))
Esempio n. 5
0
output = 'results_final/cbm_blast_output.txt'
blast_cline = NcbiblastpCommandline(cmd=blastp_exe,
                                    query=trecel7a_cbm,
                                    db='cbm_database/cel7_cbm',
                                    evalue=1e-3,
                                    outfmt=7,
                                    num_alignments=2000,
                                    out=output)
stdout, stderr = blast_cline(
)  # evalue of 1e-3 or less corresponds to bit score of 30 or more

# CBM data for all sequences
ex = pd.read_csv(
    'results_final/cbm_blast_output.csv')  # csv derived from txt file output
accession_cbm = list(ex['subject'])
accession_all = bioinf.get_accession(
    'fasta/initial_blast/cel7_nr99_full_length.fasta')
has_cbm = [1 if x in accession_cbm else 0 for x in accession_all]
df = pd.DataFrame([accession_all, has_cbm], index=['accession',
                                                   'has_cbm']).transpose()
df.to_csv('results_final/has_cbm.csv')

# CBM distribution
df['subtype'] = pd.read_excel(
    'results_final/cel7_subtypes.xlsx')['ncbi_pred_class']
df_cbh = df[df['subtype'] == 1]
df_egl = df[df['subtype'] == 0]
cbh_cbm = df_cbh.has_cbm.value_counts()[1]
cbh_nocbm = df_cbh.has_cbm.value_counts()[0]
egl_cbm = df_egl.has_cbm.value_counts()[1]
egl_nocbm = df_egl.has_cbm.value_counts()[0]