Esempio n. 1
0
    os.makedirs(dir_results)

df_metrics = Assess_Performance_KNN(distances_list,
                                    names,
                                    DTCRU.class_id,
                                    dir_results,
                                    metrics=['AUC'])

df_u = pd.DataFrame()
df_u['Class'] = df_metrics['Classes']
df_u['AUC'] = df_metrics['Value']
df_u['Method'] = df_metrics['Algorithm']
df_u['Type'] = 'Unsupervised'

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C')
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=True,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)

AUC = []
Class = []
Method = []
for i in range(10):
    DTCRS.Get_Train_Valid_Test()

    DTCRS.Train(use_only_seq=True)
Esempio n. 2
0
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np

epitope = 'ELAGIGILTV'
cdr3_beta_col = 'CDR3.beta.aa'
cdr3_alpha_col = 'CDR3.alpha.aa'
epitope_col = 'Epitope.peptide'

df = pd.read_csv('../../../Data/10x_Data/Data_Regression.csv')
DTCRS = DeepTCR_SS('reg_mart1', device=2)

#Check performance no sequences in MCPAS
df_train_pep = pd.DataFrame()
df_train_pep['alpha'] = np.asarray(df['alpha'].tolist())
df_train_pep['beta'] = np.asarray(df['beta'].tolist())
df_train_pep['seq_id'] = df_train_pep['alpha'] + '_' + df_train_pep['beta']

df_tcr = pd.read_csv('../../../Data/McPAS-TCR.csv')
df_tcr.dropna(subset=[cdr3_alpha_col, cdr3_beta_col], inplace=True)
df_tcr = df_tcr.groupby([cdr3_alpha_col, cdr3_beta_col]).agg({
    epitope_col:
    'first'
}).reset_index()
df_tcr['seq_id'] = df_tcr[cdr3_alpha_col] + '_' + df_tcr[cdr3_beta_col]
df_tcr = df_tcr[~df_tcr['seq_id'].isin(df_train_pep['seq_id'])]
remove = ["""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ 1234567890]"""]
df_tcr = df_tcr[~df_tcr[cdr3_alpha_col].str.
                contains('|'.join(remove), regex=True)]
df_tcr = df_tcr[~df_tcr[cdr3_beta_col].str.
                contains('|'.join(remove), regex=True)]
Esempio n. 3
0
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.stats import spearmanr
import seaborn as sns
import pickle
import os
import matplotlib
matplotlib.rc('font', family='Arial')

#Instantiate training object
DTCRU = DeepTCR_SS('Murine_Sup')
#Load Data
DTCRU.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=False,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3,
               classes=['Db-F2', 'Db-M45', 'Db-NP', 'Db-PA', 'Db-PB1'])
Esempio n. 4
0
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from multiprocessing import Pool
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import shutil
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv')
DTCRS = DeepTCR_SS('reg',device=2)
p = Pool(40)

#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())

antigen = 'A0201_GILGFVFTL_Flu-MP_Influenza'
i = np.where(df.columns==antigen)[0][0]
sel = df.iloc[:, i]
Y = np.log2(np.asarray(sel.tolist()) + 1)
DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y, p=p)
DTCRS.K_Fold_CrossVal(split_by_sample=False, folds=5)
DTCRS.Representative_Sequences(top_seq=100,motif_seq=10,color_scheme='hydrophobicity')

dir = 'Reg_Rep_Sequences'
Esempio n. 5
0
import numpy as np
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Arial')
from sklearn.metrics import roc_auc_score, roc_curve

DTCRS = DeepTCR_SS('reg_flu', device=2)

alpha = 'CAGAGSQGNLIF'
beta = 'CASSSRSSYEQYF'
contacts_alpha = [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]
contacts_beta = [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0]

input_alpha = np.array([alpha, alpha])
input_beta = np.array([beta, beta])
fig_rsl, ax_rsl = DTCRS.Residue_Sensitivity_Logo(input_alpha,
                                                 input_beta,
                                                 background_color='black',
                                                 Load_Prev_Data=False)

df_alpha = pd.DataFrame()
df_alpha['seq'] = list(alpha)
df_alpha['mag'] = DTCRS.mag_alpha
df_alpha['label'] = contacts_alpha

df_beta = pd.DataFrame()
df_beta['seq'] = list(beta)
df_beta['mag'] = DTCRS.mag_beta
Esempio n. 6
0
"""This script runs regression for the 10x Dataset where alpha/beta TCR's are
regressed against the quantitative evaluation of antigen-specificity via
dCODE Dextramer reagents"""

import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from multiprocessing import Pool
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv')
DTCRS = DeepTCR_SS('reg',device='/gpu:2')
p = Pool(40)

#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())

y_pred = []
y_test = []
antigen = []
#Iterate through all antigens
for i in range(2,df.columns.shape[0]):
    print(df.iloc[:,i].name)
    sel = df.iloc[:,i]
    Y = np.log2(np.asarray(sel.tolist()) + 1)
    DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y,p=p)
    DTCRS.K_Fold_CrossVal(split_by_sample=False,folds=5)
    y_pred.append(DTCRS.y_pred)
Esempio n. 7
0
    os.makedirs(dir_results)

df_metrics = Assess_Performance_KNN(distances_list,
                                    names,
                                    DTCRU.class_id,
                                    dir_results,
                                    metrics=['AUC'])

df_u = pd.DataFrame()
df_u['Class'] = df_metrics['Classes']
df_u['AUC'] = df_metrics['Value']
df_u['Method'] = df_metrics['Algorithm']
df_u['Type'] = 'Unsupervised'

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C', device=1)
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=True,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)

AUC = []
Class = []
Method = []
folds = 100
seeds = np.array(range(folds))
for i in range(folds):
    np.random.seed(seeds[i])
Esempio n. 8
0
import numpy as np
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Arial')

DTCRS = DeepTCR_SS('reg_mart1', device=2)

alpha = 'CAVNFGGGKLIF'
beta = 'CASSWSFGTEAFF'
input_alpha = np.array([alpha, alpha])
input_beta = np.array([beta, beta])
pred = DTCRS.Sequence_Inference(input_alpha, input_beta)
fig_rsl, ax_rsl = DTCRS.Residue_Sensitivity_Logo(input_alpha,
                                                 input_beta,
                                                 background_color='black',
                                                 Load_Prev_Data=False)

fig_rsl.savefig('mart1_rsl.png', dpi=1200, facecolor='black')

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.swarmplot(data=DTCRS.df_alpha_list[0], x='pos', y='high', ax=ax[0])
i = 0
ax[i].set_xlabel('')
ax[i].set_ylabel('')
ax[i].set_xticklabels(list(alpha), size=24)
ax[i].tick_params(axis='y', labelsize=18)
ax[i].spines['right'].set_visible(False)
ax[i].spines['top'].set_visible(False)
Esempio n. 9
0
from DeepTCR.DeepTCR import DeepTCR_SS
from multiprocessing import Pool
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
import os

p = Pool(80)
dir_results = 'alpha_v_beta_results'
if not os.path.exists(dir_results):
    os.makedirs(dir_results)

antigens = [
    'GANAB-S5F', 'ATP6AP1-KLG_G3W', 'CMV-MLN', 'GNL3L-R4C', 'MART1-A2L',
    'YFV-LLW'
]

for a in antigens:
    DTCR = DeepTCR_SS(a + 'Rep')
    DTCR.Get_Data(directory='../../Data/Zhang/' + a,
                  aa_column_alpha=0,
                  aa_column_beta=1,
                  p=p)
    DTCR.Monte_Carlo_CrossVal(folds=50, weight_by_class=True)
    DTCR.Representative_Sequences()
from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF

folds = 100
LOO = 4
epochs_min = 100

#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist_SS')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.Monte_Carlo_CrossVal(folds=folds, test_size=0.25)
DTCR_SS.AUC_Curve(filename='AUC.eps')

#Train Repertoire Classifier without on-graph clustering
DTCR_WF = DeepTCR_WF('Rudqvist_WF')
DTCR_WF.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_WF.Monte_Carlo_CrossVal(folds=folds, LOO=LOO, epochs_min=epochs_min)
DTCR_WF.AUC_Curve(filename='Rep_AUC.eps')
Esempio n. 11
0
from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF

#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aggregate_by_aa=True,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.Monte_Carlo_CrossVal(folds=100, test_size=0.25)
DTCR_SS.AUC_Curve()

#Train Repertoire Classifier without on-graph clustering
DTCR_WF = DeepTCR_WF('Rudqvist')
DTCR_WF.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aggregate_by_aa=True,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_WF.Monte_Carlo_CrossVal(folds=100, LOO=4, epochs_min=50)
DTCR_WF.AUC_Curve()

#Train Repertoire Classifier with on-graph clustering
Esempio n. 12
0
"""
"""This script was used to train the supervised TCR sequence classifier
and generate the top representative sequences for each class and derive the 
motifs that were learned by the network."""

from DeepTCR.DeepTCR import DeepTCR_SS
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
import numpy as np
import os
import shutil

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C', device=6)
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=True,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)

folds = 100
seeds = np.array(range(folds))
graph_seed = 0
DTCRS.Monte_Carlo_CrossVal(folds=folds, graph_seed=graph_seed, seeds=seeds)
DTCRS.Representative_Sequences(top_seq=25,
                               motif_seq=10,
                               color_scheme='hydrophobicity')
Esempio n. 13
0
thresh = 0.99
seq_train = []
label_train = []
count_train = []
for s, seq_cl, p, c in zip(sequences, seq_class_labels, predicted, counts):
    sel_idx = p > thresh
    seq_train.append(s[sel_idx])
    label_train.append(seq_cl[sel_idx])
    count_train.append(c[sel_idx])

seq_train = np.hstack(seq_train)
label_train = np.hstack(label_train)
count_train = np.hstack(count_train)

#Train Sequence Classifier
DTCR = DeepTCR_SS('tw10_seq', device=gpu)
DTCR.Load_Data(beta_sequences=seq_train, class_labels=label_train)
DTCR.Monte_Carlo_CrossVal(folds=folds,
                          graph_seed=graph_seed,
                          seeds=seeds,
                          convergence='training')
y_pred = DTCR.predicted
y_test = DTCR.Y
plt.figure(figsize=(6, 5))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
for ii, cl in enumerate(DTCR.lb.classes_, 0):
    fpr, tpr, _ = roc_curve(y_test[:, ii], y_pred[:, ii])
    roc_score = roc_auc_score(y_test[:, ii], y_pred[:, ii])
Esempio n. 14
0
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.stats import spearmanr
import seaborn as sns
import pickle
import os
import matplotlib

matplotlib.rc('font', family='Arial')

#Instantiate training object
DTCRU = DeepTCR_SS('Murine_Sup')
#Load Data
# DTCRU.Get_Data(directory='../../Data/Murine_Antigens',Load_Prev_Data=False,
#                aa_column_beta=0,count_column=1,v_beta_column=2,j_beta_column=3,
#                classes=['Db-F2', 'Db-M45', 'Db-NP', 'Db-PA', 'Db-PB1'])
# DTCRU.Monte_Carlo_CrossVal(folds=5)

DTCR_inf = DeepTCR_SS('load')
DTCR_inf.Get_Data(directory='../../Data/Murine_Antigens',
                  Load_Prev_Data=False,
                  aa_column_beta=0,
                  count_column=1,
                  v_beta_column=2,
                  j_beta_column=3,
                  classes=['Kb-M38', 'Kb-SIY', 'Kb-TRP2', 'Kb-m139'])

beta_sequences = DTCR_inf.beta_sequences
Esempio n. 15
0
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Arial')
import pickle

df = pd.read_csv('../../../Data/10x_Data/Data_Regression.csv')
antigen = 'A0201_ELAGIGILTV_MART-1_Cancer'

DTCRS = DeepTCR_SS('reg_mart1', device=2)
#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())
i = np.where(df.columns == antigen)[0][0]
sel = df.iloc[:, i]
Y = np.log2(np.asarray(sel.tolist()) + 1)
DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y)
folds = 5
seeds = np.array(range(folds))
graph_seed = 0
DTCRS.K_Fold_CrossVal(split_by_sample=False,
                      folds=folds,
                      seeds=seeds,
                      graph_seed=graph_seed)
with open('mart1_preds.pkl', 'wb') as f:
    pickle.dump([antigen, np.squeeze(DTCRS.predicted), Y], f, protocol=4)
Esempio n. 16
0
"""
Fig 2C
"""
"""This script was used to train the supervised TCR sequence classifier
and generate the top representative sequences for each class and derive the 
motifs that were learned by the network."""

from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_U
import numpy as np
import seaborn as sns

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C')
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=False,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)

DTCRS.Monte_Carlo_CrossVal(folds=10, stop_criterion=0.01)
DTCRS.Representative_Sequences(top_seq=10, unique=True)
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

for item in DTCRS.Rep_Seq:
    break
    t = DTCRS.Rep_Seq[item]
Esempio n. 17
0
"""Figure 2B"""
"""This script is used to create the ROC curves for assessing the ability
of supervised sequence classifier to correctly predict the antigen-specificity of 
the 9 murine antigens in the manuscript.."""

from DeepTCR.DeepTCR import DeepTCR_SS

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C')
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=False,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)
DTCRS.Monte_Carlo_CrossVal(folds=10)
DTCRS.AUC_Curve()
Esempio n. 18
0
Supplementary Figure 17
"""
"""This script is used to benchmark DeepTCR's Sequence Classifier
against an SVM and RF where the inputs for those latter machine learning
algorithms are the outputs of a K-mer search"""

import numpy as np
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
from NN_Assessment_utils import *
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

DTCRS = DeepTCR_SS('Sequence_C')
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=False,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)
kmer_features = kmer_search(DTCRS.beta_sequences)
clf_svm = SVC(probability=True)
clf_rf = RandomForestClassifier(n_estimators=100)

y_test_list = []
y_pred_list_dtcr = []
y_pred_list_svm = []
y_pred_list_rf = []
Esempio n. 19
0
"""Figure 3B"""
"""This script is used to train both the sequence and repertoire classifier on the
Rudqvist_2017 dataset and compare their performances."""

from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
from matplotlib import pyplot as plt

#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist_SS', device='/device:GPU:0')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.K_Fold_CrossVal(folds=5)

#Train Repertoire Classifier
folds = 100
LOO = 4
epochs_min = 10
size_of_net = 'small'
num_concepts = 64
hinge_loss_t = 0.1
train_loss_min = 0.1
seeds = np.array(range(folds))
graph_seed = 0
Esempio n. 20
0
"""Figure 2B"""
"""This script is used to create the ROC curves for assessing the ability
of supervised sequence classifier to correctly predict the antigen-specificity of 
the 9 murine antigens in the manuscript.."""

from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Arial')

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C', device=2)

DTCRS.Get_Data(directory='../../../Data/Murine_Antigens',
               Load_Prev_Data=False,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)

folds = 10
seeds = np.array(range(folds))
graph_seed = 0
DTCRS.Monte_Carlo_CrossVal(folds=folds, seeds=seeds, graph_seed=graph_seed)
DTCRS.AUC_Curve(xlabel_size=24,
                ylabel_size=24,
                xtick_size=18,
                ytick_size=18,
                legend_font_size=14,
Esempio n. 21
0
"""Figure 2D"""
"""This script is used to benchmark the performance of the Supervised Sequence Classifier
with either the alpha chain, beta chain, or both provided to the model."""

from DeepTCR.DeepTCR import DeepTCR_SS
from multiprocessing import Pool
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
import os

p = Pool(80)
dir_results = 'alpha_v_beta_results'
if not os.path.exists(dir_results):
    os.makedirs(dir_results)

DTCR = DeepTCR_SS('alpha_v_beta_SS')

antigens = ['ATP6AP1-KLG_G3W', 'GNL3L-R4C', 'MART1-A2L', 'YFV-LLW']

opt = ['alpha', 'beta', 'alpha_beta']

for a in antigens:
    y_pred_list = []
    y_test_list = []
    for o in opt:
        if o == 'alpha':
            DTCR = DeepTCR_SS('alpha_v_beta_SS')
            DTCR.Get_Data(directory='../../Data/Zhang/' + a,
                          aa_column_alpha=0,
                          p=p)
        elif o == 'beta':
Esempio n. 22
0
"""Figure 2E"""
"""This script runs regression for the 10x Dataset where alpha/beta TCR's are
regressed against the quantitative evaluation of antigen-specificity via
dCODE Dextramer reagents"""

import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from multiprocessing import Pool
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv')
DTCRS = DeepTCR_SS('reg', device=2)
p = Pool(40)

#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())

y_pred = []
y_test = []
antigen = []
folds = 5
seeds = np.array(range(folds))
graph_seed = 0
#Iterate through all antigens
for i in range(2, df.columns.shape[0]):
    print(df.iloc[:, i].name)
    sel = df.iloc[:, i]