Ejemplo n.º 1
0
"""This script is used to train the TCR+HLA model on the CheckMate-038 Clinical Trial Data."""

from DeepTCR.DeepTCR import DeepTCR_WF
import numpy as np

DTCR = DeepTCR_WF('Human_TIL',device='/device:GPU:1')
DTCR.Get_Data(directory='../../Data/CheckMate_038',Load_Prev_Data=False,
               aa_column_beta=1,count_column=2,v_beta_column=7,d_beta_column=14,j_beta_column=21,data_cut=1.0,
              hla='../../Data/CheckMate_038/HLA_Ref_sup_AB.csv')

folds = 100
LOO = 6
epochs_min = 10
size_of_net = 'small'
num_concepts=64
hinge_loss_t = 0.3
train_loss_min=0.1
seeds = np.array(range(folds))
graph_seed = 0

DTCR.Monte_Carlo_CrossVal(folds=folds,LOO=LOO,epochs_min=epochs_min,size_of_net=size_of_net, num_concepts=num_concepts,
                          combine_train_valid=True,hinge_loss_t=hinge_loss_t,train_loss_min=train_loss_min,seeds=seeds,
                          graph_seed=graph_seed)
DTCR.DFs_pred['crpr'].to_csv('sample_tcr_hla.csv',index=False)

import pickle
with open('cm038_ft_pred.pkl','wb') as f:
    pickle.dump([DTCR.features,DTCR.predicted],f,protocol=4)
Ejemplo n.º 2
0
                 j_beta_column=21)

DTCR_SS.K_Fold_CrossVal(folds=5)

#Train Repertoire Classifier
folds = 100
LOO = 4
epochs_min = 10
size_of_net = 'small'
num_concepts = 64
hinge_loss_t = 0.1
train_loss_min = 0.1
seeds = np.array(range(folds))
graph_seed = 0

DTCR_WF = DeepTCR_WF('Rudqvist_WF', device='/device:GPU:0')
DTCR_WF.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_WF.Monte_Carlo_CrossVal(folds=folds,
                             LOO=LOO,
                             epochs_min=epochs_min,
                             num_concepts=num_concepts,
                             size_of_net=size_of_net,
                             train_loss_min=train_loss_min,
                             hinge_loss_t=hinge_loss_t,
from DeepTCR.DeepTCR import DeepTCR_WF
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
import pandas as pd
import seaborn as sns
from scipy.stats import mannwhitneyu

#Train Sequence Classifier
DTCR = DeepTCR_WF('Human_TIL',device='/gpu:0')
dir = 'Topalian/beta/pre_crpr_sdpd'
DTCR.Get_Data(directory='../../Data/Topalian',Load_Prev_Data=False,
               aa_column_beta=1,count_column=2,v_beta_column=7,d_beta_column=14,j_beta_column=21,data_cut=0.25,
              hla='../../Data/Topalian/HLA_Ref.csv')

folds = 500
LOO = 6
epochs_min = 50
weight_by_class = True
size_of_net = 'small'
stop_criterion = 0.25

y_pred_list = []
y_test_list = []

auc_list = []
names_list = []

names = ['Seq','VDJ','HLA','Seq+VDJ','Seq+HLA','VDJ+HLA','Seq+VDJ+HLA']

#Just train w/ Sequence Information
DTCR.use_hla = False
Ejemplo n.º 4
0
folds = 100
graph_seed = 0
seeds = np.array(range(folds))

files = glob.glob('../../../Data/HIV/*.tsv')
files = files[0:-1]
samples = []
labels = []
for file in files:
    file = file.split('/')[-1]
    samples.append(file)
    labels.append(file.split('_')[1])

label_dict = dict(zip(samples, labels))

DTCR = DeepTCR_WF('load')
DTCR.Get_Data('../../../Data/HIV',
              aa_column_beta=1,
              count_column=2,
              v_beta_column=7,
              d_beta_column=14,
              j_beta_column=21,
              type_of_data_cut='Read_Cut',
              data_cut=10)

idx = np.isin(DTCR.sample_id, np.array(list(label_dict.keys())))
beta_sequences = DTCR.beta_sequences[idx]
v_beta = DTCR.v_beta[idx]
d_beta = DTCR.d_beta[idx]
j_beta = DTCR.j_beta[idx]
sample_labels = DTCR.sample_id[idx]
Ejemplo n.º 5
0
#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aggregate_by_aa=True,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.Monte_Carlo_CrossVal(folds=100, test_size=0.25)
DTCR_SS.AUC_Curve()

#Train Repertoire Classifier without on-graph clustering
DTCR_WF = DeepTCR_WF('Rudqvist')
DTCR_WF.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aggregate_by_aa=True,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_WF.Monte_Carlo_CrossVal(folds=100, LOO=4, epochs_min=50)
DTCR_WF.AUC_Curve()

#Train Repertoire Classifier with on-graph clustering
DTCR_WF.Monte_Carlo_CrossVal(folds=100,
                             LOO=4,
Ejemplo n.º 6
0
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import seaborn as sns
import pandas as pd

files = glob.glob('../../../Data/HIV/*.tsv')
samples = []
labels = []
for file in files:
    file = file.split('/')[-1]
    samples.append(file)
    labels.append(file.split('_')[1])

label_dict = dict(zip(samples, labels))

DTCR = DeepTCR_WF('load')
DTCR.Get_Data('../../../Data/HIV',
              aa_column_beta=1,
              count_column=2,
              v_beta_column=7,
              d_beta_column=14,
              j_beta_column=21,
              type_of_data_cut='Read_Cut',
              data_cut=10)

idx = np.isin(DTCR.sample_id, np.array(list(label_dict.keys())))
beta_sequences = DTCR.beta_sequences[idx]
v_beta = DTCR.v_beta[idx]
d_beta = DTCR.d_beta[idx]
j_beta = DTCR.j_beta[idx]
sample_labels = DTCR.sample_id[idx]
#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist_SS')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.Monte_Carlo_CrossVal(folds=folds, test_size=0.25)
DTCR_SS.AUC_Curve(filename='AUC.eps')

#Train Repertoire Classifier without on-graph clustering
DTCR_WF = DeepTCR_WF('Rudqvist_WF')
DTCR_WF.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_WF.Monte_Carlo_CrossVal(folds=folds, LOO=LOO, epochs_min=epochs_min)
DTCR_WF.AUC_Curve(filename='Rep_AUC.eps')

#Train Repertoire Classifier with on-graph clustering
DTCR_WF.Monte_Carlo_CrossVal(folds=folds,
                             LOO=LOO,
                             on_graph_clustering=True,
Ejemplo n.º 8
0
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import fisher_exact, ranksums, spearmanr
from sklearn.model_selection import StratifiedShuffleSplit
from umap import UMAP
from scipy import ndimage as ndi
from matplotlib.patches import Circle
import pickle

os.environ["CUDA DEVICE ORDER"] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

DTCR = DeepTCR_WF('Human_TIL', device='/device:GPU:0')
DTCR.Get_Data(directory='../../Data/CheckMate_038',
              Load_Prev_Data=False,
              aa_column_beta=1,
              count_column=2,
              v_beta_column=7,
              d_beta_column=14,
              j_beta_column=21,
              data_cut=1.0,
              hla='../../Data/CheckMate_038/HLA_Ref_sup_AB.csv')

with open('cm038_ft_pred_perc.pkl', 'rb') as f:
    features, predicted, perc = pickle.load(f)

win = 10
cut_bottom = np.percentile(predicted[:, 0], win)
Ejemplo n.º 9
0
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
folds = 100
graph_seed = 0
seeds = np.array(range(folds))

files = glob.glob('../../../Data/HIV/*.tsv')
samples = []
labels = []
for file in files:
    file = file.split('/')[-1]
    samples.append(file)
    labels.append(file.split('_')[1])

label_dict = dict(zip(samples, labels))

DTCR = DeepTCR_WF('load')
DTCR.Get_Data('../../../Data/HIV',
              aa_column_beta=1,
              count_column=2,
              v_beta_column=7,
              d_beta_column=14,
              j_beta_column=21,
              type_of_data_cut='Read_Cut',
              data_cut=10)

idx = np.isin(DTCR.sample_id, np.array(list(label_dict.keys())))
beta_sequences = DTCR.beta_sequences[idx]
v_beta = DTCR.v_beta[idx]
d_beta = DTCR.d_beta[idx]
j_beta = DTCR.j_beta[idx]
sample_labels = DTCR.sample_id[idx]