def test_pgen_1(): """ How to add pgen estimates to human alpha/beta CDR3s """ import pandas as pd from tcrdist.pgen import OlgaModel from tcrdist import mappers from tcrdist.repertoire import TCRrep from tcrdist.setup_tests import download_and_extract_zip_file df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df = df.sample(5, random_state = 3), organism = 'human', chains = ['alpha','beta'], db_file = 'alphabeta_gammadelta_db.tsv', store_all_cdr = False) olga_beta = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ") olga_alpha = OlgaModel(chain_folder = "human_T_alpha", recomb_type="VJ") tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens( CDR3_seq = tr.clone_df.cdr3_b_aa) tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens( CDR3_seq = tr.clone_df.cdr3_a_aa) tr.clone_df[['cdr3_b_aa', 'pgen_cdr3_b_aa', 'cdr3_a_aa','pgen_cdr3_a_aa']] """
def test_ex5(): import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep df = pd.read_csv('dash.csv') df = df[df.epitope.isin(['PA'])] tr = TCRrep(cell_df=df, chains=['alpha', 'beta'], organism='mouse') tr.tcrdist2(processes=1, metric='hamming', reduce=True, dump=False, save=False, replacement_weights={ 'cdr3_a_aa': 3, 'pmhc_a_aa': 1, 'cdr2_a_aa': 1, 'cdr1_a_aa': 1, 'cdr3_b_aa': 3, 'pmhc_b_aa': 1, 'cdr2_b_aa': 1, 'cdr1_b_aa': 1 }) assert np.all(tr.pw_tcrdist == tr.pw_alpha + tr.pw_beta) assert np.all(tr.pw_beta == 3 * tr.cdr3_b_aa_pw + tr.pmhc_b_aa_pw + tr.cdr2_b_aa_pw + tr.cdr1_b_aa_pw) assert np.all(tr.pw_alpha == 3 * tr.cdr3_a_aa_pw + tr.pmhc_a_aa_pw + tr.cdr2_a_aa_pw + tr.cdr1_a_aa_pw)
def test_repertoire_full_use_case(self): """ This is not a unit test persay! This is a test of a use_case of an instance of the TCRrep() class used for pairwise sequence comparison """ testrep = TCRrep(cell_df = example_df, chains = ["alpha", "beta"]) # (1) testrep.index_cols.append("epitope") # (2) testrep.index_cols.append("subject") testrep.deduplicate() # (3) testrep.cdr3_a_aa_smat = 'blosum62' # (4) testrep.cdr3_b_aa_smat = 'blosum62' testrep.compute_pairwise(chain = "alpha") # (5) testrep.compute_pairwise(chain = "beta") # (6) tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw # (7) expected_tcrdist = np.array([[ 0., 222., 210., 223., 231., 239., 219., 231., 175.], [ 222., 0., 116., 175., 173., 185., 131., 209., 205.], [ 210., 116., 0., 175., 169., 183., 145., 201., 221.], [ 223., 175., 175., 0., 154., 200., 162., 234., 202.], [ 231., 173., 169., 154., 0., 152., 120., 182., 192.], [ 239., 185., 183., 200., 152., 0., 146., 112., 192.], [ 219., 131., 145., 162., 120., 146., 0., 178., 172.], [ 231., 209., 201., 234., 182., 112., 178., 0., 220.], [ 175., 205., 221., 202., 192., 192., 172., 220., 0.]]) self.assertTrue((tcrdist == expected_tcrdist).all())
def test_integration_TCRrep_with_TCRMotif(): import pandas as pd import tcrdist as td from tcrdist import mappers from tcrdist.repertoire import TCRrep from tcrdist.cdr3_motif import TCRMotif fn = os.path.join("tcrdist","test_files", "vdjDB_PMID28636592.tsv") pd_df = pd.read_csv(fn, sep = "\t") # 1 t_df = td.mappers.vdjdb_to_tcrdist2(pd_df = pd_df) # 2 t_df.organism.value_counts # 3 index_mus = t_df.organism == "MusMusculus" # 4 t_df_mus = t_df.loc[index_mus,:].copy() # 5 tr = TCRrep(cell_df = t_df_mus, organism = "mouse") # 6 tr.infer_cdrs_from_v_gene(chain = 'alpha') # 7 tr.infer_cdrs_from_v_gene(chain = 'beta') # 8 tr.index_cols = ['subject', 'epitope', # subject and epitope 'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene',# gene usage 'cdr3_a_aa', 'cdr3_b_aa', # CDR 3 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', # alpha CDR 1, 2, and 2.5 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa'] # beta CDR 1, 2, and 2.5 tr.deduplicate() # 10 motif = TCRMotif(clones_df = tr.tcr_motif_clones_df(), organism = "mouse", chains = ["A","B"], epitopes = ["PA"]) # 11 assert isinstance(motif.clones_df, pd.DataFrame)
def test_repertoire_full_use_case_hamming(self): """ This is not a unit test persay! This is a test of a use_case of an instance of the TCRrep() class used for pairwise sequence comparison """ testrep = TCRrep(cell_df=example_df, chains=["alpha", "beta"]) # (1) testrep.index_cols.append("epitope") # (2) testrep.index_cols.append("subject") testrep.deduplicate() # (3) testrep.cdr3_a_aa_smat = 'blosum62' # (4) testrep.cdr3_b_aa_smat = 'blosum62' testrep.compute_pairwise_all(chain="alpha", metric="hamming") # (5) testrep.compute_pairwise_all(chain="beta", metric="hamming") # (6) tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw # (7) expected_tcrdist = np.array( [[0., 18., 17., 18., 19., 22., 19., 18., 16.], [18., 0., 11., 15., 15., 17., 10., 18., 18.], [17., 11., 0., 18., 15., 17., 13., 18., 20.], [18., 15., 18., 0., 14., 19., 14., 20., 18.], [19., 15., 15., 14., 0., 14., 11., 17., 16.], [22., 17., 17., 19., 14., 0., 14., 13., 18.], [19., 10., 13., 14., 11., 14., 0., 17., 15.], [18., 18., 18., 20., 17., 13., 17., 0., 19.], [16., 18., 20., 18., 16., 18., 15., 19., 0.]]) self.assertTrue((tcrdist == expected_tcrdist).all())
def test_ex11(): import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df=pd.DataFrame(), chains=['alpha', 'beta'], organism='mouse') tr.rebuild(dest_tar_name="some_archive.tar.gz")
def run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw"): cpu = multiprocessing.cpu_count() # really basic example df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv')) df = df[df.epitope.isin(['NP'])] tr = TCRrep(cell_df=df, chains=chains, organism='mouse') tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save=False) return tr
def test_example_9(): """ If you already have a clones file and want to compute 'tcrdistances' on a DataFrame with custom columns names. Set: 1. Assign TCRrep.clone_df 2. set infer_cdrs = False, 3. compute_distances = False 4. deduplicate = False 5. customize the keys for metrics, weights, and kargs with the lambda customize = lambda d : {new_cols[k]:v for k,v in d.items()} 6. call .calculate_distances() """ import pwseqdist as pw import pandas as pd from tcrdist.repertoire import TCRrep new_cols = { 'cdr3_a_aa': 'c3a', 'pmhc_a_aa': 'pa', 'cdr2_a_aa': 'c2a', 'cdr1_a_aa': 'c1a', 'cdr3_b_aa': 'c3b', 'pmhc_b_aa': 'pb', 'cdr2_b_aa': 'c2b', 'cdr1_b_aa': 'c1b' } df = pd.read_csv("dash2.csv").rename(columns=new_cols) tr = TCRrep( cell_df=df, clone_df=df, #(1) organism='mouse', chains=['alpha', 'beta'], infer_all_genes=True, infer_cdrs=False, #(2)s compute_distances=False, #(3) deduplicate=False, #(4) db_file='alphabeta_gammadelta_db.tsv') customize = lambda d: {new_cols[k]: v for k, v in d.items()} #(5) tr.metrics_a = customize(tr.metrics_a) tr.metrics_b = customize(tr.metrics_b) tr.weights_a = customize(tr.weights_a) tr.weights_b = customize(tr.weights_b) tr.kargs_a = customize(tr.kargs_a) tr.kargs_b = customize(tr.kargs_b) tr.compute_distances() #(6) # Notice that pairwise results now have custom names tr.pw_c3b tr.pw_c3a tr.pw_alpha tr.pw_beta
def test_ex2(): import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep df = pd.read_csv('dash.csv') df = df[df.epitope.isin(['PA'])] tr = TCRrep(cell_df=df, chains=['alpha', 'beta'], organism='mouse') tr.tcrdist2(processes=1, metric='nw', reduce=True, dump=False, save=False) tr.pw_tcrdist
def test_TCRpublic(): df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index().copy() tr = TCRrep(cell_df = df.head(200).copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = True) tr.clone_df['radius'] = 40 from tcrdist.public import TCRpublic tp = TCRpublic(tcrrep = tr, organism = 'mouse', chain = 'beta') tp.report()
def test_current_example(): import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2 from tcrdist.automate import auto_pgen import scipy.sparse fn_mira_background = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.olga100K_brit100K_bkgd.csv' fn_mira_background = os.path.join('tcrdist', 'data', 'covid19', fn_mira_background) df_background = pd.read_csv(fn_mira_background) tr_background = TCRrep(cell_df=df_background.copy(), organism="human", chains=['beta'], compute_distances=False) fn_mira = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv' fn_mira = os.path.join('tcrdist', 'data', 'covid19', fn_mira) df_mira = pd.read_csv(fn_mira) df_mira = df_mira[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']] tr = TCRrep(cell_df=df_mira.copy(), organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', store_all_cdr=False, compute_distances=True) auto_pgen(tr) tr.compute_rect_distances(df=tr.clone_df, df2=tr_background.clone_df, store=False) assert tr.rw_beta.shape[0] == tr.clone_df.shape[0] centers_df = bkgd_cntl_nn2(tr=tr, tr_background=tr_background, ctrl_bkgd=10**-6, weights=tr_background.clone_df.weights, col='cdr3_b_aa', ncpus=2, thresholds=[x for x in range(0, 50, 2)], generate_regex=True, test_regex=True) out_fn_center_df = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.centers_df.csv' out_fn_rw_beta_sparse_matrix = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.rw_beta.sparse.npz' centers_df.to_csv(out_fn_center_df, index=False) tr.rw_beta[tr.rw_beta == 0] = 1 # set true zeros to 1 tr.rw_beta[tr.rw_beta > 50] = 0 # ignores everything less than 100 rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta) scipy.sparse.save_npz(out_fn_rw_beta_sparse_matrix, rw_beta_sparse)
def test_TCRrep_show_incomplete_entries(): """ Test that show TCRrep method returns rows with some missing attribute """ df = pd.DataFrame({ "cdr3_a_aa": ["A", "B", "C"], "v_a_gene": ["TRAV1*01", "TRAV1*01", None], "count": [1, 1, 1] }) tr = TCRrep(cell_df=df, organism="human", chains=["alpha"]) tr.index_cols = ['cdr3_a_aa', 'v_a_gene'] dfi = tr.show_incomplete() assert isinstance(dfi, pd.DataFrame) assert dfi.to_dict() == {'cdr3_a_aa': {2: 'C'}, 'v_a_gene': {2: None}}
def test_calc_radii(): import numpy as np import pandas as pd from tcrdist.repertoire import TCRrep df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True) tr = TCRrep(cell_df = df.copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = True) df = pd.read_csv("dash.csv").query('epitope != "PA"').reset_index(drop = True) tr_bkgd = TCRrep(cell_df = df.copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False) from tcrdist.centers import calc_radii radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = tr_bkgd, chain = 'beta', ctrl_bkgd = 10**-3, use_sparse = False, max_radius=50) from tcrdist.public import _neighbors_variable_radius # Compute neighbors <= variable radius in the background set and the foreground set neighbors = _neighbors_variable_radius(pwmat = tr.pw_beta , radius_list = radii) background_neighbors = _neighbors_variable_radius(pwmat = tr.rw_beta , radius_list = radii) tr.clone_df['radius'] = radii tr.clone_df['neighbors'] = neighbors tr.clone_df['background_neighbors'] = background_neighbors tr.clone_df['nsubject'] = tr.clone_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) tr.clone_df['qpublic'] = tr.clone_df['nsubject'].\ apply(lambda x: x > 1) tr.clone_df
def test_ex9(): import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep df = pd.read_csv('dash.csv') df = df[df.epitope.isin(['PA'])] tr = TCRrep(cell_df=df, chains=['alpha','beta'], organism='mouse') tr.tcrdist2(processes = 1, metric = 'hamming', reduce = True, dump = False, save = True, dest = "some_archive", dest_tar_name = "some_archive.tar.gz")
def test_TCRrep_func_tcrdist2_save_auto_rebuild(chains = ['beta','alpha'], metric = "nw"): cpu = multiprocessing.cpu_count() # really basic example df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv')) df = df[df.epitope.isin(['NP'])] tr = TCRrep(cell_df=df, chains=chains, organism='mouse') tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save= True, dest = "default_archive", dest_tar_name = "default_archive.tar.gz" ) # Cleanup folder that you just made os.system("rm -rf myTCRrep_archive") # Rebuild tr = TCRrep(cell_df=df.iloc[0:0,:], chains=chains, organism='mouse') tr.rebuild(dest_tar_name = "default_archive.tar.gz") tr_compare = run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw") attendance = {k : k in tr.__dict__.keys() for k in tr_compare.__dict__.keys()} assert attendance['pw_tcrdist'] assert attendance['pw_alpha'] assert attendance['pw_beta'] assert attendance['clone_df'] assert attendance['cell_df'] # only compare things in common shared_attributes = [k for k in attendance.keys() if attendance[k]] for k in shared_attributes: if not isinstance(getattr(tr, k), pd.DataFrame): if not isinstance(getattr(tr, k), dict): assert np.all(getattr(tr, k) == getattr(tr_compare, k)) else: assert set(getattr(tr, k).keys()) == set(getattr(tr_compare, k).keys()) assert set(tr.all_genes['mouse'].keys()) == set(tr_compare.all_genes['mouse'].keys()) assert set(tr.all_genes['human'].keys()) == set(tr_compare.all_genes['human'].keys())
def test_example_10_sparse(): """ If you just want a 'tcrdistances' of some target seqs against another set computed sparsely (1) cell_df is asigned the first 10 cells in dash.csv (2) compute tcrdistances with default settings. (3) compute rectangular distance between clone_df and df2. (4) compute rectangular distance between clone_df and any arbtirary df3, which need not be associated with the TCRrep object. (5) compute rectangular distance with only a subset of the TCRrep.clone_df """ import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.rep_funcs import pw2dense import numpy as np df = pd.read_csv("dash.csv") df2 = pd.read_csv("dash2.csv") df = df.head(10) #(1) tr = TCRrep( cell_df=df, #(2) df2=df2, organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') tr.compute_rect_distances(df=tr.clone_df, df2=df2) assert tr.rw_alpha.shape == (10, 1924) assert tr.rw_beta.shape == (10, 1924) rw_alpha = tr.rw_alpha.copy() rw_beta = tr.rw_beta.copy() radius = 100 tr.cpus = 1 tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius) d = pw2dense(tr.rw_alpha, radius) print(rw_alpha[:2, :10]) print(tr.rw_alpha.todense()[:2, :10]) print(d[:2, :10]) assert np.all(rw_alpha[rw_alpha <= radius] == d[d <= radius]) d = pw2dense(tr.rw_beta, radius) assert np.all(rw_beta[rw_beta <= radius] == d[d <= radius]) radius = 5000 tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius) d = pw2dense(tr.rw_alpha, radius) print(rw_alpha[:2, :10]) print(tr.rw_alpha.todense()[:2, :10]) assert np.all(rw_alpha == d) d = pw2dense(tr.rw_beta, radius) assert np.all(rw_beta == d)
def test_TCRrep_deduplicate_gives_warning_above_complete_values(): """ Warn user if there is missing value in an index column and cell count will not match clone count """ df = pd.DataFrame({ "cdr3_a_aa": ["A", "B", "C"], "v_a_gene": ["TRAV1*01", "TRAV1*01", None], "count": [1, 1, 1] }) tr = TCRrep(cell_df=df, organism="human", chains=["alpha"]) tr.index_cols = ['cdr3_a_aa', 'v_a_gene'] with pytest.warns(None) as record: tr.deduplicate() assert str(record[0].message).startswith( "Not all") # cells/sequences could be grouped into clones.\n"
def test_TCRrep_show_incomplete_entries_when_there_are_none(): """ Test that show TCRrep method returns rows with some missing attribute """ df = pd.DataFrame({ "cdr3_a_aa": ["A", "B", "C"], "v_a_gene": ["TRAV1*01", "TRAV1*01", "TRAV1*01"], "count": [1, 1, 1] }) tr = TCRrep(cell_df=df, organism="human", chains=["alpha"]) tr.index_cols = ['cdr3_a_aa', 'v_a_gene'] dfi = tr.show_incomplete() assert isinstance(dfi, pd.DataFrame) assert dfi.shape[0] == 0 assert dfi.shape[1] == 2
def test_repertoire_infer_cdrs_from_v_gene(self): testrep = TCRrep(cell_df = example_df, chains = ["alpha", "beta"]) testrep.infer_cdrs_from_v_gene(chain = "alpha") testrep.infer_cdrs_from_v_gene(chain = "beta") testrep.index_cols =['epitope', 'subject', 'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa'] testrep.deduplicate() self.assertTrue(np.all([isinstance(testrep.clone_df['cdr1_a_aa'], pd.Series), isinstance(testrep.clone_df['cdr1_b_aa'], pd.Series), isinstance(testrep.clone_df['cdr2_a_aa'], pd.Series), isinstance(testrep.clone_df['cdr2_b_aa'], pd.Series)]))
def test_import_vdjtools_beta_w_validation(): import pandas as pd import numpy as np import os from tcrdist.paths import path_to_base from tcrdist.vdjtools_funcs import import_vdjtools from tcrdist.repertoire import TCRrep # Reformat vdj_tools input format for tcrdist3 vdj_tools_file_beta = os.path.join( path_to_base, 'tcrdist', 'data', 'formats', 'vdj.M_15_CD8_beta.clonotypes.TRB.txt.gz') df_beta = import_vdjtools(vdj_tools_file=vdj_tools_file_beta, chain='beta', organism='human', db_file='alphabeta_gammadelta_db.tsv', validate=True) assert np.all(df_beta.columns == [ 'count', 'freq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'cdr3_b_nucseq', 'valid_v', 'valid_j', 'valid_cdr3' ]) # Can be directly imported into a TCRrep instance. tr = TCRrep( cell_df=df_beta[['count', 'freq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene']], chains=['beta'], organism='human', compute_distances=False)
def test_introduction_3(): import pandas as pd from tcrdist.repertoire import TCRrep df = pd.read_csv("dash.csv") tr = TCRrep(cell_df=df, organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=False) from tcrdist.plotting import plot_pairings, _write_svg svg_PA = plot_pairings( tr.clone_df.loc[tr.clone_df.epitope == "PA"], cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'], count_col='count') svg_NP = plot_pairings( tr.clone_df.loc[tr.clone_df.epitope == "NP"], cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'], count_col='count') _write_svg(svg_PA, name="PA_gene_usage_plot.svg", dest=".") _write_svg(svg_NP, name="NP_gene_usage_plot.svg", dest=".") import fishersapi fishersapi.fishers_frame(tr.clone_df.loc[tr.clone_df.epitope == "NP"], col_pairs=[('v_b_gene', 'j_b_gene'), ('v_a_gene', 'j_a_gene'), ('v_a_gene', 'v_b_gene'), ('j_a_gene', 'j_b_gene')])
def test_validate_chains(): """Test that incorrect chain raise causes ValueError""" with pytest.raises(ValueError) as info: tr = TCRrep(organism="mouse", chains=["ALPHA", "beta"]) assert str( info.value ) == "TCRrep chains arg can be one or more of the following ['alpha', 'beta', 'gamma', 'delta'] case-sensitive"
def test_olga_sample_alphas_for_a_human_repertoire(): import re import pandas as pd from tcrdist.repertoire import TCRrep import palmotif from tcrdist.pgen import OlgaModel olga_model_alpha = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") from tcrdist.pgen import OlgaModel olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') rb = [ olga_model_beta.gen_cdr3s(V=allele_01(r['v_b_gene']), J=allele_01(r['j_b_gene']), n=1) for _, r in tr.clone_df[['v_b_gene', 'j_b_gene']].iterrows() ] ra = [ olga_model_alpha.gen_cdr3s(V=allele_01(r['v_a_gene']), J=allele_01(r['j_a_gene']), n=1) for _, r in tr.clone_df[['v_a_gene', 'j_a_gene']].iterrows() ]
def test_workflow_1(): """ Load all the TCRs associated with a particular epitope in the Adaptive Biotechnology COVID19 Data Release 2 """ import os import pandas as pd from tcrdist.repertoire import TCRrep path = os.path.join('tcrdist', 'data', 'covid19') file = 'mira_epitope_9_2477_FLQSINFVR_FLQSINFVRI_FLYLYALVYF_GLEAPFLYLY_INFVRIIMR_LQSINFVRI_LQSINFVRII_QSINFVRII_SINFVRIIMR_VYFLQSINF_VYFLQSINFV_YFLQSINFVR_YLYALVYFL.tcrdist3.csv' filename = os.path.join(path, file) df = pd.read_csv(filename, sep=",") df = df[[ 'cell_type', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope', 'age', 'sex', 'race', 'cohort', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'hla-c', 'hla-c_1', 'dpa1_1', 'dpb1', 'dpb1_1', 'dqa1', 'dqa1_1', 'dqb1', 'dqb1_1', 'drb1', 'drb1_1', 'drb3' ]] df['count'] = 1 tr = TCRrep(cell_df=df, organism='human', chains=['beta'])
def test_validate_imgt_aligned(): """Test that incorrect chain raise causes ValueError""" with pytest.raises(ValueError) as info: tr = TCRrep(organism="mouse", chains=["alpha", "beta"], imgt_aligned="align") assert str(info.value) == "TCRrep imgt_aligned argument must be a boolean"
def test_TCRpublic_with_neighborhood_dif(): """ Use values from neighborhood_diff """ import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.public import TCRpublic fn = os.path.join( 'tcrdist', 'data', 'covid19', 'mira_epitope_55_524_ALRKVPTDNYITTY_KVPTDNYITTY.tcrdist3.radius.csv') df = pd.read_csv(fn) tr = TCRrep(cell_df=df[[ 'cohort', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'radius' ]], organism="human", chains=["beta"]) from tcrdist.rep_diff import neighborhood_diff ndif = neighborhood_diff(clone_df=tr.clone_df, pwmat=tr.pw_beta, count_col='count', x_cols=['cohort'], knn_radius=25, test_method="chi2") # Add neighbors and other columns of interest # from neighbor_diff result to the clone_df tr.clone_df = pd.concat([ tr.clone_df, ndif[['neighbors', 'K_neighbors', 'val_0', 'ct_0', 'pvalue']] ], axis=1) # Because neighors and K_neighbor are already added to the clone_df # TCRpublic.report() uses those instead of finding new ones. tp = TCRpublic(tcrrep=tr, output_html_name="quasi_public_clones_with_ndif.html") # Add any columns neighbor_diff columns #that you want to display in the final report tp.labels.append('val_0') tp.labels.append('ct_0') tp.labels.append('pvalue') # chagne sort to be pvalue not publicity tp.sort_columns = ['pvalue'] # because you are sorting by pvalue, change to True tp.sort_ascending = True tp.report()
def motif_creation_human_betas(): import re import pandas as pd from tcrdist.repertoire import TCRrep import palmotif from tcrdist.pgen import OlgaModel oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") from tcrdist.pgen import OlgaModel omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') from tcrdist.adpt_funcs import get_basic_centroids get_basic_centroids(tr, max_dist=75) with open("test_3.svg", 'w') as oh: oh.write('<body>') for i, r in tr.centroids_df.iterrows(): if len(r['neighbors']) < 5: break seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list() gene_usages = tr.clone_df.iloc[r['neighbors'], ][[ 'v_b_gene', 'j_b_gene' ]].value_counts().reset_index().to_dict('split')['data'] depth = 3 refs = flatten([ omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth) for v, j, i in combos_alpha ]) refs = [x for x in refs if x is not None] matrix, stats = palmotif.compute_pal_motif(seqs=seqs, refs=refs, centroid=r['cdr3_b_aa']) matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs, centroid=r['cdr3_b_aa']) refs.append(r['cdr3_b_aa']) matrix_bkgd, _ = palmotif.compute_pal_motif( seqs=refs, centroid=r['cdr3_b_aa']) svgs = [ palmotif.svg_logo(matrix, 'test.svg', return_str=True), palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True), palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True) ] [oh.write(f"{s}<div></div>\n") for s in svgs] oh.write('<div></div>') oh.write(str(r)) oh.write('<div></div>') oh.write('</body>')
def test_neighbors_and_publicity_directly(): """ Instead of enforcing a fixed radius, use a radius specific to each centroid, specified in an additional column. """ import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.public import TCRpublic fn = os.path.join( 'tcrdist', 'data', 'covid19', 'mira_epitope_55_524_ALRKVPTDNYITTY_KVPTDNYITTY.tcrdist3.radius.csv') df = pd.read_csv(fn) tr = TCRrep(cell_df=df[[ 'cohort', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'radius' ]], organism="human", chains=["beta"]) # NEIGHBORS from tcrdist.public import _neighbors_fixed_radius from tcrdist.public import _neighbors_variable_radius # returns lists of lists of all neighbors at fixed of variable radii _neighbors_fixed_radius(pwmat=tr.pw_beta, radius=18) _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list=tr.clone_df.radius) # returns the number (K) neighbors at fixed or vriable radii from tcrdist.public import _K_neighbors_fixed_radius from tcrdist.public import _K_neighbors_variable_radius _K_neighbors_fixed_radius(pwmat=tr.pw_beta, radius=18) _K_neighbors_variable_radius(pwmat=tr.pw_beta, radius_list=tr.clone_df.radius) # First find neighbors by your favorite method tr.clone_df['neighbors'] = _neighbors_variable_radius( pwmat=tr.pw_beta, radius_list=tr.clone_df.radius) # Once neighbors are added to a clone_df you can easily determine publicity. tr.clone_df['nsubject'] = tr.clone_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) tr.clone_df['qpublic'] = tr.clone_df['nsubject'].\ apply(lambda x: x > 1)
def test_mixcr_to_tcrdist_on_clones(): test_clones = os.path.join('tcrdist', 'test_files_compact', 'SRR5130260.1.test.fastq.output.clns.txt') df = mixcr.mixcr_to_tcrdist2(chain="delta", organism="human", clones_fn=test_clones) assert isinstance(df, pd.DataFrame) df1 = mixcr.remove_entries_with_invalid_vgene(df, chain="delta", organism="human") assert isinstance(df, pd.DataFrame) df1['subject'] = 'SRR5130260.1' tr = TCRrep(cell_df=df1, organism="human", chains=['delta'], db_file='gammadelta_db.tsv') print(tr.cell_df.shape[0]) tr.infer_cdrs_from_v_gene(chain='delta', imgt_aligned=True) tr.index_cols = [ 'subject', "v_d_gene", 'd_d_gene', 'j_d_gene', 'cdr3_d_nucseq', 'cdr3_d_aa', 'cdr1_d_aa', 'cdr2_d_aa', 'pmhc_d_aa' ] tr.deduplicate() assert isinstance(tr.clone_df, pd.DataFrame)
def test_example_10_sparse_multiprocessing(): import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.rep_funcs import pw2dense import numpy as np df2 = pd.read_csv("dash2.csv") tr = TCRrep( cell_df=df2, #(2) df2=df2, organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') tr.compute_rect_distances(df=tr.clone_df, df2=df2) assert tr.rw_alpha.shape == (1924, 1924) assert tr.rw_beta.shape == (1924, 1924) rw_alpha = tr.rw_alpha.copy() rw_beta = tr.rw_beta.copy() radius = 150 tr.cpus = 2 tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius) d = pw2dense(tr.rw_alpha, radius) assert np.all(rw_alpha[rw_alpha <= radius] == d[d <= radius]) d = pw2dense(tr.rw_beta, radius) assert np.all(rw_beta[rw_beta <= radius] == d[d <= radius])