def test_TCRrep_func_tcrdist2_save_auto_rebuild(chains = ['beta','alpha'], metric = "nw"): cpu = multiprocessing.cpu_count() # really basic example df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv')) df = df[df.epitope.isin(['NP'])] tr = TCRrep(cell_df=df, chains=chains, organism='mouse') tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save= True, dest = "default_archive", dest_tar_name = "default_archive.tar.gz" ) # Cleanup folder that you just made os.system("rm -rf myTCRrep_archive") # Rebuild tr = TCRrep(cell_df=df.iloc[0:0,:], chains=chains, organism='mouse') tr.rebuild(dest_tar_name = "default_archive.tar.gz") tr_compare = run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw") attendance = {k : k in tr.__dict__.keys() for k in tr_compare.__dict__.keys()} assert attendance['pw_tcrdist'] assert attendance['pw_alpha'] assert attendance['pw_beta'] assert attendance['clone_df'] assert attendance['cell_df'] # only compare things in common shared_attributes = [k for k in attendance.keys() if attendance[k]] for k in shared_attributes: if not isinstance(getattr(tr, k), pd.DataFrame): if not isinstance(getattr(tr, k), dict): assert np.all(getattr(tr, k) == getattr(tr_compare, k)) else: assert set(getattr(tr, k).keys()) == set(getattr(tr_compare, k).keys()) assert set(tr.all_genes['mouse'].keys()) == set(tr_compare.all_genes['mouse'].keys()) assert set(tr.all_genes['human'].keys()) == set(tr_compare.all_genes['human'].keys())
def test_calc_radii(): import numpy as np import pandas as pd from tcrdist.repertoire import TCRrep df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True) tr = TCRrep(cell_df = df.copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = True) df = pd.read_csv("dash.csv").query('epitope != "PA"').reset_index(drop = True) tr_bkgd = TCRrep(cell_df = df.copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False) from tcrdist.centers import calc_radii radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = tr_bkgd, chain = 'beta', ctrl_bkgd = 10**-3, use_sparse = False, max_radius=50) from tcrdist.public import _neighbors_variable_radius # Compute neighbors <= variable radius in the background set and the foreground set neighbors = _neighbors_variable_radius(pwmat = tr.pw_beta , radius_list = radii) background_neighbors = _neighbors_variable_radius(pwmat = tr.rw_beta , radius_list = radii) tr.clone_df['radius'] = radii tr.clone_df['neighbors'] = neighbors tr.clone_df['background_neighbors'] = background_neighbors tr.clone_df['nsubject'] = tr.clone_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) tr.clone_df['qpublic'] = tr.clone_df['nsubject'].\ apply(lambda x: x > 1) tr.clone_df
def do_search2(file, df_search, dest, tag, path): sample_name = file.replace('.tcrdist.tsv', '') tic = time.perf_counter() # <tr_search> tcrdist.repertoire.TCRrep object for computing distances tr_search = TCRrep(cell_df=df_search, organism='human', chains=['delta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=False) # set cpus according to parameter above tr_search.cpus = 1 df_bulk = pd.read_csv(os.path.join(path, file), sep='\t').rename(columns={'cdr3_b_aa': 'cdr3_d_aa'}) print(df_bulk) df_bulk = df_bulk[[ 'cdr3_d_aa', 'v_d_gene', 'j_d_gene', 'templates', 'productive_frequency' ]].rename(columns={'templates': 'count'}) tr_bulk = TCRrep(cell_df=df_bulk, organism='human', chains=['delta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=False) #lines_per_file.append(tr_bulk.clone_df.shape[0]) search_clones = tr_search.clone_df.shape[0] bulk_clones = tr_bulk.clone_df.shape[0] # To avoid memory pressure on the system we set a target that tcrdist doesn't do more than 10M comparisons per process ideal_chunk_size = get_safe_chunk(tr_search.clone_df.shape[0], tr_bulk.clone_df.shape[0], target=10**7) tr_search.compute_sparse_rect_distances(df=tr_search.clone_df, df2=tr_bulk.clone_df, chunk_size=ideal_chunk_size) #(5) r1 = tabulate(clone_df1=tr_search.clone_df, clone_df2=tr_bulk.clone_df, pwmat=tr_search.rw_delta, cdr3_name='cdr3_d_aa', v_gene_name='v_d_gene', j_gene_name='j_d_gene') outfile = os.path.join(dest, f"{sample_name}.{tag}.bulk_tabulation.tsv") print(f"WRITING: {outfile}") r1.to_csv(outfile, sep='\t', index=False) toc = time.perf_counter() print(f"TABULATED IN {toc - tic:0.4f} seconds") del (tr_search) del (tr_bulk) #return(r1) return (f"{toc - tic:0.4f}s")
def test_current_example(): import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2 from tcrdist.automate import auto_pgen import scipy.sparse fn_mira_background = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.olga100K_brit100K_bkgd.csv' fn_mira_background = os.path.join('tcrdist', 'data', 'covid19', fn_mira_background) df_background = pd.read_csv(fn_mira_background) tr_background = TCRrep(cell_df=df_background.copy(), organism="human", chains=['beta'], compute_distances=False) fn_mira = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv' fn_mira = os.path.join('tcrdist', 'data', 'covid19', fn_mira) df_mira = pd.read_csv(fn_mira) df_mira = df_mira[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']] tr = TCRrep(cell_df=df_mira.copy(), organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', store_all_cdr=False, compute_distances=True) auto_pgen(tr) tr.compute_rect_distances(df=tr.clone_df, df2=tr_background.clone_df, store=False) assert tr.rw_beta.shape[0] == tr.clone_df.shape[0] centers_df = bkgd_cntl_nn2(tr=tr, tr_background=tr_background, ctrl_bkgd=10**-6, weights=tr_background.clone_df.weights, col='cdr3_b_aa', ncpus=2, thresholds=[x for x in range(0, 50, 2)], generate_regex=True, test_regex=True) out_fn_center_df = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.centers_df.csv' out_fn_rw_beta_sparse_matrix = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.rw_beta.sparse.npz' centers_df.to_csv(out_fn_center_df, index=False) tr.rw_beta[tr.rw_beta == 0] = 1 # set true zeros to 1 tr.rw_beta[tr.rw_beta > 50] = 0 # ignores everything less than 100 rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta) scipy.sparse.save_npz(out_fn_rw_beta_sparse_matrix, rw_beta_sparse)
def test_example_with_report(): # Example that would work with a large bakcgournd import numpy as np import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.background import sample_britanova from tcrdist.sample import _default_sampler """A useful background for beta chain""" df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True) tr = TCRrep(cell_df = df.copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = True) ts = _default_sampler(organism = "mouse", chain = "beta")() trb = TCRrep(cell_df = ts.ref_df.rename(columns = {'v_reps' : 'v_b_gene', 'j_reps': 'j_b_gene', 'cdr3': 'cdr3_b_aa'}).copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False) tr.cpus = 2 tr.compute_sparse_rect_distances(df = tr.clone_df, df2 = trb.clone_df, radius=50,chunk_size=100) from tcrdist.centers import calc_radii radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = trb, chain = 'beta', ctrl_bkgd = 10**-5, use_sparse = True, max_radius=50) # Set a maximum radius of 26 tr.clone_df['radius'] = radii tr.clone_df['radius'][tr.clone_df['radius'] > 26] = 26 # Quick access to publicity from tcrdist.public import _neighbors_sparse_variable_radius, _neighbors_variable_radius tr.clone_df['neighbors'] = _neighbors_variable_radius(pwmat = tr.pw_beta, radius_list = tr.clone_df['radius']) tr.clone_df['background_neighbors'] = _neighbors_sparse_variable_radius(csrmat = tr.rw_beta, radius_list = tr.clone_df['radius']) tr.clone_df['nsubject'] = tr.clone_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) tr.clone_df['qpublic'] = tr.clone_df['nsubject'].\ apply(lambda x: x > 1) tr.clone_df # A Report from tcrdist.public import TCRpublic tp = TCRpublic( tcrrep = tr, output_html_name = "quasi_public_clones.html") tp.fixed_radius = False rp = tp.report()
def test_old_example(): """ The purpose of this example is to show the use of chosing thresholds based on background discovery rate """ import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2 from tcrdist.automate import auto_pgen from tcrdist.regex import _index_to_regex_str, _index_to_seqs from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets fn = os.path.join('tcrdist', 'data', 'covid19', "m60_bkgd_test_input.csv") df_background = pd.read_csv(fn) tr_background = TCRrep(cell_df=df_background, organism="human", chains=['beta'], compute_distances=False) tr_background.clone_df['weights'] = 1 fn = os.path.join('tcrdist', 'data', 'covid19', "m60_test_input.csv") df = pd.read_csv(fn) tr = TCRrep(cell_df=df, organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv') auto_pgen(tr) tr.compute_rect_distances(df=tr.clone_df, df2=tr_background.clone_df, store=False) assert tr.rw_beta.shape[0] == tr.clone_df.shape[0] centers_df = bkgd_cntl_nn2(tr=tr, tr_background=tr_background, ctrl_bkgd=2 * 10**-5, weights=tr_background.clone_df.weights, col='cdr3_b_aa', ncpus=2, thresholds=[x for x in range(0, 50, 2)], generate_regex=True, test_regex=True) centers_df.sort_values(['target_hits'], ascending=False)
def setUpClass(self): filename = op.join(td.__path__[0], 'test_files', 'vdjDB_PMID28636592.tsv') pd_df = pd.read_csv(filename, sep='\t') t_df = td.mappers.vdjdb_to_tcrdist2(pd_df=pd_df) t_df = t_df.loc[(t_df.organism == 'HomoSapiens') & (t_df.epitope == 'M1')] tr = TCRrep(cell_df=t_df, organism='human') tr.infer_cdrs_from_v_gene(chain='alpha') tr.infer_cdrs_from_v_gene(chain='beta') tr.index_cols = ['subject', 'cdr3_b_aa'] tr.deduplicate() with warnings.catch_warnings(): warnings.simplefilter("ignore") tr.compute_pairwise_all(chain='beta', metric='nw', proceses=1) self.pw = tr.cdr3_b_aa_pw np.random.seed(110820) self.clone_df = tr.clone_df.assign( Visit=np.random.choice(['Pre', 'Post'], size=tr.clone_df.shape[0], p=[0.4, 0.6]), Stim=np.random.choice(['A', 'B', 'C'], size=tr.clone_df.shape[0], p=[0.4, 0.1, 0.5]))
def test_introduction_3(): import pandas as pd from tcrdist.repertoire import TCRrep df = pd.read_csv("dash.csv") tr = TCRrep(cell_df=df, organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=False) from tcrdist.plotting import plot_pairings, _write_svg svg_PA = plot_pairings( tr.clone_df.loc[tr.clone_df.epitope == "PA"], cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'], count_col='count') svg_NP = plot_pairings( tr.clone_df.loc[tr.clone_df.epitope == "NP"], cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'], count_col='count') _write_svg(svg_PA, name="PA_gene_usage_plot.svg", dest=".") _write_svg(svg_NP, name="NP_gene_usage_plot.svg", dest=".") import fishersapi fishersapi.fishers_frame(tr.clone_df.loc[tr.clone_df.epitope == "NP"], col_pairs=[('v_b_gene', 'j_b_gene'), ('v_a_gene', 'j_a_gene'), ('v_a_gene', 'v_b_gene'), ('j_a_gene', 'j_b_gene')])
def test_TCRrep_func_tcrdist2_save_manual_rebuild(chains = ['beta','alpha'], metric = "nw"): cpu = multiprocessing.cpu_count() # really basic example df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv')) df = df[df.epitope.isin(['NP'])] tr = TCRrep(cell_df=df, chains=chains, organism='mouse') tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save=True, dest = "default_archive", dest_tar_name = "default_archive.tar.gz" ) # Cleanup folder that you just made os.system("rm -rf myTCRrep_archive") # Rebuild tr = TCRrep(cell_df=df.iloc[0:0,:], chains=chains, organism='mouse') z = Zipdist2(name = "test_only", target = tr) z._build(dest_tar = "default_archive.tar.gz", target = tr) assert isinstance(tr.paired_tcrdist, np.ndarray ) assert isinstance(tr.pw_tcrdist, np.ndarray ) assert np.array_equal(tr.pw_tcrdist, tr.paired_tcrdist)
def test_repertoire_full_use_case(self): """ This is not a unit test persay! This is a test of a use_case of an instance of the TCRrep() class used for pairwise sequence comparison """ testrep = TCRrep(cell_df = example_df, chains = ["alpha", "beta"]) # (1) testrep.index_cols.append("epitope") # (2) testrep.index_cols.append("subject") testrep.deduplicate() # (3) testrep.cdr3_a_aa_smat = 'blosum62' # (4) testrep.cdr3_b_aa_smat = 'blosum62' testrep.compute_pairwise(chain = "alpha") # (5) testrep.compute_pairwise(chain = "beta") # (6) tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw # (7) expected_tcrdist = np.array([[ 0., 222., 210., 223., 231., 239., 219., 231., 175.], [ 222., 0., 116., 175., 173., 185., 131., 209., 205.], [ 210., 116., 0., 175., 169., 183., 145., 201., 221.], [ 223., 175., 175., 0., 154., 200., 162., 234., 202.], [ 231., 173., 169., 154., 0., 152., 120., 182., 192.], [ 239., 185., 183., 200., 152., 0., 146., 112., 192.], [ 219., 131., 145., 162., 120., 146., 0., 178., 172.], [ 231., 209., 201., 234., 182., 112., 178., 0., 220.], [ 175., 205., 221., 202., 192., 192., 172., 220., 0.]]) self.assertTrue((tcrdist == expected_tcrdist).all())
def test_repertoire____use_case_hamming_paired_tcrdist(self): tr = TCRrep(cell_df = example_df.copy(), organism = "human", chains= ["alpha", "beta"]) tr.infer_cdrs_from_v_gene(chain = "alpha") tr.infer_cdrs_from_v_gene(chain = "beta") tr.index_cols =['cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa'] tr.deduplicate() #tr.clone_df tr.compute_pairwise_all(chain = "alpha", metric = "hamming") tr.compute_pairwise_all(chain = "beta", metric = "hamming") r = tr.compute_paired_tcrdist(chains = ['alpha', 'beta']) expected = {'paired_tcrdist': np.array([[ 0., 50., 49., 48., 49., 49., 51., 44., 46.], [ 50., 0., 21., 29., 29., 47., 40., 45., 44.], [ 49., 21., 0., 42., 39., 47., 42., 44., 50.], [ 48., 29., 42., 0., 14., 35., 48., 52., 46.], [ 49., 29., 39., 14., 0., 30., 45., 49., 44.], [ 49., 47., 47., 35., 30., 0., 46., 43., 48.], [ 51., 40., 42., 48., 45., 46., 0., 36., 41.], [ 44., 45., 44., 52., 49., 43., 36., 0., 47.], [ 46., 44., 50., 46., 44., 48., 41., 47., 0.]]), 'paired_tcrdist_weights': {'cdr1_a_aa_pw': 1, 'cdr1_b_aa_pw': 1, 'cdr2_a_aa_pw': 1, 'cdr2_b_aa_pw': 1, 'cdr3_a_aa_pw': 1, 'cdr3_b_aa_pw': 1, 'pmhc_a_aa_pw': 1, 'pmhc_b_aa_pw': 1}} #print(r['paired_tcrdist'][1, 2]) #print(expected['paired_tcrdist'][1, 2]) #print(tr.clone_df.iloc[1]) #print(tr.clone_df.iloc[2]) #print(r['paired_tcrdist'] == expected['paired_tcrdist']) self.assertTrue((r['paired_tcrdist'] == expected['paired_tcrdist']).all())
def test_gamma_delta_manually_step_by_step(): """ Test that the user can go through the automatic steps, manually, one-by-one for well formatted gamma delta data. """ df = pd.read_csv("sant.csv") tr = TCRrep(cell_df=df, organism="human", chains=['gamma', "delta"], imgt_aligned=False, infer_cdrs=False, infer_index_cols=False, deduplicate=False, use_defaults=False, store_all_cdr=False, compute_distances=False, cpus=1, db_file='alphabeta_gammadelta_db.tsv') tr.infer_cdrs_from_v_gene(chain="gamma") tr.infer_cdrs_from_v_gene(chain="delta") tr.infer_index_cols() tr.show_incomplete() tr.deduplicate() tr._initialize_chain_specific_attributes() tr.stora_all_cdr = True tr.compute_distances() tr.pw_gamma
def test_validate_imgt_aligned(): """Test that incorrect chain raise causes ValueError""" with pytest.raises(ValueError) as info: tr = TCRrep(organism="mouse", chains=["alpha", "beta"], imgt_aligned="align") assert str(info.value) == "TCRrep imgt_aligned argument must be a boolean"
def test_validate_chains(): """Test that incorrect chain raise causes ValueError""" with pytest.raises(ValueError) as info: tr = TCRrep(organism="mouse", chains=["ALPHA", "beta"]) assert str( info.value ) == "TCRrep chains arg can be one or more of the following ['alpha', 'beta', 'gamma', 'delta'] case-sensitive"
def test_repertoire_full_use_case_hamming(self): """ This is not a unit test persay! This is a test of a use_case of an instance of the TCRrep() class used for pairwise sequence comparison """ testrep = TCRrep(cell_df=example_df, chains=["alpha", "beta"]) # (1) testrep.index_cols.append("epitope") # (2) testrep.index_cols.append("subject") testrep.deduplicate() # (3) testrep.cdr3_a_aa_smat = 'blosum62' # (4) testrep.cdr3_b_aa_smat = 'blosum62' testrep.compute_pairwise_all(chain="alpha", metric="hamming") # (5) testrep.compute_pairwise_all(chain="beta", metric="hamming") # (6) tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw # (7) expected_tcrdist = np.array( [[0., 18., 17., 18., 19., 22., 19., 18., 16.], [18., 0., 11., 15., 15., 17., 10., 18., 18.], [17., 11., 0., 18., 15., 17., 13., 18., 20.], [18., 15., 18., 0., 14., 19., 14., 20., 18.], [19., 15., 15., 14., 0., 14., 11., 17., 16.], [22., 17., 17., 19., 14., 0., 14., 13., 18.], [19., 10., 13., 14., 11., 14., 0., 17., 15.], [18., 18., 18., 20., 17., 13., 17., 0., 19.], [16., 18., 20., 18., 16., 18., 15., 19., 0.]]) self.assertTrue((tcrdist == expected_tcrdist).all())
def test_mixcr_to_tcrdist_on_clones(): test_clones = os.path.join('tcrdist', 'test_files_compact', 'SRR5130260.1.test.fastq.output.clns.txt') df = mixcr.mixcr_to_tcrdist2(chain="delta", organism="human", clones_fn=test_clones) assert isinstance(df, pd.DataFrame) df1 = mixcr.remove_entries_with_invalid_vgene(df, chain="delta", organism="human") assert isinstance(df, pd.DataFrame) df1['subject'] = 'SRR5130260.1' tr = TCRrep(cell_df=df1, organism="human", chains=['delta'], db_file='gammadelta_db.tsv') print(tr.cell_df.shape[0]) tr.infer_cdrs_from_v_gene(chain='delta', imgt_aligned=True) tr.index_cols = [ 'subject', "v_d_gene", 'd_d_gene', 'j_d_gene', 'cdr3_d_nucseq', 'cdr3_d_aa', 'cdr1_d_aa', 'cdr2_d_aa', 'pmhc_d_aa' ] tr.deduplicate() assert isinstance(tr.clone_df, pd.DataFrame)
def test_import_vdjtools_beta_w_validation(): import pandas as pd import numpy as np import os from tcrdist.paths import path_to_base from tcrdist.vdjtools_funcs import import_vdjtools from tcrdist.repertoire import TCRrep # Reformat vdj_tools input format for tcrdist3 vdj_tools_file_beta = os.path.join( path_to_base, 'tcrdist', 'data', 'formats', 'vdj.M_15_CD8_beta.clonotypes.TRB.txt.gz') df_beta = import_vdjtools(vdj_tools_file=vdj_tools_file_beta, chain='beta', organism='human', db_file='alphabeta_gammadelta_db.tsv', validate=True) assert np.all(df_beta.columns == [ 'count', 'freq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'cdr3_b_nucseq', 'valid_v', 'valid_j', 'valid_cdr3' ]) # Can be directly imported into a TCRrep instance. tr = TCRrep( cell_df=df_beta[['count', 'freq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene']], chains=['beta'], organism='human', compute_distances=False)
def test_example_10_sparse_multiprocessing(): import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.rep_funcs import pw2dense import numpy as np df2 = pd.read_csv("dash2.csv") tr = TCRrep( cell_df=df2, #(2) df2=df2, organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') tr.compute_rect_distances(df=tr.clone_df, df2=df2) assert tr.rw_alpha.shape == (1924, 1924) assert tr.rw_beta.shape == (1924, 1924) rw_alpha = tr.rw_alpha.copy() rw_beta = tr.rw_beta.copy() radius = 150 tr.cpus = 2 tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius) d = pw2dense(tr.rw_alpha, radius) assert np.all(rw_alpha[rw_alpha <= radius] == d[d <= radius]) d = pw2dense(tr.rw_beta, radius) assert np.all(rw_beta[rw_beta <= radius] == d[d <= radius])
def test_ex5(): import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep df = pd.read_csv('dash.csv') df = df[df.epitope.isin(['PA'])] tr = TCRrep(cell_df=df, chains=['alpha', 'beta'], organism='mouse') tr.tcrdist2(processes=1, metric='hamming', reduce=True, dump=False, save=False, replacement_weights={ 'cdr3_a_aa': 3, 'pmhc_a_aa': 1, 'cdr2_a_aa': 1, 'cdr1_a_aa': 1, 'cdr3_b_aa': 3, 'pmhc_b_aa': 1, 'cdr2_b_aa': 1, 'cdr1_b_aa': 1 }) assert np.all(tr.pw_tcrdist == tr.pw_alpha + tr.pw_beta) assert np.all(tr.pw_beta == 3 * tr.cdr3_b_aa_pw + tr.pmhc_b_aa_pw + tr.cdr2_b_aa_pw + tr.cdr1_b_aa_pw) assert np.all(tr.pw_alpha == 3 * tr.cdr3_a_aa_pw + tr.pmhc_a_aa_pw + tr.cdr2_a_aa_pw + tr.cdr1_a_aa_pw)
def test_olga_sample_alphas_for_a_human_repertoire(): import re import pandas as pd from tcrdist.repertoire import TCRrep import palmotif from tcrdist.pgen import OlgaModel olga_model_alpha = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") from tcrdist.pgen import OlgaModel olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') rb = [ olga_model_beta.gen_cdr3s(V=allele_01(r['v_b_gene']), J=allele_01(r['j_b_gene']), n=1) for _, r in tr.clone_df[['v_b_gene', 'j_b_gene']].iterrows() ] ra = [ olga_model_alpha.gen_cdr3s(V=allele_01(r['v_a_gene']), J=allele_01(r['j_a_gene']), n=1) for _, r in tr.clone_df[['v_a_gene', 'j_a_gene']].iterrows() ]
def test_integration_TCRrep_with_TCRMotif(): import pandas as pd import tcrdist as td from tcrdist import mappers from tcrdist.repertoire import TCRrep from tcrdist.cdr3_motif import TCRMotif fn = os.path.join("tcrdist","test_files", "vdjDB_PMID28636592.tsv") pd_df = pd.read_csv(fn, sep = "\t") # 1 t_df = td.mappers.vdjdb_to_tcrdist2(pd_df = pd_df) # 2 t_df.organism.value_counts # 3 index_mus = t_df.organism == "MusMusculus" # 4 t_df_mus = t_df.loc[index_mus,:].copy() # 5 tr = TCRrep(cell_df = t_df_mus, organism = "mouse") # 6 tr.infer_cdrs_from_v_gene(chain = 'alpha') # 7 tr.infer_cdrs_from_v_gene(chain = 'beta') # 8 tr.index_cols = ['subject', 'epitope', # subject and epitope 'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene',# gene usage 'cdr3_a_aa', 'cdr3_b_aa', # CDR 3 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', # alpha CDR 1, 2, and 2.5 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa'] # beta CDR 1, 2, and 2.5 tr.deduplicate() # 10 motif = TCRMotif(clones_df = tr.tcr_motif_clones_df(), organism = "mouse", chains = ["A","B"], epitopes = ["PA"]) # 11 assert isinstance(motif.clones_df, pd.DataFrame)
def test_pgen_1(): """ How to add pgen estimates to human alpha/beta CDR3s """ import pandas as pd from tcrdist.pgen import OlgaModel from tcrdist import mappers from tcrdist.repertoire import TCRrep from tcrdist.setup_tests import download_and_extract_zip_file df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df = df.sample(5, random_state = 3), organism = 'human', chains = ['alpha','beta'], db_file = 'alphabeta_gammadelta_db.tsv', store_all_cdr = False) olga_beta = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ") olga_alpha = OlgaModel(chain_folder = "human_T_alpha", recomb_type="VJ") tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens( CDR3_seq = tr.clone_df.cdr3_b_aa) tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens( CDR3_seq = tr.clone_df.cdr3_a_aa) tr.clone_df[['cdr3_b_aa', 'pgen_cdr3_b_aa', 'cdr3_a_aa','pgen_cdr3_a_aa']] """
def test_workflow_1(): """ Load all the TCRs associated with a particular epitope in the Adaptive Biotechnology COVID19 Data Release 2 """ import os import pandas as pd from tcrdist.repertoire import TCRrep path = os.path.join('tcrdist', 'data', 'covid19') file = 'mira_epitope_9_2477_FLQSINFVR_FLQSINFVRI_FLYLYALVYF_GLEAPFLYLY_INFVRIIMR_LQSINFVRI_LQSINFVRII_QSINFVRII_SINFVRIIMR_VYFLQSINF_VYFLQSINFV_YFLQSINFVR_YLYALVYFL.tcrdist3.csv' filename = os.path.join(path, file) df = pd.read_csv(filename, sep=",") df = df[[ 'cell_type', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope', 'age', 'sex', 'race', 'cohort', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'hla-c', 'hla-c_1', 'dpa1_1', 'dpb1', 'dpb1_1', 'dqa1', 'dqa1_1', 'dqb1', 'dqb1_1', 'drb1', 'drb1_1', 'drb3' ]] df['count'] = 1 tr = TCRrep(cell_df=df, organism='human', chains=['beta'])
def test_ex11(): import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df=pd.DataFrame(), chains=['alpha', 'beta'], organism='mouse') tr.rebuild(dest_tar_name="some_archive.tar.gz")
def test_tcr_join_tcrdist(): import pandas as pd from tcrdist.breadth import get_safe_chunk from tcrdist.repertoire import TCRrep from tcrdist.join import join_by_dist tr20 = TCRrep(cell_df=v20df[[ 'subject', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'bio_identity', 'protein_coordinate' ]].copy(), organism='human', chains=['beta'], compute_distances=False) tr21 = TCRrep(cell_df=v21df[[ 'subject', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'bio_identity', 'protein_coordinate' ]].copy(), organism='human', chains=['beta'], compute_distances=False) tr21.cpus = 2 tr21.compute_sparse_rect_distances(df=tr21.clone_df, df2=tr20.clone_df, radius=36, chunk_size=get_safe_chunk( tr21.clone_df.shape[0], tr20.clone_df.shape[0])) left_right_comparision = join_by_dist(how='inner', csrmat=tr21.rw_beta, left_df=v21df, right_df=v20df, left_cols=[ 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'protein_coordinate', 'bio_identity', 'subject' ], right_cols=[ 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'protein_coordinate', 'bio_identity', 'subject' ], left_suffix='_x', right_suffix='_y', max_n=10, radius=24)
def run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw"): cpu = multiprocessing.cpu_count() # really basic example df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv')) df = df[df.epitope.isin(['NP'])] tr = TCRrep(cell_df=df, chains=chains, organism='mouse') tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save=False) return tr
def motif_creation_human_betas(): import re import pandas as pd from tcrdist.repertoire import TCRrep import palmotif from tcrdist.pgen import OlgaModel oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") from tcrdist.pgen import OlgaModel omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') from tcrdist.adpt_funcs import get_basic_centroids get_basic_centroids(tr, max_dist=75) with open("test_3.svg", 'w') as oh: oh.write('<body>') for i, r in tr.centroids_df.iterrows(): if len(r['neighbors']) < 5: break seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list() gene_usages = tr.clone_df.iloc[r['neighbors'], ][[ 'v_b_gene', 'j_b_gene' ]].value_counts().reset_index().to_dict('split')['data'] depth = 3 refs = flatten([ omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth) for v, j, i in combos_alpha ]) refs = [x for x in refs if x is not None] matrix, stats = palmotif.compute_pal_motif(seqs=seqs, refs=refs, centroid=r['cdr3_b_aa']) matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs, centroid=r['cdr3_b_aa']) refs.append(r['cdr3_b_aa']) matrix_bkgd, _ = palmotif.compute_pal_motif( seqs=refs, centroid=r['cdr3_b_aa']) svgs = [ palmotif.svg_logo(matrix, 'test.svg', return_str=True), palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True), palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True) ] [oh.write(f"{s}<div></div>\n") for s in svgs] oh.write('<div></div>') oh.write(str(r)) oh.write('<div></div>') oh.write('</body>')
def test_example_9(): """ If you already have a clones file and want to compute 'tcrdistances' on a DataFrame with custom columns names. Set: 1. Assign TCRrep.clone_df 2. set infer_cdrs = False, 3. compute_distances = False 4. deduplicate = False 5. customize the keys for metrics, weights, and kargs with the lambda customize = lambda d : {new_cols[k]:v for k,v in d.items()} 6. call .calculate_distances() """ import pwseqdist as pw import pandas as pd from tcrdist.repertoire import TCRrep new_cols = { 'cdr3_a_aa': 'c3a', 'pmhc_a_aa': 'pa', 'cdr2_a_aa': 'c2a', 'cdr1_a_aa': 'c1a', 'cdr3_b_aa': 'c3b', 'pmhc_b_aa': 'pb', 'cdr2_b_aa': 'c2b', 'cdr1_b_aa': 'c1b' } df = pd.read_csv("dash2.csv").rename(columns=new_cols) tr = TCRrep( cell_df=df, clone_df=df, #(1) organism='mouse', chains=['alpha', 'beta'], infer_all_genes=True, infer_cdrs=False, #(2)s compute_distances=False, #(3) deduplicate=False, #(4) db_file='alphabeta_gammadelta_db.tsv') customize = lambda d: {new_cols[k]: v for k, v in d.items()} #(5) tr.metrics_a = customize(tr.metrics_a) tr.metrics_b = customize(tr.metrics_b) tr.weights_a = customize(tr.weights_a) tr.weights_b = customize(tr.weights_b) tr.kargs_a = customize(tr.kargs_a) tr.kargs_b = customize(tr.kargs_b) tr.compute_distances() #(6) # Notice that pairwise results now have custom names tr.pw_c3b tr.pw_c3a tr.pw_alpha tr.pw_beta
def test_alpha_beta(): import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory2 from tcrdist.rep_funcs import compute_n_tally_out_of_memory2 from hierdiff.association_testing import cluster_association_test df = pd.read_csv("dash.csv") tr = TCRrep(cell_df=df.sample(100, random_state=1), organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=True, store_all_cdr=False) check_beta = tr.pw_beta.copy() check_beta[check_beta == 0] = 1 check_alpha = tr.pw_alpha.copy() check_alpha[check_alpha == 0] = 1 check_alpha_beta = check_beta + check_alpha S, fragments = compute_pw_sparse_out_of_memory2(tr=tr, row_size=50, pm_processes=1, pm_pbar=True, max_distance=1000, reassemble=True, cleanup=False, assign=True) assert np.all(tr.pw_beta == check_beta) assert np.all(tr.pw_alpha == check_alpha) ndif1 = compute_n_tally_out_of_memory2(fragments, to_file=False, to_memory=True, pm_processes=2, x_cols=['epitope'], count_col='count', knn_neighbors=None, knn_radius=100) from hierdiff.association_testing import cluster_association_test ndif1 = cluster_association_test(res=ndif1, y_col='cmember', method='chi2') from tcrdist.rep_diff import neighborhood_diff ndif2 = neighborhood_diff(clone_df=tr.clone_df, pwmat=np.array(tr.pw_beta.todense() + tr.pw_alpha.todense()), count_col='count', x_cols=['epitope'], knn_radius=100, test_method="chi2") assert ndif1.shape == ndif2.shape np.all(ndif2['FDRq'].to_list() == ndif2['FDRq'].to_list())
def test_calc_radii_if_big(): import numpy as np import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.centers import calc_radii from tcrdist.public import _neighbors_sparse_variable_radius df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True) tr = TCRrep(cell_df = df.copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False) # For a large matrix one can use compute_sparse_rect_distances() instead of # .compute_distances() for the pairwise square matrix tr.cpus = 2 tr.compute_sparse_rect_distances(df = tr.clone_df, radius=50,chunk_size=100) tr.pw_beta = tr.rw_beta.copy() # dfb = pd.read_csv("dash.csv").query('epitope != "PA"').reset_index(drop = True) tr_bkgd = TCRrep(cell_df = dfb.copy(), organism = 'mouse', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False) # Set rw_beta to none as it will be computed between target and background by calc_radii tr.rw_beta = None from tcrdist.centers import calc_radii radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = tr_bkgd, chain = 'beta', ctrl_bkgd = 10**-3, use_sparse = True, max_radius=50) tr.clone_df['radius'] = radii from tcrdist.public import _neighbors_sparse_variable_radius # Compute neighbors <= variable radius in the background set and the foreground set neighbors = _neighbors_sparse_variable_radius(csrmat = tr.pw_beta , radius_list = tr.clone_df['radius']) background_neighbors = _neighbors_sparse_variable_radius(csrmat = tr.rw_beta , radius_list = tr.clone_df['radius']) #tr.clone_df['radius'] = radii tr.clone_df['neighbors'] = neighbors tr.clone_df['background_neighbors'] = background_neighbors tr.clone_df['nsubject'] = tr.clone_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) tr.clone_df['qpublic'] = tr.clone_df['nsubject'].\ apply(lambda x: x > 1) tr.clone_df