def _default_tcrsampler_human_beta(default_background=None, default_background_if_missing=None): """ Responsible for providing the default human beta sampler 'britanova_human_beta_t_cb.tsv.sampler.tsv' Returns ------- t : tcrsampler.sampler.TCRsampler """ from tcrsampler.sampler import TCRsampler if default_background is None: default_background = 'britanova_human_beta_t_cb.tsv.sampler.tsv' if default_background_if_missing is None: default_background_if_missing = 'britanova_human_beta_t_cb.tsv.sampler.tsv.zip' print(default_background) try: t = TCRsampler(default_background=default_background) except OSError: t = TCRsampler() t.download_background_file(default_background_if_missing) t = TCRsampler(default_background=default_background) return t
def _default_tcrsampler_mouse_beta(default_background=None, default_background_if_missing=None): """ Responsible for providing the default mouse beta sampler Returns ------- t : tcrsampler.sampler.TCRsampler """ from tcrsampler.sampler import TCRsampler if default_background is None: default_background = 'ruggiero_mouse_beta_t.tsv.sampler.tsv' if default_background_if_missing is None: default_background_if_missing = 'ruggiero_mouse_sampler.zip' print(default_background) try: t = TCRsampler(default_background=default_background) except OSError: t = TCRsampler() t.download_background_file(default_background_if_missing) t = TCRsampler(default_background=default_background) return t
def test_TCRsampler_init_default(): t = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') assert t.default_bkgd == 'britanova_human_beta_t_cb.tsv.sampler.tsv' assert isinstance(t.ref_df, pd.DataFrame) assert isinstance(t.ref_dict, dict) assert 'TRBV2*01' in t.v_freq.keys()
def test_prob_sampler_sample_key_warn(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() with pytest.warns(None): r = t.sample([['TRBV999*01', 'TRBJ2-7*01', 2]]) assert r == [[None]]
def test_prob_sampler_sample(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2]]) assert r == [['CASSRTGSLADEQYF', 'CASSATGVVSAQYF']] r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2]], flatten=True) assert r == ['CASSRTGSLADEQYF', 'CASSATGVVSAQYF'] r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2], ['TRBV7-7*01', 'TRBJ2-4*01', 4]]) assert r == [['CASSRTGSLADEQYF', 'CASSATGVVSAQYF'], [ 'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF' ]]
def make_flat_vj_background(ts=None, n=200, size=100000, cols=['v_b_gene', 'j_b_gene']): """ Parameters ---------- ts : TCRsampler n : int Default 200, number of TCRs to generate using Olga cols : list Default ['v_b_gene','j_b_gene'] Returns ------- df : DataFrame Makes a flat background where every V,J pair is equally represeented. Enrichment factors for each pair are based on frequency distribution of VJ pairing in a TCRsamler """ if size / n < 135: raise ValueError( f"Based on size = {size}, increase to alteast {size/1000} to have sufficient TCRs per VJ pairing" ) if ts is None: from tcrsampler.sampler import TCRsampler ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') results, find_nones = sim_all_cdr3_gen(n=n) dfopt = pd.concat(results) dfopt = dfopt[dfopt.cdr3_b_aa.notna()] # import numpy as np # min_pV = np.min(list(ts.v_occur_freq.values())) # min_pJ = np.min(list(ts.j_occur_freq.values())) # min_pVJ = np.min(list(ts.vj_occur_freq.values())) # dfopt['pV'] = dfopt.v_b_gene.apply(lambda x : ts.v_occur_freq.get(x, min_pV)) # dfopt['pJ'] = dfopt.j_b_gene.apply(lambda x : ts.j_occur_freq.get(x, min_pJ)) # dfopt['pVJ'] = [ts.vj_occur_freq.get((r[cols[0]], r[cols[1]]), min_pVJ) for i,r in dfopt[cols].iterrows()] min_n = dfopt.groupby(cols).size().min() import math n = math.ceil(size / dfopt.groupby(cols).size().shape[0]) min_n = min(min_n, n) parts = list() for i, g in dfopt.groupby(cols): parts.append(g.sample(min_n)) df = pd.concat(parts).reset_index(drop=True) #df.to_csv("olga_optimized_human_T_beta.csv", index = False) df = get_gene_frequencies(ts=ts, df=df, cols=cols) return df
def test_background_generation_in_mira_60(fn=os.path.join( 'tcrdist', 'data', 'covid19', 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv')): import sys import os import numpy as np import pandas as pd from tcrsampler.sampler import TCRsampler from tcrdist.background import make_gene_usage_counter, get_gene_frequencies, calculate_adjustment, make_gene_usage_counter from tcrdist.background import make_vj_matched_background, make_flat_vj_background from tcrdist.background import get_stratified_gene_usage_frequency from tcrdist.background import sample_britanova """ SUPPOSE WE HAVE SOME REPERTOIRE WITH THE FOLLOWING GENE USAGE SPECIFIED BY ix < df_target > For testing we will use a set of 25 TCRs generated from rare and semi-rare V,J pairings. We use 25 only because we will be comuting distances against 4.6 Million seqs. 1. TCRsampler, replacing gene occurance frequencies with subject tratified estimates NOTE: with replace = True .vj_occur_freq will now be the stratified value 2. Make V,J gene usage matched backgound to match usage in df_target 3. Use a subject-stratifeid random draw from the Britanova Chord Blood Samples 4. Make V,J gene usage matched backgound to match usage in df_target """ ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') # 1 ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) df_target = pd.read_csv(fn) df_target = df_target[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']] gene_usage_counter = make_gene_usage_counter(df_target) # 2 df_vj_bkgd = make_vj_matched_background( ts=ts, gene_usage_counter=gene_usage_counter, size= 150000, # Ask for a few extra as Olga can return none if it makes too many non-productive CDR3s recomb_type="VDJ", chain_folder="human_T_beta", cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']) df_vj_bkgd = df_vj_bkgd.sample(100000).reset_index(drop=True) df_vj_bkgd['weights'] = calculate_adjustment(df=df_vj_bkgd, adjcol="pVJ") df_vj_bkgd['source'] = "vj_matched" df_britanova_100K = sample_britanova(size=100000) # 3 df_britanova_100K = get_gene_frequencies(ts=ts, df=df_britanova_100K) df_britanova_100K['weights'] = 1 df_britanova_100K['source'] = "stratified_random" df_bkgd = pd.concat([df_vj_bkgd, df_britanova_100K], axis = 0).\ reset_index(drop = True) # 4 assert df_bkgd.shape[0] == 200000 #df_bkgd. return df_bkgd
def test_TCRsampler_build(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() assert isinstance(t.ref_dict, dict) assert isinstance(t.ref_dict.popitem()[1], pd.DataFrame)
def test_prob_sampler_sample_background(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() r = t.sample_background('TRBV9*01', 'TRBJ2-7*01', n=10) assert r == [ 'CASSRTGSLADEQYF', 'CASSATGVVSAQYF', 'CASSAWGQVYEQYF', 'CASSVSGSPYEQYF', 'CASSAWGQVYEQYF', 'CASSAWGQVYEQYF', 'CASRWGEQYF', 'CASSGDDWEQYF', 'CASSATGTSGPYEQYF', 'CASSSRTSGSNSEQYF' ]
def test_ex12(): import pandas as pd import os from tcrsampler.sampler import TCRsampler # fn = 'britanova_chord_blood.csv' # real file fn = os.path.join('tcrdist','test_files', 'britanova_chord_blood_sample_5000.csv') # test_only file t = TCRsampler() t.ref_df = pd.read_csv(fn) t.build_background() t.v_freq t.j_freq t.vj_freq t.sample_background(v ='TRBV10-1*01', j ='TRBJ1-1*01',n=3, depth = 1, seed =1, use_frequency= True )
def test_TCRsampler_build_vj_components(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() assert np.isclose(np.sum([k for _, k in t.vj_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.j_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.v_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.vj_occur_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.v_occur_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.j_occur_freq.items()]), 1.0)
def _default_tcrsampler_human_alpha(default_background=None, default_background_if_missing=None): """ Responsible for providing the default human alpha sampler 'ruggiero_human_alpha_t.tsv.sampler.tsv' """ from tcrsampler.sampler import TCRsampler if default_background is None: default_background = 'ruggiero_human_alpha_t.tsv.sampler.tsv' if default_background_if_missing is None: default_background_if_missing = 'ruggiero_human_alpha_t.tsv.sampler.tsv.zip' print(default_background) try: t = TCRsampler(default_background=default_background) except OSError: t = TCRsampler() t.download_background_file(default_background_if_missing) t = TCRsampler(default_background=default_background) return t
df['strain'] = 'C57BL6 inbred mouse strain' print(df) wirasinha = pd.read_csv( '/Volumes/Samsung_T5/kmayerbl/tcr_data/wirasinha/Wirasinha.migec.txt', sep='\t') for i, row in df.iterrows(): sdf = subset_wirasinha(df=wirasinha, subset=row['subset'], tcr_b=row['tcr_b'], chain=row['chain']) sdf[['bestv', 'bestj']] = sdf[['v', 'j']].apply(lambda x: x.apply(_pick_best)) sdf[['bestv', 'bestj']] = sdf[['bestv', 'bestj']].apply(lambda x: x.apply(_strip_allele)) sdf = sdf.rename(columns=wirasinha_to_mixcr_headers) sys.stdout.write(f"Writing {row['filename']}\n") sdf.to_csv(row['filename'], sep="\t") sys.stdout.write( f"Testing {row['filename']} for import into TCRsampler\t") t = TCRsampler() t.clean_mixcr(filename=row['filename']) t.build_background() print("\n") print(t.ref_df.head(3)) name = f"{row['filename']}.sampler.tsv" sys.stdout.write(f"Writing {name} \t") t.ref_df.to_csv(name, sep="\t", index=False)
mixcr exportClones -cloneId -count -fraction -vGene -jGene -vHit -jHit -vHits -jHits -aaFeature CDR3 -nFeature CDR3 SRR2079522.1.clns SRR2079522.1.clns.best.txt -f mixcr exportAlignments SRR2079522.1.vdjca SRR2079522.1.vdjca.txt -f ``` #### Files Available For Download Beta: [SRR2079522.1.clns.best.txt](https://www.dropbox.com/s/czcewp7x7auwdsu/SRR2079522.1.clns.best.txt?dl=1) Alpha: [SRR2079521.1.clns.best.txt](https://www.dropbox.com/s/k4i0mt0cwhcn1h7/SRR2079521.1.clns.best.txt?dl=1) """ from tcrsampler.sampler import TCRsampler fn = 'SRR2079522.1.clns.best.subject.txt' t = TCRsampler() t.clean_mixcr(fn) t.build_background() t.ref_df t.ref_df.to_csv('ruggiero_mouse_beta_t.tsv.sampler.tsv', sep="\t", index=False) fn = 'SRR2079521.1.clns.best.subject.txt' t = TCRsampler() t.clean_mixcr(fn) t.build_background() t.ref_df t.ref_df.to_csv('ruggiero_mouse_alpha_t.tsv.sampler.tsv', sep="\t", index=False)
import os import pandas as pd from tcrsampler.sampler import TCRsampler t = TCRsampler() fn = os.path.join('emerson_cmv_negative.csv') t.ref_df = pd.read_csv(fn) t.build_background(max_rows=100, stratify_by_subject=True) t.sample( [['TRBV10-2*01', 'TRBV10-2*01*01', 1], ['TRBV27*01', 'TRBV27*01*01', 4]], depth=10) for k, v in t.ref_dict.items(): print(k, v.shape[0])
def test_background_generation_toy_example(): import sys import os import numpy as np import pandas as pd from tcrsampler.sampler import TCRsampler from tcrdist.background import make_gene_usage_counter, get_gene_frequencies, calculate_adjustment, make_gene_usage_counter from tcrdist.background import make_vj_matched_background, make_flat_vj_background from tcrdist.background import get_stratified_gene_usage_frequency from tcrdist.background import sample_britanova """ SUPPOSE WE HAVE SOME REPERTOIRE WITH THE FOLLOWING GENE USAGE SPECIFIED BY ix < df_target > For testing we will use a set of 25 TCRs generated from rare and semi-rare V,J pairings. We use 25 only because we will be comuting distances against 4.6 Million seqs. 1. TCRsampler, replacing gene occurance frequencies with subject tratified estimates NOTE: with replace = True .vj_occur_freq will now be the stratified value 2. Make V,J gene usage matched backgound to match usage in df_target 3. Use a subject-stratifeid random draw from the Britanova Chord Blood Samples 4. Make V,J gene usage matched backgound to match usage in df_target """ ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') # 1 ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) ix = [['TRBV19*01', 'TRBJ2-5*01', 3], ['TRBV24-1*01', 'TRBJ2-4*01', 3], ['TRBV25-1*01', 'TRBJ2-4*01', 3], ['TRBV30*01', 'TRBJ2-3*01', 2], ['TRBV5-4*01', 'TRBJ2-3*01', 2], ['TRBV11-2*01', 'TRBJ2-2*01', 2], ['TRBV2*01', 'TRBJ1-5*01', 1], ['TRBV12-5*01', 'TRBJ2-7*01', 1], ['TRBV4-1*01', 'TRBJ1-6*01', 1], ['TRBV6-5*01', 'TRBJ1-6*01', 1], ['TRBV13*01', 'TRBJ2-3*01', 1], ['TRBV18*01', 'TRBJ2-3*01', 1], ['TRBV14*01', 'TRBJ2-7*01', 1], ['TRBV6-6*01', 'TRBJ2-7*01', 1], ['TRBV10-3*01', 'TRBJ2-3*01', 1], ['TRBV7-2*01', 'TRBJ2-1*01', 1], ['TRBV5-1*01', 'TRBJ2-1*01', 1]] flatten = lambda l: [item for sublist in l for item in sublist] df_target = pd.concat([ pd.DataFrame({ 'cdr3_b_aa': flatten(ts.sample([[x[0], x[1], x[2]]])), 'v_b_gene': x[0], 'j_b_gene': x[1] }) for x in ix ]).reset_index(drop=True) gene_usage_counter = make_gene_usage_counter(df_target) # 2 df_vj_bkgd = make_vj_matched_background( ts=ts, gene_usage_counter=gene_usage_counter, size= 101000, # Ask for a few extra as Olga can return none if it makes too many non-productive CDR3s recomb_type="VDJ", chain_folder="human_T_beta", cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']) df_vj_bkgd = df_vj_bkgd.sample(100000).reset_index(drop=True) df_vj_bkgd['weights'] = calculate_adjustment(df=df_vj_bkgd, adjcol="pVJ") df_vj_bkgd['source'] = "vj_matched" df_britanova_100K = sample_britanova(size=100000) # 3 df_britanova_100K = get_gene_frequencies(ts=ts, df=df_britanova_100K) df_britanova_100K['weights'] = 1 df_britanova_100K['source'] = "stratified_random" df_bkgd = pd.concat([df_vj_bkgd, df_britanova_100K], axis = 0).\ reset_index(drop = True) # 4 assert df_bkgd.shape[0] == 200000 """ Visually inspect the gene_usage between target seqs and vj-matched background """ df_check_match = pd.concat([ df_vj_bkgd.groupby(['v_b_gene', 'j_b_gene']).size() / df_vj_bkgd.shape[0], df_target.groupby(['v_b_gene', 'j_b_gene']).size() / df_target.shape[0] ], axis=1) assert np.all(abs(df_check_match[0] - df_check_match[1]) < 0.001) return df_bkgd
def test_quick_pipeline_with_fragmented_compute(): """ How can I used tcrdist3 to test for TCRs that may HLA restricted. """ import os import pandas as pd import numpy as np from scipy import sparse from tcrdist.repertoire import TCRrep from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory f = 'mira_epitope_67_382_APHGVVFL_APHGVVFLHV_GVVFLHVTY_VVFLHVTYV.tcrdist3.csv' f = os.path.join('tcrdist','data','covid19',f) assert os.path.isfile(f) df = pd.read_csv(f) df = df[['subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq', 'cohort', 'hla-a', 'hla-a_1','hla-b', 'hla-b_1']] tr = TCRrep(cell_df = df, organism = 'human', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False, store_all_cdr = False) from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory S, fragments = compute_pw_sparse_out_of_memory( tr = tr, row_size = 100, pm_processes = 2, pm_pbar = True, max_distance = 1000, matrix_name = 'rw_beta', reassemble = True, cleanup = False) tr.clone_df['B07'] = (tr.clone_df['hla-b'].str.startswith("B*07") | tr.clone_df['hla-b_1'].str.startswith("B*07")) tr.clone_df['B07'] = ["B*07" if (x) else "NOTB*07 " for x in tr.clone_df['B07']] #sparse.save_npz("S.npz", S) from tcrdist.rep_funcs import compute_n_tally_out_of_memory nn_tally_df_cohort = compute_n_tally_out_of_memory(fragments, matrix_name = "rw_beta", pm_processes = 6, to_file = False, to_memory = True, knn_radius = 25, x_cols = ['B07']) from hierdiff.association_testing import cluster_association_test nn_associations = cluster_association_test(res = nn_tally_df_cohort, y_col='cmember', method='fishers') nn_associations = nn_associations.sort_values('pvalue', ascending = True) import ast nn_associations['neighbors_i'] = nn_associations.neighbors.apply(lambda x: ast.literal_eval(x)) from tcrdist.summarize import test_for_almost_subsets, filter_is, filter_gt nn_associations['mostly_unique'] = test_for_almost_subsets(nn_associations['neighbors_i'], thr = 5) nr_nn_associations = filter_is(nn_associations, 'mostly_unique', 1).copy() #nr_nn_associations = filter_gt(nr_nn_associations, 'K_neighbors', 25).copy() nr_nn_associations # MOTIF GENERATION from tcrsampler.sampler import TCRsampler t = TCRsampler() if 'olga_human_beta_t.sampler.tsv' not in t.currently_available_backgrounds(): t.download_background_file('olga_sampler.zip') #t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE tcrsampler_beta = TCRsampler(default_background = 'olga_human_beta_t.sampler.tsv') tcrsampler_beta.build_background(max_rows = 1000) """SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)""" from palmotif import compute_pal_motif, svg_logo from tcrdist.summarize import _select """GENERATE SVG GRAPHIC FOR EACH NODE OF THE TREE""" #pwmat_str = 'pw_beta' cdr3_name = 'cdr3_b_aa' gene_names = ['v_b_gene','j_b_gene'] svgs_beta = list() svgs_beta_raw = list() info_list = list() from tcrdist.rep_diff import member_summ summary = member_summ( res_df = nr_nn_associations, clone_df = tr.clone_df, addl_cols=['cohort','hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'subject']) nr_nn_associations = pd.concat([nr_nn_associations, summary], axis = 1).reset_index() for i,r in nr_nn_associations.head(25).iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],:].copy() # <pwnode> Pairwise Matrix for node sequences pwnode = S[r['neighbors_i'],:] [:,r['neighbors_i']].todense() if dfnode.shape[0] > 2: iloc_idx = pwnode.sum(axis = 0).argmin() centroid = dfnode[cdr3_name].to_list()[iloc_idx] else: centroid = dfnode[cdr3_name].to_list()[0] print(f"CENTROID: {centroid}") gene_usage_beta = dfnode.groupby(gene_names).size() sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'], flatten = True, depth = max(100, 1000 // dfnode.shape[0])) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), refs = sampled_rep, centroid = centroid) svgs_beta.append(svg_logo(motif, return_str= True)) sampled_rep = sampled_rep.append(centroid) motif_raw, _ = compute_pal_motif( seqs =_select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), centroid = centroid) svgs_beta_raw.append(svg_logo(motif_raw, return_str= True)) info_list.append(r) def row_to_string(r, vals = ['ct_columns', 'val_0', 'ct_0', 'val_1', 'ct_1', 'val_2', 'ct_2','val_3', 'ct_3', 'levels', 'K_neighbors', 'R_radius', 'RR', 'OR', 'pvalue', 'FWERp','FDRq']): #d = {v:r[v] for v in vals} return "<br></br>".join([f"\t{v} : {r[v]}" for v in vals]) def to_html_table(r, vals = ['ct_columns', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'val_0', 'ct_0', 'val_2', 'ct_2', 'K_neighbors', 'R_radius', 'pvalue', 'FDRq','cdr3_b_aa','v_b_gene', 'j_b_gene', 'cohort','subject']): return pd.DataFrame(r[vals]).transpose().to_html() def shrink(html_str): return html_str.replace('height="100%"', 'height="10%"').\ replace('width="100%"', 'width="10%"') with open('svgs_in_line.html', 'w') as fh: fh.write(f"<html><body>\n") for svg, svg_raw, details in zip(svgs_beta, svgs_beta_raw, info_list): fh.write(f"{shrink(svg_raw)}{shrink(svg)}") try: fh.write(to_html_table(details)) except: print("F") fh.write("<div></div>") fh.write(f"</html></body>\n")
import os import pandas as pd from tcrsampler.sampler import TCRsampler t = TCRsampler() fn = os.path.join('britanova_chord_blood.csv') t.ref_df = pd.read_csv(fn) t.build_background(max_rows=1000) t.sample( [['TRBV10-2*01', 'TRBV10-2*01*01', 1], ['TRBV27*01', 'TRBV27*01*01', 4]], depth=10) for k, v in t.ref_dict.items(): print(k, v.shape[0])
how='left', on='subject') dfd['freq'] = dfd['freq_x'] / dfd['freq_y'] print(dfd[['freq', 'subject']].groupby(['subject']).sum()) # Test that these will work with TCRsampler from tcrsampler.sampler import TCRsampler from tcrdist import repertoire_db ref = repertoire_db.RefGeneSet(db_file='alphabeta_gammadelta_db.tsv') ref.generate_all_genes() ref.all_genes ref.all_genes['human'].keys() tsd = TCRsampler() tsd.ref_df = dfd tsd.build_background() # find potential missing: print([x for x in tsd.v_freq.keys()]) print([x for x in tsd.v_freq.keys() if x not in ref.all_genes['human'].keys()]) assert len([ x for x in tsd.v_freq.keys() if x not in ref.all_genes['human'].keys() ]) == 0 print([x for x in tsd.j_freq.keys()]) print([x for x in tsd.j_freq.keys() if x not in ref.all_genes['human'].keys()]) assert len([ x for x in tsd.j_freq.keys() if x not in ref.all_genes['human'].keys() ]) == 0 tsg = TCRsampler()
def test_v_j_freq_estimates(): d = { 'Unnamed: 0': { 0: 0, 1: 1, 2: 2, 3: 3, 4: 4 }, 'v_reps': { 0: 'TRBV24-1*01', 1: 'TRBV5-1*01', 2: 'TRBV7-2*01', 3: 'TRBV3-1*01', 4: 'TRBV7-3*01' }, 'j_reps': { 0: 'TRBJ2-1*01', 1: 'TRBJ2-5*01', 2: 'TRBJ2-3*01', 3: 'TRBJ2-5*01', 4: 'TRBJ2-3*01' }, 'cdr3': { 0: 'CATRQDNEQFF', 1: 'CASSLEETQYF', 2: 'CASSLADTQYF', 3: 'CASSQETQYF', 4: 'CASSLAGGTDTQYF' }, 'count': { 0: 252, 1: 166, 2: 113, 3: 98, 4: 89 }, 'freq': { 0: 0.0003726818302818776, 1: 0.0002454967612174273, 2: 0.00016711526516608003, 3: 0.00014493182288739684, 4: 0.00013162175752018694 }, 'subject': { 0: 'A5-S11.txt', 1: 'A5-S11.txt', 2: 'A5-S11.txt', 3: 'A5-S11.txt', 4: 'A5-S11.txt' } } df = pd.DataFrame(d) t = TCRsampler() t.ref_df = df t.build_background() assert t.v_occur_freq == { 'TRBV3-1*01': 0.2, 'TRBV5-1*01': 0.2, 'TRBV7-2*01': 0.2, 'TRBV7-3*01': 0.2, 'TRBV24-1*01': 0.2 } assert t.j_occur_freq == { 'TRBJ2-1*01': 0.2, 'TRBJ2-3*01': 0.4, 'TRBJ2-5*01': 0.4 }
def test_dash_ecdf(): """ An empirical distribution function (ECDF) can be created for a target TCR and a reference set of TCRs to show the proportion of reference TCRs that are within a distance D of the target TCR, over a range of distances. A plot of the ECDF as a function of increasing D shows the density of TCR space in the reference set in the neighborhood around the target TCR. This can be very helpful for identifying dense antigen-specific clusters in an antigen enriched TCR repertoire, where the "reference" set is actually an experimentally enriched repertoire (e.g. pMHC:tetramer or AIM sorting). Or the ECDF can be helpful for identifying a radius around a TCR that retains high antigen specificity, by showing that the neighborhood is extremely sparse in an large unsorted/bulk TCR repertoire. """ import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrsampler.sampler import TCRsampler from tcrdist.ecdf import distance_ecdf, make_ecdf_step from tcrdist.background import make_gene_usage_counter, make_vj_matched_background, \ make_flat_vj_background, get_gene_frequencies, calculate_adjustment import matplotlib.pyplot as plt df = pd.read_csv('dash.csv') df = df.loc[df['epitope'] == 'PB1'] tr = TCRrep(cell_df=df, organism='mouse', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv') TCRsampler.download_background_file(download_file='wiraninha_sampler.zip') cols = ['v_b_gene', 'j_b_gene'] refs = [] for ts_fn in [f'wirasinha_mouse_beta_s_{i}.tsv.sampler.tsv' for i in '48']: ts = TCRsampler(default_background=ts_fn) ts.build_background(stratify_by_subject=True, use_frequency=False) """Sanitize the alleles to *01 for TCRSampler""" tmp = df[cols].applymap(lambda s: s.split('*')[0] + '*01') freqs = tmp.groupby(cols).size() freq_records = list(freqs.to_frame().to_records()) ref = ts.sample(freq_records, depth=10, seed=110820) ref_df = pd.concat([ pd.DataFrame({ 'cdr3_b_aa': ref[i] }).assign(v_b_gene=v, j_b_gene=j) for i, (v, j, _) in enumerate(freq_records) ]) """Assigns pV, pJ and pVJ to ref_df""" ref_df = get_gene_frequencies(ts=ts, df=ref_df) xdf = freqs.reset_index() xdf.columns = ['v_b_gene', 'j_b_gene', 'n'] """For each V,J pairing compute frequency in this reference""" xdf = xdf.assign(ref_freq=xdf['n'] / xdf['n'].sum()) ref_df = ref_df.merge(xdf, how='left', on=cols).reset_index() """ Assign weights to ref sequences: Pr_actual / Pr_sampling""" ref_df = ref_df.assign(weights=ref_df['pVJ'] / ref_df['ref_freq']) refs.append(ref_df) """Add uniformly sampled sequences""" ref_df = ts.ref_df.sample(100, random_state=1) refs.append(ref_df) ref_df = pd.concat(refs, axis=0) ref_tr = TCRrep(cell_df=ref_df[cols + ['cdr3_b_aa', 'weights']], organism='mouse', chains=['beta'], compute_distances=False, store_all_cdr=False) tr.compute_rect_distances(df=tr.clone_df, df2=ref_tr.clone_df, store=False) thresholds = np.arange(1, 50) thresholds, ref_ecdf = distance_ecdf(tr.rw_beta, thresholds=thresholds, weights=ref_tr.clone_df['weights'] * ref_tr.clone_df['count']) thresholds, target_ecdf = distance_ecdf(tr.pw_beta, thresholds=thresholds, weights=None) figh = plt.figure(figsize=(5, 5)) axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log') plt.ylabel(f'Proportion of reference TCRs') plt.xlabel(f'Distance from target TCR clone') for tari in range(ref_ecdf.shape[0]): x, y = make_ecdf_step(thresholds, ref_ecdf[tari, :]) axh.plot(x, y, color='k', alpha=0.2) x, y = make_ecdf_step(thresholds, np.mean(ref_ecdf, axis=0)) axh.plot(x, y, color='r', alpha=1) figh = plt.figure(figsize=(5, 5)) axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log') plt.ylabel(f'Proportion of target TCRs') plt.xlabel(f'Distance from target TCR clone') for tari in range(target_ecdf.shape[0]): x, y = make_ecdf_step(thresholds, target_ecdf[tari, :]) axh.plot(x, y, color='k', alpha=0.2) x, y = make_ecdf_step(thresholds, np.mean(target_ecdf, axis=0)) axh.plot(x, y, color='r', alpha=1) """Make an "ROC" plot combining the ECDF against the target (sensitivity) vs. ECDF against the reference (specificity)""" figh = plt.figure(figsize=(7, 5)) axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log', xscale='log') plt.ylabel(f'Proportion of target TCRs') plt.xlabel(f'Proportion of reference TCRs') for tari in range(target_ecdf.shape[0]): x, y = make_ecdf_step(ref_ecdf[tari, :], target_ecdf[tari, :]) axh.plot(x, y, color='k', alpha=0.2) x, y = make_ecdf_step(np.mean(ref_ecdf, axis=0), np.mean(target_ecdf, axis=0)) axh.plot(x, y, color='r', alpha=1) yl = plt.ylim() xl = plt.xlim() #yl = (1e-6, 0.3) plt.plot(yl, yl, '--', color='gray') plt.xlim(xl) plt.ylim(yl)
def get_stratified_gene_usage_frequency(ts=None, replace=True): """ MODIFIES A TCRsampler instance with esitmates vj_occur_freq_stratified by subject Parameters ---------- ts : tcrsampler.sampler.TCRsampler replace : bool if True, ts.v_occur_freq is set to ts.v_occur_freq_stratified so other functions will work as befor. Returns ------- ts : tcrsampler.sampler.TCRsampler """ if ts is None: ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') # (1/uniqueTCR_sample_depth) / nsubject nsubjects = len(ts.ref_df.subject.value_counts()) inverse_tcrs_per_subject = (1 / ts.ref_df.subject.value_counts()) / nsubjects # <weights df> ws_df = pd.DataFrame({ 'subject': inverse_tcrs_per_subject.index, 'sweight': inverse_tcrs_per_subject }).reset_index(drop=True) # left join <ws_df> to provide a subject specific weight df = ts.ref_df.merge(ws_df, how='left', on='subject').copy() # All sweights should sum to 1.0, up to rounding error assert np.isclose(df.sweight.sum(), 1.0) # SUBJECT STRATIFIED V,J FREQUENCIES # For each V,J combo take the weighted sum across all samples df_vj_occur_freq = df[['sweight', 'v_reps', 'j_reps']].groupby( ['v_reps', 'j_reps']).sum().reset_index().rename(columns={'sweight': 'pVJ'}) assert np.isclose(df_vj_occur_freq.pVJ.sum(), 1.0) df_vj_occur_freq # Covert to a dictionary keyed on (V,J) ts.vj_occur_freq_stratified = { (x[0], x[1]): x[2] for x in df_vj_occur_freq.to_dict('split')['data'] } # SUBJECT STRATIFIED VFREQUENCIES df_v_occur_freq = df[['sweight', 'v_reps']].groupby( ['v_reps']).sum().reset_index().rename(columns={'sweight': 'pV'}) assert np.isclose(df_v_occur_freq.pV.sum(), 1.0) df_v_occur_freq # Covert to a dictionary keyed on (V,J) ts.v_occur_freq_stratified = { x[0]: x[1] for x in df_v_occur_freq.to_dict('split')['data'] } # SUBJECT STRATIFIED JFREQUENCIES df_j_occur_freq = df[['sweight', 'j_reps']].groupby( ['j_reps']).sum().reset_index().rename(columns={'sweight': 'pJ'}) assert np.isclose(df_j_occur_freq.pJ.sum(), 1.0) df_j_occur_freq # Covert to a dictionary keyed on (V,J) ts.j_occur_freq_stratified = { x[0]: x[1] for x in df_j_occur_freq.to_dict('split')['data'] } if replace: warnings.warn( "REPLACING ts.vj_occur_freq WITH ts.vj_occur_freq_stratified", stacklevel=2) warnings.warn( "REPLACING ts.v_occur_freq WITH ts.v_occur_freq_stratified", stacklevel=2) warnings.warn( "REPLACING ts.j_occur_freq WITH ts.j_occur_freq_stratified", stacklevel=2) ts.vj_occur_freq = ts.vj_occur_freq_stratified ts.v_occur_freq = ts.v_occur_freq_stratified ts.j_occur_freq = ts.j_occur_freq_stratified return ts
def _get_britanova_human_beta_chord_blood_subject_stratified_background( size=100000, random_state=24082020): """ Produce a background, stratfied by 8 subjects up to 960,000 TCR clones. Unique TCRs are returned without consideration of their clonal frequency. Parameters ---------- size : int Size of background random_state : int Seed for random. sample """ """Check for background file. If not present, download""" if not 'britanova_human_beta_t_cb.tsv.sampler.tsv' in TCRsampler.currently_available_backgrounds( ): TCRsampler.download_background_file( 'britanova_human_beta_t_cb.tsv.sampler.tsv.zip') else: pass # print("CONGRATS 'britanova_human_beta_t_cb.tsv.sampler.tsv' ALREADY INSTALLED") ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) # In [10]: ts.ref_df.subject.value_counts() # Out[10]: # A5-S18.txt 1073416 # A5-S17.txt 825507 # A5-S13.txt 692050 # A5-S12.txt 573373 # A5-S16.txt 559980 # A5-S11.txt 519582 # A5-S14.txt 302288 # A5-S15.txt 120302 (NOTE THIS IS THE SMALLED STAMPLE) total = size #100K nsubject = 8 import math per_sample = math.ceil(total / nsubject) if per_sample > 120000: raise ValueError( "Size: {size} exceed max size (960000) for valid stratification based on smallest sample" ) samples = [] for subject_name, subject_df in ts.ref_df.groupby('subject'): if subject_name == 'A5-S15.txt': samples.append( subject_df.sample( per_sample, replace=False, random_state=random_state).copy().reset_index(drop=True)) else: samples.append( subject_df.sample( per_sample, replace=False, random_state=random_state).copy().reset_index(drop=True)) bitanova_unique_clones_sampled = pd.concat(samples).reset_index(drop=True) bitanova_unique_clones_sampled = bitanova_unique_clones_sampled[[ 'v_reps', 'j_reps', 'cdr3' ]].rename(columns={ 'v_reps': 'v_b_gene', 'j_reps': 'j_b_gene', 'cdr3': 'cdr3_b_aa' }) return bitanova_unique_clones_sampled
def make_vj_matched_background(gene_usage_counter, ts=None, size=100000, recomb_type="VDJ", chain_folder="human_T_beta", cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']): """ gene_usage_counter : collections.Counter size : int recomb_type : str Default "VDJ", chain_folder : str Default is for human beta "human_T_beta", cols : list Default is for beta ['v_b_gene', 'j_b_gene', 'cdr3_b_aa'] Example ------- >>> ix =[['TRBV19*01', 'TRBJ2-5*01', 3],['TRBV24-1*01', 'TRBJ2-4*01', 3]] >>> df_rare= pd.concat([pd.DataFrame({'cdr3_b_aa' : flatten(ts.sample([[x[0], x[1], x[2]]])) , 'v_b_gene':x[0], 'j_b_gene':x[1]}) for x in ix]).reset_index(drop = True) >>> gene_usage_counter = make_gene_usage_counter(df_rare) >>> make_vj_matched_background(gene_usage_counter, size = 10) v_b_gene j_b_gene cdr3_b_aa pV pJ pVJ 0 TRBV24-1*01 TRBJ2-4*01 CATPVAGVAKNIQYF 0.011942 0.042163 0.000420 1 TRBV24-1*01 TRBJ2-4*01 CATSPRGSLSIQYF 0.011942 0.042163 0.000420 2 TRBV24-1*01 TRBJ2-4*01 CATSDLGGGGIHNIQYF 0.011942 0.042163 0.000420 3 TRBV19*01 TRBJ2-5*01 CASSISDRGKFSETQYF 0.006788 0.089505 0.000394 4 TRBV24-1*01 TRBJ2-4*01 CATSDLPARTRENIQYF 0.011942 0.042163 0.000420 5 TRBV24-1*01 TRBJ2-4*01 CATSDPQGAKNIQYF 0.011942 0.042163 0.000420 6 TRBV19*01 TRBJ2-5*01 CASSISCGRNLGGQETQYF 0.006788 0.089505 0.000394 7 TRBV19*01 TRBJ2-5*01 CASSCKPSGGYQETQYF 0.006788 0.089505 0.000394 8 TRBV19*01 TRBJ2-5*01 CASSSGTSHKLETQYF 0.006788 0.089505 0.000394 9 TRBV19*01 TRBJ2-5*01 CASSDRETQYF 0.006788 0.089505 0.000394 """ olga_model_beta = OlgaModel(recomb_type=recomb_type, chain_folder=chain_folder) total_seqs = np.sum(list(gene_usage_counter.values())) adjust_factor = size / total_seqs dfs = list() adjust_depth = 1 for k, v in gene_usage_counter.items(): try: cdr3s = olga_model_beta.gen_cdr3s(V=k[0], J=k[1], n=v * math.ceil(adjust_factor)) df = pd.DataFrame({cols[2]: cdr3s}) df[cols[0]] = k[0] df[cols[1]] = k[1] dfs.append(df) except AttributeError: pass df = pd.concat(dfs).reset_index(drop=True) df = df[df[cols[2]].notna()][cols] if ts is None: from tcrsampler.sampler import TCRsampler ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') ts = get_stratified_gene_usage_frequency(ts, replace=True) df = get_gene_frequencies(ts=ts, df=df, cols=cols) df = df.reset_index(drop=True) return (df)
chains=['alpha']) """COMPUTE TCRDISTANCES (SEE DOCS PAGE:https://tcrdist3.readthedocs.io/en/latest/index.html#hierarchical-neighborhoods)""" from tcrdist.rep_diff import hcluster_diff tr.hcluster_df, tr.Z =\ hcluster_diff(clone_df = tr.clone_df, pwmat = tr.pw_alpha, x_cols = ['cohort'], count_col = 'count') """ SEE TCRSAMPLER (https://github.com/kmayerb/tcrsampler/blob/master/docs/tcrsampler.md) Here we used olga human alpha synthetic sequences for best coverage """ from tcrsampler.sampler import TCRsampler t = TCRsampler() #t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE tcrsampler_alpha = TCRsampler(default_background = 'olga_human_alpha_t.sampler.tsv') tcrsampler_alpha.build_background(max_rows = 1000) """SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)""" from palmotif import compute_pal_motif, svg_logo from tcrdist.summarize import _select """GENERATE SVG GRAPHIC FOR EACH NODE OF THE TREE""" pwmat_str = 'pw_alpha' cdr3_name = 'cdr3_a_aa' gene_names = ['v_a_gene','j_a_gene'] svgs_alpha = list() svgs_alpha_raw = list() for i,r in tr.hcluster_df.iterrows():
def test_TCRsampler_clean_mixcr(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) assert isinstance(t.ref_df, pd.DataFrame)
def test_TCRsampler_build_stratified(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background(stratify_by_subject=True) r = t.sample_background('TRBV9*01', 'TRBJ2-7*01', n=10)
def test_TCRsampler_init(): t = TCRsampler()
def test_gallery_hdiff(): """ All imports are provided here, and are repeated step-wise below, for clarity, and for module cut-and-paste. This example performs paired alpha-beta analysis, but code blocks can be used for single chain analysis as well. """ import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.rep_diff import hcluster_diff, member_summ from tcrsampler.sampler import TCRsampler from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif, svg_logo from hierdiff import plot_hclust_props """ Load a subset of data that contains paired alpha-beta chain mouse TCR receptors that recognized the PA or PB1 epitopes (present in mouse influenza). """ import pandas as pd df = pd.read_csv("dash.csv") conditional = df['epitope'].apply( lambda x: x in ['PA','PB1']) """ For illustrative/testing purposes, randomly subset the data to include only 100 clones. Increase for more informative plot. """ df = df[conditional].\ reset_index(drop = True).\ sample(100, random_state = 3).\ reset_index(drop = True).\ copy() """ Load DataFrame into TCRrep instance, which automatically computes attributes: 1. .clone_df DataFrame 2. .pw_beta nd.array 3. .pw_alpha nd.array """ from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df = df, organism = 'mouse', chains = ['beta','alpha'], db_file = 'alphabeta_gammadelta_db.tsv') """ Apply hcluster_diff, which hierarchically clusters. Note ---- pwmat could easily be tr.pw_beta or tr.pw_alpha if clustering should be done on a single chain. """ from tcrdist.rep_diff import hcluster_diff tr.hcluster_df, tr.Z =\ hcluster_diff(clone_df = tr.clone_df, pwmat = tr.pw_beta + tr.pw_alpha, x_cols = ['epitope'], count_col = 'count') """ Load a custom background, mouse appropriate dataset to sample CDR3s according to the V and J gene usage frequencies observed in each node. See the tcrsampler package for more details (https://github.com/kmayerb/tcrsampler/blob/master/docs/getting_default_backgrounds.md) """ from tcrsampler.sampler import TCRsampler t = TCRsampler() t.download_background_file("ruggiero_mouse_sampler.zip") tcrsampler_beta = TCRsampler(default_background = 'ruggiero_mouse_beta_t.tsv.sampler.tsv') tcrsampler_alpha = TCRsampler(default_background = 'ruggiero_mouse_alpha_t.tsv.sampler.tsv') """ Add an SVG graphic to every node of the tree aligned to the cluster centroid. """ from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif, svg_logo """Beta Chain""" svgs_beta = list() for i,r in tr.hcluster_df.iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],] if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df = dfnode) else: centroid = dfnode['cdr3_b_aa'].to_list()[0] print(f"BETA-CHAIN: {centroid}") gene_usage_beta = dfnode.groupby(['v_b_gene','j_b_gene']).size() sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'], flatten = True, depth = 10) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = 'cdr3_b_aa'), refs = sampled_rep, centroid = centroid) svgs_beta.append(svg_logo(motif, return_str= True)) """Add Beta SVG graphics to hcluster_df""" tr.hcluster_df['svg_beta'] = svgs_beta """Alpha Chain""" svgs_alpha = list() for i,r in tr.hcluster_df.iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],] if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df = dfnode) else: centroid = dfnode['cdr3_a_aa'].to_list()[0] print(f"ALPHA-CHAIN: {centroid}") gene_usage_alpha = dfnode.groupby(['v_a_gene','j_a_gene']).size() sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], flatten = True, depth = 10) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = 'cdr3_a_aa'), refs = sampled_rep, centroid = centroid) svgs_alpha.append(svg_logo(motif, return_str= True)) """Add Alpha SVG graphics to hcluster_df""" tr.hcluster_df['svg_alpha'] = svgs_alpha """ Produce summary information for tooltips. For instance, describe percentage of TCRs with a given epitope at a given node. """ res_summary = member_summ( res_df = tr.hcluster_df, clone_df = tr.clone_df, addl_cols=['epitope']) tr.hcluster_df_detailed = \ pd.concat([tr.hcluster_df, res_summary], axis = 1) """ Write D3 html for interactive denogram graphic. Specify desired tooltips. """ from hierdiff import plot_hclust_props html = plot_hclust_props(tr.Z, title='PA Epitope Example', res=tr.hcluster_df_detailed, tooltip_cols=['cdr3_b_aa','v_b_gene', 'j_b_gene','svg_alpha','svg_beta'], alpha=0.00001, colors = ['blue','gray'], alpha_col='pvalue') with open('hierdiff_example_PA_v_PB1.html', 'w') as fh: fh.write(html)
chains = ['delta'], db_file = 'alphabeta_gammadelta_db.tsv') # Matrix of delta-chain pairwise distances trd.pw_delta # The tcrdist delta-chain matrix is available here and can be easily visualized: gd = sns.clustermap(data= trd.pw_delta, row_cluster=True, col_cluster=True, yticklabels=False, xticklabels=False, ) # FIND METACLONOTYPES from tcrdist.public import _neighbors_fixed_radius tcrsampler_delta = TCRsampler(default_background = 'ravens_human_delta_t.sampler.tsv') trd.clone_df['radius'] = 18 trd.clone_df['neighbors'] = _neighbors_fixed_radius(pwmat = trd.pw_delta, radius = 18) trd.clone_df['K_neighbors'] = trd.clone_df['neighbors'].apply(lambda x : len(x)) trd.clone_df['nsubject'] = trd.clone_df['neighbors'].\ apply(lambda x: trd.clone_df['subject'].iloc[x].nunique()) trd.clone_df['qpublic'] = trd.clone_df['nsubject'].\ apply(lambda x: x > 1) from tcrdist.public import make_motif_logo from tcrdist.public import _quasi_public_meta_clonotypes qpublic_mcs = _quasi_public_meta_clonotypes(clone_df = trd.clone_df, pwmat = trd.pw_delta, tcrsampler = tcrsampler_delta, cdr3_name = 'cdr3_d_aa', v_gene_name = 'v_d_gene',