def sim_all_cdr3_gen(n=100, recomb_type="VDJ", chain_folder="human_T_beta", cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']): def expand_grid(dct): rows = itertools.product(*dct.values()) return pd.DataFrame.from_records(rows, columns=dct.keys()) omb = OlgaModel(recomb_type=recomb_type, chain_folder=chain_folder) all_vs = [x for x in omb.pgen_model.V_allele_names if x.endswith('*01')] all_js = [x for x in omb.pgen_model.J_allele_names if x.endswith('*01')] all_possible_beta = expand_grid({'V': all_vs, 'J': all_js}) find_nones = list() results = list() for i, r in all_possible_beta.iterrows(): e = omb.gen_cdr3s(V=r['V'], J=r['J'], n=n) results.append( pd.DataFrame({ cols[2]: e, cols[0]: r['V'], cols[1]: r['J'] })) if e[0] is None: find_nones.append([r['V'], r['J']]) return results, find_nones
def test_olga_sample(): np.random.seed(310) from tcrdist.pgen import OlgaModel olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") result = olga_model_beta.gen_cdr3(V='TRBV20-1*01', J='TRBJ1-2*01') # NOTE: seed is set, so we expect standard result #NOTE: .gen_cdr3() returns the full output tuple expected = ('TGCAGTGCTAGAGTAAGGGAAGCGGGAAGGACCTACACCTTC', 'CSARVREAGRTYTF', 29, 1, { 'V': 29, 'D': 2, 'J': 1, 'delV': 5, 'delJ': 15, 'delDl': 10, 'delDr': 7, 'insVD': 7, 'insDJ': 6 }) assert result == expected # NOTE: .gen_cdr3s() returns a list of CDR3s (amino acid only) np.random.seed(310) result = olga_model_beta.gen_cdr3s(V='TRBV20-1*01', J='TRBJ1-2*01', n=4) expected = [ 'CSARVREAGRTYTF', 'CSAVPPGLPNYGYTF', 'CSARGPSQGYVRGLYGYTF', 'CSAQGLAGYGYTF' ] assert result == expected
def motif_creation_human_betas(): import re import pandas as pd from tcrdist.repertoire import TCRrep import palmotif from tcrdist.pgen import OlgaModel oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") from tcrdist.pgen import OlgaModel omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') from tcrdist.adpt_funcs import get_basic_centroids get_basic_centroids(tr, max_dist=75) with open("test_3.svg", 'w') as oh: oh.write('<body>') for i, r in tr.centroids_df.iterrows(): if len(r['neighbors']) < 5: break seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list() gene_usages = tr.clone_df.iloc[r['neighbors'], ][[ 'v_b_gene', 'j_b_gene' ]].value_counts().reset_index().to_dict('split')['data'] depth = 3 refs = flatten([ omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth) for v, j, i in combos_alpha ]) refs = [x for x in refs if x is not None] matrix, stats = palmotif.compute_pal_motif(seqs=seqs, refs=refs, centroid=r['cdr3_b_aa']) matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs, centroid=r['cdr3_b_aa']) refs.append(r['cdr3_b_aa']) matrix_bkgd, _ = palmotif.compute_pal_motif( seqs=refs, centroid=r['cdr3_b_aa']) svgs = [ palmotif.svg_logo(matrix, 'test.svg', return_str=True), palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True), palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True) ] [oh.write(f"{s}<div></div>\n") for s in svgs] oh.write('<div></div>') oh.write(str(r)) oh.write('<div></div>') oh.write('</body>')
def test_olga_sample_beta(): np.random.seed(1) from tcrdist.pgen import OlgaModel olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") result = olga_model_beta.gen_cdr3s(V='TRBV20-1*01', J='TRBJ1-2*01', n=5) assert isinstance(result, list) assert len(result) == 5 assert result == [ 'CSARQGLANYGYTF', 'CSARPSRGQDGYTF', 'CSARDQRTGQDGYTF', 'CSARDVSSSGGYYGYTF', 'CSAPEPLTSGRACNGYTF' ]
def test_olga_sample_alpha(): np.random.seed(1) from tcrdist.pgen import OlgaModel olga_model_alpha = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") result = olga_model_alpha.gen_cdr3s(V='TRAV19*01', J='TRAJ37*01', n=5) assert isinstance(result, list) assert len(result) == 5 assert result == [ 'CALSEAPGNTGKLIF', 'CAPPSGNTGKLIF', 'CALAGNTGKLIF', 'CAQDNTGKLIF', 'CALRNTGKLIF' ]
def _auto_pgen(tcrrep=None, organism='human', chain='beta', ncpus=2): """ Automate a pgen estimation of cdr3s alpha/beta given a tcrrep with a clones_df attribute Parameters ---------- tcrrep : tcrdist.repertoire.TCRrep TCRrep instance with a clone_df organism : str 'human' or 'mouse' chain : str 'beta' or 'alpha' Returns ------- tcrrep : tcrdist.repertoire.TCRrep """ import tcrdist import parmap import pandas as pd from tcrdist.pgen import OlgaModel assert organism in ['human', 'mouse'] assert chain in ['beta', 'alpha'] assert isinstance(tcrrep, tcrdist.repertoire.TCRrep) assert isinstance(tcrrep.clone_df, pd.DataFrame) cdr3_col = {'alpha': 'cdr3_a_aa', 'beta': 'cdr3_b_aa'}[chain] cdr3s = tcrrep.clone_df[cdr3_col] olga_models = { ('human', 'beta'): OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ"), ('human', 'alpha'): OlgaModel(chain_folder="human_T_alpha", recomb_type="VJ"), ('mouse', 'beta'): OlgaModel(chain_folder="mouse_T_beta", recomb_type="VDJ") } olga_model = olga_models[(organism, chain)] pgens = parmap.map(olga_model.compute_aa_cdr3_pgen, cdr3s, pm_pbar=True, pm_processes=ncpus) tcrrep.clone_df[f"pgen_{cdr3_col}"] = pgens return tcrrep
def test_pgen_with_parmap(): """ Really simple example of using multiple cpus to speed up computation of pgens with olga. """ import parmap from tcrdist.pgen import OlgaModel olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ") parmap.map(olga_beta.compute_aa_cdr3_pgen, [ 'CASSYRVGTDTQYF', 'CATSTNRGGTPADTQYF', 'CASQGDSFNSPLHF', 'CASSPWTGSMALHF' ])
def test_olga_sample_alphas_for_a_human_repertoire(): import re import pandas as pd from tcrdist.repertoire import TCRrep import palmotif from tcrdist.pgen import OlgaModel olga_model_alpha = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") from tcrdist.pgen import OlgaModel olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') rb = [ olga_model_beta.gen_cdr3s(V=allele_01(r['v_b_gene']), J=allele_01(r['j_b_gene']), n=1) for _, r in tr.clone_df[['v_b_gene', 'j_b_gene']].iterrows() ] ra = [ olga_model_alpha.gen_cdr3s(V=allele_01(r['v_a_gene']), J=allele_01(r['j_a_gene']), n=1) for _, r in tr.clone_df[['v_a_gene', 'j_a_gene']].iterrows() ]
def test_pgen_1(): """ How to add pgen estimates to human alpha/beta CDR3s """ import pandas as pd from tcrdist.pgen import OlgaModel from tcrdist import mappers from tcrdist.repertoire import TCRrep from tcrdist.setup_tests import download_and_extract_zip_file df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df = df.sample(5, random_state = 3), organism = 'human', chains = ['alpha','beta'], db_file = 'alphabeta_gammadelta_db.tsv', store_all_cdr = False) olga_beta = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ") olga_alpha = OlgaModel(chain_folder = "human_T_alpha", recomb_type="VJ") tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens( CDR3_seq = tr.clone_df.cdr3_b_aa) tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens( CDR3_seq = tr.clone_df.cdr3_a_aa) tr.clone_df[['cdr3_b_aa', 'pgen_cdr3_b_aa', 'cdr3_a_aa','pgen_cdr3_a_aa']] """
def longtest_pgen_with_parmap(): """ Test speed up of computation of many pgens using parmap to make use of more than one cpu For 1000 CDR3 Finished 'olga_in_series' in 26.3842 secs with 1 core Finished 'olga_in_parmap' in 6.2384 secs with 6 cores """ import numpy as np import pandas as pd import parmap from tcrdist.pgen import OlgaModel from tcrdist.speed import timer from tcrdist.adpt_funcs import _valid_cdr3 from tcrdist.setup_tests import download_and_extract_zip_file download_and_extract_zip_file('cdr3_beta_500K.zip', source="dropbox", dest=".") olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ") n = 1000 df = pd.read_csv('cdr3_beta_500K.csv') inputlist = df.iloc[:, 0].to_list()[0:n] inputlist = [x for x in inputlist if _valid_cdr3(x)] @timer def olga_in_series(f=olga_beta.compute_aa_cdr3_pgen, input=inputlist): return [f(x) for x in input] @timer def olga_in_parmap(f=olga_beta.compute_aa_cdr3_pgen, input=inputlist, **kwargs): return parmap.map(f, input, pm_pbar=True, **kwargs) r1 = olga_in_series(f=olga_beta.compute_aa_cdr3_pgen, input=inputlist) r2 = olga_in_parmap(f=olga_beta.compute_aa_cdr3_pgen, input=inputlist) assert np.all(r1 == r2)
def sim_all_cdr3_gen(): import itertools def expand_grid(dct): rows = itertools.product(*dct.values()) return pd.DataFrame.from_records(rows, columns=dct.keys()) from tcrdist.pgen import OlgaModel omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") all_possible_beta = expand_grid({ 'V': omb.pgen_model.V_allele_names, 'J': omb.pgen_model.J_allele_names }) find_nones = list() results = list() for i, r in all_possible_beta.iterrows(): e = omb.gen_cdr3s(V=r['V'], J=r['J'], n=3) results.append(e) if e is None: find_nones.append([r['V'], r['J'], e]) print((r['V'], r['J'], e)) from tcrdist.pgen import OlgaModel oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") all_possible_alpha = expand_grid({ 'V': oma.pgen_model.V_allele_names, 'J': oma.pgen_model.J_allele_names }) find_nones = list() results = list() for i, r in all_possible_alpha.iterrows(): e = oma.gen_cdr3(V=r['V'], J=r['J']) results.append([r['V'], r['J'], e]) if e is None: find_nones.append([r['V'], r['J'], e]) print((r['V'], r['J'], e)) # Things we can't find: df = pd.DataFrame(results, columns=['v', 'j', 'r']) df[df['r'].isna()][['v']].value_counts() df[df['r'].isna()][['j']].value_counts()
def make_vj_matched_background(gene_usage_counter, ts=None, size=100000, recomb_type="VDJ", chain_folder="human_T_beta", cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']): """ gene_usage_counter : collections.Counter size : int recomb_type : str Default "VDJ", chain_folder : str Default is for human beta "human_T_beta", cols : list Default is for beta ['v_b_gene', 'j_b_gene', 'cdr3_b_aa'] Example ------- >>> ix =[['TRBV19*01', 'TRBJ2-5*01', 3],['TRBV24-1*01', 'TRBJ2-4*01', 3]] >>> df_rare= pd.concat([pd.DataFrame({'cdr3_b_aa' : flatten(ts.sample([[x[0], x[1], x[2]]])) , 'v_b_gene':x[0], 'j_b_gene':x[1]}) for x in ix]).reset_index(drop = True) >>> gene_usage_counter = make_gene_usage_counter(df_rare) >>> make_vj_matched_background(gene_usage_counter, size = 10) v_b_gene j_b_gene cdr3_b_aa pV pJ pVJ 0 TRBV24-1*01 TRBJ2-4*01 CATPVAGVAKNIQYF 0.011942 0.042163 0.000420 1 TRBV24-1*01 TRBJ2-4*01 CATSPRGSLSIQYF 0.011942 0.042163 0.000420 2 TRBV24-1*01 TRBJ2-4*01 CATSDLGGGGIHNIQYF 0.011942 0.042163 0.000420 3 TRBV19*01 TRBJ2-5*01 CASSISDRGKFSETQYF 0.006788 0.089505 0.000394 4 TRBV24-1*01 TRBJ2-4*01 CATSDLPARTRENIQYF 0.011942 0.042163 0.000420 5 TRBV24-1*01 TRBJ2-4*01 CATSDPQGAKNIQYF 0.011942 0.042163 0.000420 6 TRBV19*01 TRBJ2-5*01 CASSISCGRNLGGQETQYF 0.006788 0.089505 0.000394 7 TRBV19*01 TRBJ2-5*01 CASSCKPSGGYQETQYF 0.006788 0.089505 0.000394 8 TRBV19*01 TRBJ2-5*01 CASSSGTSHKLETQYF 0.006788 0.089505 0.000394 9 TRBV19*01 TRBJ2-5*01 CASSDRETQYF 0.006788 0.089505 0.000394 """ olga_model_beta = OlgaModel(recomb_type=recomb_type, chain_folder=chain_folder) total_seqs = np.sum(list(gene_usage_counter.values())) adjust_factor = size / total_seqs dfs = list() adjust_depth = 1 for k, v in gene_usage_counter.items(): try: cdr3s = olga_model_beta.gen_cdr3s(V=k[0], J=k[1], n=v * math.ceil(adjust_factor)) df = pd.DataFrame({cols[2]: cdr3s}) df[cols[0]] = k[0] df[cols[1]] = k[1] dfs.append(df) except AttributeError: pass df = pd.concat(dfs).reset_index(drop=True) df = df[df[cols[2]].notna()][cols] if ts is None: from tcrsampler.sampler import TCRsampler ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') ts = get_stratified_gene_usage_frequency(ts, replace=True) df = get_gene_frequencies(ts=ts, df=df, cols=cols) df = df.reset_index(drop=True) return (df)
'cdr3': 'cdr3_b_aa' }.get(c, c) for c in em_ss.columns] em_tr = TCRrep(cell_df=em_ss, organism='human', chains=['beta'], compute_distances=False) dash_fn = opj(_fg_data, 'tcrdist', 'datasets', 'dash_human.csv') df = pd.read_csv(dash_fn) tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], compute_distances=False) """Compute pgen of each epitope-specific sequence""" olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ") olga_alpha = OlgaModel(chain_folder="human_T_alpha", recomb_type="VJ") tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens( tr.clone_df.cdr3_b_aa) tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens( tr.clone_df.cdr3_a_aa) """Force pgen > 0: there were 7 CDR3 alphas with pgen = 0""" tr.clone_df = tr.clone_df.loc[(tr.clone_df['pgen_cdr3_a_aa'] > 0) & (tr.clone_df['pgen_cdr3_b_aa'] > 0)] norm_pgen = mpl.colors.LogNorm(vmin=1e-10, vmax=1e-6) norm_a = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_a_aa'].min(), vmax=tr.clone_df['pgen_cdr3_a_aa'].max()) norm_b = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_b_aa'].min(),
def test_pgen_mouse(): import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.pgen import OlgaModel import numpy as np df = pd.read_csv("dash.csv") tr = TCRrep(cell_df=df, organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=True) # Load OLGA model as a python object olga_beta = OlgaModel(chain_folder="mouse_T_beta", recomb_type="VDJ") olga_alpha = OlgaModel(chain_folder="mouse_T_alpha", recomb_type="VJ") # An example computing a single Pgen olga_beta.compute_aa_cdr3_pgen(tr.clone_df['cdr3_b_aa'][0]) olga_alpha.compute_aa_cdr3_pgen(tr.clone_df['cdr3_a_aa'][0]) # An example computing multiple Pgens olga_beta.compute_aa_cdr3_pgens(tr.clone_df['cdr3_b_aa'][0:5]) olga_alpha.compute_aa_cdr3_pgens(tr.clone_df['cdr3_a_aa'][0:5]) # An example computing 1920 Pgens more quickly with multiple cpus import parmap tr.clone_df['pgen_cdr3_b_aa'] = \ parmap.map( olga_beta.compute_aa_cdr3_pgen, tr.clone_df['cdr3_b_aa'], pm_pbar=True, pm_processes = 2) tr.clone_df['pgen_cdr3_a_aa'] = \ parmap.map( olga_alpha.compute_aa_cdr3_pgen, tr.clone_df['cdr3_a_aa'], pm_pbar=True, pm_processes = 2) """ We can do something else useful. We've tweaked the original generative code in OLGA, so that you can generate CDRs, given a specific TRV and TRJ. Note that unfortunately not all genes are recognized in default OLGA models, but many are. This gives you an idea of what you can do. Here are 10 CDR3s generated at random given a particular V,J usage combination """ np.random.seed(1) olga_beta.gen_cdr3s(V='TRBV14*01', J='TRBJ2-5*01', n=10) olga_alpha.gen_cdr3s(V='TRAV4-3*02', J='TRAJ31*01', n=10) """ Using this approach, we can synthesize an 100K background, with similar gene usage frequency to our actual repertoire. Note, however, that given data availability, this is currently likely the most reliable for human beta chain. After OLGA's publication, a default mouse alpha model (mouse_T_alpha) was added to the OLGA GitHub repository. We've included that here but it should be used with caution as it is missing a number of commonly seen V genes. """ np.random.seed(1) tr.synthesize_vj_matched_background(chain='beta') """ v_b_gene j_b_gene cdr3_b_aa pV pJ pVJ weights source 0 TRBV14*01 TRBJ2-3*01 CASSLASAETLYF 0.033721 0.092039 0.002989 0.065742 vj_matched 1 TRBV13-2*01 TRBJ2-3*01 CASGDAPDRTGAETLYF 0.118785 0.092039 0.010331 0.271309 vj_matched 2 TRBV13-3*01 TRBJ1-1*01 CASSDGFSRTGGVNTEVFF 0.074051 0.106146 0.006923 1.009124 vj_matched 3 TRBV13-3*01 TRBJ2-1*01 CASSDVQGGAEQFF 0.074051 0.117684 0.008915 1.021244 vj_matched 4 TRBV13-3*01 TRBJ2-7*01 CASSSGTGGYIYEQYF 0.074051 0.204898 0.015366 1.670224 vj_matched ... ... ... ... ... ... ... ... ... 99995 TRBV14*01 TRBJ2-3*01 CASSPTGGAPYASAETLYF 0.033721 0.092039 0.002989 0.065742 vj_matched 99996 TRBV17*01 TRBJ2-5*01 CASSRDPTQDTQYF 0.028110 0.124712 0.004930 0.650360 vj_matched 99997 TRBV14*01 TRBJ2-3*01 CASSSTGGAETLYF 0.033721 0.092039 0.002989 0.065742 vj_matched 99998 TRBV13-1*01 TRBJ2-1*01 CASSDWGKDYAEQFF 0.106042 0.117684 0.013373 2.622194 vj_matched 99999 TRBV4*01 TRBJ2-3*01 CASSYDRGSAETLYF 0.040749 0.092039 0.002989 0.068343 vj_matched """ np.random.seed(1) tr.synthesize_vj_matched_background(chain='alpha') """ v_a_gene j_a_gene cdr3_a_aa pV pJ pVJ weights source 0 TRAV12N-3*01 TRAJ34*02 CAIASNTNKVVF 0.000438 0.000088 0.000088 0.006059 vj_matched 1 TRAV3D-3*02 TRAJ33*01 CAVSAGADSNYQLIW 0.000088 0.000088 0.000088 0.005122 vj_matched 2 TRAV3-3*01 TRAJ27*01 CAVSTNTGKLTF 0.014029 0.042964 0.000877 0.277471 vj_matched 3 TRAV3-3*01 TRAJ26*01 CAVSHNYAQGLTF 0.014029 0.040947 0.001052 0.009155 vj_matched 4 TRAV3-3*01 TRAJ26*01 CAVSARNYAQGLTF 0.014029 0.040947 0.001052 0.009155 vj_matched ... ... ... ... ... ... ... ... ... 99995 TRAV3D-3*02 TRAJ21*01 CAVSVSNYNVLYF 0.000088 0.039982 0.000088 0.003758 vj_matched 99996 TRAV3-3*01 TRAJ43*01 CAVSENNNNAPRF 0.014029 0.022271 0.000526 0.071093 vj_matched 99997 TRAV3D-3*02 TRAJ26*01 CAVSGNYAQGLTF 0.000088 0.040947 0.000088 0.000296 vj_matched 99998 TRAV3-3*01 TRAJ26*01 CAVKGNNYAQGLTF 0.014029 0.040947 0.001052 0.009155 vj_matched 99999 TRAV9N-2*01 TRAJ15*01 CTYQGGRALIF 0.000088 0.043840 0.000088 0.020438 vj_matched """ """" tcrdist3's integration of Pgen estimates makes it very easy to look for PUBLIC clusters of TCRs (i.e. high number of neighbors) with unlikely V(D)J recombinations. """ from tcrdist.public import _neighbors_fixed_radius from tcrdist.public import _K_neighbors_fixed_radius tr.clone_df['neighbors'] = _neighbors_fixed_radius(pwmat=tr.pw_beta, radius=18) tr.clone_df['K_neighbors'] = _K_neighbors_fixed_radius(pwmat=tr.pw_beta, radius=18) tr.clone_df['pgen_cdr3_b_aa_nlog10'] = tr.clone_df['pgen_cdr3_b_aa'].apply( lambda x: -1 * np.log10(x)) tr.clone_df['nsubject'] = tr.clone_df['neighbors'].apply( lambda x: len(tr.clone_df['subject'][x].unique())) # nsubject > 1 implies quasi-publicity tr.clone_df['qpublic'] = tr.clone_df['nsubject'].apply(lambda x: x > 1) # Note one can find neighbors based on paired-chain distances. from tcrdist.public import _neighbors_fixed_radius from tcrdist.public import _K_neighbors_fixed_radius tr.clone_df['neighbors'] = _neighbors_fixed_radius(pwmat=tr.pw_beta + tr.pw_alpha, radius=50) tr.clone_df['K_neighbors'] = _K_neighbors_fixed_radius(pwmat=tr.pw_beta + tr.pw_alpha, radius=50) tr.clone_df['pgen_cdr3_b_aa_nlog10'] = tr.clone_df['pgen_cdr3_b_aa'].apply( lambda x: -1 * np.log10(x)) tr.clone_df['nsubject'] = tr.clone_df['neighbors'].apply( lambda x: len(tr.clone_df['subject'][x].unique())) # nsubject > 1 implies quasi-publicity) tr.clone_df['qpublic'] = tr.clone_df['nsubject'].apply(lambda x: x > 1) """
def _tcrsampler_svgs(tcrrep, default_background=None, default_background_if_missing=None, cdr3_name='cdr3_b_aa', pwmat_str='pw_cdr3_b_aa', chain='beta', gene_names=['v_b_gene', 'j_b_gene'], combine_olga=False, verbose=True): """ Breath. What does this do? Given a TCRrep instance, this function samples a background repertoire using TCRsampler and makes svg-logos using palmotif. This function doesn't return anything it. It needs to access attribute values of a TCRrep (tcrrep) instance and it modifies th etcrrep in place adding svgs and stats colums to .hcluster_df_detailed DataFrame. TODO: could just output a dataframe which would then just be concattenated. ONLY WORKS WITH _BETA using defaults: Notes ----- Note: TCRSampler.build_background() accepts kwargs, we've set these as fixed as most user won't know what these do and won't need to change them. max_rows : int Maximum clones per v,j pair (per subject) stratify_by_subject : bool If True, max_rows will apply to v,j,subject. If False, max_rows applies to v,j use_frequency : bool If True, uses frequency for ranking rows. If False, uses raw counts. make_singleton : bool If True, background is still sorted by frequency or counts, but final fequency and counts values are overridden and set to 1. """ from tcrsampler.sampler import TCRsampler from palmotif import compute_pal_motif, svg_logo import pandas as pd from tcrdist.summarize import _select if chain == 'alpha' and tcrrep.organism == "mouse": # Here we enforce the rule that alpha-mouse cannot use an olga-sampler # TODO: This should be removed as soon as TCRsampler can be updated with a valid # mouse-alpha simulated background. combine_olga = False # _default_sampler returns a TCRSampler based on organism and chain if verbose: print(f"INITIALIZING A TCRSAMPLER") print(tcrrep.organism, chain) t = _default_sampler(organism=tcrrep.organism, chain=chain)( default_background=default_background, default_background_if_missing=default_background_if_missing) build_kargs = { 'max_rows': 100, 'stratify_by_subject': True, 'use_frequency': True, 'make_singleton': False } build_kargs_olga = { 'max_rows': 1000, 'stratify_by_subject': False, 'use_frequency': True, 'make_singleton': False } if verbose: print(f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs)}") t.build_background(**build_kargs) # Olga Sampler if combine_olga: t_olga = _default_sampler_olga(chain=chain, organism=tcrrep.organism)() if verbose: print( f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs_olga)}" ) t.build_background(**build_kargs_olga) olga_model = { ('beta', 'human'): OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta"), ('alpha', 'human'): OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha"), ('beta', 'mouse'): OlgaModel(recomb_type="VDJ", chain_folder="mouse_T_beta") }[(chain, tcrrep.organism)] if 'prune' not in tcrrep.hcluster_df.columns: if verbose: print("NO PRUNE COLUMNS USED ALL SET TO 0") tcrrep.hcluster_df['prune'] = 0 print("ITERATE THROUGH CLUSTERS") svgs = list() svgs_raw = list() reference_unique = list() reference_unique_olga = list() reference_size = list() reference_size_olga = list() percent_missing_sampler = list() percent_missing_sampler_olga = list() n_rows = tcrrep.hcluster_df.shape[0] bar = IncrementalBar(f'Make {chain} SVGs :', max=n_rows, suffix='%(percent)d%%') for i, r in tcrrep.hcluster_df.iterrows(): bar.next() if r['prune'] == 0: # <dfnode> is dataframe with all the clones at a given tree node dfnode = tcrrep.clone_df.iloc[r['neighbors_i'], ].copy() # <pwnode> Pairwise Matrix for node sequences pwnode = getattr( tcrrep, pwmat_str)[r['neighbors_i'], :][:, r['neighbors_i']].copy() iloc_idx = pwnode.sum(axis=0).argmin() centroid = dfnode[cdr3_name].to_list()[iloc_idx] # Compute gene usage at the node # Convert to allele_01 for gene_name in gene_names: dfnode[gene_name] = dfnode[gene_name].apply( lambda x: allele_01(x)) gene_usage = dfnode.groupby( gene_names).size() # e.g., ['v_b_gene','j_b_gene'] gene_usage_tuples = gene_usage.reset_index().to_dict( 'split')['data'] # Given gene usage use the <t> a TCRsampler instance to get background seqs # Adjust depth for small nodes adjust_depth = 10 * round(10 / dfnode.shape[0]) if adjust_depth < 10: adjust_depth = 10 sampled_rep = t.sample(gene_usage_tuples, flatten=True, depth=adjust_depth * 10) # Only keep the non-none sequences sampled_rep = [x for x in sampled_rep if x is not None] # < missing_gene > Count the percentage missing, sampler returns none when no v,j pair is present expected_depth = dfnode.shape[0] * adjust_depth * 10 recovered_depth = len(sampled_rep) percent_missing = round( 100 * (1 - (recovered_depth / expected_depth)), 1) percent_missing_sampler.append(f"{percent_missing}%") reference_unique.append(str(pd.Series(sampled_rep).nunique())) reference_size.append(str(pd.Series(sampled_rep).count())) if combine_olga: # We modified Olga source code slightly, such that we simulated sequences # with a given V,J gene usage # OLD METHOD WHERE WE ACTUALLY SAMPLED, slower but can go much deeper. I don't think one rare sequence however, really make a big difference. #flatten = lambda l: [item for sublist in l for item in sublist] #sampled_rep_olga = [olga_model.gen_cdr3s(allele_01(v),allele_01(j),n*adjust_depth*10) for v,j,n in gene_usage_tuples] #sampled_rep_olga = [x for x in flatten(sampled_rep_olga) if x is not None] sampled_rep_olga = t_olga.sample(gene_usage_tuples, flatten=True, depth=adjust_depth * 10) sampled_rep_olga = [ x for x in sampled_rep_olga if x is not None ] expected_depth = dfnode.shape[0] * adjust_depth * 10 recovered_depth = len(sampled_rep_olga) percent_missing_olga = round( 100 * (1 - (recovered_depth / expected_depth)), 1) percent_missing_sampler_olga.append(f"{percent_missing_olga}%") reference_unique_olga.append( str(pd.Series(sampled_rep_olga).nunique())) reference_size_olga.append( str(pd.Series(sampled_rep_olga).count())) # HERE WE COMBINE INTO A SINGLE BACKGROUND: sampled_rep = sampled_rep + sampled_rep_olga # Get motif matrix and motif stats motif, stat = compute_pal_motif(seqs=_select( df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name), refs=sampled_rep, centroid=centroid) svgs.append(svg_logo(motif, return_str=True)) # repeaat without references raw_motif, raw_stat = compute_pal_motif(seqs=_select( df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name), centroid=centroid) # Convert the motif matrix into an svg_logo, append to list svgs_raw.append(svg_logo(raw_motif, return_str=True)) else: # If prune column is 1 don't go to the trouble of sampling and generating seqs svgs.append("PRUNE") svgs_raw.append("PRUNE") reference_size.append("PRUNE") reference_unique.append("PRUNE") percent_missing_sampler.append("PRUNE") percent_missing_sampler_olga.append("PRUNE") reference_unique_olga.append("PRUNE") reference_size_olga.append("PRUNE") bar.next() bar.finish() # The standard svg_ includes background, whereas raw has no background tcrrep.hcluster_df_detailed[f'svg_{chain}'] = svgs tcrrep.hcluster_df_detailed[f'svg_raw_{chain}'] = svgs_raw tcrrep.hcluster_df_detailed[f'ref_size_{chain}'] = reference_size tcrrep.hcluster_df_detailed[f'ref_unique_{chain}'] = reference_unique tcrrep.hcluster_df_detailed[ f'percent_missing_{chain}'] = percent_missing_sampler if combine_olga: tcrrep.hcluster_df_detailed[ f'ref_size_olga_{chain}'] = reference_size_olga tcrrep.hcluster_df_detailed[ f'ref_unique_olga_{chain}'] = reference_unique_olga tcrrep.hcluster_df_detailed[ f'percent_missing_olga_{chain}'] = percent_missing_sampler_olga return True