def test_current_example(): import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2 from tcrdist.automate import auto_pgen import scipy.sparse fn_mira_background = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.olga100K_brit100K_bkgd.csv' fn_mira_background = os.path.join('tcrdist', 'data', 'covid19', fn_mira_background) df_background = pd.read_csv(fn_mira_background) tr_background = TCRrep(cell_df=df_background.copy(), organism="human", chains=['beta'], compute_distances=False) fn_mira = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv' fn_mira = os.path.join('tcrdist', 'data', 'covid19', fn_mira) df_mira = pd.read_csv(fn_mira) df_mira = df_mira[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']] tr = TCRrep(cell_df=df_mira.copy(), organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', store_all_cdr=False, compute_distances=True) auto_pgen(tr) tr.compute_rect_distances(df=tr.clone_df, df2=tr_background.clone_df, store=False) assert tr.rw_beta.shape[0] == tr.clone_df.shape[0] centers_df = bkgd_cntl_nn2(tr=tr, tr_background=tr_background, ctrl_bkgd=10**-6, weights=tr_background.clone_df.weights, col='cdr3_b_aa', ncpus=2, thresholds=[x for x in range(0, 50, 2)], generate_regex=True, test_regex=True) out_fn_center_df = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.centers_df.csv' out_fn_rw_beta_sparse_matrix = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.rw_beta.sparse.npz' centers_df.to_csv(out_fn_center_df, index=False) tr.rw_beta[tr.rw_beta == 0] = 1 # set true zeros to 1 tr.rw_beta[tr.rw_beta > 50] = 0 # ignores everything less than 100 rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta) scipy.sparse.save_npz(out_fn_rw_beta_sparse_matrix, rw_beta_sparse)
def test_old_example(): """ The purpose of this example is to show the use of chosing thresholds based on background discovery rate """ import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2 from tcrdist.automate import auto_pgen from tcrdist.regex import _index_to_regex_str, _index_to_seqs from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets fn = os.path.join('tcrdist', 'data', 'covid19', "m60_bkgd_test_input.csv") df_background = pd.read_csv(fn) tr_background = TCRrep(cell_df=df_background, organism="human", chains=['beta'], compute_distances=False) tr_background.clone_df['weights'] = 1 fn = os.path.join('tcrdist', 'data', 'covid19', "m60_test_input.csv") df = pd.read_csv(fn) tr = TCRrep(cell_df=df, organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv') auto_pgen(tr) tr.compute_rect_distances(df=tr.clone_df, df2=tr_background.clone_df, store=False) assert tr.rw_beta.shape[0] == tr.clone_df.shape[0] centers_df = bkgd_cntl_nn2(tr=tr, tr_background=tr_background, ctrl_bkgd=2 * 10**-5, weights=tr_background.clone_df.weights, col='cdr3_b_aa', ncpus=2, thresholds=[x for x in range(0, 50, 2)], generate_regex=True, test_regex=True) centers_df.sort_values(['target_hits'], ascending=False)
def test_auto_pgen_human_alpha_beta(): import pandas as pd from tcrdist.repertoire import TCRrep df = pd.read_csv("dash_human.csv").sample(10, random_state=3) tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') from tcrdist.automate import auto_pgen tr = auto_pgen(tr) assert isinstance(tr.clone_df.pgen_cdr3_b_aa, pd.Series)
def test_auto_pgen_mouse_alpha_beta_ValueError(): """ If auto_pgen called on TCRrep with organism == mouse and chain including 'alpha', Raises ValueError""" import pandas as pd from tcrdist.repertoire import TCRrep df = pd.read_csv("dash.csv").sample(10, random_state=3) tr = TCRrep(cell_df=df, organism='mouse', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') from tcrdist.automate import auto_pgen with pytest.raises(ValueError): tr = auto_pgen(tr)
def find_centers_beta(background_filename, target_filename, ncpus, min_nsubject, ctrl_bkgd=10**-5, prefilter=False): import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2 from tcrdist.automate import auto_pgen from tcrdist.rep_diff import neighborhood_diff from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets import scipy.sparse df_background = pd.read_csv(background_filename) print(df_background) tr_background = TCRrep(cell_df=df_background.copy(), organism="human", chains=['beta'], compute_distances=False) df_mira = pd.read_csv(target_filename) df_mira = df_mira[[ 'subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa' ]] print(df_mira) tr = TCRrep(cell_df=df_mira.copy(), organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', store_all_cdr=False, compute_distances=True) if prefilter: # We can greatly cut down on the number of searches if we drop centroids without minimum publicicity nn_df = neighborhood_diff(clone_df=tr.clone_df, pwmat=tr.pw_beta, count_col='count', x_cols=['cell_type'], knn_radius=37) def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'): # Tabulate the number of unique subjects at each node neighbor_df['nsubject'] = neighbor_df[col_nn].apply(lambda x: len( set(_select(clone_df, iloc_rows=x, col='subject')))) return neighbor_df print(f"TABULATING PUBLIC CLUSTERS") nn_df = tabulate_publicity(nn_df, tr.clone_df) nn_df = filter_gt(nn_df, 'nsubject', min_nsubject) if nn_df.shape[0] == 0: centers_df = pd.DataFrame( {}, columns=[ 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'max_radi', 'target_hits', 'bkgd_hits', 'bkgd_hits_weighted', 'bkgd_total', 'ctrl', 'ctrl_weighted', 'target_misses', 'TR', 'TR2', 'BR_weighted', 'RR_weighted', 'OR_weighted', 'chi2dist', 'target_neighbors', 'target_seqs', 'background_neighbors', 'background_seqs', 'background_v', 'background_j', 'regex', 'target_re_hits', 'bkgd_re_hits', 'bkgd_re_weighted_hits', 'TR_re', 'BR_re_weighted', 'RR_re_weighted', 'OR_re_weighted', 'chi2re', 'chi2joint', 'nsubject' ]) tr.pw_beta[tr.pw_beta == 0] = 1 # set true zeros to 1 tr.pw_beta[tr.pw_beta > 50] = 0 # ignores everything less than 100 pw_beta_sparse = scipy.sparse.csr_matrix(tr.pw_beta) return centers_df, pw_beta_sparse tr.clone_df = tr.clone_df.loc[nn_df.index, :].reset_index(drop=True) del nn_df # Compute pairwise again with filtered set tr.compute_distances() # compute pgens automatically, currently parmap will max out cpus on this step print("COMPUTING PROBABILITY OF GENERATION") auto_pgen(tr) print( f"COMPUTING RECT DIST {tr.clone_df.shape[0]}x{tr_background.clone_df.shape[0]}" ) tr.compute_rect_distances(df=tr.clone_df, df2=tr_background.clone_df, store=False) assert tr.rw_beta.shape[0] == tr.clone_df.shape[0] centers_df = bkgd_cntl_nn2( tr=tr, tr_background=tr_background, ctrl_bkgd=ctrl_bkgd, #ctrl_bkgd = 2*10**-5 weights=tr_background.clone_df.weights, col='cdr3_b_aa', ncpus=ncpus, thresholds=[x for x in range(0, 38, 2)], # Settign 38 as the max radius generate_regex=True, test_regex=True) def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'): # Tabulate the number of unique subjects at each node neighbor_df['nsubject'] = neighbor_df[col_nn].apply( lambda x: len(set(_select(clone_df, iloc_rows=x, col='subject')))) return neighbor_df centers_df = tabulate_publicity(neighbor_df=centers_df, clone_df=tr.clone_df, col_nn='target_neighbors') tr.rw_beta[tr.rw_beta == 0] = 1 # set true zeros to 1 tr.rw_beta[tr.rw_beta > 50] = 0 # ignores everything less than 100 rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta) #scipy.sparse.save_npzz(output_matrix_filename, rw_beta_sparse) return centers_df, rw_beta_sparse
def run_one(ref_fn, rep_fn, ss=-1, ncpus=1): ref_df = pd.read_csv(ref_fn) ref_df.columns = [{ 'v_b_name': 'v_b_gene', 'j_b_name': 'j_b_gene', 'cdr3_b_aa': 'cdr3_b_aa' }.get(c, c) for c in ref_df.columns] ref_df.loc[:, 'count'] = 1 if ss == -1: ref_tr = TCRrep(cell_df=ref_df, organism='human', chains=['beta'], compute_distances=False, store_all_cdr=False) else: ref_tr = TCRrep(cell_df=ref_df.sample(n=ss, replace=False), organism='human', chains=['beta'], compute_distances=False, store_all_cdr=False) rep_df = pd.read_csv(rep_fn).assign(count=1) tr = TCRrep(cell_df=rep_df[[ 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope', 'experiment', 'subject', 'count' ]], organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=False) if tr.clone_df.shape[0] > 6000: """Limit size of MIRA set to 2000""" tr.clone_df = tr.clone_df.sample(n=6000, replace=False, random_state=110820) auto_pgen(tr) out = [] print(rep_fn) for metric in ['tcrdist', 'tcrdist-cdr3', 'edit']: if 'tcr' in metric: metric_thresholds = np.arange(76) fcluster_thresholds = [0, 25, 50] else: metric_thresholds = np.arange(9) fcluster_thresholds = [0, 1, 2] """Enforce no clustering analysis""" fcluster_thresholds = [0] epitope_name = os.path.split(rep_fn)[1].split('.')[0] epitope_name = epitope_name.replace('mira_epitope_', 'M') # rep_fn = opj(_fg_data, 'ncov_tcrs/adaptive_bio_r2/tcrs_by_mira_epitope/pw_computed', rep_fn) print(f'\t{metric}') """with open(rep_fn, 'rb') as fh: tr = dill.load(fh)""" """Compute repertoire PW distances and create flat clusters""" rep_pwmat = _pwrect(tr, clone_df1=tr.clone_df, metric=metric, ncpus=ncpus) print('Computed MIRA set pwrect.') ref_pwmat = _pwrect(tr, clone_df1=tr.clone_df, clone_df2=ref_tr.clone_df, metric=metric, ncpus=ncpus) print('Computed reference pwrect.') for fclust_thresh in fcluster_thresholds: if fclust_thresh > 0: rep_pwvec = scipy.spatial.distance.squareform(rep_pwmat, force='tovector') Z = sch.linkage(rep_pwvec, method='complete') labels = sch.fcluster(Z, t=fclust_thresh, criterion='distance') else: labels = np.arange(1, rep_pwmat.shape[0] + 1) """Compute ECDF for each cluster within the repertoire""" # rep_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds))) for lab in range(1, np.max(labels) + 1): lab_ind = labels == lab rep_ecdf = compute_ecdf(np.mean( rep_pwmat[lab_ind, :][:, ~lab_ind], axis=0), thresholds=metric_thresholds) tmp_df = pd.DataFrame({ 'ecdf': rep_ecdf, 'thresholds': metric_thresholds }) tmp_df = tmp_df.assign( metric=metric, fclust_thresh=fclust_thresh, label=lab, name=epitope_name, versus='rep', pgen=np.median( tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]), K=lab_ind.sum()) out.append(tmp_df) """Compute distances to the reference for each cluster and compute ECDF vs reference""" # ref_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds))) for lab in range(1, np.max(labels) + 1): lab_ind = labels == lab ref_ecdf = compute_ecdf(np.mean(ref_pwmat[lab_ind, :], axis=0), thresholds=metric_thresholds, weights=ref_tr.clone_df['weights']) tmp_df = pd.DataFrame({ 'ecdf': ref_ecdf, 'thresholds': metric_thresholds }) tmp_df = tmp_df.assign( metric=metric, fclust_thresh=fclust_thresh, label=lab, name=epitope_name, versus='ref', pgen=np.median( tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]), K=lab_ind.sum()) out.append(tmp_df) out = pd.concat(out, axis=0) return out
def find_metaclonotypes( project_path = "tutorial48", source_path = os.path.join(path_to_base,'tcrdist','data','covid19'), antigen_enriched_file = 'mira_epitope_48_610_YLQPRTFL_YLQPRTFLL_YYVGYLQPRTF.tcrdist3.csv', ncpus = 4, seed = 3434): """ This functions encapsulates a complete workflow for finding meta-clonotypes in antigen-enriched data. """ np.random.seed(seed) if not os.path.isdir(project_path): os.mkdir(project_path) ############################################################################ # Step 1: Select and load a antigen-enriched (sub)repertoire. #### ############################################################################ print(f"INITIATING A TCRrep() with {antigen_enriched_file}") assert os.path.isfile(os.path.join(source_path, antigen_enriched_file)) # Read file into a Pandas DataFrame <df> df = pd.read_csv(os.path.join(source_path, antigen_enriched_file)) # Drop cells without any gene usage information df = df[( df['v_b_gene'].notna() ) & (df['j_b_gene'].notna()) ] # Initialize a TCRrep class, using ONLY columns that are complete and unique define a a clone. # Class provides a 'count' column if non is present # Counts of identical subject:VCDR3 'clones' will be aggregated into a TCRrep.clone_df. from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df = df[['subject','cell_type','v_b_gene', 'j_b_gene', 'cdr3_b_aa']], organism = "human", chains = ['beta'], compute_distances = True) tr.cpus = ncpus ############################################################################ # Step 1.1: Estimate Probability of Generation #### ############################################################################ ### It will be useful later to know the pgen of each from tcrdist.automate import auto_pgen print(f"COMPUTING PGEN WITH OLGA (Sethna et al 2018)") print("FOR ANTIGEN-ENRICHED CLONES TO BE USED FOR SUBSEQUENT ANALYSES") auto_pgen(tr) # Tip: Users of tcrdist3 should be aware that by default a <TCRrep.clone_df> # DataFrame is created out of non-redundant cells in the cell_df, and # pairwise distance matrices automatically computed. # Notice that attributes <tr.clone_df> and <tr.pw_beta> , <tr.pw_cdr3_b_aa>, # are immediately accessible. # Attributes <tr.pw_pmhc_b_aa>, <tr.pw_cdr2_b_aa>, and <tr.pw_cdr1_b_aa> # are also available if <TCRrep.store_all_cdr> is set to True. # For large datasets, i.e., >15,000 clones, this approach may consume too much # memory so <TCRrep.compute_distances> is automatically set to False. ############################################################################ # Step 2: Synthesize an Inverse Probability Weighted VJ Matched Background # ############################################################################ # Generating an appropriate set of unenriched reference TCRs is important; for # each set of antigen-associated TCRs, discovered by MIRA, we created a two part # background. One part consists of 100,000 synthetic TCRs whose V-gene and J-gene # frequencies match those in the antigen-enriched repertoire, using the software # OLGA (Sethna et al. 2019; Marcou et al. 2018). The other part consists of # 100,000 umbilical cord blood TCRs sampled uniformly from 8 subjects (Britanova # et al., 2017). This mix balances dense sampling of sequences near the # biochemical neighborhoods of interest with broad sampling of TCRs from an # antigen-naive repertoire. Importantly, we adjust for the biased sampling by # using the V- and J-gene frequencies observed in the cord-blood data (see # Methods for details about inverse probability weighting adjustment). Using this # approach we are able to estimate the abundance of TCRs similar to a centroid # TCR in an unenriched background repertoire of ~1,000,000 TCRs, using a # comparatively modest background dataset of 200,000 TCRs. While this estimate # may underestimate the true specificity, since some of the neighborhood TCRs in # the unenriched background repertoire may in fact recognize the antigen of # interest, it is useful for prioritizing neighborhoods and selecting a radius # for each neighborhood that balances sensitivity and specificity. # Initialize a TCRsampler -- human, beta, umbilical cord blood from 8 people. print(f"USING tcrsampler TO CONSTRUCT A CUSTOM V-J MATCHED BACKGROUND") from tcrsampler.sampler import TCRsampler ts = TCRsampler(default_background = 'britanova_human_beta_t_cb.tsv.sampler.tsv') # Stratify sample so that each subject contributes similarly to estimate of # gene usage frequency from tcrdist.background import get_stratified_gene_usage_frequency ts = get_stratified_gene_usage_frequency(ts = ts, replace = True) # Synthesize an inverse probability weighted V,J gene background that matches # usage in your enriched repertoire df_vj_background = tr.synthesize_vj_matched_background(ts = ts, chain = 'beta') # Get a randomly drawn stratified sampler of beta, cord blood from # Britanova et al. 2016 # Dynamics of Individual T Cell Repertoires: From Cord Blood to Centenarians from tcrdist.background import sample_britanova df_britanova_100K = sample_britanova(size = 100000) # Append frequency columns using, using sampler above df_britanova_100K = get_gene_frequencies(ts = ts, df = df_britanova_100K) df_britanova_100K['weights'] = 1 df_britanova_100K['source'] = "stratified_random" # Combine the two parts of the background into a single DataFrame df_bkgd = pd.concat([df_vj_background.copy(), df_britanova_100K.copy()], axis = 0).\ reset_index(drop = True) # Assert that the backgrounds have the expected number of rows. assert df_bkgd.shape[0] == 200000 # Save the background for future use background_outfile = os.path.join(project_path, f"{antigen_enriched_file}.olga100K_brit100K_bkgd.csv") print(f'WRITING {background_outfile}') df_bkgd.to_csv(background_outfile, index = False) # Load the background to a TCRrep without computing pairwise distances # (i.e., compute_distances = False) tr_bkgd = TCRrep( cell_df = df_bkgd, organism = "human", chains = ['beta'], compute_distances = False) # Compute rectangular distances. Those are, distances between each clone in # the antigen-enriched repertoire and each TCR in the background. # With a single 1 CPU and < 10GB RAM, 5E2x2E5 = 100 million pairwise distances, # across CDR1, CDR2, CDR2.5, and CDR3 # 1min 34s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) # %timeit -r 1 tr.compute_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df, store = False) ############################################################################ # Step 4: Calculate Distances ##### ############################################################################ print(f"COMPUTING RECTANGULARE DISTANCE") tr.compute_sparse_rect_distances( df = tr.clone_df, df2 = tr_bkgd.clone_df, radius=50, chunk_size = 100) scipy.sparse.save_npz(os.path.join(project_path, f"{antigen_enriched_file}.rw_beta.npz"), tr.rw_beta) # Tip: For larger dataset you can use a sparse implementation: # 30.8 s ± 0 ns per loop ; tr.cpus = 6 # %timeit -r tr.compute_sparse_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df,radius=50, chunk_size=85) ############################################################################ # Step 5: Examine Density ECDFS ##### ############################################################################ # Investigate the density of neighbors to each TCR, based on expanding # distance radius. from tcrdist.ecdf import distance_ecdf, _plot_manuscript_ecdfs import matplotlib.pyplot as plt # Compute empirical cumulative density function (ecdf) # Compare Antigen Enriched TCRs (against itself). thresholds, antigen_enriched_ecdf = distance_ecdf( tr.pw_beta, thresholds=range(0,50,2)) # Compute empirical cumulative density function (ecdf) # Compare Antigen Enriched TCRs (against) 200K probability # inverse weighted background thresholds, background_ecdf = distance_ecdf( tr.rw_beta, thresholds=range(0,50,2), weights= tr_bkgd.clone_df['weights'], absolute_weight = True) # plot_ecdf similar to tcrdist3 manuscript # antigen_enriched_ecdf[antigen_enriched_ecdf == antigen_enriched_ecdf.min()] = 1E-10 f1 = _plot_manuscript_ecdfs( thresholds, antigen_enriched_ecdf, ylab= 'Proportion of Antigen Enriched TCRs', cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), min_freq=1E-10) f1.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_AER_plot.png')) f2 = _plot_manuscript_ecdfs( thresholds, background_ecdf, ylab= 'Proportion of Reference TCRs', cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), min_freq=1E-10) f2.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_BUR_plot.png')) ############################################################################ # Step 6: Find optimal radii (theta = 1E5 ##### ############################################################################ # To ascertain which meta-clonotypes are likely to be most specific, # take advantage of an existing function <bkgd_cntrl_nn2>. # d888 .d8888b. 8888888888 888888888 # d8888 d88P Y88b 888 888 # 888 888 888 888 888 # 888 888 888 8888888 8888888b. # 888 888 888 888 "Y88b # 888 888 888 888 888888 888 # 888 Y88b d88P 888 Y88b d88P # 8888888 "Y8888P" 8888888888 "Y8888P" level_tag = '1E5' from tcrdist.neighbors import bkgd_cntl_nn2 centers_df = bkgd_cntl_nn2( tr = tr, tr_background = tr_bkgd, weights = tr_bkgd.clone_df.weights, ctrl_bkgd = 10**-5, col = 'cdr3_b_aa', add_cols = ['v_b_gene', 'j_b_gene'], ncpus = 4, include_seq_info = True, thresholds = [x for x in range(0,50,2)], generate_regex = True, test_regex = True, forced_max_radius = 36) ############################################################################ # Step 6.2: (theta = 1E5) ALL meta-clonotypes .tsv file ## ############################################################################ # save center to project_path for future use centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) # Many of meta-clonotypes contain redundant information. # We can winnow down to less-redundant list. We do this # by ranking clonotypes from most to least specific. # <min_nsubject> is minimum publicity of the meta-clonotype, # <min_nr> is minimum non-redundancy # Add neighbors, K_neighbors, and nsubject columns from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius']) centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x)) # We determine how many <nsubjects> are in the set of neighbors centers_df['nsubject'] = centers_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) from tcrdist.centers import rank_centers ranked_centers_df = rank_centers( centers_df = centers_df, rank_column = 'chi2joint', min_nsubject = 2, min_nr = 1) ############################################################################ # Step 6.3: (theta = 1E5) NR meta-clonotypes .tsv file ### ############################################################################ # Output, ready to search bulk data. ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) ############################################################################ # Step 6.4: (theta = 1E5) Output Meta-Clonotypes HTML Summary ### ############################################################################ # Here we can make a svg logo for each NR meta-clonotype if ranked_centers_df.shape[0] > 0: from progress.bar import IncrementalBar from tcrdist.public import make_motif_logo cdr3_name = 'cdr3_b_aa' v_gene_name = 'v_b_gene' svgs = list() svgs_raw = list() bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0]) for i,r in ranked_centers_df.iterrows(): bar.next() centroid = r[cdr3_name] v_gene = r[v_gene_name] svg, svg_raw = make_motif_logo( tcrsampler = ts, pwmat = tr.pw_beta, clone_df = tr.clone_df, centroid = centroid , v_gene = v_gene , radius = r['radius'], pwmat_str = 'pw_beta', cdr3_name = 'cdr3_b_aa', v_name = 'v_b_gene', gene_names = ['v_b_gene','j_b_gene']) svgs.append(svg) svgs_raw.append(svg_raw) bar.next();bar.finish() ranked_centers_df['svg'] = svgs ranked_centers_df['svg_raw'] = svgs_raw def shrink(s): return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"') labels =['cdr3_b_aa','v_b_gene', 'j_b_gene', 'pgen', 'radius', 'regex','nsubject','K_neighbors', 'bkgd_hits_weighted','chi2dist','chi2re','chi2joint'] output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html') # 888 888 88888888888 888b d888 888 # 888 888 888 8888b d8888 888 # 888 888 888 88888b.d88888 888 # 8888888888 888 888Y88888P888 888 # 888 888 888 888 Y888P 888 888 # 888 888 888 888 Y8P 888 888 # 888 888 888 888 " 888 888 # 888 888 888 888 888 88888888 with open(output_html_name, 'w') as output_handle: for i,r in ranked_centers_df.iterrows(): #import pdb; pdb.set_trace() svg, svg_raw = r['svg'],r['svg_raw'] output_handle.write("<br></br>") output_handle.write(shrink(svg)) output_handle.write(shrink(svg_raw)) output_handle.write("<br></br>") output_handle.write(pd.DataFrame(r[labels]).transpose().to_html()) output_handle.write("<br></br>") # To ascertain which meta-clonotypes are likely to be most specific, # take advantage of an existing function <bkgd_cntrl_nn2>. # d888 .d8888b. 8888888888 .d8888b. # d8888 d88P Y88b 888 d88P Y88b # 888 888 888 888 888 # 888 888 888 8888888 888d888b. # 888 888 888 888 888P "Y88b # 888 888 888 888 888888 888 888 # 888 Y88b d88P 888 Y88b d88P # 8888888 "Y8888P" 8888888888 "Y8888P" ############################################################################ # Step 6.5: Find optimal radii (theta = 1E6) ### ############################################################################ level_tag = '1E6' from tcrdist.neighbors import bkgd_cntl_nn2 centers_df = bkgd_cntl_nn2( tr = tr, tr_background = tr_bkgd, weights = tr_bkgd.clone_df.weights, ctrl_bkgd = 10**-6, col = 'cdr3_b_aa', add_cols = ['v_b_gene', 'j_b_gene'], ncpus = 4, include_seq_info = True, thresholds = [x for x in range(0,50,2)], generate_regex = True, test_regex = True, forced_max_radius = 36) ############################################################################ # Step 6.6: (theta = 1E6) ALL meta-clonotypes .tsv file ## ############################################################################ # save center to project_path for future use centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) # Many of meta-clonotypes contain redundant information. # We can winnow down to less-redundant list. We do this # by ranking clonotypes from most to least specific. # <min_nsubject> is minimum publicity of the meta-clonotype, # <min_nr> is minimum non-redundancy # Add neighbors, K_neighbors, and nsubject columns from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius']) centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x)) # We determine how many <nsubjects> are in the set of neighbors centers_df['nsubject'] = centers_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) from tcrdist.centers import rank_centers ranked_centers_df = rank_centers( centers_df = centers_df, rank_column = 'chi2joint', min_nsubject = 2, min_nr = 1) ############################################################################ # Step 6.7: (theta = 1E6) NR meta-clonotypes .tsv file ### ############################################################################ # Output, ready to search bulk data. ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) ############################################################################ # Step 6.8: (theta = 1E6) Output Meta-Clonotypes HTML Summary ### ############################################################################ # Here we can make a svg logo for each meta-clonotype from progress.bar import IncrementalBar from tcrdist.public import make_motif_logo if ranked_centers_df.shape[0] > 0: cdr3_name = 'cdr3_b_aa' v_gene_name = 'v_b_gene' svgs = list() svgs_raw = list() bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0]) for i,r in ranked_centers_df.iterrows(): bar.next() centroid = r[cdr3_name] v_gene = r[v_gene_name] svg, svg_raw = make_motif_logo( tcrsampler = ts, pwmat = tr.pw_beta, clone_df = tr.clone_df, centroid = centroid , v_gene = v_gene , radius = r['radius'], pwmat_str = 'pw_beta', cdr3_name = 'cdr3_b_aa', v_name = 'v_b_gene', gene_names = ['v_b_gene','j_b_gene']) svgs.append(svg) svgs_raw.append(svg_raw) bar.next();bar.finish() ranked_centers_df['svg'] = svgs ranked_centers_df['svg_raw'] = svgs_raw def shrink(s): return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"') labels =['cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'radius', 'regex','nsubject','K_neighbors', 'bkgd_hits_weighted','chi2dist','chi2re','chi2joint'] output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html') # 888 888 88888888888 888b d888 888 # 888 888 888 8888b d8888 888 # 888 888 888 88888b.d88888 888 # 8888888888 888 888Y88888P888 888 # 888 888 888 888 Y888P 888 888 # 888 888 888 888 Y8P 888 888 # 888 888 888 888 " 888 888 # 888 888 888 888 888 88888888 with open(output_html_name, 'w') as output_handle: for i,r in ranked_centers_df.iterrows(): #import pdb; pdb.set_trace() svg, svg_raw = r['svg'],r['svg_raw'] output_handle.write("<br></br>") output_handle.write(shrink(svg)) output_handle.write(shrink(svg_raw)) output_handle.write("<br></br>") output_handle.write(pd.DataFrame(r[labels]).transpose().to_html()) output_handle.write("<br></br>")