def test_TCRpublic_with_neighborhood_dif(): """ Use values from neighborhood_diff """ import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.public import TCRpublic fn = os.path.join( 'tcrdist', 'data', 'covid19', 'mira_epitope_55_524_ALRKVPTDNYITTY_KVPTDNYITTY.tcrdist3.radius.csv') df = pd.read_csv(fn) tr = TCRrep(cell_df=df[[ 'cohort', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'radius' ]], organism="human", chains=["beta"]) from tcrdist.rep_diff import neighborhood_diff ndif = neighborhood_diff(clone_df=tr.clone_df, pwmat=tr.pw_beta, count_col='count', x_cols=['cohort'], knn_radius=25, test_method="chi2") # Add neighbors and other columns of interest # from neighbor_diff result to the clone_df tr.clone_df = pd.concat([ tr.clone_df, ndif[['neighbors', 'K_neighbors', 'val_0', 'ct_0', 'pvalue']] ], axis=1) # Because neighors and K_neighbor are already added to the clone_df # TCRpublic.report() uses those instead of finding new ones. tp = TCRpublic(tcrrep=tr, output_html_name="quasi_public_clones_with_ndif.html") # Add any columns neighbor_diff columns #that you want to display in the final report tp.labels.append('val_0') tp.labels.append('ct_0') tp.labels.append('pvalue') # chagne sort to be pvalue not publicity tp.sort_columns = ['pvalue'] # because you are sorting by pvalue, change to True tp.sort_ascending = True tp.report()
def find_centers_beta(background_filename, target_filename, ncpus, min_nsubject, ctrl_bkgd=10**-5, prefilter=False): import os import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2 from tcrdist.automate import auto_pgen from tcrdist.rep_diff import neighborhood_diff from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets import scipy.sparse df_background = pd.read_csv(background_filename) print(df_background) tr_background = TCRrep(cell_df=df_background.copy(), organism="human", chains=['beta'], compute_distances=False) df_mira = pd.read_csv(target_filename) df_mira = df_mira[[ 'subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa' ]] print(df_mira) tr = TCRrep(cell_df=df_mira.copy(), organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', store_all_cdr=False, compute_distances=True) if prefilter: # We can greatly cut down on the number of searches if we drop centroids without minimum publicicity nn_df = neighborhood_diff(clone_df=tr.clone_df, pwmat=tr.pw_beta, count_col='count', x_cols=['cell_type'], knn_radius=37) def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'): # Tabulate the number of unique subjects at each node neighbor_df['nsubject'] = neighbor_df[col_nn].apply(lambda x: len( set(_select(clone_df, iloc_rows=x, col='subject')))) return neighbor_df print(f"TABULATING PUBLIC CLUSTERS") nn_df = tabulate_publicity(nn_df, tr.clone_df) nn_df = filter_gt(nn_df, 'nsubject', min_nsubject) if nn_df.shape[0] == 0: centers_df = pd.DataFrame( {}, columns=[ 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'max_radi', 'target_hits', 'bkgd_hits', 'bkgd_hits_weighted', 'bkgd_total', 'ctrl', 'ctrl_weighted', 'target_misses', 'TR', 'TR2', 'BR_weighted', 'RR_weighted', 'OR_weighted', 'chi2dist', 'target_neighbors', 'target_seqs', 'background_neighbors', 'background_seqs', 'background_v', 'background_j', 'regex', 'target_re_hits', 'bkgd_re_hits', 'bkgd_re_weighted_hits', 'TR_re', 'BR_re_weighted', 'RR_re_weighted', 'OR_re_weighted', 'chi2re', 'chi2joint', 'nsubject' ]) tr.pw_beta[tr.pw_beta == 0] = 1 # set true zeros to 1 tr.pw_beta[tr.pw_beta > 50] = 0 # ignores everything less than 100 pw_beta_sparse = scipy.sparse.csr_matrix(tr.pw_beta) return centers_df, pw_beta_sparse tr.clone_df = tr.clone_df.loc[nn_df.index, :].reset_index(drop=True) del nn_df # Compute pairwise again with filtered set tr.compute_distances() # compute pgens automatically, currently parmap will max out cpus on this step print("COMPUTING PROBABILITY OF GENERATION") auto_pgen(tr) print( f"COMPUTING RECT DIST {tr.clone_df.shape[0]}x{tr_background.clone_df.shape[0]}" ) tr.compute_rect_distances(df=tr.clone_df, df2=tr_background.clone_df, store=False) assert tr.rw_beta.shape[0] == tr.clone_df.shape[0] centers_df = bkgd_cntl_nn2( tr=tr, tr_background=tr_background, ctrl_bkgd=ctrl_bkgd, #ctrl_bkgd = 2*10**-5 weights=tr_background.clone_df.weights, col='cdr3_b_aa', ncpus=ncpus, thresholds=[x for x in range(0, 38, 2)], # Settign 38 as the max radius generate_regex=True, test_regex=True) def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'): # Tabulate the number of unique subjects at each node neighbor_df['nsubject'] = neighbor_df[col_nn].apply( lambda x: len(set(_select(clone_df, iloc_rows=x, col='subject')))) return neighbor_df centers_df = tabulate_publicity(neighbor_df=centers_df, clone_df=tr.clone_df, col_nn='target_neighbors') tr.rw_beta[tr.rw_beta == 0] = 1 # set true zeros to 1 tr.rw_beta[tr.rw_beta > 50] = 0 # ignores everything less than 100 rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta) #scipy.sparse.save_npzz(output_matrix_filename, rw_beta_sparse) return centers_df, rw_beta_sparse
def run_one(ref_fn, rep_fn, ss=-1, ncpus=1): ref_df = pd.read_csv(ref_fn) ref_df.columns = [{ 'v_b_name': 'v_b_gene', 'j_b_name': 'j_b_gene', 'cdr3_b_aa': 'cdr3_b_aa' }.get(c, c) for c in ref_df.columns] ref_df.loc[:, 'count'] = 1 if ss == -1: ref_tr = TCRrep(cell_df=ref_df, organism='human', chains=['beta'], compute_distances=False, store_all_cdr=False) else: ref_tr = TCRrep(cell_df=ref_df.sample(n=ss, replace=False), organism='human', chains=['beta'], compute_distances=False, store_all_cdr=False) rep_df = pd.read_csv(rep_fn).assign(count=1) tr = TCRrep(cell_df=rep_df[[ 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope', 'experiment', 'subject', 'count' ]], organism='human', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=False) if tr.clone_df.shape[0] > 6000: """Limit size of MIRA set to 2000""" tr.clone_df = tr.clone_df.sample(n=6000, replace=False, random_state=110820) auto_pgen(tr) out = [] print(rep_fn) for metric in ['tcrdist', 'tcrdist-cdr3', 'edit']: if 'tcr' in metric: metric_thresholds = np.arange(76) fcluster_thresholds = [0, 25, 50] else: metric_thresholds = np.arange(9) fcluster_thresholds = [0, 1, 2] """Enforce no clustering analysis""" fcluster_thresholds = [0] epitope_name = os.path.split(rep_fn)[1].split('.')[0] epitope_name = epitope_name.replace('mira_epitope_', 'M') # rep_fn = opj(_fg_data, 'ncov_tcrs/adaptive_bio_r2/tcrs_by_mira_epitope/pw_computed', rep_fn) print(f'\t{metric}') """with open(rep_fn, 'rb') as fh: tr = dill.load(fh)""" """Compute repertoire PW distances and create flat clusters""" rep_pwmat = _pwrect(tr, clone_df1=tr.clone_df, metric=metric, ncpus=ncpus) print('Computed MIRA set pwrect.') ref_pwmat = _pwrect(tr, clone_df1=tr.clone_df, clone_df2=ref_tr.clone_df, metric=metric, ncpus=ncpus) print('Computed reference pwrect.') for fclust_thresh in fcluster_thresholds: if fclust_thresh > 0: rep_pwvec = scipy.spatial.distance.squareform(rep_pwmat, force='tovector') Z = sch.linkage(rep_pwvec, method='complete') labels = sch.fcluster(Z, t=fclust_thresh, criterion='distance') else: labels = np.arange(1, rep_pwmat.shape[0] + 1) """Compute ECDF for each cluster within the repertoire""" # rep_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds))) for lab in range(1, np.max(labels) + 1): lab_ind = labels == lab rep_ecdf = compute_ecdf(np.mean( rep_pwmat[lab_ind, :][:, ~lab_ind], axis=0), thresholds=metric_thresholds) tmp_df = pd.DataFrame({ 'ecdf': rep_ecdf, 'thresholds': metric_thresholds }) tmp_df = tmp_df.assign( metric=metric, fclust_thresh=fclust_thresh, label=lab, name=epitope_name, versus='rep', pgen=np.median( tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]), K=lab_ind.sum()) out.append(tmp_df) """Compute distances to the reference for each cluster and compute ECDF vs reference""" # ref_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds))) for lab in range(1, np.max(labels) + 1): lab_ind = labels == lab ref_ecdf = compute_ecdf(np.mean(ref_pwmat[lab_ind, :], axis=0), thresholds=metric_thresholds, weights=ref_tr.clone_df['weights']) tmp_df = pd.DataFrame({ 'ecdf': ref_ecdf, 'thresholds': metric_thresholds }) tmp_df = tmp_df.assign( metric=metric, fclust_thresh=fclust_thresh, label=lab, name=epitope_name, versus='ref', pgen=np.median( tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]), K=lab_ind.sum()) out.append(tmp_df) out = pd.concat(out, axis=0) return out
df = pd.read_csv(dash_fn) tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], compute_distances=False) """Compute pgen of each epitope-specific sequence""" olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ") olga_alpha = OlgaModel(chain_folder="human_T_alpha", recomb_type="VJ") tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens( tr.clone_df.cdr3_b_aa) tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens( tr.clone_df.cdr3_a_aa) """Force pgen > 0: there were 7 CDR3 alphas with pgen = 0""" tr.clone_df = tr.clone_df.loc[(tr.clone_df['pgen_cdr3_a_aa'] > 0) & (tr.clone_df['pgen_cdr3_b_aa'] > 0)] norm_pgen = mpl.colors.LogNorm(vmin=1e-10, vmax=1e-6) norm_a = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_a_aa'].min(), vmax=tr.clone_df['pgen_cdr3_a_aa'].max()) norm_b = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_b_aa'].min(), vmax=tr.clone_df['pgen_cdr3_b_aa'].max()) def color_lu(norm, colors, pgen): i = int(np.floor(norm(pgen) * len(colors))) if i >= len(colors): i = len(colors) - 1 if i < 0: i = 0