def _generate(self) -> ReportResult: from immuneML.util.TCRdistHelper import TCRdistHelper from tcrdist.rep_diff import hcluster_diff from tcrdist.summarize import member_summ PathBuilder.build(self.result_path) subsampled_dataset = self._extract_positive_example_dataset() reference_sequences = self._extract_reference_sequences() tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores) tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"], count_col='count') figures, tables = [], [] logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.') for index, row in tcr_rep.hcluster_df.iterrows(): if len(row['neighbors_i']) >= self.min_cluster_size: figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences) figures.extend(figure_outputs) tables.extend(table_outputs) res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope']) res_summary.to_csv(self.result_path / "tcrdist_summary.csv") tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)")) return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables)
def test_introduction_6(): """ Basic Specificity Neighborhoods based on a Hierarchical Clustering """ import pandas as pd from tcrdist.repertoire import TCRrep df = pd.read_csv("dash.csv") tr = TCRrep(cell_df=df, organism='mouse', chains=['beta', 'alpha'], db_file='alphabeta_gammadelta_db.tsv') from tcrdist.rep_diff import hcluster_diff, member_summ from hierdiff import plot_hclust_props # diff testing is pasted on binary comparison, so all epitope not 'PA' are set to 'X' tr.clone_df['PA'] = [ 'PA' if x == 'PA' else 'X' for x in tr.clone_df.epitope ] res, Z = hcluster_diff(tr.clone_df, tr.pw_beta, x_cols=['PA'], count_col='count') res_summary = member_summ(res_df=res, clone_df=tr.clone_df, addl_cols=['epitope']) res_detailed = pd.concat([res, res_summary], axis=1) html = plot_hclust_props( Z, title='PA Epitope Example', res=res_detailed, tooltip_cols=['cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'epitope'], alpha=0.00001, colors=['blue', 'gray'], alpha_col='pvalue') with open('hierdiff_example.html', 'w') as fh: fh.write(html)
def test_gallery_hdiff(): """ All imports are provided here, and are repeated step-wise below, for clarity, and for module cut-and-paste. This example performs paired alpha-beta analysis, but code blocks can be used for single chain analysis as well. """ import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.rep_diff import hcluster_diff, member_summ from tcrsampler.sampler import TCRsampler from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif, svg_logo from hierdiff import plot_hclust_props """ Load a subset of data that contains paired alpha-beta chain mouse TCR receptors that recognized the PA or PB1 epitopes (present in mouse influenza). """ import pandas as pd df = pd.read_csv("dash.csv") conditional = df['epitope'].apply( lambda x: x in ['PA','PB1']) """ For illustrative/testing purposes, randomly subset the data to include only 100 clones. Increase for more informative plot. """ df = df[conditional].\ reset_index(drop = True).\ sample(100, random_state = 3).\ reset_index(drop = True).\ copy() """ Load DataFrame into TCRrep instance, which automatically computes attributes: 1. .clone_df DataFrame 2. .pw_beta nd.array 3. .pw_alpha nd.array """ from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df = df, organism = 'mouse', chains = ['beta','alpha'], db_file = 'alphabeta_gammadelta_db.tsv') """ Apply hcluster_diff, which hierarchically clusters. Note ---- pwmat could easily be tr.pw_beta or tr.pw_alpha if clustering should be done on a single chain. """ from tcrdist.rep_diff import hcluster_diff tr.hcluster_df, tr.Z =\ hcluster_diff(clone_df = tr.clone_df, pwmat = tr.pw_beta + tr.pw_alpha, x_cols = ['epitope'], count_col = 'count') """ Load a custom background, mouse appropriate dataset to sample CDR3s according to the V and J gene usage frequencies observed in each node. See the tcrsampler package for more details (https://github.com/kmayerb/tcrsampler/blob/master/docs/getting_default_backgrounds.md) """ from tcrsampler.sampler import TCRsampler t = TCRsampler() t.download_background_file("ruggiero_mouse_sampler.zip") tcrsampler_beta = TCRsampler(default_background = 'ruggiero_mouse_beta_t.tsv.sampler.tsv') tcrsampler_alpha = TCRsampler(default_background = 'ruggiero_mouse_alpha_t.tsv.sampler.tsv') """ Add an SVG graphic to every node of the tree aligned to the cluster centroid. """ from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif, svg_logo """Beta Chain""" svgs_beta = list() for i,r in tr.hcluster_df.iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],] if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df = dfnode) else: centroid = dfnode['cdr3_b_aa'].to_list()[0] print(f"BETA-CHAIN: {centroid}") gene_usage_beta = dfnode.groupby(['v_b_gene','j_b_gene']).size() sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'], flatten = True, depth = 10) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = 'cdr3_b_aa'), refs = sampled_rep, centroid = centroid) svgs_beta.append(svg_logo(motif, return_str= True)) """Add Beta SVG graphics to hcluster_df""" tr.hcluster_df['svg_beta'] = svgs_beta """Alpha Chain""" svgs_alpha = list() for i,r in tr.hcluster_df.iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],] if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df = dfnode) else: centroid = dfnode['cdr3_a_aa'].to_list()[0] print(f"ALPHA-CHAIN: {centroid}") gene_usage_alpha = dfnode.groupby(['v_a_gene','j_a_gene']).size() sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], flatten = True, depth = 10) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = 'cdr3_a_aa'), refs = sampled_rep, centroid = centroid) svgs_alpha.append(svg_logo(motif, return_str= True)) """Add Alpha SVG graphics to hcluster_df""" tr.hcluster_df['svg_alpha'] = svgs_alpha """ Produce summary information for tooltips. For instance, describe percentage of TCRs with a given epitope at a given node. """ res_summary = member_summ( res_df = tr.hcluster_df, clone_df = tr.clone_df, addl_cols=['epitope']) tr.hcluster_df_detailed = \ pd.concat([tr.hcluster_df, res_summary], axis = 1) """ Write D3 html for interactive denogram graphic. Specify desired tooltips. """ from hierdiff import plot_hclust_props html = plot_hclust_props(tr.Z, title='PA Epitope Example', res=tr.hcluster_df_detailed, tooltip_cols=['cdr3_b_aa','v_b_gene', 'j_b_gene','svg_alpha','svg_beta'], alpha=0.00001, colors = ['blue','gray'], alpha_col='pvalue') with open('hierdiff_example_PA_v_PB1.html', 'w') as fh: fh.write(html)
def test_workflow_2(): """ Load all the TCRs associated with a particular epitope in the Adaptive Biotechnology COVID19 Data Release 2 """ import os import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.adpt_funcs import get_basic_centroids path = os.path.join('tcrdist', 'data', 'covid19') file = 'mira_epitope_16_1683_QYIKWPWYI_YEQYIKWPW_YEQYIKWPWY.tcrdist3.csv' filename = os.path.join(path, file) df = pd.read_csv(filename, sep=",") df = df[[ 'cell_type', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope', 'age', 'sex', 'cohort' ]] df['count'] = 1 tr = TCRrep(cell_df=df, organism='human', chains=['beta']) tr = get_basic_centroids(tr, max_dist=200) tr.centroids_df tr.clone_df['covid'] = [ 'healthy' if x.find("Healthy") != -1 else "covid" for x in tr.clone_df.cohort ] from tcrdist.rep_diff import neighborhood_diff, hcluster_diff, member_summ import hierdiff #tr.clone_df['covid'] = ['healthy' if x.find("Healthy") != -1 else "covid" for x in tr.clone_df.cohort] #nd = neighborhood_diff(tr.clone_df, tr.pw_beta, x_cols = ['covid'], count_col = 'count') tr.clone_df['covid'] = [ 'healthy' if x.find("Healthy") != -1 else "covid" for x in tr.clone_df.cohort ] res, Z = hcluster_diff(tr.clone_df, tr.pw_beta, x_cols=['covid'], count_col='count') res_summary = member_summ(res_df=res, clone_df=tr.clone_df, addl_cols=['cohort', 'subject']) res_detailed = pd.concat([res, res_summary], axis=1) html = hierdiff.plot_hclust_props(Z, title='PA Epitope Example', res=res_detailed, tooltip_cols=[ 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'cohort', 'subject' ], alpha=0.05, alpha_col='pvalue') with open('hierdiff_example.html', 'w') as fh: fh.write(html)
from tcrdist.mappers import vdjdb_to_tcrdist2, vdjdb_to_tcrdist2_mapping_TRA, vdjdb_to_tcrdist2_mapping_TRB selin_a = selin.loc[selin['Gene'] == 'TRA'].rename(vdjdb_to_tcrdist2_mapping_TRA, axis=1) selin_b = selin.loc[selin['Gene'] == 'TRB'].rename(vdjdb_to_tcrdist2_mapping_TRB, axis=1) """COMPUTE TCRDISTANCES (SEE DOCS PAGE: https://tcrdist3.readthedocs.io/en/latest/tcrdistances.html)""" from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df=selin_a, organism='human', chains=['alpha']) """COMPUTE TCRDISTANCES (SEE DOCS PAGE:https://tcrdist3.readthedocs.io/en/latest/index.html#hierarchical-neighborhoods)""" from tcrdist.rep_diff import hcluster_diff tr.hcluster_df, tr.Z =\ hcluster_diff(clone_df = tr.clone_df, pwmat = tr.pw_alpha, x_cols = ['cohort'], count_col = 'count') """ SEE TCRSAMPLER (https://github.com/kmayerb/tcrsampler/blob/master/docs/tcrsampler.md) Here we used olga human alpha synthetic sequences for best coverage """ from tcrsampler.sampler import TCRsampler t = TCRsampler() #t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE tcrsampler_alpha = TCRsampler(default_background = 'olga_human_alpha_t.sampler.tsv') tcrsampler_alpha.build_background(max_rows = 1000) """SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)""" from palmotif import compute_pal_motif, svg_logo from tcrdist.summarize import _select
def _auto_hdiff2( tcrrep, html_name='DEFAULT.html', pwmat_str_b='pw_beta', pwmat_str_a='pw_alpha', single=True, generate_svgs=True, combine_olga=False, verbose=True, prune=3, default_hcluster_diff_kwargs=_get_default_kwargs(chains=['beta'])[0], default_member_summ_kwargs=_get_default_kwargs(chains=['beta'])[1], default_plot_hclust_props=_get_default_kwargs(chains=['beta'])[2]): """ Automatic Hierarchical Cluster Plotting Parameters ---------- html_name : str name for html file output e.g., 'DEFAULT.html' pwmat_str_b : str name of pairwise matrix attribute to be used for clustering beta chains e.g., 'pw_beta' pwmat_str_a : str name of pairwise matrix attribute to be used for clustering alpha chains e.g., 'pw_alpha' single : bool If true, make summary based on each clone being present in single-copy, otherwise, the 'count' column is used when calculating percentages. NOTE: 'count_col' in default_hcluster_diff_kwargs can also set to 'single'. If true, diversity metrics will also be based on clones rather than clonal abundances. generate_svgs : bool If True, SVG logos are produced for each node where .hcluster_df['prune'] is 0 verbose : bool report on status default_hcluster_diff_kwargs: dict kwargs dictionary for (tcrdist.rep_diff.hcluster_diff) clone_df : pd.DataFrame [nclones x metadata] Contains metadata for each clone. pwmat : np.ndarray [nclones x nclones] Square distance matrix for defining neighborhoods x_cols : list List of columns to be tested for association with the neighborhood count_col : str Column in clone_df that specifies counts. Default none assumes count of 1 cell for each row. subset_ind : None or np.ndarray with partial index of df, optional Provides option to tally counts only within a subset of df, but to maintain the clustering of all individuals. Allows for one clustering of pooled TCRs, but tallying/testing within a subset (e.g. participants or conditions) hclust_method : str Method for hierarchical clustering, passed to the scipy.clustering.hierarchy linkage function. optimal_ordering : bool Flag passed to the scipy.clustering.hierarchy linkage function to improve visual tree layout. Can be slow for large trees. test_method : str or None Specifies Fisher's exact test ("fishers"), Chi-squared ("chi2") or Cochran-Mantel-Haenszel test ("chm") for testing. default_member_summ_kwargs: dict kwargs dictionary for (tcrdist.rep_diff.member_summ) Return additional summary info about each result (row)) based on the members of the cluster. This is helpful for preparing strings to add to the tooltip in hierdiff.plot_hclust_props. res_df : pd.DataFrame [nclusters x result cols] Returned from neighborhood_diff or hcluster_diff clone_df : pd.DataFrame [nclones x metadata] Contains metadata for each clone. key_col : str Column in res_df that specifies the iloc of members in the clone_df count_col : str Column in clone_df that specifies counts. Default none assumes count of 1 cell for each row. addl_cols : list Columns to summarize addl_n : int Number of top N clones to include in the summary of each cluster. default_plot_hclust_props : dict kwargs dictionary for (hierdiff.plot_hclust_props) Plot tree of linkage-based hierarchical clustering, with nodes colored using stacked bars representing proportion of cluster members associated with specific conditions. Nodes also optionally annotated with pvalue, number of members or cluster ID. Z : linkage matrix Result of calling sch.linkage on a compressed pair-wise distance matrix res : pd.DataFrame Result from calling hcluster_diff, with observed/frequencies and p-values for each node alpha_col : str Column in res to use for 'alpha' annotation alpha : float Threshold for plotting the stacked bars and annotation colors : tuple of valid colors Used for stacked bars of conditions at each node prune_col : str/column in res Column of res that indicates whether a result/node can be pruned from the tree. The tree will not print any pruned nodes that only contain other pruned nodes. """ import os import pandas as pd import numpy as np import warnings from tcrsampler.sampler import TCRsampler from palmotif import compute_pal_motif, svg_logo from tcrdist.adpt_funcs import get_centroid_seq, get_centroid_seq_alpha from tcrdist.summarize import _select from tcrdist.repertoire import TCRrep from tcrdist.rep_diff import hcluster_diff, member_summ from tcrdist.summarize import _select from tcrdist.pgen import OlgaModel from palmotif import compute_pal_motif, svg_logo from hierdiff import plot_hclust_props from numpy.random import randint from tcrdist.diversity import generalized_simpsons_entropy from tcrdist.diversity import fuzzy_diversity # Load clone_df directly from input object if default_hcluster_diff_kwargs['clone_df'] is None: default_hcluster_diff_kwargs['clone_df'] = getattr(tcrrep, 'clone_df') # Get appropirate pwmat_str (pw_beta, pw_alpha, or one of the CDRs, e.g., pw_cdr3_b_aa) if 'alpha' in tcrrep.chains: pwmat_str = pwmat_str_a if 'beta' in tcrrep.chains: pwmat_str = pwmat_str_b # Get appropirate pairwise matrix if default_hcluster_diff_kwargs['pwmat'] is None: if verbose: print(f"pwmat set with {pwmat_str}") default_hcluster_diff_kwargs['pwmat'] = getattr(tcrrep, pwmat_str) else: if verbose: print("pwmat was directly provided as a kwarg") """Handle the Fact that 2 or more catagoorical levels are need to run hierdiff""" x_cols = default_hcluster_diff_kwargs['x_cols'] if x_cols is None: tcrrep.clone_df['dummy'] = \ [['X1','X2'][randint(2)] for x in range(tcrrep.clone_df.shape[0])] default_hcluster_diff_kwargs['x_cols'] = ['dummy'] default_hcluster_diff_kwargs['test_method'] = 'fishers' warnings.warn( f"Because x_cols was None, setting random dummy values, and using {default_hcluster_diff_kwargs['test_method']}\n", stacklevel=2) elif tcrrep.clone_df[x_cols].nunique()[0] == 2: default_hcluster_diff_kwargs['test_method'] = 'fishers' elif tcrrep.clone_df[x_cols].nunique()[0] > 2: default_hcluster_diff_kwargs['test_method'] = 'chi2' elif tcrrep.clone_df[x_cols].nunique()[0] < 2: tcrrep.clone_df['dummy'] = \ [['X1','X2'][randint(2)] for x in range(tcrrep.clone_df.shape[0])] default_hcluster_diff_kwargs['x_cols'] = ['dummy'] default_hcluster_diff_kwargs['test_method'] = 'fishers' warnings.warn( f"Because x_cols was None, setting random dummy values, and using {default_hcluster_diff_kwargs['test_method']}\n", stacklevel=2) """ Run hcluster_df """ bar = IncrementalBar(f'Run hcluster_diff :', max=2, suffix='%(percent)d%%') bar.next() tcrrep.hcluster_df, tcrrep.Z = hcluster_diff( **default_hcluster_diff_kwargs) bar.next() bar.finish() tcrrep.hcluster_df['prune'] = tcrrep.hcluster_df['K_neighbors'].apply( lambda x: 1 if (x < prune) else 0) # """ Do Basic Summary """ mean_distance_ = list() percentage_node_25_ = list() percentage_node_50_ = list() percentage_node_75_ = list() n_rows = tcrrep.hcluster_df.shape[0] bar = IncrementalBar(f'Evaluate Clusters :', max=n_rows, suffix='%(percent)d%%') for i, r in tcrrep.hcluster_df.iterrows(): bar.next() # <dfnode> is dataframe with all the clones at a given tree node dfnode = tcrrep.clone_df.iloc[r['neighbors_i'], ] # <pwnod> is dataframe with all the clones at a given tree node pwnode = getattr(tcrrep, pwmat_str)[r['neighbors_i'], :][:, r['neighbors_i']] # get the non-diaganol entries. node_non_diag_entries = pwnode[~np.eye(pwnode.shape[0], dtype=bool)] # Compute the mean distance at the node mean_distance_.append(str(round(node_non_diag_entries.mean(), 1))) percentage_node_25 = 100 * (node_non_diag_entries < 25).sum() / (node_non_diag_entries.size) percentage_node_50 = 100 * (node_non_diag_entries < 50).sum() / ( node_non_diag_entries.size ) #100*((pwnode < 50).sum() - pwnode.shape[0]) / (pwnode.shape[0] * pwnode.shape[1]) percentage_node_75 = 100 * (node_non_diag_entries < 75).sum() / ( node_non_diag_entries.size ) #100*((pwnode < 100).sum() - pwnode.shape[0]) / (pwnode.shape[0] * pwnode.shape[1]) percentage_node_25_.append(f"{round(percentage_node_25,1)}%") percentage_node_50_.append(f"{round(percentage_node_50,1)}%") percentage_node_75_.append(f"{round(percentage_node_75,1)}%") bar.next() bar.finish() tcrrep.hcluster_df['mean_dist'] = mean_distance_ tcrrep.hcluster_df['pct_dist_25'] = percentage_node_25_ tcrrep.hcluster_df['pct_dist_50'] = percentage_node_50_ tcrrep.hcluster_df['pct_dist_75'] = percentage_node_75_ """ By default, treat each clone as a single entity if 'count_col' is single """ if default_member_summ_kwargs['count_col'] == 'single': single = True print("MAKING 'single' variable") tcrrep.hcluster_df['single'] = 1 if single: tcrrep.clone_df['single'] = 1 default_member_summ_kwargs['count_col'] = 'single' """ member_summ""" tcrrep.res_summary = \ member_summ(res_df = tcrrep.hcluster_df,clone_df = tcrrep.clone_df, **default_member_summ_kwargs) tcrrep.hcluster_df_detailed = pd.concat( [tcrrep.hcluster_df.copy(), tcrrep.res_summary.copy()], axis=1) """ Add diversity stats""" tcrrep.clone_df['single'] = 1 if single: fdiv75 = lambda ind: fuzzy_diversity( tcrrep.clone_df.iloc[ind, :]['single'], getattr(tcrrep, pwmat_str)[ind, :][:, ind], order=2, threshold=75) fdiv50 = lambda ind: fuzzy_diversity( tcrrep.clone_df.iloc[ind, :]['single'], getattr(tcrrep, pwmat_str)[ind, :][:, ind], order=2, threshold=50) fdiv25 = lambda ind: fuzzy_diversity( tcrrep.clone_df.iloc[ind, :]['single'], getattr(tcrrep, pwmat_str)[ind, :][:, ind], order=2, threshold=25) else: fdiv75 = lambda ind: fuzzy_diversity( tcrrep.clone_df.iloc[ind, :]['count'], getattr(tcrrep, pwmat_str)[ind, :][:, ind], order=2, threshold=75) fdiv50 = lambda ind: fuzzy_diversity( tcrrep.clone_df.iloc[ind, :]['count'], getattr(tcrrep, pwmat_str)[ind, :][:, ind], order=2, threshold=50) fdiv25 = lambda ind: fuzzy_diversity( tcrrep.clone_df.iloc[ind, :]['count'], getattr(tcrrep, pwmat_str)[ind, :][:, ind], order=2, threshold=25) tcrrep.hcluster_df_detailed['fuzzy_simpson_diversity_25'] = [ str(round(fdiv25(ind), 2)) for ind in tcrrep.hcluster_df_detailed.neighbors_i.to_list() ] tcrrep.hcluster_df_detailed['fuzzy_simpson_diversity_50'] = [ str(round(fdiv50(ind), 2)) for ind in tcrrep.hcluster_df_detailed.neighbors_i.to_list() ] tcrrep.hcluster_df_detailed['fuzzy_simpson_diversity_75'] = [ str(round(fdiv75(ind), 2)) for ind in tcrrep.hcluster_df_detailed.neighbors_i.to_list() ] """Optional Add SVGs to hcluster_detailed""" if 'beta' in tcrrep.chains: _tcrsampler_svgs(tcrrep=tcrrep, default_background=None, default_background_if_missing=None, cdr3_name='cdr3_b_aa', pwmat_str=pwmat_str_b, chain='beta', gene_names=['v_b_gene', 'j_b_gene'], combine_olga=combine_olga) if 'alpha' in tcrrep.chains: _tcrsampler_svgs(tcrrep=tcrrep, default_background=None, default_background_if_missing=None, cdr3_name='cdr3_a_aa', pwmat_str=pwmat_str_a, chain='alpha', gene_names=['v_a_gene', 'j_a_gene'], combine_olga=combine_olga) """ Plot """ html = plot_hclust_props(tcrrep.Z, res=tcrrep.hcluster_df_detailed, prune_col='prune', **default_plot_hclust_props) """ Write File """ with open(html_name, 'w') as fh: print(f"WRITING {html_name}") fh.write(html)