Exemple #1
0
def test_prob_sampler_sample():
    t = TCRsampler()
    fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt')
    t.clean_mixcr(filename=fn)
    t.build_background()
    r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2]])
    assert r == [['CASSRTGSLADEQYF', 'CASSATGVVSAQYF']]
    r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2]], flatten=True)
    assert r == ['CASSRTGSLADEQYF', 'CASSATGVVSAQYF']
    r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2],
                  ['TRBV7-7*01', 'TRBJ2-4*01', 4]])
    assert r == [['CASSRTGSLADEQYF', 'CASSATGVVSAQYF'],
                 [
                     'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF',
                     'CASSLGQAARGIQYF'
                 ]]
Exemple #2
0
def test_prob_sampler_sample_key_warn():
    t = TCRsampler()
    fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt')
    t.clean_mixcr(filename=fn)
    t.build_background()
    with pytest.warns(None):
        r = t.sample([['TRBV999*01', 'TRBJ2-7*01', 2]])
    assert r == [[None]]
def test_gallery_hdiff():
    """
    All imports are provided here, and are repeated 
    step-wise below, for clarity, and for
    module cut-and-paste. This example
    performs paired alpha-beta analysis,
    but code blocks can be used for single
    chain analysis as well.
    """
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist.rep_diff import hcluster_diff, member_summ
    from tcrsampler.sampler import TCRsampler
    from tcrdist.adpt_funcs import get_centroid_seq
    from tcrdist.summarize import _select
    from palmotif import compute_pal_motif, svg_logo
    from hierdiff import plot_hclust_props
    """
    Load a subset of data that contains paired alpha-beta
    chain mouse TCR receptors that recognized 
    the PA or PB1 epitopes (present in mouse influenza). 
    """
    import pandas as pd
    df = pd.read_csv("dash.csv")
    conditional = df['epitope'].apply( lambda x: x in ['PA','PB1'])
    """
    For illustrative/testing purposes, randomly subset the data to include 
    only 100 clones. Increase for more informative plot.
    """
    df = df[conditional].\
        reset_index(drop = True).\
        sample(100, random_state = 3).\
        reset_index(drop = True).\
        copy()
    """
    Load DataFrame into TCRrep instance, 
    which automatically computes attributes:
    1. .clone_df DataFrame
    2. .pw_beta nd.array 
    3. .pw_alpha nd.array 
    """
    from tcrdist.repertoire import TCRrep
    tr = TCRrep(cell_df = df, 
                organism = 'mouse', 
                chains = ['beta','alpha'], 
                db_file = 'alphabeta_gammadelta_db.tsv')

    """
    Apply hcluster_diff, which hierarchically clusters.
    
    Note
    ----
    pwmat could easily be tr.pw_beta or tr.pw_alpha if 
    clustering should be done on a single chain.
    """
    from tcrdist.rep_diff import hcluster_diff
    tr.hcluster_df, tr.Z =\
        hcluster_diff(clone_df = tr.clone_df, 
                      pwmat    = tr.pw_beta + tr.pw_alpha,
                      x_cols = ['epitope'], 
                      count_col = 'count')

    """
    Load a custom background, mouse appropriate dataset to sample CDR3s 
    according to the V and J gene usage frequencies observed in each node.
    See the tcrsampler package for more details 
    (https://github.com/kmayerb/tcrsampler/blob/master/docs/getting_default_backgrounds.md)
    """
    from tcrsampler.sampler import TCRsampler

    t = TCRsampler()
    t.download_background_file("ruggiero_mouse_sampler.zip")
    tcrsampler_beta = TCRsampler(default_background = 'ruggiero_mouse_beta_t.tsv.sampler.tsv')
    tcrsampler_alpha = TCRsampler(default_background = 'ruggiero_mouse_alpha_t.tsv.sampler.tsv')

    """
    Add an SVG graphic to every node of the tree 
    aligned to the cluster centroid.
    """
    from tcrdist.adpt_funcs import get_centroid_seq
    from tcrdist.summarize import _select
    from palmotif import compute_pal_motif, svg_logo

    """Beta Chain"""
    svgs_beta = list()
    for i,r in tr.hcluster_df.iterrows():

        dfnode = tr.clone_df.iloc[r['neighbors_i'],]
        if dfnode.shape[0] > 2:
            centroid, *_ = get_centroid_seq(df = dfnode)
        else:
            centroid = dfnode['cdr3_b_aa'].to_list()[0]
        print(f"BETA-CHAIN: {centroid}")

        gene_usage_beta = dfnode.groupby(['v_b_gene','j_b_gene']).size()
        sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'],
                        flatten = True, depth = 10)
        sampled_rep  = [x for x in sampled_rep if x is not None]
        motif, stat = compute_pal_motif(
                        seqs = _select(df = tr.clone_df, 
                                       iloc_rows = r['neighbors_i'], 
                                       col = 'cdr3_b_aa'),
                        refs = sampled_rep, 
                        centroid = centroid)
        
        svgs_beta.append(svg_logo(motif, return_str= True))

    """Add Beta SVG graphics to hcluster_df"""
    tr.hcluster_df['svg_beta'] = svgs_beta


    """Alpha Chain"""
    svgs_alpha = list()
    for i,r in tr.hcluster_df.iterrows():

        dfnode = tr.clone_df.iloc[r['neighbors_i'],]
        if dfnode.shape[0] > 2:
            centroid, *_ = get_centroid_seq(df = dfnode)
        else:
            centroid = dfnode['cdr3_a_aa'].to_list()[0]
        print(f"ALPHA-CHAIN: {centroid}")
        gene_usage_alpha = dfnode.groupby(['v_a_gene','j_a_gene']).size()
        sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], 
                        flatten = True, depth = 10)
        
        sampled_rep  = [x for x in sampled_rep if x is not None]
        motif, stat = compute_pal_motif(
                        seqs = _select(df = tr.clone_df, 
                                       iloc_rows = r['neighbors_i'], 
                                       col = 'cdr3_a_aa'),
                        refs = sampled_rep, 
                        centroid = centroid)

        svgs_alpha.append(svg_logo(motif, return_str= True))
    
    """Add Alpha SVG graphics to hcluster_df"""
    tr.hcluster_df['svg_alpha'] = svgs_alpha
    """
    Produce summary information for tooltips. 
    For instance, describe percentage of TCRs with 
    a given epitope at a given node.
    """
    res_summary = member_summ(  res_df = tr.hcluster_df,
                                clone_df = tr.clone_df, 
                                addl_cols=['epitope'])

    tr.hcluster_df_detailed = \
        pd.concat([tr.hcluster_df, res_summary], axis = 1)
    """
    Write D3 html for interactive denogram graphic. 
    Specify desired tooltips.
    """
    from hierdiff import plot_hclust_props
    html = plot_hclust_props(tr.Z,
                title='PA Epitope Example',
                res=tr.hcluster_df_detailed,
                tooltip_cols=['cdr3_b_aa','v_b_gene', 'j_b_gene','svg_alpha','svg_beta'],
                alpha=0.00001, colors = ['blue','gray'],
                alpha_col='pvalue')

    with open('hierdiff_example_PA_v_PB1.html', 'w') as fh:
        fh.write(html)
Exemple #4
0
def test_quick_pipeline_with_fragmented_compute():

	"""
	How can I used tcrdist3 to test for TCRs that may HLA restricted. 

	
	"""

	import os
	import pandas as pd
	import numpy as np
	from scipy import sparse
	from tcrdist.repertoire import TCRrep
	from tcrdist.rep_funcs import  compute_pw_sparse_out_of_memory
	
	f = 'mira_epitope_67_382_APHGVVFL_APHGVVFLHV_GVVFLHVTY_VVFLHVTYV.tcrdist3.csv'
	f = os.path.join('tcrdist','data','covid19',f)
	assert os.path.isfile(f)

	df = pd.read_csv(f)
	df = df[['subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq',  'cohort', 'hla-a', 'hla-a_1','hla-b', 'hla-b_1']]
	tr = TCRrep(cell_df = df,               
				organism = 'human',
				chains = ['beta'],
				db_file = 'alphabeta_gammadelta_db.tsv',
				compute_distances = False,
				store_all_cdr = False)

	from tcrdist.rep_funcs import  compute_pw_sparse_out_of_memory
	
	S, fragments = compute_pw_sparse_out_of_memory(	tr = tr,
													row_size      = 100,
													pm_processes  = 2,
													pm_pbar       = True,
													max_distance  = 1000,
													matrix_name   = 'rw_beta',
													reassemble    = True,
													cleanup       = False)

	tr.clone_df['B07'] = (tr.clone_df['hla-b'].str.startswith("B*07") | tr.clone_df['hla-b_1'].str.startswith("B*07"))
	tr.clone_df['B07'] = ["B*07" if (x) else "NOTB*07 " for x in tr.clone_df['B07']]

	#sparse.save_npz("S.npz", S)
	from tcrdist.rep_funcs import  compute_n_tally_out_of_memory
	nn_tally_df_cohort = compute_n_tally_out_of_memory(fragments,
												matrix_name = "rw_beta",
												pm_processes  = 6,
												to_file = False,
												to_memory = True, 
												knn_radius = 25, 
												x_cols = ['B07'])

	from hierdiff.association_testing import cluster_association_test
	nn_associations = cluster_association_test(res = nn_tally_df_cohort, y_col='cmember', method='fishers')
	nn_associations = nn_associations.sort_values('pvalue', ascending = True)
	import ast 
	nn_associations['neighbors_i'] = nn_associations.neighbors.apply(lambda x: ast.literal_eval(x))

	from tcrdist.summarize import test_for_almost_subsets, filter_is, filter_gt
	nn_associations['mostly_unique'] = test_for_almost_subsets(nn_associations['neighbors_i'], thr = 5)
	nr_nn_associations = filter_is(nn_associations, 'mostly_unique', 1).copy()

	#nr_nn_associations = filter_gt(nr_nn_associations, 'K_neighbors', 25).copy()
	nr_nn_associations


	# MOTIF GENERATION
	from tcrsampler.sampler import TCRsampler
	t = TCRsampler()
	if  'olga_human_beta_t.sampler.tsv' not in t.currently_available_backgrounds():
		t.download_background_file('olga_sampler.zip')
	#t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE
	tcrsampler_beta = TCRsampler(default_background = 'olga_human_beta_t.sampler.tsv')
	tcrsampler_beta.build_background(max_rows = 1000)

	"""SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)"""
	from palmotif import compute_pal_motif, svg_logo
	from tcrdist.summarize import _select
	
	"""GENERATE SVG GRAPHIC FOR EACH NODE OF THE TREE"""
	#pwmat_str = 'pw_beta'
	cdr3_name = 'cdr3_b_aa'
	gene_names = ['v_b_gene','j_b_gene']
	svgs_beta = list()
	svgs_beta_raw = list()
	info_list = list()

	from tcrdist.rep_diff import member_summ
	summary = member_summ(  res_df = nr_nn_associations,
							clone_df = tr.clone_df,
							addl_cols=['cohort','hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'subject'])

	nr_nn_associations = pd.concat([nr_nn_associations, summary], axis = 1).reset_index()

	for i,r in nr_nn_associations.head(25).iterrows():
		dfnode  = tr.clone_df.iloc[r['neighbors_i'],:].copy()
		# <pwnode> Pairwise Matrix for node sequences
		pwnode = S[r['neighbors_i'],:] [:,r['neighbors_i']].todense()
		if dfnode.shape[0] > 2:
			iloc_idx = pwnode.sum(axis = 0).argmin()
			centroid = dfnode[cdr3_name].to_list()[iloc_idx]
		else:
			centroid = dfnode[cdr3_name].to_list()[0]

		print(f"CENTROID: {centroid}")

		gene_usage_beta = dfnode.groupby(gene_names).size()
		sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'],
			flatten = True, depth = max(100, 1000 // dfnode.shape[0]))

		sampled_rep  = [x for x in sampled_rep if x is not None]

		motif, stat = compute_pal_motif(
						seqs = _select(df = tr.clone_df,
									   iloc_rows = r['neighbors_i'],
									   col = cdr3_name),
						refs = sampled_rep,
						centroid = centroid)

		svgs_beta.append(svg_logo(motif, return_str= True))

		sampled_rep = sampled_rep.append(centroid)
		motif_raw, _ = compute_pal_motif(
					 seqs =_select(df = tr.clone_df,
									iloc_rows = r['neighbors_i'],
									col = cdr3_name),
					 centroid = centroid)
		svgs_beta_raw.append(svg_logo(motif_raw, return_str= True))
		info_list.append(r)


	def row_to_string(r, vals = ['ct_columns', 'val_0', 'ct_0', 'val_1', 'ct_1', 'val_2', 'ct_2','val_3', 'ct_3', 'levels', 'K_neighbors', 'R_radius', 'RR', 'OR', 'pvalue', 'FWERp','FDRq']):
		#d = {v:r[v] for v in vals}
		return "<br></br>".join([f"\t{v} : {r[v]}" for v in vals])

	def to_html_table(r, vals = ['ct_columns', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'val_0', 'ct_0', 'val_2', 'ct_2', 'K_neighbors', 'R_radius', 'pvalue', 'FDRq','cdr3_b_aa','v_b_gene', 'j_b_gene', 'cohort','subject']):
		return pd.DataFrame(r[vals]).transpose().to_html()

	def shrink(html_str):
		return html_str.replace('height="100%"',  'height="10%"').\
			replace('width="100%"', 'width="10%"')

	with open('svgs_in_line.html', 'w') as fh:
		fh.write(f"<html><body>\n")
		

		for svg, svg_raw, details in zip(svgs_beta, svgs_beta_raw, info_list):
			fh.write(f"{shrink(svg_raw)}{shrink(svg)}")
			try:
				fh.write(to_html_table(details))
			except:
				print("F")
			fh.write("<div></div>")
		fh.write(f"</html></body>\n")
Exemple #5
0
def test_background_generation_toy_example():
    import sys
    import os
    import numpy as np
    import pandas as pd
    from tcrsampler.sampler import TCRsampler
    from tcrdist.background import make_gene_usage_counter, get_gene_frequencies, calculate_adjustment, make_gene_usage_counter
    from tcrdist.background import make_vj_matched_background, make_flat_vj_background
    from tcrdist.background import get_stratified_gene_usage_frequency
    from tcrdist.background import sample_britanova
    """
	SUPPOSE WE HAVE SOME REPERTOIRE WITH THE FOLLOWING GENE USAGE SPECIFIED BY ix
	< df_target > For testing we will use a set of 25 TCRs generated from rare and semi-rare V,J pairings. We use 25 only 
	because we will be comuting distances against 4.6 Million seqs.
		1. TCRsampler, replacing gene occurance frequencies with subject tratified estimates
		NOTE: with replace = True .vj_occur_freq will now be the stratified value
		2. Make V,J gene usage matched backgound to match usage in df_target
		3. Use a subject-stratifeid random draw from the Britanova Chord Blood Samples
		4. Make V,J gene usage matched backgound to match usage in df_target
	"""
    ts = TCRsampler(
        default_background='britanova_human_beta_t_cb.tsv.sampler.tsv')  # 1
    ts = get_stratified_gene_usage_frequency(ts=ts, replace=True)

    ix = [['TRBV19*01', 'TRBJ2-5*01', 3], ['TRBV24-1*01', 'TRBJ2-4*01', 3],
          ['TRBV25-1*01', 'TRBJ2-4*01', 3], ['TRBV30*01', 'TRBJ2-3*01', 2],
          ['TRBV5-4*01', 'TRBJ2-3*01', 2], ['TRBV11-2*01', 'TRBJ2-2*01', 2],
          ['TRBV2*01', 'TRBJ1-5*01', 1], ['TRBV12-5*01', 'TRBJ2-7*01', 1],
          ['TRBV4-1*01', 'TRBJ1-6*01', 1], ['TRBV6-5*01', 'TRBJ1-6*01', 1],
          ['TRBV13*01', 'TRBJ2-3*01', 1], ['TRBV18*01', 'TRBJ2-3*01', 1],
          ['TRBV14*01', 'TRBJ2-7*01', 1], ['TRBV6-6*01', 'TRBJ2-7*01', 1],
          ['TRBV10-3*01', 'TRBJ2-3*01', 1], ['TRBV7-2*01', 'TRBJ2-1*01', 1],
          ['TRBV5-1*01', 'TRBJ2-1*01', 1]]
    flatten = lambda l: [item for sublist in l for item in sublist]
    df_target = pd.concat([
        pd.DataFrame({
            'cdr3_b_aa': flatten(ts.sample([[x[0], x[1], x[2]]])),
            'v_b_gene': x[0],
            'j_b_gene': x[1]
        }) for x in ix
    ]).reset_index(drop=True)

    gene_usage_counter = make_gene_usage_counter(df_target)  # 2
    df_vj_bkgd = make_vj_matched_background(
        ts=ts,
        gene_usage_counter=gene_usage_counter,
        size=
        101000,  # Ask for a few extra as Olga can return none if it makes too many non-productive CDR3s
        recomb_type="VDJ",
        chain_folder="human_T_beta",
        cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa'])
    df_vj_bkgd = df_vj_bkgd.sample(100000).reset_index(drop=True)
    df_vj_bkgd['weights'] = calculate_adjustment(df=df_vj_bkgd, adjcol="pVJ")
    df_vj_bkgd['source'] = "vj_matched"

    df_britanova_100K = sample_britanova(size=100000)  # 3
    df_britanova_100K = get_gene_frequencies(ts=ts, df=df_britanova_100K)
    df_britanova_100K['weights'] = 1
    df_britanova_100K['source'] = "stratified_random"
    df_bkgd = pd.concat([df_vj_bkgd, df_britanova_100K], axis = 0).\
     reset_index(drop = True)               # 4

    assert df_bkgd.shape[0] == 200000
    """
	Visually inspect the gene_usage between target seqs and vj-matched background
	"""
    df_check_match = pd.concat([
        df_vj_bkgd.groupby(['v_b_gene', 'j_b_gene']).size() /
        df_vj_bkgd.shape[0],
        df_target.groupby(['v_b_gene', 'j_b_gene']).size() / df_target.shape[0]
    ],
                               axis=1)
    assert np.all(abs(df_check_match[0] - df_check_match[1]) < 0.001)
    return df_bkgd
import os
import pandas as pd
from tcrsampler.sampler import TCRsampler

t = TCRsampler()
fn = os.path.join('britanova_chord_blood.csv')
t.ref_df = pd.read_csv(fn)
t.build_background(max_rows=1000)
t.sample(
    [['TRBV10-2*01', 'TRBV10-2*01*01', 1], ['TRBV27*01', 'TRBV27*01*01', 4]],
    depth=10)

for k, v in t.ref_dict.items():
    print(k, v.shape[0])
def test_dash_ecdf():
    """
    An empirical distribution function (ECDF) can be created
    for a target TCR and a reference set of TCRs to show
    the proportion of reference TCRs that are within a distance
    D of the target TCR, over a range of distances.

    A plot of the ECDF as a function of increasing D shows the
    density of TCR space in the reference set in the neighborhood
    around the target TCR. This can be very helpful for 
    identifying dense antigen-specific clusters in an antigen
    enriched TCR repertoire, where the "reference" set is 
    actually an experimentally enriched repertoire (e.g. 
    pMHC:tetramer or AIM sorting). Or the ECDF can be helpful
    for identifying a radius around a TCR that retains high
    antigen specificity, by showing that the neighborhood
    is extremely sparse in an large unsorted/bulk TCR repertoire.
    
    """
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrsampler.sampler import TCRsampler
    from tcrdist.ecdf import distance_ecdf, make_ecdf_step
    from tcrdist.background import make_gene_usage_counter, make_vj_matched_background, \
                                    make_flat_vj_background, get_gene_frequencies, calculate_adjustment

    import matplotlib.pyplot as plt

    df = pd.read_csv('dash.csv')
    df = df.loc[df['epitope'] == 'PB1']
    tr = TCRrep(cell_df=df,
                organism='mouse',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    TCRsampler.download_background_file(download_file='wiraninha_sampler.zip')
    cols = ['v_b_gene', 'j_b_gene']

    refs = []
    for ts_fn in [f'wirasinha_mouse_beta_s_{i}.tsv.sampler.tsv' for i in '48']:
        ts = TCRsampler(default_background=ts_fn)
        ts.build_background(stratify_by_subject=True, use_frequency=False)
        """Sanitize the alleles to *01 for TCRSampler"""
        tmp = df[cols].applymap(lambda s: s.split('*')[0] + '*01')
        freqs = tmp.groupby(cols).size()
        freq_records = list(freqs.to_frame().to_records())
        ref = ts.sample(freq_records, depth=10, seed=110820)
        ref_df = pd.concat([
            pd.DataFrame({
                'cdr3_b_aa': ref[i]
            }).assign(v_b_gene=v, j_b_gene=j)
            for i, (v, j, _) in enumerate(freq_records)
        ])
        """Assigns pV, pJ and pVJ to ref_df"""
        ref_df = get_gene_frequencies(ts=ts, df=ref_df)

        xdf = freqs.reset_index()
        xdf.columns = ['v_b_gene', 'j_b_gene', 'n']
        """For each V,J pairing compute frequency in this reference"""
        xdf = xdf.assign(ref_freq=xdf['n'] / xdf['n'].sum())
        ref_df = ref_df.merge(xdf, how='left', on=cols).reset_index()
        """ Assign weights to ref sequences: Pr_actual / Pr_sampling"""
        ref_df = ref_df.assign(weights=ref_df['pVJ'] / ref_df['ref_freq'])
        refs.append(ref_df)
        """Add uniformly sampled sequences"""
        ref_df = ts.ref_df.sample(100, random_state=1)
        refs.append(ref_df)

    ref_df = pd.concat(refs, axis=0)
    ref_tr = TCRrep(cell_df=ref_df[cols + ['cdr3_b_aa', 'weights']],
                    organism='mouse',
                    chains=['beta'],
                    compute_distances=False,
                    store_all_cdr=False)

    tr.compute_rect_distances(df=tr.clone_df, df2=ref_tr.clone_df, store=False)

    thresholds = np.arange(1, 50)
    thresholds, ref_ecdf = distance_ecdf(tr.rw_beta,
                                         thresholds=thresholds,
                                         weights=ref_tr.clone_df['weights'] *
                                         ref_tr.clone_df['count'])

    thresholds, target_ecdf = distance_ecdf(tr.pw_beta,
                                            thresholds=thresholds,
                                            weights=None)

    figh = plt.figure(figsize=(5, 5))
    axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log')
    plt.ylabel(f'Proportion of reference TCRs')
    plt.xlabel(f'Distance from target TCR clone')
    for tari in range(ref_ecdf.shape[0]):
        x, y = make_ecdf_step(thresholds, ref_ecdf[tari, :])
        axh.plot(x, y, color='k', alpha=0.2)
    x, y = make_ecdf_step(thresholds, np.mean(ref_ecdf, axis=0))
    axh.plot(x, y, color='r', alpha=1)

    figh = plt.figure(figsize=(5, 5))
    axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log')
    plt.ylabel(f'Proportion of target TCRs')
    plt.xlabel(f'Distance from target TCR clone')
    for tari in range(target_ecdf.shape[0]):
        x, y = make_ecdf_step(thresholds, target_ecdf[tari, :])
        axh.plot(x, y, color='k', alpha=0.2)
    x, y = make_ecdf_step(thresholds, np.mean(target_ecdf, axis=0))
    axh.plot(x, y, color='r', alpha=1)
    """Make an "ROC" plot combining the ECDF against the target (sensitivity)
    vs. ECDF against the reference (specificity)"""
    figh = plt.figure(figsize=(7, 5))
    axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log', xscale='log')
    plt.ylabel(f'Proportion of target TCRs')
    plt.xlabel(f'Proportion of reference TCRs')
    for tari in range(target_ecdf.shape[0]):
        x, y = make_ecdf_step(ref_ecdf[tari, :], target_ecdf[tari, :])
        axh.plot(x, y, color='k', alpha=0.2)
    x, y = make_ecdf_step(np.mean(ref_ecdf, axis=0),
                          np.mean(target_ecdf, axis=0))
    axh.plot(x, y, color='r', alpha=1)
    yl = plt.ylim()
    xl = plt.xlim()
    #yl = (1e-6, 0.3)
    plt.plot(yl, yl, '--', color='gray')
    plt.xlim(xl)
    plt.ylim(yl)
svgs_alpha = list()
svgs_alpha_raw = list()
for i,r in tr.hcluster_df.iterrows():
    dfnode   = tr.clone_df.iloc[r['neighbors_i'],].copy()
    # <pwnode> Pairwise Matrix for node sequences
    pwnode   = getattr(tr, pwmat_str)[r['neighbors_i'],:][:,r['neighbors_i']].copy()
    if dfnode.shape[0] > 2:
        iloc_idx = pwnode.sum(axis = 0).argmin()
        centroid = dfnode[cdr3_name].to_list()[iloc_idx]
    else:
        centroid = dfnode[cdr3_name].to_list()[0]
    
    print(f"ALPHA-CHAIN CENTROID: {centroid}")
    
    gene_usage_alpha = dfnode.groupby(gene_names).size()
    sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], 
                    flatten = True, depth = 10)
    
    sampled_rep  = [x for x in sampled_rep if x is not None]

    motif, stat = compute_pal_motif(
                    seqs = _select(df = tr.clone_df, 
                                   iloc_rows = r['neighbors_i'], 
                                   col = cdr3_name),
                    refs = sampled_rep, 
                    centroid = centroid)
    svgs_alpha.append(svg_logo(motif, return_str= True))

    sampled_rep = sampled_rep.append(centroid)
    motif_raw, _ = compute_pal_motif(
                seqs =_select(df = tr.clone_df, 
                               iloc_rows = r['neighbors_i'],