コード例 #1
0
ファイル: test_background.py プロジェクト: xzhan50/tcrdist3
def test_background_generation_in_mira_60(fn=os.path.join(
    'tcrdist', 'data', 'covid19',
    'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv')):
    import sys
    import os
    import numpy as np
    import pandas as pd
    from tcrsampler.sampler import TCRsampler
    from tcrdist.background import make_gene_usage_counter, get_gene_frequencies, calculate_adjustment, make_gene_usage_counter
    from tcrdist.background import make_vj_matched_background, make_flat_vj_background
    from tcrdist.background import get_stratified_gene_usage_frequency
    from tcrdist.background import sample_britanova
    """
	SUPPOSE WE HAVE SOME REPERTOIRE WITH THE FOLLOWING GENE USAGE SPECIFIED BY ix
	< df_target > For testing we will use a set of 25 TCRs generated from rare and semi-rare V,J pairings. We use 25 only 
	because we will be comuting distances against 4.6 Million seqs.
		1. TCRsampler, replacing gene occurance frequencies with subject tratified estimates
		NOTE: with replace = True .vj_occur_freq will now be the stratified value
		2. Make V,J gene usage matched backgound to match usage in df_target
		3. Use a subject-stratifeid random draw from the Britanova Chord Blood Samples
		4. Make V,J gene usage matched backgound to match usage in df_target
	"""
    ts = TCRsampler(
        default_background='britanova_human_beta_t_cb.tsv.sampler.tsv')  # 1
    ts = get_stratified_gene_usage_frequency(ts=ts, replace=True)

    df_target = pd.read_csv(fn)
    df_target = df_target[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']]

    gene_usage_counter = make_gene_usage_counter(df_target)  # 2

    df_vj_bkgd = make_vj_matched_background(
        ts=ts,
        gene_usage_counter=gene_usage_counter,
        size=
        150000,  # Ask for a few extra as Olga can return none if it makes too many non-productive CDR3s
        recomb_type="VDJ",
        chain_folder="human_T_beta",
        cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa'])
    df_vj_bkgd = df_vj_bkgd.sample(100000).reset_index(drop=True)
    df_vj_bkgd['weights'] = calculate_adjustment(df=df_vj_bkgd, adjcol="pVJ")
    df_vj_bkgd['source'] = "vj_matched"

    df_britanova_100K = sample_britanova(size=100000)  # 3
    df_britanova_100K = get_gene_frequencies(ts=ts, df=df_britanova_100K)
    df_britanova_100K['weights'] = 1
    df_britanova_100K['source'] = "stratified_random"
    df_bkgd = pd.concat([df_vj_bkgd, df_britanova_100K], axis = 0).\
     reset_index(drop = True)               # 4

    assert df_bkgd.shape[0] == 200000
    #df_bkgd.
    return df_bkgd
コード例 #2
0
ファイル: repertoire.py プロジェクト: xzhan50/tcrdist3
 def synthesize_vj_matched_background(self, ts=None, chain="beta"):
     """
     tcrsampler : TCRsampler or None
    
     chain : str 
         'beta' (in future, TODO: add 'alpha')
     TODO
     -------
     ONLY WORKS CURRENTLY FOR HUMAN BETA, VIA OLGA
     """
     if chain not in ["beta", "alpha"]:
         raise ValueError("Invalid <chain> argument.")
     if chain == "beta":
         if ts is None:
             ts = _default_sampler(organism=self.organism, chain="beta")()
             ts = get_stratified_gene_usage_frequency(ts=ts, replace=True)
         if self.organism == "human":
             vj_background = _synthesize_human_beta_vj_background(
                 ts=ts, df=self.clone_df)
         elif self.organism == "mouse":
             vj_background = _synthesize_mouse_beta_vj_background(
                 ts=ts, df=self.clone_df)
     # TODO: ADD OTHER OPTIONS
     elif chain == "alpha":
         if ts is None:
             ts = _default_sampler(organism=self.organism, chain="alpha")()
             ts = get_stratified_gene_usage_frequency(ts=ts, replace=True)
         if self.organism == "human":
             #raise ValueError("TODO: FUTURE VERSIONS NEED ALPHA(HUMAN)")
             vj_background = _synthesize_human_alpha_vj_background(
                 ts=ts, df=self.clone_df)
         elif self.organism == "mouse":
             #raise ValueError("TODO: FUTURE VERSIONS NEED ALPHA(MOUSE)")
             vj_background = _synthesize_mouse_alpha_vj_background(
                 ts=ts, df=self.clone_df)
     return vj_background
コード例 #3
0
def test_example_with_report():
    """ 
    Example of TCR radii defined for each TCR in an 
    antigen enriched repertoire, and logo-motif report.
    """
    import os
    import numpy as np
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist.sample import _default_sampler
    from tcrdist.background import get_stratified_gene_usage_frequency
    from tcrdist.centers import calc_radii
    from tcrdist.public import _neighbors_sparse_variable_radius, _neighbors_variable_radius
    from tcrdist.public import TCRpublic
    from tcrdist.ecdf import _plot_manuscript_ecdfs
    import matplotlib.pyplot as plt
    # ANTIGEN ENRICHED REPERTOIRE
    # Load all TCRs tetramer-sorted for the epitope influenza PA epitope
    df = pd.read_csv("dash.csv").query('epitope == "PA"').\
        reset_index(drop = True)
    # Load <df> into a TCRrep instance, to infer CDR1, CDR2, and CDR2.5 region of each clone
    tr = TCRrep(cell_df=df.copy(),
                organism='mouse',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                compute_distances=True)
    # UN-ENRICHED REPERTOIRE
    # For illustration we pull a default sampler for mouse beta chains.
    # This is used to estimate the gene usage
    # probabilities P(TRBV = V, TRBJ = J)
    ts = _default_sampler(organism="mouse", chain="beta")()
    ts = get_stratified_gene_usage_frequency(ts=ts, replace=True)
    # Then we synthesize a background using Olga (Sethna et al. 2019),
    # using the P(TRBV = V, TRBJ = J) for inverse probability weighting.
    df_vj_background = tr.synthesize_vj_matched_background(ts=ts, chain='beta')
    # Load <df_vj_background> into a TCRrep instance, to infer CDR1,CDR2,CDR2.5
    trb = TCRrep(cell_df=df_vj_background.copy(),
                 organism='mouse',
                 chains=['beta'],
                 db_file='alphabeta_gammadelta_db.tsv',
                 compute_distances=False)
    # Take advantage of multiple CPUs
    tr.cpus = 4
    # Compute radii for each TCR that controls neighbor-discovery in the background at
    # estimate of 1/10^5 inverse probability weighted TCRs.
    # Note we are set <use_sparse> to True, which allows us to take advantage of
    # multiple cpus and only store distance less than or equal to <max_radius>
    radii, thresholds, ecdfs = \
        calc_radii(tr = tr,
            tr_bkgd = trb,
            chain = 'beta',
            ctrl_bkgd = 10**-5,
            use_sparse = True,
            max_radius=50)
    #  Optional, set a maximum radius
    tr.clone_df['radius'] = radii
    tr.clone_df['radius'][tr.clone_df['radius'] > 26] = 26
    # Tabulate index of neighboring clones in the ANTIGEN ENRICHED REPERTOIRE,
    # at each TCR-specific radius
    tr.clone_df['neighbors'] = _neighbors_variable_radius(
        pwmat=tr.pw_beta, radius_list=tr.clone_df['radius'])
    # Tabulate neighboring sequences in background
    tr.clone_df['background_neighbors'] = _neighbors_sparse_variable_radius(
        csrmat=tr.rw_beta, radius_list=tr.clone_df['radius'])
    # Tabulate number of unique subjects
    tr.clone_df['nsubject']             = tr.clone_df['neighbors'].\
            apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    # Score Quasi(Publicity) : True (Quasi-Public), False (private)
    tr.clone_df['qpublic']              = tr.clone_df['nsubject'].\
            apply(lambda x: x > 1)
    # OPTIONAL: HTML Report
    # Note: you can call TCRpublic() with fixed radius or directly
    # after tr.clone_df['radius'] is defined.
    tp = TCRpublic(tcrrep=tr, output_html_name="quasi_public_clones.html")
    tp.fixed_radius = False
    # Generates the HTML report
    rp = tp.report()
    # OPTIONAL: ECDF Figure, against reference
    f1 = _plot_manuscript_ecdfs(thresholds=thresholds,
                                ecdf_mat=ecdfs,
                                ylab='Proportion of Background TCRs',
                                cdr3_len=tr.clone_df.cdr3_b_aa.str.len(),
                                min_freq=1E-10)
    f1.savefig(os.path.join("", "PA1.png"))
    from tcrdist.ecdf import distance_ecdf
    tresholds, antigen_enriched_ecdf = distance_ecdf(pwrect=tr.pw_beta,
                                                     thresholds=thresholds,
                                                     weights=None,
                                                     pseudo_count=0,
                                                     skip_diag=False,
                                                     absolute_weight=True)
    # It is straightforward to make a ECDF between antigen enriched TCRs as well:
    antigen_enriched_ecdf[antigen_enriched_ecdf ==
                          antigen_enriched_ecdf.min()] = 1E-10
    f2 = _plot_manuscript_ecdfs(thresholds=thresholds,
                                ecdf_mat=antigen_enriched_ecdf,
                                ylab='Proportion of Antigen Enriched PA TCRs',
                                cdr3_len=tr.clone_df.cdr3_b_aa.str.len(),
                                min_freq=1E-10)
    f2.savefig(os.path.join("", "PA2.png"))
コード例 #4
0
ファイル: test_background.py プロジェクト: xzhan50/tcrdist3
def test_background_generation_toy_example():
    import sys
    import os
    import numpy as np
    import pandas as pd
    from tcrsampler.sampler import TCRsampler
    from tcrdist.background import make_gene_usage_counter, get_gene_frequencies, calculate_adjustment, make_gene_usage_counter
    from tcrdist.background import make_vj_matched_background, make_flat_vj_background
    from tcrdist.background import get_stratified_gene_usage_frequency
    from tcrdist.background import sample_britanova
    """
	SUPPOSE WE HAVE SOME REPERTOIRE WITH THE FOLLOWING GENE USAGE SPECIFIED BY ix
	< df_target > For testing we will use a set of 25 TCRs generated from rare and semi-rare V,J pairings. We use 25 only 
	because we will be comuting distances against 4.6 Million seqs.
		1. TCRsampler, replacing gene occurance frequencies with subject tratified estimates
		NOTE: with replace = True .vj_occur_freq will now be the stratified value
		2. Make V,J gene usage matched backgound to match usage in df_target
		3. Use a subject-stratifeid random draw from the Britanova Chord Blood Samples
		4. Make V,J gene usage matched backgound to match usage in df_target
	"""
    ts = TCRsampler(
        default_background='britanova_human_beta_t_cb.tsv.sampler.tsv')  # 1
    ts = get_stratified_gene_usage_frequency(ts=ts, replace=True)

    ix = [['TRBV19*01', 'TRBJ2-5*01', 3], ['TRBV24-1*01', 'TRBJ2-4*01', 3],
          ['TRBV25-1*01', 'TRBJ2-4*01', 3], ['TRBV30*01', 'TRBJ2-3*01', 2],
          ['TRBV5-4*01', 'TRBJ2-3*01', 2], ['TRBV11-2*01', 'TRBJ2-2*01', 2],
          ['TRBV2*01', 'TRBJ1-5*01', 1], ['TRBV12-5*01', 'TRBJ2-7*01', 1],
          ['TRBV4-1*01', 'TRBJ1-6*01', 1], ['TRBV6-5*01', 'TRBJ1-6*01', 1],
          ['TRBV13*01', 'TRBJ2-3*01', 1], ['TRBV18*01', 'TRBJ2-3*01', 1],
          ['TRBV14*01', 'TRBJ2-7*01', 1], ['TRBV6-6*01', 'TRBJ2-7*01', 1],
          ['TRBV10-3*01', 'TRBJ2-3*01', 1], ['TRBV7-2*01', 'TRBJ2-1*01', 1],
          ['TRBV5-1*01', 'TRBJ2-1*01', 1]]
    flatten = lambda l: [item for sublist in l for item in sublist]
    df_target = pd.concat([
        pd.DataFrame({
            'cdr3_b_aa': flatten(ts.sample([[x[0], x[1], x[2]]])),
            'v_b_gene': x[0],
            'j_b_gene': x[1]
        }) for x in ix
    ]).reset_index(drop=True)

    gene_usage_counter = make_gene_usage_counter(df_target)  # 2
    df_vj_bkgd = make_vj_matched_background(
        ts=ts,
        gene_usage_counter=gene_usage_counter,
        size=
        101000,  # Ask for a few extra as Olga can return none if it makes too many non-productive CDR3s
        recomb_type="VDJ",
        chain_folder="human_T_beta",
        cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa'])
    df_vj_bkgd = df_vj_bkgd.sample(100000).reset_index(drop=True)
    df_vj_bkgd['weights'] = calculate_adjustment(df=df_vj_bkgd, adjcol="pVJ")
    df_vj_bkgd['source'] = "vj_matched"

    df_britanova_100K = sample_britanova(size=100000)  # 3
    df_britanova_100K = get_gene_frequencies(ts=ts, df=df_britanova_100K)
    df_britanova_100K['weights'] = 1
    df_britanova_100K['source'] = "stratified_random"
    df_bkgd = pd.concat([df_vj_bkgd, df_britanova_100K], axis = 0).\
     reset_index(drop = True)               # 4

    assert df_bkgd.shape[0] == 200000
    """
	Visually inspect the gene_usage between target seqs and vj-matched background
	"""
    df_check_match = pd.concat([
        df_vj_bkgd.groupby(['v_b_gene', 'j_b_gene']).size() /
        df_vj_bkgd.shape[0],
        df_target.groupby(['v_b_gene', 'j_b_gene']).size() / df_target.shape[0]
    ],
                               axis=1)
    assert np.all(abs(df_check_match[0] - df_check_match[1]) < 0.001)
    return df_bkgd
コード例 #5
0
ファイル: metaclonotypes.py プロジェクト: xzhan50/tcrdist3
def find_metaclonotypes(
    project_path = "tutorial48",
    source_path = os.path.join(path_to_base,'tcrdist','data','covid19'),
    antigen_enriched_file = 'mira_epitope_48_610_YLQPRTFL_YLQPRTFLL_YYVGYLQPRTF.tcrdist3.csv',
    ncpus = 4, 
    seed = 3434):
    """
    This functions encapsulates a complete 
    workflow for finding meta-clonotypes in antigen-enriched data.
    """
    np.random.seed(seed)
    if not os.path.isdir(project_path):
        os.mkdir(project_path)
    ############################################################################
    # Step 1: Select and load a antigen-enriched (sub)repertoire.           ####
    ############################################################################
    print(f"INITIATING A TCRrep() with {antigen_enriched_file}")
    assert os.path.isfile(os.path.join(source_path, antigen_enriched_file))
        # Read file into a Pandas DataFrame <df>
    df = pd.read_csv(os.path.join(source_path, antigen_enriched_file))
        # Drop cells without any gene usage information
    df = df[( df['v_b_gene'].notna() ) & (df['j_b_gene'].notna()) ]
        # Initialize a TCRrep class, using ONLY columns that are complete and unique define a a clone.
        # Class provides a 'count' column if non is present
        # Counts of identical subject:VCDR3 'clones' will be aggregated into a TCRrep.clone_df.
    from tcrdist.repertoire import TCRrep
    tr = TCRrep(cell_df = df[['subject','cell_type','v_b_gene', 'j_b_gene', 'cdr3_b_aa']], 
                organism = "human", 
                chains = ['beta'], 
                compute_distances = True)
    tr.cpus = ncpus
    ############################################################################
    # Step 1.1: Estimate Probability of Generation                          ####
    ############################################################################
    ### It will be useful later to know the pgen of each
    from tcrdist.automate import auto_pgen
    print(f"COMPUTING PGEN WITH OLGA (Sethna et al 2018)")
    print("FOR ANTIGEN-ENRICHED CLONES TO BE USED FOR SUBSEQUENT ANALYSES")
    auto_pgen(tr)

    # Tip: Users of tcrdist3 should be aware that by default a <TCRrep.clone_df> 
    # DataFrame is created out of non-redundant cells in the cell_df, and 
    # pairwise distance matrices automatically computed.
    # Notice that attributes <tr.clone_df>  and  <tr.pw_beta> , <tr.pw_cdr3_b_aa>, 
    # are immediately accessible.
    # Attributes <tr.pw_pmhc_b_aa>, <tr.pw_cdr2_b_aa>, and <tr.pw_cdr1_b_aa>  
    # are also available if <TCRrep.store_all_cdr> is set to True.
    # For large datasets, i.e., >15,000 clones, this approach may consume too much 
    # memory so <TCRrep.compute_distances> is automatically set to False. 
                                    
    ############################################################################
    # Step 2: Synthesize an Inverse Probability Weighted VJ Matched Background #
    ############################################################################
    # Generating an appropriate set of unenriched reference TCRs is important; for
    # each set of antigen-associated TCRs, discovered by MIRA, we created a two part
    # background. One part consists of 100,000 synthetic TCRs whose V-gene and J-gene
    # frequencies match those in the antigen-enriched repertoire, using the software
    # OLGA (Sethna et al. 2019; Marcou et al. 2018). The other part consists of
    # 100,000 umbilical cord blood TCRs sampled uniformly from 8 subjects (Britanova
    # et al., 2017). This mix balances dense sampling of sequences near the
    # biochemical neighborhoods of interest with broad sampling of TCRs from an
    # antigen-naive repertoire. Importantly, we adjust for the biased sampling by
    # using the V- and J-gene frequencies observed in the cord-blood data (see
    # Methods for details about inverse probability weighting adjustment). Using this
    # approach we are able to estimate the abundance of TCRs similar to a centroid
    # TCR in an unenriched background repertoire of ~1,000,000 TCRs, using a
    # comparatively modest background dataset of 200,000 TCRs. While this estimate
    # may underestimate the true specificity, since some of the neighborhood TCRs in
    # the unenriched background repertoire may in fact recognize the antigen of
    # interest, it is useful for prioritizing neighborhoods and selecting a radius
    # for each neighborhood that balances sensitivity and specificity.
    # Initialize a TCRsampler -- human, beta, umbilical cord blood from 8 people.
    print(f"USING tcrsampler TO CONSTRUCT A CUSTOM V-J MATCHED BACKGROUND")
    from tcrsampler.sampler import TCRsampler
    ts = TCRsampler(default_background = 'britanova_human_beta_t_cb.tsv.sampler.tsv')
    # Stratify sample so that each subject contributes similarly to estimate of 
    # gene usage frequency
    from tcrdist.background import get_stratified_gene_usage_frequency
    ts = get_stratified_gene_usage_frequency(ts = ts, replace = True) 
    # Synthesize an inverse probability weighted V,J gene background that matches 
    # usage in your enriched repertoire 
    df_vj_background = tr.synthesize_vj_matched_background(ts = ts, chain = 'beta')
    # Get a randomly drawn stratified sampler of beta, cord blood from 
    # Britanova et al. 2016 
    # Dynamics of Individual T Cell Repertoires: From Cord Blood to Centenarians
    from tcrdist.background import  sample_britanova
    df_britanova_100K = sample_britanova(size = 100000)
    # Append frequency columns using, using sampler above
    df_britanova_100K = get_gene_frequencies(ts = ts, df = df_britanova_100K)
    df_britanova_100K['weights'] = 1
    df_britanova_100K['source'] = "stratified_random"
    # Combine the two parts of the background into a single DataFrame
    df_bkgd = pd.concat([df_vj_background.copy(), df_britanova_100K.copy()], axis = 0).\
        reset_index(drop = True)                                              
    # Assert that the backgrounds have the expected number of rows.
    assert df_bkgd.shape[0] == 200000
    # Save the background for future use
    background_outfile = os.path.join(project_path, f"{antigen_enriched_file}.olga100K_brit100K_bkgd.csv")
    print(f'WRITING {background_outfile}')
    df_bkgd.to_csv(background_outfile, index = False)
    # Load the background to a TCRrep without computing pairwise distances 
    # (i.e., compute_distances = False)
    tr_bkgd = TCRrep(
        cell_df = df_bkgd,
        organism = "human", 
        chains = ['beta'], 
        compute_distances = False)
    # Compute rectangular distances. Those are, distances between each clone in 
    # the antigen-enriched repertoire and each TCR in the background.
    # With a single 1 CPU and < 10GB RAM, 5E2x2E5 = 100 million pairwise distances, 
    # across CDR1, CDR2, CDR2.5, and CDR3 
    # 1min 34s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) 
    # %timeit -r 1 tr.compute_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df, store = False)
    ############################################################################
    # Step 4: Calculate Distances                                          #####
    ############################################################################
    print(f"COMPUTING RECTANGULARE DISTANCE")
    tr.compute_sparse_rect_distances(
        df = tr.clone_df, 
        df2 = tr_bkgd.clone_df,
        radius=50,
        chunk_size = 100)
    scipy.sparse.save_npz(os.path.join(project_path, f"{antigen_enriched_file}.rw_beta.npz"), tr.rw_beta)
        # Tip: For larger dataset you can use a sparse implementation: 
        # 30.8 s ± 0 ns per loop ; tr.cpus = 6
        # %timeit -r tr.compute_sparse_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df,radius=50, chunk_size=85)
    ############################################################################
    # Step 5: Examine Density ECDFS                                        #####
    ############################################################################
        # Investigate the density of neighbors to each TCR, based on expanding 
        # distance radius.
    from tcrdist.ecdf import distance_ecdf, _plot_manuscript_ecdfs
    import matplotlib.pyplot as plt
        # Compute empirical cumulative density function (ecdf)
        # Compare Antigen Enriched TCRs (against itself).
    thresholds, antigen_enriched_ecdf = distance_ecdf(
        tr.pw_beta,
        thresholds=range(0,50,2))
        # Compute empirical cumulative density function (ecdf)
        # Compare Antigen Enriched TCRs (against) 200K probability 
        # inverse weighted background
    thresholds, background_ecdf = distance_ecdf(
        tr.rw_beta,
        thresholds=range(0,50,2),
        weights= tr_bkgd.clone_df['weights'], 
        absolute_weight = True)
        # plot_ecdf similar to tcrdist3 manuscript #
    antigen_enriched_ecdf[antigen_enriched_ecdf == antigen_enriched_ecdf.min()] = 1E-10
    f1 = _plot_manuscript_ecdfs(
        thresholds, 
        antigen_enriched_ecdf, 
        ylab= 'Proportion of Antigen Enriched TCRs', 
        cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), 
        min_freq=1E-10)
    f1.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_AER_plot.png'))
    f2 = _plot_manuscript_ecdfs(
        thresholds,
        background_ecdf,
        ylab= 'Proportion of Reference TCRs',
        cdr3_len=tr.clone_df.cdr3_b_aa.str.len(),
        min_freq=1E-10)
    f2.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_BUR_plot.png'))
    ############################################################################
    # Step 6: Find optimal radii  (theta = 1E5                             #####
    ############################################################################
    # To ascertain which meta-clonotypes are likely to be most specific, 
    # take advantage of an existing function <bkgd_cntrl_nn2>.                                                                                                                                  
    #  d888   .d8888b.  8888888888     888888888  
    # d8888  d88P  Y88b 888            888        
    #   888  888    888 888            888        
    #   888  888    888 8888888        8888888b.  
    #   888  888    888 888                 "Y88b 
    #   888  888    888 888      888888       888 
    #   888  Y88b  d88P 888            Y88b  d88P 
    # 8888888 "Y8888P"  8888888888      "Y8888P"                                         
   
    level_tag = '1E5'
    from tcrdist.neighbors import bkgd_cntl_nn2
    centers_df  = bkgd_cntl_nn2(
        tr               = tr,
        tr_background    = tr_bkgd,
        weights          = tr_bkgd.clone_df.weights,
        ctrl_bkgd        = 10**-5, 
        col              = 'cdr3_b_aa',
        add_cols         = ['v_b_gene', 'j_b_gene'],
        ncpus            = 4,
        include_seq_info = True,
        thresholds       = [x for x in range(0,50,2)],
        generate_regex   = True,
        test_regex       = True,
        forced_max_radius = 36)

    ############################################################################
    # Step 6.2: (theta = 1E5) ALL meta-clonotypes .tsv file                   ##
    ############################################################################
    # save center to project_path for future use
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    
    # Many of meta-clonotypes contain redundant information. 
    # We can winnow down to less-redundant list. We do this 
    # by ranking clonotypes from most to least specific. 
        # <min_nsubject> is minimum publicity of the meta-clonotype,  
        # <min_nr> is minimum non-redundancy
    # Add neighbors, K_neighbors, and nsubject columns
    from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius
    centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius'])
    centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x))
    # We determine how many <nsubjects> are in the set of neighbors 
    centers_df['nsubject']  = centers_df['neighbors'].\
            apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    from tcrdist.centers import rank_centers
    ranked_centers_df = rank_centers(
        centers_df = centers_df, 
        rank_column = 'chi2joint', 
        min_nsubject = 2, 
        min_nr = 1)
    ############################################################################
    # Step 6.3:  (theta = 1E5) NR meta-clonotypes .tsv file                  ###
    ############################################################################
    # Output, ready to search bulk data.
    ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    ############################################################################
    # Step 6.4: (theta = 1E5) Output Meta-Clonotypes HTML Summary            ###
    ############################################################################
    # Here we can make a svg logo for each NR meta-clonotype
    if ranked_centers_df.shape[0] > 0:
        from progress.bar import IncrementalBar
        from tcrdist.public import make_motif_logo
        cdr3_name = 'cdr3_b_aa'
        v_gene_name = 'v_b_gene'
        svgs = list()
        svgs_raw = list()
        bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0])
        for i,r in ranked_centers_df.iterrows():
            bar.next()
            centroid = r[cdr3_name]
            v_gene   = r[v_gene_name]
            svg, svg_raw = make_motif_logo( tcrsampler = ts, 
                                            pwmat = tr.pw_beta,
                                            clone_df = tr.clone_df,
                                            centroid = centroid ,
                                            v_gene = v_gene ,
                                            radius = r['radius'],
                                            pwmat_str = 'pw_beta',
                                            cdr3_name = 'cdr3_b_aa',
                                            v_name = 'v_b_gene',
                                            gene_names = ['v_b_gene','j_b_gene'])
            svgs.append(svg)
            svgs_raw.append(svg_raw)
        bar.next();bar.finish()
        ranked_centers_df['svg']      = svgs
        ranked_centers_df['svg_raw'] = svgs_raw

        def shrink(s):
            return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"')
        labels =['cdr3_b_aa','v_b_gene', 'j_b_gene', 'pgen',
                'radius', 'regex','nsubject','K_neighbors', 
                'bkgd_hits_weighted','chi2dist','chi2re','chi2joint']
        
        output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html')
        # 888    888 88888888888 888b     d888 888      
        # 888    888     888     8888b   d8888 888      
        # 888    888     888     88888b.d88888 888      
        # 8888888888     888     888Y88888P888 888      
        # 888    888     888     888 Y888P 888 888      
        # 888    888     888     888  Y8P  888 888      
        # 888    888     888     888   "   888 888      
        # 888    888     888     888       888 88888888
        with open(output_html_name, 'w') as output_handle:
            for i,r in ranked_centers_df.iterrows():
                #import pdb; pdb.set_trace()
                svg, svg_raw = r['svg'],r['svg_raw']
                output_handle.write("<br></br>")
                output_handle.write(shrink(svg))
                output_handle.write(shrink(svg_raw))
                output_handle.write("<br></br>")
                output_handle.write(pd.DataFrame(r[labels]).transpose().to_html())
                output_handle.write("<br></br>")
    # To ascertain which meta-clonotypes are likely to be most specific, 
    # take advantage of an existing function <bkgd_cntrl_nn2>.       
    #  d888   .d8888b.  8888888888       .d8888b.  
    # d8888  d88P  Y88b 888             d88P  Y88b 
    #   888  888    888 888             888        
    #   888  888    888 8888888         888d888b.  
    #   888  888    888 888             888P "Y88b 
    #   888  888    888 888      888888 888    888 
    #   888  Y88b  d88P 888             Y88b  d88P 
    # 8888888 "Y8888P"  8888888888       "Y8888P" 
    ############################################################################
    # Step 6.5: Find optimal radii  (theta = 1E6)                            ###
    ############################################################################
    level_tag = '1E6'
    from tcrdist.neighbors import bkgd_cntl_nn2
    centers_df  = bkgd_cntl_nn2(
        tr               = tr,
        tr_background    = tr_bkgd,
        weights          = tr_bkgd.clone_df.weights,
        ctrl_bkgd        = 10**-6, 
        col              = 'cdr3_b_aa',
        add_cols         = ['v_b_gene', 'j_b_gene'],
        ncpus            = 4,
        include_seq_info = True,
        thresholds       = [x for x in range(0,50,2)],
        generate_regex   = True,
        test_regex       = True,
        forced_max_radius = 36)
    ############################################################################
    # Step 6.6: (theta = 1E6) ALL meta-clonotypes .tsv file                   ##
    ############################################################################
    # save center to project_path for future use
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    
    # Many of meta-clonotypes contain redundant information. 
    # We can winnow down to less-redundant list. We do this 
    # by ranking clonotypes from most to least specific. 
        # <min_nsubject> is minimum publicity of the meta-clonotype,  
        # <min_nr> is minimum non-redundancy
    # Add neighbors, K_neighbors, and nsubject columns
    from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius
    centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius'])
    centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x))
    # We determine how many <nsubjects> are in the set of neighbors 
    centers_df['nsubject']  = centers_df['neighbors'].\
            apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    from tcrdist.centers import rank_centers
    ranked_centers_df = rank_centers(
        centers_df = centers_df, 
        rank_column = 'chi2joint', 
        min_nsubject = 2, 
        min_nr = 1)
    ############################################################################
    # Step 6.7:  (theta = 1E6) NR meta-clonotypes .tsv file                  ###
    ############################################################################
    # Output, ready to search bulk data.
    ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    ############################################################################
    # Step 6.8: (theta = 1E6) Output Meta-Clonotypes HTML Summary            ###
    ############################################################################
    # Here we can make a svg logo for each meta-clonotype
    from progress.bar import IncrementalBar
    from tcrdist.public import make_motif_logo
    if ranked_centers_df.shape[0] > 0:
        cdr3_name = 'cdr3_b_aa'
        v_gene_name = 'v_b_gene'
        svgs = list()
        svgs_raw = list()
        bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0])
        for i,r in ranked_centers_df.iterrows():
            bar.next()
            centroid = r[cdr3_name]
            v_gene   = r[v_gene_name]
            svg, svg_raw = make_motif_logo( tcrsampler = ts, 
                                            pwmat = tr.pw_beta,
                                            clone_df = tr.clone_df,
                                            centroid = centroid ,
                                            v_gene = v_gene ,
                                            radius = r['radius'],
                                            pwmat_str = 'pw_beta',
                                            cdr3_name = 'cdr3_b_aa',
                                            v_name = 'v_b_gene',
                                            gene_names = ['v_b_gene','j_b_gene'])
            svgs.append(svg)
            svgs_raw.append(svg_raw)
        bar.next();bar.finish()
        ranked_centers_df['svg']      = svgs
        ranked_centers_df['svg_raw'] = svgs_raw

        def shrink(s):
            return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"')
        labels =['cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'radius', 'regex','nsubject','K_neighbors', 'bkgd_hits_weighted','chi2dist','chi2re','chi2joint']
        
        output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html')
        # 888    888 88888888888 888b     d888 888      
        # 888    888     888     8888b   d8888 888      
        # 888    888     888     88888b.d88888 888      
        # 8888888888     888     888Y88888P888 888      
        # 888    888     888     888 Y888P 888 888      
        # 888    888     888     888  Y8P  888 888      
        # 888    888     888     888   "   888 888      
        # 888    888     888     888       888 88888888     
        with open(output_html_name, 'w') as output_handle:
            for i,r in ranked_centers_df.iterrows():
                #import pdb; pdb.set_trace()
                svg, svg_raw = r['svg'],r['svg_raw']
                output_handle.write("<br></br>")
                output_handle.write(shrink(svg))
                output_handle.write(shrink(svg_raw))
                output_handle.write("<br></br>")
                output_handle.write(pd.DataFrame(r[labels]).transpose().to_html())
                output_handle.write("<br></br>")