Exemple #1
0
def test_current_example():
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2
    from tcrdist.automate import auto_pgen
    import scipy.sparse

    fn_mira_background = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.olga100K_brit100K_bkgd.csv'
    fn_mira_background = os.path.join('tcrdist', 'data', 'covid19',
                                      fn_mira_background)
    df_background = pd.read_csv(fn_mira_background)
    tr_background = TCRrep(cell_df=df_background.copy(),
                           organism="human",
                           chains=['beta'],
                           compute_distances=False)

    fn_mira = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv'
    fn_mira = os.path.join('tcrdist', 'data', 'covid19', fn_mira)
    df_mira = pd.read_csv(fn_mira)
    df_mira = df_mira[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']]
    tr = TCRrep(cell_df=df_mira.copy(),
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                store_all_cdr=False,
                compute_distances=True)
    auto_pgen(tr)

    tr.compute_rect_distances(df=tr.clone_df,
                              df2=tr_background.clone_df,
                              store=False)

    assert tr.rw_beta.shape[0] == tr.clone_df.shape[0]

    centers_df = bkgd_cntl_nn2(tr=tr,
                               tr_background=tr_background,
                               ctrl_bkgd=10**-6,
                               weights=tr_background.clone_df.weights,
                               col='cdr3_b_aa',
                               ncpus=2,
                               thresholds=[x for x in range(0, 50, 2)],
                               generate_regex=True,
                               test_regex=True)

    out_fn_center_df = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.centers_df.csv'
    out_fn_rw_beta_sparse_matrix = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.rw_beta.sparse.npz'
    centers_df.to_csv(out_fn_center_df, index=False)
    tr.rw_beta[tr.rw_beta == 0] = 1  # set true zeros to 1
    tr.rw_beta[tr.rw_beta > 50] = 0  # ignores everything less than 100
    rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta)
    scipy.sparse.save_npz(out_fn_rw_beta_sparse_matrix, rw_beta_sparse)
Exemple #2
0
def test_old_example():
    """
	The purpose of this example is to show the use of 
	chosing thresholds based on background discovery rate
	"""
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2
    from tcrdist.automate import auto_pgen
    from tcrdist.regex import _index_to_regex_str, _index_to_seqs
    from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets

    fn = os.path.join('tcrdist', 'data', 'covid19', "m60_bkgd_test_input.csv")
    df_background = pd.read_csv(fn)

    tr_background = TCRrep(cell_df=df_background,
                           organism="human",
                           chains=['beta'],
                           compute_distances=False)
    tr_background.clone_df['weights'] = 1
    fn = os.path.join('tcrdist', 'data', 'covid19', "m60_test_input.csv")
    df = pd.read_csv(fn)

    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    auto_pgen(tr)

    tr.compute_rect_distances(df=tr.clone_df,
                              df2=tr_background.clone_df,
                              store=False)

    assert tr.rw_beta.shape[0] == tr.clone_df.shape[0]

    centers_df = bkgd_cntl_nn2(tr=tr,
                               tr_background=tr_background,
                               ctrl_bkgd=2 * 10**-5,
                               weights=tr_background.clone_df.weights,
                               col='cdr3_b_aa',
                               ncpus=2,
                               thresholds=[x for x in range(0, 50, 2)],
                               generate_regex=True,
                               test_regex=True)

    centers_df.sort_values(['target_hits'], ascending=False)
Exemple #3
0
def test_auto_pgen_human_alpha_beta():
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    df = pd.read_csv("dash_human.csv").sample(10, random_state=3)
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')
    from tcrdist.automate import auto_pgen
    tr = auto_pgen(tr)
    assert isinstance(tr.clone_df.pgen_cdr3_b_aa, pd.Series)
Exemple #4
0
def test_auto_pgen_mouse_alpha_beta_ValueError():
    """ If auto_pgen called on TCRrep with organism == mouse and 
    chain including 'alpha', Raises ValueError"""

    import pandas as pd
    from tcrdist.repertoire import TCRrep

    df = pd.read_csv("dash.csv").sample(10, random_state=3)
    tr = TCRrep(cell_df=df,
                organism='mouse',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    from tcrdist.automate import auto_pgen
    with pytest.raises(ValueError):
        tr = auto_pgen(tr)
Exemple #5
0
def find_centers_beta(background_filename,
                      target_filename,
                      ncpus,
                      min_nsubject,
                      ctrl_bkgd=10**-5,
                      prefilter=False):
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2
    from tcrdist.automate import auto_pgen
    from tcrdist.rep_diff import neighborhood_diff
    from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets
    import scipy.sparse

    df_background = pd.read_csv(background_filename)
    print(df_background)
    tr_background = TCRrep(cell_df=df_background.copy(),
                           organism="human",
                           chains=['beta'],
                           compute_distances=False)

    df_mira = pd.read_csv(target_filename)
    df_mira = df_mira[[
        'subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa'
    ]]
    print(df_mira)
    tr = TCRrep(cell_df=df_mira.copy(),
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                store_all_cdr=False,
                compute_distances=True)

    if prefilter:
        # We can greatly cut down on the number of searches if we drop centroids without minimum publicicity
        nn_df = neighborhood_diff(clone_df=tr.clone_df,
                                  pwmat=tr.pw_beta,
                                  count_col='count',
                                  x_cols=['cell_type'],
                                  knn_radius=37)

        def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'):
            # Tabulate the number of unique subjects at each node
            neighbor_df['nsubject'] = neighbor_df[col_nn].apply(lambda x: len(
                set(_select(clone_df, iloc_rows=x, col='subject'))))
            return neighbor_df

        print(f"TABULATING PUBLIC CLUSTERS")
        nn_df = tabulate_publicity(nn_df, tr.clone_df)
        nn_df = filter_gt(nn_df, 'nsubject', min_nsubject)

        if nn_df.shape[0] == 0:
            centers_df = pd.DataFrame(
                {},
                columns=[
                    'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'max_radi',
                    'target_hits', 'bkgd_hits', 'bkgd_hits_weighted',
                    'bkgd_total', 'ctrl', 'ctrl_weighted', 'target_misses',
                    'TR', 'TR2', 'BR_weighted', 'RR_weighted', 'OR_weighted',
                    'chi2dist', 'target_neighbors', 'target_seqs',
                    'background_neighbors', 'background_seqs', 'background_v',
                    'background_j', 'regex', 'target_re_hits', 'bkgd_re_hits',
                    'bkgd_re_weighted_hits', 'TR_re', 'BR_re_weighted',
                    'RR_re_weighted', 'OR_re_weighted', 'chi2re', 'chi2joint',
                    'nsubject'
                ])
            tr.pw_beta[tr.pw_beta == 0] = 1  # set true zeros to 1
            tr.pw_beta[tr.pw_beta > 50] = 0  # ignores everything less than 100
            pw_beta_sparse = scipy.sparse.csr_matrix(tr.pw_beta)
            return centers_df, pw_beta_sparse

        tr.clone_df = tr.clone_df.loc[nn_df.index, :].reset_index(drop=True)
        del nn_df
        # Compute pairwise again with filtered set
        tr.compute_distances()
        # compute pgens automatically, currently parmap will max out cpus on this step

    print("COMPUTING PROBABILITY OF GENERATION")
    auto_pgen(tr)
    print(
        f"COMPUTING RECT DIST {tr.clone_df.shape[0]}x{tr_background.clone_df.shape[0]}"
    )
    tr.compute_rect_distances(df=tr.clone_df,
                              df2=tr_background.clone_df,
                              store=False)

    assert tr.rw_beta.shape[0] == tr.clone_df.shape[0]

    centers_df = bkgd_cntl_nn2(
        tr=tr,
        tr_background=tr_background,
        ctrl_bkgd=ctrl_bkgd,  #ctrl_bkgd = 2*10**-5
        weights=tr_background.clone_df.weights,
        col='cdr3_b_aa',
        ncpus=ncpus,
        thresholds=[x
                    for x in range(0, 38, 2)],  # Settign 38 as the max radius
        generate_regex=True,
        test_regex=True)

    def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'):
        # Tabulate the number of unique subjects at each node
        neighbor_df['nsubject'] = neighbor_df[col_nn].apply(
            lambda x: len(set(_select(clone_df, iloc_rows=x, col='subject'))))
        return neighbor_df

    centers_df = tabulate_publicity(neighbor_df=centers_df,
                                    clone_df=tr.clone_df,
                                    col_nn='target_neighbors')

    tr.rw_beta[tr.rw_beta == 0] = 1  # set true zeros to 1
    tr.rw_beta[tr.rw_beta > 50] = 0  # ignores everything less than 100
    rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta)
    #scipy.sparse.save_npzz(output_matrix_filename, rw_beta_sparse)
    return centers_df, rw_beta_sparse
Exemple #6
0
def run_one(ref_fn, rep_fn, ss=-1, ncpus=1):
    ref_df = pd.read_csv(ref_fn)
    ref_df.columns = [{
        'v_b_name': 'v_b_gene',
        'j_b_name': 'j_b_gene',
        'cdr3_b_aa': 'cdr3_b_aa'
    }.get(c, c) for c in ref_df.columns]
    ref_df.loc[:, 'count'] = 1
    if ss == -1:
        ref_tr = TCRrep(cell_df=ref_df,
                        organism='human',
                        chains=['beta'],
                        compute_distances=False,
                        store_all_cdr=False)
    else:
        ref_tr = TCRrep(cell_df=ref_df.sample(n=ss, replace=False),
                        organism='human',
                        chains=['beta'],
                        compute_distances=False,
                        store_all_cdr=False)

    rep_df = pd.read_csv(rep_fn).assign(count=1)
    tr = TCRrep(cell_df=rep_df[[
        'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope', 'experiment',
        'subject', 'count'
    ]],
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                compute_distances=False)

    if tr.clone_df.shape[0] > 6000:
        """Limit size of MIRA set to 2000"""
        tr.clone_df = tr.clone_df.sample(n=6000,
                                         replace=False,
                                         random_state=110820)

    auto_pgen(tr)

    out = []
    print(rep_fn)
    for metric in ['tcrdist', 'tcrdist-cdr3', 'edit']:
        if 'tcr' in metric:
            metric_thresholds = np.arange(76)
            fcluster_thresholds = [0, 25, 50]
        else:
            metric_thresholds = np.arange(9)
            fcluster_thresholds = [0, 1, 2]
        """Enforce no clustering analysis"""
        fcluster_thresholds = [0]

        epitope_name = os.path.split(rep_fn)[1].split('.')[0]
        epitope_name = epitope_name.replace('mira_epitope_', 'M')

        # rep_fn = opj(_fg_data, 'ncov_tcrs/adaptive_bio_r2/tcrs_by_mira_epitope/pw_computed', rep_fn)
        print(f'\t{metric}')
        """with open(rep_fn, 'rb') as fh:
            tr = dill.load(fh)"""
        """Compute repertoire PW distances and create flat clusters"""
        rep_pwmat = _pwrect(tr,
                            clone_df1=tr.clone_df,
                            metric=metric,
                            ncpus=ncpus)
        print('Computed MIRA set pwrect.')

        ref_pwmat = _pwrect(tr,
                            clone_df1=tr.clone_df,
                            clone_df2=ref_tr.clone_df,
                            metric=metric,
                            ncpus=ncpus)
        print('Computed reference pwrect.')

        for fclust_thresh in fcluster_thresholds:
            if fclust_thresh > 0:
                rep_pwvec = scipy.spatial.distance.squareform(rep_pwmat,
                                                              force='tovector')
                Z = sch.linkage(rep_pwvec, method='complete')
                labels = sch.fcluster(Z, t=fclust_thresh, criterion='distance')
            else:
                labels = np.arange(1, rep_pwmat.shape[0] + 1)
            """Compute ECDF for each cluster within the repertoire"""
            # rep_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds)))
            for lab in range(1, np.max(labels) + 1):
                lab_ind = labels == lab
                rep_ecdf = compute_ecdf(np.mean(
                    rep_pwmat[lab_ind, :][:, ~lab_ind], axis=0),
                                        thresholds=metric_thresholds)
                tmp_df = pd.DataFrame({
                    'ecdf': rep_ecdf,
                    'thresholds': metric_thresholds
                })
                tmp_df = tmp_df.assign(
                    metric=metric,
                    fclust_thresh=fclust_thresh,
                    label=lab,
                    name=epitope_name,
                    versus='rep',
                    pgen=np.median(
                        tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]),
                    K=lab_ind.sum())
                out.append(tmp_df)
            """Compute distances to the reference for each cluster and compute ECDF vs reference"""
            # ref_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds)))
            for lab in range(1, np.max(labels) + 1):
                lab_ind = labels == lab
                ref_ecdf = compute_ecdf(np.mean(ref_pwmat[lab_ind, :], axis=0),
                                        thresholds=metric_thresholds,
                                        weights=ref_tr.clone_df['weights'])
                tmp_df = pd.DataFrame({
                    'ecdf': ref_ecdf,
                    'thresholds': metric_thresholds
                })
                tmp_df = tmp_df.assign(
                    metric=metric,
                    fclust_thresh=fclust_thresh,
                    label=lab,
                    name=epitope_name,
                    versus='ref',
                    pgen=np.median(
                        tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]),
                    K=lab_ind.sum())
                out.append(tmp_df)
    out = pd.concat(out, axis=0)
    return out
Exemple #7
0
def find_metaclonotypes(
    project_path = "tutorial48",
    source_path = os.path.join(path_to_base,'tcrdist','data','covid19'),
    antigen_enriched_file = 'mira_epitope_48_610_YLQPRTFL_YLQPRTFLL_YYVGYLQPRTF.tcrdist3.csv',
    ncpus = 4, 
    seed = 3434):
    """
    This functions encapsulates a complete 
    workflow for finding meta-clonotypes in antigen-enriched data.
    """
    np.random.seed(seed)
    if not os.path.isdir(project_path):
        os.mkdir(project_path)
    ############################################################################
    # Step 1: Select and load a antigen-enriched (sub)repertoire.           ####
    ############################################################################
    print(f"INITIATING A TCRrep() with {antigen_enriched_file}")
    assert os.path.isfile(os.path.join(source_path, antigen_enriched_file))
        # Read file into a Pandas DataFrame <df>
    df = pd.read_csv(os.path.join(source_path, antigen_enriched_file))
        # Drop cells without any gene usage information
    df = df[( df['v_b_gene'].notna() ) & (df['j_b_gene'].notna()) ]
        # Initialize a TCRrep class, using ONLY columns that are complete and unique define a a clone.
        # Class provides a 'count' column if non is present
        # Counts of identical subject:VCDR3 'clones' will be aggregated into a TCRrep.clone_df.
    from tcrdist.repertoire import TCRrep
    tr = TCRrep(cell_df = df[['subject','cell_type','v_b_gene', 'j_b_gene', 'cdr3_b_aa']], 
                organism = "human", 
                chains = ['beta'], 
                compute_distances = True)
    tr.cpus = ncpus
    ############################################################################
    # Step 1.1: Estimate Probability of Generation                          ####
    ############################################################################
    ### It will be useful later to know the pgen of each
    from tcrdist.automate import auto_pgen
    print(f"COMPUTING PGEN WITH OLGA (Sethna et al 2018)")
    print("FOR ANTIGEN-ENRICHED CLONES TO BE USED FOR SUBSEQUENT ANALYSES")
    auto_pgen(tr)

    # Tip: Users of tcrdist3 should be aware that by default a <TCRrep.clone_df> 
    # DataFrame is created out of non-redundant cells in the cell_df, and 
    # pairwise distance matrices automatically computed.
    # Notice that attributes <tr.clone_df>  and  <tr.pw_beta> , <tr.pw_cdr3_b_aa>, 
    # are immediately accessible.
    # Attributes <tr.pw_pmhc_b_aa>, <tr.pw_cdr2_b_aa>, and <tr.pw_cdr1_b_aa>  
    # are also available if <TCRrep.store_all_cdr> is set to True.
    # For large datasets, i.e., >15,000 clones, this approach may consume too much 
    # memory so <TCRrep.compute_distances> is automatically set to False. 
                                    
    ############################################################################
    # Step 2: Synthesize an Inverse Probability Weighted VJ Matched Background #
    ############################################################################
    # Generating an appropriate set of unenriched reference TCRs is important; for
    # each set of antigen-associated TCRs, discovered by MIRA, we created a two part
    # background. One part consists of 100,000 synthetic TCRs whose V-gene and J-gene
    # frequencies match those in the antigen-enriched repertoire, using the software
    # OLGA (Sethna et al. 2019; Marcou et al. 2018). The other part consists of
    # 100,000 umbilical cord blood TCRs sampled uniformly from 8 subjects (Britanova
    # et al., 2017). This mix balances dense sampling of sequences near the
    # biochemical neighborhoods of interest with broad sampling of TCRs from an
    # antigen-naive repertoire. Importantly, we adjust for the biased sampling by
    # using the V- and J-gene frequencies observed in the cord-blood data (see
    # Methods for details about inverse probability weighting adjustment). Using this
    # approach we are able to estimate the abundance of TCRs similar to a centroid
    # TCR in an unenriched background repertoire of ~1,000,000 TCRs, using a
    # comparatively modest background dataset of 200,000 TCRs. While this estimate
    # may underestimate the true specificity, since some of the neighborhood TCRs in
    # the unenriched background repertoire may in fact recognize the antigen of
    # interest, it is useful for prioritizing neighborhoods and selecting a radius
    # for each neighborhood that balances sensitivity and specificity.
    # Initialize a TCRsampler -- human, beta, umbilical cord blood from 8 people.
    print(f"USING tcrsampler TO CONSTRUCT A CUSTOM V-J MATCHED BACKGROUND")
    from tcrsampler.sampler import TCRsampler
    ts = TCRsampler(default_background = 'britanova_human_beta_t_cb.tsv.sampler.tsv')
    # Stratify sample so that each subject contributes similarly to estimate of 
    # gene usage frequency
    from tcrdist.background import get_stratified_gene_usage_frequency
    ts = get_stratified_gene_usage_frequency(ts = ts, replace = True) 
    # Synthesize an inverse probability weighted V,J gene background that matches 
    # usage in your enriched repertoire 
    df_vj_background = tr.synthesize_vj_matched_background(ts = ts, chain = 'beta')
    # Get a randomly drawn stratified sampler of beta, cord blood from 
    # Britanova et al. 2016 
    # Dynamics of Individual T Cell Repertoires: From Cord Blood to Centenarians
    from tcrdist.background import  sample_britanova
    df_britanova_100K = sample_britanova(size = 100000)
    # Append frequency columns using, using sampler above
    df_britanova_100K = get_gene_frequencies(ts = ts, df = df_britanova_100K)
    df_britanova_100K['weights'] = 1
    df_britanova_100K['source'] = "stratified_random"
    # Combine the two parts of the background into a single DataFrame
    df_bkgd = pd.concat([df_vj_background.copy(), df_britanova_100K.copy()], axis = 0).\
        reset_index(drop = True)                                              
    # Assert that the backgrounds have the expected number of rows.
    assert df_bkgd.shape[0] == 200000
    # Save the background for future use
    background_outfile = os.path.join(project_path, f"{antigen_enriched_file}.olga100K_brit100K_bkgd.csv")
    print(f'WRITING {background_outfile}')
    df_bkgd.to_csv(background_outfile, index = False)
    # Load the background to a TCRrep without computing pairwise distances 
    # (i.e., compute_distances = False)
    tr_bkgd = TCRrep(
        cell_df = df_bkgd,
        organism = "human", 
        chains = ['beta'], 
        compute_distances = False)
    # Compute rectangular distances. Those are, distances between each clone in 
    # the antigen-enriched repertoire and each TCR in the background.
    # With a single 1 CPU and < 10GB RAM, 5E2x2E5 = 100 million pairwise distances, 
    # across CDR1, CDR2, CDR2.5, and CDR3 
    # 1min 34s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) 
    # %timeit -r 1 tr.compute_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df, store = False)
    ############################################################################
    # Step 4: Calculate Distances                                          #####
    ############################################################################
    print(f"COMPUTING RECTANGULARE DISTANCE")
    tr.compute_sparse_rect_distances(
        df = tr.clone_df, 
        df2 = tr_bkgd.clone_df,
        radius=50,
        chunk_size = 100)
    scipy.sparse.save_npz(os.path.join(project_path, f"{antigen_enriched_file}.rw_beta.npz"), tr.rw_beta)
        # Tip: For larger dataset you can use a sparse implementation: 
        # 30.8 s ± 0 ns per loop ; tr.cpus = 6
        # %timeit -r tr.compute_sparse_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df,radius=50, chunk_size=85)
    ############################################################################
    # Step 5: Examine Density ECDFS                                        #####
    ############################################################################
        # Investigate the density of neighbors to each TCR, based on expanding 
        # distance radius.
    from tcrdist.ecdf import distance_ecdf, _plot_manuscript_ecdfs
    import matplotlib.pyplot as plt
        # Compute empirical cumulative density function (ecdf)
        # Compare Antigen Enriched TCRs (against itself).
    thresholds, antigen_enriched_ecdf = distance_ecdf(
        tr.pw_beta,
        thresholds=range(0,50,2))
        # Compute empirical cumulative density function (ecdf)
        # Compare Antigen Enriched TCRs (against) 200K probability 
        # inverse weighted background
    thresholds, background_ecdf = distance_ecdf(
        tr.rw_beta,
        thresholds=range(0,50,2),
        weights= tr_bkgd.clone_df['weights'], 
        absolute_weight = True)
        # plot_ecdf similar to tcrdist3 manuscript #
    antigen_enriched_ecdf[antigen_enriched_ecdf == antigen_enriched_ecdf.min()] = 1E-10
    f1 = _plot_manuscript_ecdfs(
        thresholds, 
        antigen_enriched_ecdf, 
        ylab= 'Proportion of Antigen Enriched TCRs', 
        cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), 
        min_freq=1E-10)
    f1.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_AER_plot.png'))
    f2 = _plot_manuscript_ecdfs(
        thresholds,
        background_ecdf,
        ylab= 'Proportion of Reference TCRs',
        cdr3_len=tr.clone_df.cdr3_b_aa.str.len(),
        min_freq=1E-10)
    f2.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_BUR_plot.png'))
    ############################################################################
    # Step 6: Find optimal radii  (theta = 1E5                             #####
    ############################################################################
    # To ascertain which meta-clonotypes are likely to be most specific, 
    # take advantage of an existing function <bkgd_cntrl_nn2>.                                                                                                                                  
    #  d888   .d8888b.  8888888888     888888888  
    # d8888  d88P  Y88b 888            888        
    #   888  888    888 888            888        
    #   888  888    888 8888888        8888888b.  
    #   888  888    888 888                 "Y88b 
    #   888  888    888 888      888888       888 
    #   888  Y88b  d88P 888            Y88b  d88P 
    # 8888888 "Y8888P"  8888888888      "Y8888P"                                         
   
    level_tag = '1E5'
    from tcrdist.neighbors import bkgd_cntl_nn2
    centers_df  = bkgd_cntl_nn2(
        tr               = tr,
        tr_background    = tr_bkgd,
        weights          = tr_bkgd.clone_df.weights,
        ctrl_bkgd        = 10**-5, 
        col              = 'cdr3_b_aa',
        add_cols         = ['v_b_gene', 'j_b_gene'],
        ncpus            = 4,
        include_seq_info = True,
        thresholds       = [x for x in range(0,50,2)],
        generate_regex   = True,
        test_regex       = True,
        forced_max_radius = 36)

    ############################################################################
    # Step 6.2: (theta = 1E5) ALL meta-clonotypes .tsv file                   ##
    ############################################################################
    # save center to project_path for future use
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    
    # Many of meta-clonotypes contain redundant information. 
    # We can winnow down to less-redundant list. We do this 
    # by ranking clonotypes from most to least specific. 
        # <min_nsubject> is minimum publicity of the meta-clonotype,  
        # <min_nr> is minimum non-redundancy
    # Add neighbors, K_neighbors, and nsubject columns
    from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius
    centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius'])
    centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x))
    # We determine how many <nsubjects> are in the set of neighbors 
    centers_df['nsubject']  = centers_df['neighbors'].\
            apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    from tcrdist.centers import rank_centers
    ranked_centers_df = rank_centers(
        centers_df = centers_df, 
        rank_column = 'chi2joint', 
        min_nsubject = 2, 
        min_nr = 1)
    ############################################################################
    # Step 6.3:  (theta = 1E5) NR meta-clonotypes .tsv file                  ###
    ############################################################################
    # Output, ready to search bulk data.
    ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    ############################################################################
    # Step 6.4: (theta = 1E5) Output Meta-Clonotypes HTML Summary            ###
    ############################################################################
    # Here we can make a svg logo for each NR meta-clonotype
    if ranked_centers_df.shape[0] > 0:
        from progress.bar import IncrementalBar
        from tcrdist.public import make_motif_logo
        cdr3_name = 'cdr3_b_aa'
        v_gene_name = 'v_b_gene'
        svgs = list()
        svgs_raw = list()
        bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0])
        for i,r in ranked_centers_df.iterrows():
            bar.next()
            centroid = r[cdr3_name]
            v_gene   = r[v_gene_name]
            svg, svg_raw = make_motif_logo( tcrsampler = ts, 
                                            pwmat = tr.pw_beta,
                                            clone_df = tr.clone_df,
                                            centroid = centroid ,
                                            v_gene = v_gene ,
                                            radius = r['radius'],
                                            pwmat_str = 'pw_beta',
                                            cdr3_name = 'cdr3_b_aa',
                                            v_name = 'v_b_gene',
                                            gene_names = ['v_b_gene','j_b_gene'])
            svgs.append(svg)
            svgs_raw.append(svg_raw)
        bar.next();bar.finish()
        ranked_centers_df['svg']      = svgs
        ranked_centers_df['svg_raw'] = svgs_raw

        def shrink(s):
            return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"')
        labels =['cdr3_b_aa','v_b_gene', 'j_b_gene', 'pgen',
                'radius', 'regex','nsubject','K_neighbors', 
                'bkgd_hits_weighted','chi2dist','chi2re','chi2joint']
        
        output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html')
        # 888    888 88888888888 888b     d888 888      
        # 888    888     888     8888b   d8888 888      
        # 888    888     888     88888b.d88888 888      
        # 8888888888     888     888Y88888P888 888      
        # 888    888     888     888 Y888P 888 888      
        # 888    888     888     888  Y8P  888 888      
        # 888    888     888     888   "   888 888      
        # 888    888     888     888       888 88888888
        with open(output_html_name, 'w') as output_handle:
            for i,r in ranked_centers_df.iterrows():
                #import pdb; pdb.set_trace()
                svg, svg_raw = r['svg'],r['svg_raw']
                output_handle.write("<br></br>")
                output_handle.write(shrink(svg))
                output_handle.write(shrink(svg_raw))
                output_handle.write("<br></br>")
                output_handle.write(pd.DataFrame(r[labels]).transpose().to_html())
                output_handle.write("<br></br>")
    # To ascertain which meta-clonotypes are likely to be most specific, 
    # take advantage of an existing function <bkgd_cntrl_nn2>.       
    #  d888   .d8888b.  8888888888       .d8888b.  
    # d8888  d88P  Y88b 888             d88P  Y88b 
    #   888  888    888 888             888        
    #   888  888    888 8888888         888d888b.  
    #   888  888    888 888             888P "Y88b 
    #   888  888    888 888      888888 888    888 
    #   888  Y88b  d88P 888             Y88b  d88P 
    # 8888888 "Y8888P"  8888888888       "Y8888P" 
    ############################################################################
    # Step 6.5: Find optimal radii  (theta = 1E6)                            ###
    ############################################################################
    level_tag = '1E6'
    from tcrdist.neighbors import bkgd_cntl_nn2
    centers_df  = bkgd_cntl_nn2(
        tr               = tr,
        tr_background    = tr_bkgd,
        weights          = tr_bkgd.clone_df.weights,
        ctrl_bkgd        = 10**-6, 
        col              = 'cdr3_b_aa',
        add_cols         = ['v_b_gene', 'j_b_gene'],
        ncpus            = 4,
        include_seq_info = True,
        thresholds       = [x for x in range(0,50,2)],
        generate_regex   = True,
        test_regex       = True,
        forced_max_radius = 36)
    ############################################################################
    # Step 6.6: (theta = 1E6) ALL meta-clonotypes .tsv file                   ##
    ############################################################################
    # save center to project_path for future use
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    
    # Many of meta-clonotypes contain redundant information. 
    # We can winnow down to less-redundant list. We do this 
    # by ranking clonotypes from most to least specific. 
        # <min_nsubject> is minimum publicity of the meta-clonotype,  
        # <min_nr> is minimum non-redundancy
    # Add neighbors, K_neighbors, and nsubject columns
    from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius
    centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius'])
    centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x))
    # We determine how many <nsubjects> are in the set of neighbors 
    centers_df['nsubject']  = centers_df['neighbors'].\
            apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    from tcrdist.centers import rank_centers
    ranked_centers_df = rank_centers(
        centers_df = centers_df, 
        rank_column = 'chi2joint', 
        min_nsubject = 2, 
        min_nr = 1)
    ############################################################################
    # Step 6.7:  (theta = 1E6) NR meta-clonotypes .tsv file                  ###
    ############################################################################
    # Output, ready to search bulk data.
    ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    ############################################################################
    # Step 6.8: (theta = 1E6) Output Meta-Clonotypes HTML Summary            ###
    ############################################################################
    # Here we can make a svg logo for each meta-clonotype
    from progress.bar import IncrementalBar
    from tcrdist.public import make_motif_logo
    if ranked_centers_df.shape[0] > 0:
        cdr3_name = 'cdr3_b_aa'
        v_gene_name = 'v_b_gene'
        svgs = list()
        svgs_raw = list()
        bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0])
        for i,r in ranked_centers_df.iterrows():
            bar.next()
            centroid = r[cdr3_name]
            v_gene   = r[v_gene_name]
            svg, svg_raw = make_motif_logo( tcrsampler = ts, 
                                            pwmat = tr.pw_beta,
                                            clone_df = tr.clone_df,
                                            centroid = centroid ,
                                            v_gene = v_gene ,
                                            radius = r['radius'],
                                            pwmat_str = 'pw_beta',
                                            cdr3_name = 'cdr3_b_aa',
                                            v_name = 'v_b_gene',
                                            gene_names = ['v_b_gene','j_b_gene'])
            svgs.append(svg)
            svgs_raw.append(svg_raw)
        bar.next();bar.finish()
        ranked_centers_df['svg']      = svgs
        ranked_centers_df['svg_raw'] = svgs_raw

        def shrink(s):
            return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"')
        labels =['cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'radius', 'regex','nsubject','K_neighbors', 'bkgd_hits_weighted','chi2dist','chi2re','chi2joint']
        
        output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html')
        # 888    888 88888888888 888b     d888 888      
        # 888    888     888     8888b   d8888 888      
        # 888    888     888     88888b.d88888 888      
        # 8888888888     888     888Y88888P888 888      
        # 888    888     888     888 Y888P 888 888      
        # 888    888     888     888  Y8P  888 888      
        # 888    888     888     888   "   888 888      
        # 888    888     888     888       888 88888888     
        with open(output_html_name, 'w') as output_handle:
            for i,r in ranked_centers_df.iterrows():
                #import pdb; pdb.set_trace()
                svg, svg_raw = r['svg'],r['svg_raw']
                output_handle.write("<br></br>")
                output_handle.write(shrink(svg))
                output_handle.write(shrink(svg_raw))
                output_handle.write("<br></br>")
                output_handle.write(pd.DataFrame(r[labels]).transpose().to_html())
                output_handle.write("<br></br>")