Beispiel #1
0
def sim_all_cdr3_gen(n=100,
                     recomb_type="VDJ",
                     chain_folder="human_T_beta",
                     cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']):
    def expand_grid(dct):
        rows = itertools.product(*dct.values())
        return pd.DataFrame.from_records(rows, columns=dct.keys())

    omb = OlgaModel(recomb_type=recomb_type, chain_folder=chain_folder)

    all_vs = [x for x in omb.pgen_model.V_allele_names if x.endswith('*01')]
    all_js = [x for x in omb.pgen_model.J_allele_names if x.endswith('*01')]
    all_possible_beta = expand_grid({'V': all_vs, 'J': all_js})

    find_nones = list()
    results = list()

    for i, r in all_possible_beta.iterrows():

        e = omb.gen_cdr3s(V=r['V'], J=r['J'], n=n)
        results.append(
            pd.DataFrame({
                cols[2]: e,
                cols[0]: r['V'],
                cols[1]: r['J']
            }))

        if e[0] is None:
            find_nones.append([r['V'], r['J']])

    return results, find_nones
def test_olga_sample():
    np.random.seed(310)
    from tcrdist.pgen import OlgaModel
    olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")
    result = olga_model_beta.gen_cdr3(V='TRBV20-1*01', J='TRBJ1-2*01')
    # NOTE: seed is set, so we expect standard result
    #NOTE: .gen_cdr3() returns the full output tuple
    expected = ('TGCAGTGCTAGAGTAAGGGAAGCGGGAAGGACCTACACCTTC', 'CSARVREAGRTYTF',
                29, 1, {
                    'V': 29,
                    'D': 2,
                    'J': 1,
                    'delV': 5,
                    'delJ': 15,
                    'delDl': 10,
                    'delDr': 7,
                    'insVD': 7,
                    'insDJ': 6
                })
    assert result == expected

    # NOTE: .gen_cdr3s() returns a list of CDR3s (amino acid only)
    np.random.seed(310)
    result = olga_model_beta.gen_cdr3s(V='TRBV20-1*01', J='TRBJ1-2*01', n=4)
    expected = [
        'CSARVREAGRTYTF', 'CSAVPPGLPNYGYTF', 'CSARGPSQGYVRGLYGYTF',
        'CSAQGLAGYGYTF'
    ]
    assert result == expected
def motif_creation_human_betas():
    import re
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    import palmotif

    from tcrdist.pgen import OlgaModel
    oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha")

    from tcrdist.pgen import OlgaModel
    omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")

    df = pd.read_csv("dash_human.csv")
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    from tcrdist.adpt_funcs import get_basic_centroids
    get_basic_centroids(tr, max_dist=75)
    with open("test_3.svg", 'w') as oh:
        oh.write('<body>')
        for i, r in tr.centroids_df.iterrows():
            if len(r['neighbors']) < 5:
                break
            seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list()
            gene_usages = tr.clone_df.iloc[r['neighbors'], ][[
                'v_b_gene', 'j_b_gene'
            ]].value_counts().reset_index().to_dict('split')['data']
            depth = 3

            refs = flatten([
                omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth)
                for v, j, i in combos_alpha
            ])
            refs = [x for x in refs if x is not None]

            matrix, stats = palmotif.compute_pal_motif(seqs=seqs,
                                                       refs=refs,
                                                       centroid=r['cdr3_b_aa'])
            matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs,
                                                       centroid=r['cdr3_b_aa'])
            refs.append(r['cdr3_b_aa'])
            matrix_bkgd, _ = palmotif.compute_pal_motif(
                seqs=refs, centroid=r['cdr3_b_aa'])

            svgs = [
                palmotif.svg_logo(matrix, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True)
            ]

            [oh.write(f"{s}<div></div>\n") for s in svgs]
            oh.write('<div></div>')
            oh.write(str(r))
            oh.write('<div></div>')

        oh.write('</body>')
def test_olga_sample_beta():
    np.random.seed(1)
    from tcrdist.pgen import OlgaModel
    olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")
    result = olga_model_beta.gen_cdr3s(V='TRBV20-1*01', J='TRBJ1-2*01', n=5)
    assert isinstance(result, list)
    assert len(result) == 5
    assert result == [
        'CSARQGLANYGYTF', 'CSARPSRGQDGYTF', 'CSARDQRTGQDGYTF',
        'CSARDVSSSGGYYGYTF', 'CSAPEPLTSGRACNGYTF'
    ]
def test_olga_sample_alpha():
    np.random.seed(1)
    from tcrdist.pgen import OlgaModel
    olga_model_alpha = OlgaModel(recomb_type="VJ",
                                 chain_folder="human_T_alpha")
    result = olga_model_alpha.gen_cdr3s(V='TRAV19*01', J='TRAJ37*01', n=5)
    assert isinstance(result, list)
    assert len(result) == 5
    assert result == [
        'CALSEAPGNTGKLIF', 'CAPPSGNTGKLIF', 'CALAGNTGKLIF', 'CAQDNTGKLIF',
        'CALRNTGKLIF'
    ]
Beispiel #6
0
def _auto_pgen(tcrrep=None, organism='human', chain='beta', ncpus=2):
    """
	Automate a pgen estimation of cdr3s alpha/beta given a tcrrep with a clones_df attribute

	Parameters
	----------
	tcrrep : tcrdist.repertoire.TCRrep
		TCRrep instance with a clone_df
	organism : str
		'human' or 'mouse'
	chain : str
		'beta' or 'alpha'

	Returns
	-------
	tcrrep : tcrdist.repertoire.TCRrep
	"""
    import tcrdist
    import parmap
    import pandas as pd
    from tcrdist.pgen import OlgaModel
    assert organism in ['human', 'mouse']
    assert chain in ['beta', 'alpha']
    assert isinstance(tcrrep, tcrdist.repertoire.TCRrep)
    assert isinstance(tcrrep.clone_df, pd.DataFrame)

    cdr3_col = {'alpha': 'cdr3_a_aa', 'beta': 'cdr3_b_aa'}[chain]
    cdr3s = tcrrep.clone_df[cdr3_col]

    olga_models = {
        ('human', 'beta'):
        OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ"),
        ('human', 'alpha'):
        OlgaModel(chain_folder="human_T_alpha", recomb_type="VJ"),
        ('mouse', 'beta'):
        OlgaModel(chain_folder="mouse_T_beta", recomb_type="VDJ")
    }

    olga_model = olga_models[(organism, chain)]

    pgens = parmap.map(olga_model.compute_aa_cdr3_pgen,
                       cdr3s,
                       pm_pbar=True,
                       pm_processes=ncpus)
    tcrrep.clone_df[f"pgen_{cdr3_col}"] = pgens

    return tcrrep
Beispiel #7
0
def test_pgen_with_parmap():
    """
    Really simple example of using multiple cpus to 
    speed up computation of pgens with olga.
    """
    import parmap
    from tcrdist.pgen import OlgaModel
    olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ")
    parmap.map(olga_beta.compute_aa_cdr3_pgen, [
        'CASSYRVGTDTQYF', 'CATSTNRGGTPADTQYF', 'CASQGDSFNSPLHF',
        'CASSPWTGSMALHF'
    ])
def test_olga_sample_alphas_for_a_human_repertoire():
    import re
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    import palmotif

    from tcrdist.pgen import OlgaModel
    olga_model_alpha = OlgaModel(recomb_type="VJ",
                                 chain_folder="human_T_alpha")

    from tcrdist.pgen import OlgaModel
    olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")

    df = pd.read_csv("dash_human.csv")
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    rb = [
        olga_model_beta.gen_cdr3s(V=allele_01(r['v_b_gene']),
                                  J=allele_01(r['j_b_gene']),
                                  n=1)
        for _, r in tr.clone_df[['v_b_gene', 'j_b_gene']].iterrows()
    ]
    ra = [
        olga_model_alpha.gen_cdr3s(V=allele_01(r['v_a_gene']),
                                   J=allele_01(r['j_a_gene']),
                                   n=1)
        for _, r in tr.clone_df[['v_a_gene', 'j_a_gene']].iterrows()
    ]
Beispiel #9
0
def test_pgen_1():
    """
    How to add pgen estimates to human alpha/beta CDR3s
    """
    import pandas as pd
    from tcrdist.pgen import OlgaModel
    from tcrdist import mappers 
    from tcrdist.repertoire import TCRrep
    from tcrdist.setup_tests import download_and_extract_zip_file

    df = pd.read_csv("dash_human.csv")

    tr = TCRrep(cell_df = df.sample(5, random_state = 3), 
                organism = 'human', 
                chains = ['alpha','beta'], 
                db_file = 'alphabeta_gammadelta_db.tsv', 
                store_all_cdr = False)

    olga_beta  = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ")
    olga_alpha = OlgaModel(chain_folder = "human_T_alpha", recomb_type="VJ")

    tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens(
        CDR3_seq = tr.clone_df.cdr3_b_aa)
    
    tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens(
        CDR3_seq = tr.clone_df.cdr3_a_aa)

    tr.clone_df[['cdr3_b_aa', 'pgen_cdr3_b_aa', 'cdr3_a_aa','pgen_cdr3_a_aa']]
    """
Beispiel #10
0
def longtest_pgen_with_parmap():
    """
    Test speed up of computation of many pgens using parmap to make use of more than one cpu

    For 1000 CDR3 

    Finished 'olga_in_series' in 26.3842 secs with 1 core
    Finished 'olga_in_parmap' in 6.2384 secs with 6 cores
    """
    import numpy as np
    import pandas as pd
    import parmap
    from tcrdist.pgen import OlgaModel
    from tcrdist.speed import timer
    from tcrdist.adpt_funcs import _valid_cdr3

    from tcrdist.setup_tests import download_and_extract_zip_file
    download_and_extract_zip_file('cdr3_beta_500K.zip',
                                  source="dropbox",
                                  dest=".")

    olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ")

    n = 1000
    df = pd.read_csv('cdr3_beta_500K.csv')
    inputlist = df.iloc[:, 0].to_list()[0:n]
    inputlist = [x for x in inputlist if _valid_cdr3(x)]

    @timer
    def olga_in_series(f=olga_beta.compute_aa_cdr3_pgen, input=inputlist):
        return [f(x) for x in input]

    @timer
    def olga_in_parmap(f=olga_beta.compute_aa_cdr3_pgen,
                       input=inputlist,
                       **kwargs):
        return parmap.map(f, input, pm_pbar=True, **kwargs)

    r1 = olga_in_series(f=olga_beta.compute_aa_cdr3_pgen, input=inputlist)
    r2 = olga_in_parmap(f=olga_beta.compute_aa_cdr3_pgen, input=inputlist)

    assert np.all(r1 == r2)
Beispiel #11
0
def sim_all_cdr3_gen():
    import itertools

    def expand_grid(dct):
        rows = itertools.product(*dct.values())
        return pd.DataFrame.from_records(rows, columns=dct.keys())

    from tcrdist.pgen import OlgaModel
    omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")
    all_possible_beta = expand_grid({
        'V': omb.pgen_model.V_allele_names,
        'J': omb.pgen_model.J_allele_names
    })

    find_nones = list()
    results = list()
    for i, r in all_possible_beta.iterrows():
        e = omb.gen_cdr3s(V=r['V'], J=r['J'], n=3)
        results.append(e)
        if e is None:
            find_nones.append([r['V'], r['J'], e])
        print((r['V'], r['J'], e))

    from tcrdist.pgen import OlgaModel
    oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha")
    all_possible_alpha = expand_grid({
        'V': oma.pgen_model.V_allele_names,
        'J': oma.pgen_model.J_allele_names
    })

    find_nones = list()
    results = list()
    for i, r in all_possible_alpha.iterrows():
        e = oma.gen_cdr3(V=r['V'], J=r['J'])
        results.append([r['V'], r['J'], e])
        if e is None:
            find_nones.append([r['V'], r['J'], e])
        print((r['V'], r['J'], e))

    # Things we can't find:
    df = pd.DataFrame(results, columns=['v', 'j', 'r'])
    df[df['r'].isna()][['v']].value_counts()
    df[df['r'].isna()][['j']].value_counts()
Beispiel #12
0
def make_vj_matched_background(gene_usage_counter,
                               ts=None,
                               size=100000,
                               recomb_type="VDJ",
                               chain_folder="human_T_beta",
                               cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']):
    """
    gene_usage_counter : collections.Counter
    size : int
    recomb_type : str
        Default "VDJ", 
    chain_folder : str
        Default is for human beta "human_T_beta",
    cols : list 
        Default is for beta ['v_b_gene', 'j_b_gene', 'cdr3_b_aa']
    
    Example
    -------
    >>> ix =[['TRBV19*01', 'TRBJ2-5*01', 3],['TRBV24-1*01', 'TRBJ2-4*01', 3]]
    >>> df_rare= pd.concat([pd.DataFrame({'cdr3_b_aa' : flatten(ts.sample([[x[0], x[1], x[2]]])) , 'v_b_gene':x[0], 'j_b_gene':x[1]}) for x in ix]).reset_index(drop = True)
    >>> gene_usage_counter = make_gene_usage_counter(df_rare)
    >>> make_vj_matched_background(gene_usage_counter, size = 10)
          v_b_gene    j_b_gene            cdr3_b_aa        pV        pJ       pVJ
    0  TRBV24-1*01  TRBJ2-4*01      CATPVAGVAKNIQYF  0.011942  0.042163  0.000420
    1  TRBV24-1*01  TRBJ2-4*01       CATSPRGSLSIQYF  0.011942  0.042163  0.000420
    2  TRBV24-1*01  TRBJ2-4*01    CATSDLGGGGIHNIQYF  0.011942  0.042163  0.000420
    3    TRBV19*01  TRBJ2-5*01    CASSISDRGKFSETQYF  0.006788  0.089505  0.000394
    4  TRBV24-1*01  TRBJ2-4*01    CATSDLPARTRENIQYF  0.011942  0.042163  0.000420
    5  TRBV24-1*01  TRBJ2-4*01      CATSDPQGAKNIQYF  0.011942  0.042163  0.000420
    6    TRBV19*01  TRBJ2-5*01  CASSISCGRNLGGQETQYF  0.006788  0.089505  0.000394
    7    TRBV19*01  TRBJ2-5*01    CASSCKPSGGYQETQYF  0.006788  0.089505  0.000394
    8    TRBV19*01  TRBJ2-5*01     CASSSGTSHKLETQYF  0.006788  0.089505  0.000394
    9    TRBV19*01  TRBJ2-5*01          CASSDRETQYF  0.006788  0.089505  0.000394
    """

    olga_model_beta = OlgaModel(recomb_type=recomb_type,
                                chain_folder=chain_folder)
    total_seqs = np.sum(list(gene_usage_counter.values()))
    adjust_factor = size / total_seqs

    dfs = list()
    adjust_depth = 1
    for k, v in gene_usage_counter.items():
        try:
            cdr3s = olga_model_beta.gen_cdr3s(V=k[0],
                                              J=k[1],
                                              n=v * math.ceil(adjust_factor))
            df = pd.DataFrame({cols[2]: cdr3s})
            df[cols[0]] = k[0]
            df[cols[1]] = k[1]
            dfs.append(df)
        except AttributeError:
            pass

    df = pd.concat(dfs).reset_index(drop=True)
    df = df[df[cols[2]].notna()][cols]

    if ts is None:
        from tcrsampler.sampler import TCRsampler
        ts = TCRsampler(
            default_background='britanova_human_beta_t_cb.tsv.sampler.tsv')
        ts = get_stratified_gene_usage_frequency(ts, replace=True)
    df = get_gene_frequencies(ts=ts, df=df, cols=cols)
    df = df.reset_index(drop=True)
    return (df)
    'cdr3': 'cdr3_b_aa'
}.get(c, c) for c in em_ss.columns]
em_tr = TCRrep(cell_df=em_ss,
               organism='human',
               chains=['beta'],
               compute_distances=False)

dash_fn = opj(_fg_data, 'tcrdist', 'datasets', 'dash_human.csv')

df = pd.read_csv(dash_fn)
tr = TCRrep(cell_df=df,
            organism='human',
            chains=['alpha', 'beta'],
            compute_distances=False)
"""Compute pgen of each epitope-specific sequence"""
olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ")
olga_alpha = OlgaModel(chain_folder="human_T_alpha", recomb_type="VJ")

tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens(
    tr.clone_df.cdr3_b_aa)
tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens(
    tr.clone_df.cdr3_a_aa)
"""Force pgen > 0: there were 7 CDR3 alphas with pgen = 0"""
tr.clone_df = tr.clone_df.loc[(tr.clone_df['pgen_cdr3_a_aa'] > 0)
                              & (tr.clone_df['pgen_cdr3_b_aa'] > 0)]

norm_pgen = mpl.colors.LogNorm(vmin=1e-10, vmax=1e-6)
norm_a = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_a_aa'].min(),
                            vmax=tr.clone_df['pgen_cdr3_a_aa'].max())

norm_b = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_b_aa'].min(),
Beispiel #14
0
def test_pgen_mouse():
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist.pgen import OlgaModel
    import numpy as np

    df = pd.read_csv("dash.csv")
    tr = TCRrep(cell_df=df,
                organism='mouse',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                compute_distances=True)

    # Load OLGA model as a python object
    olga_beta = OlgaModel(chain_folder="mouse_T_beta", recomb_type="VDJ")
    olga_alpha = OlgaModel(chain_folder="mouse_T_alpha", recomb_type="VJ")

    # An example computing a single Pgen
    olga_beta.compute_aa_cdr3_pgen(tr.clone_df['cdr3_b_aa'][0])
    olga_alpha.compute_aa_cdr3_pgen(tr.clone_df['cdr3_a_aa'][0])

    # An example computing multiple Pgens
    olga_beta.compute_aa_cdr3_pgens(tr.clone_df['cdr3_b_aa'][0:5])
    olga_alpha.compute_aa_cdr3_pgens(tr.clone_df['cdr3_a_aa'][0:5])

    # An example computing 1920 Pgens more quickly with multiple cpus
    import parmap
    tr.clone_df['pgen_cdr3_b_aa'] = \
        parmap.map(
            olga_beta.compute_aa_cdr3_pgen,
            tr.clone_df['cdr3_b_aa'],
            pm_pbar=True,
            pm_processes = 2)

    tr.clone_df['pgen_cdr3_a_aa'] = \
        parmap.map(
            olga_alpha.compute_aa_cdr3_pgen,
            tr.clone_df['cdr3_a_aa'],
            pm_pbar=True,
            pm_processes = 2)
    """
    We can do something else useful. We've tweaked the original 
    generative code in OLGA, so that you can generate CDRs,
    given a specific TRV and TRJ. 

    Note that unfortunately not all genes are recognized in default OLGA models, 
    but many are. This gives you an idea of what you can do. Here are 10
    CDR3s generated at random given a particular V,J usage combination
    """
    np.random.seed(1)
    olga_beta.gen_cdr3s(V='TRBV14*01', J='TRBJ2-5*01', n=10)
    olga_alpha.gen_cdr3s(V='TRAV4-3*02', J='TRAJ31*01', n=10)
    """
    Using this approach, we can synthesize an 100K background, 
    with similar gene usage frequency to our actual repertoire. 
    Note, however, that given data availability, 
    this is currently likely the most reliable for human beta chain.

    After OLGA's publication, a default mouse alpha model (mouse_T_alpha) 
    was added to the OLGA GitHub repository. We've included that here
    but it should be used with caution as it is missing
    a number of commonly seen V genes.
    """
    np.random.seed(1)
    tr.synthesize_vj_matched_background(chain='beta')
    """
              v_b_gene    j_b_gene            cdr3_b_aa        pV        pJ       pVJ   weights      source
    0        TRBV14*01  TRBJ2-3*01        CASSLASAETLYF  0.033721  0.092039  0.002989  0.065742  vj_matched
    1      TRBV13-2*01  TRBJ2-3*01    CASGDAPDRTGAETLYF  0.118785  0.092039  0.010331  0.271309  vj_matched
    2      TRBV13-3*01  TRBJ1-1*01  CASSDGFSRTGGVNTEVFF  0.074051  0.106146  0.006923  1.009124  vj_matched
    3      TRBV13-3*01  TRBJ2-1*01       CASSDVQGGAEQFF  0.074051  0.117684  0.008915  1.021244  vj_matched
    4      TRBV13-3*01  TRBJ2-7*01     CASSSGTGGYIYEQYF  0.074051  0.204898  0.015366  1.670224  vj_matched
    ...            ...         ...                  ...       ...       ...       ...       ...         ...
    99995    TRBV14*01  TRBJ2-3*01  CASSPTGGAPYASAETLYF  0.033721  0.092039  0.002989  0.065742  vj_matched
    99996    TRBV17*01  TRBJ2-5*01       CASSRDPTQDTQYF  0.028110  0.124712  0.004930  0.650360  vj_matched
    99997    TRBV14*01  TRBJ2-3*01       CASSSTGGAETLYF  0.033721  0.092039  0.002989  0.065742  vj_matched
    99998  TRBV13-1*01  TRBJ2-1*01      CASSDWGKDYAEQFF  0.106042  0.117684  0.013373  2.622194  vj_matched
    99999     TRBV4*01  TRBJ2-3*01      CASSYDRGSAETLYF  0.040749  0.092039  0.002989  0.068343  vj_matched
    """
    np.random.seed(1)
    tr.synthesize_vj_matched_background(chain='alpha')
    """
               v_a_gene   j_a_gene        cdr3_a_aa        pV        pJ       pVJ   weights      source
    0      TRAV12N-3*01  TRAJ34*02     CAIASNTNKVVF  0.000438  0.000088  0.000088  0.006059  vj_matched
    1       TRAV3D-3*02  TRAJ33*01  CAVSAGADSNYQLIW  0.000088  0.000088  0.000088  0.005122  vj_matched
    2        TRAV3-3*01  TRAJ27*01     CAVSTNTGKLTF  0.014029  0.042964  0.000877  0.277471  vj_matched
    3        TRAV3-3*01  TRAJ26*01    CAVSHNYAQGLTF  0.014029  0.040947  0.001052  0.009155  vj_matched
    4        TRAV3-3*01  TRAJ26*01   CAVSARNYAQGLTF  0.014029  0.040947  0.001052  0.009155  vj_matched
    ...             ...        ...              ...       ...       ...       ...       ...         ...
    99995   TRAV3D-3*02  TRAJ21*01    CAVSVSNYNVLYF  0.000088  0.039982  0.000088  0.003758  vj_matched
    99996    TRAV3-3*01  TRAJ43*01    CAVSENNNNAPRF  0.014029  0.022271  0.000526  0.071093  vj_matched
    99997   TRAV3D-3*02  TRAJ26*01    CAVSGNYAQGLTF  0.000088  0.040947  0.000088  0.000296  vj_matched
    99998    TRAV3-3*01  TRAJ26*01   CAVKGNNYAQGLTF  0.014029  0.040947  0.001052  0.009155  vj_matched
    99999   TRAV9N-2*01  TRAJ15*01      CTYQGGRALIF  0.000088  0.043840  0.000088  0.020438  vj_matched
    """
    """"
    tcrdist3's integration of Pgen estimates makes it very easy to look for
    PUBLIC clusters of TCRs (i.e. high number of neighbors) with unlikely V(D)J 
    recombinations.
    """
    from tcrdist.public import _neighbors_fixed_radius
    from tcrdist.public import _K_neighbors_fixed_radius
    tr.clone_df['neighbors'] = _neighbors_fixed_radius(pwmat=tr.pw_beta,
                                                       radius=18)
    tr.clone_df['K_neighbors'] = _K_neighbors_fixed_radius(pwmat=tr.pw_beta,
                                                           radius=18)
    tr.clone_df['pgen_cdr3_b_aa_nlog10'] = tr.clone_df['pgen_cdr3_b_aa'].apply(
        lambda x: -1 * np.log10(x))
    tr.clone_df['nsubject'] = tr.clone_df['neighbors'].apply(
        lambda x: len(tr.clone_df['subject'][x].unique()))
    # nsubject > 1 implies quasi-publicity
    tr.clone_df['qpublic'] = tr.clone_df['nsubject'].apply(lambda x: x > 1)

    # Note one can find neighbors based on paired-chain distances.
    from tcrdist.public import _neighbors_fixed_radius
    from tcrdist.public import _K_neighbors_fixed_radius
    tr.clone_df['neighbors'] = _neighbors_fixed_radius(pwmat=tr.pw_beta +
                                                       tr.pw_alpha,
                                                       radius=50)
    tr.clone_df['K_neighbors'] = _K_neighbors_fixed_radius(pwmat=tr.pw_beta +
                                                           tr.pw_alpha,
                                                           radius=50)
    tr.clone_df['pgen_cdr3_b_aa_nlog10'] = tr.clone_df['pgen_cdr3_b_aa'].apply(
        lambda x: -1 * np.log10(x))
    tr.clone_df['nsubject'] = tr.clone_df['neighbors'].apply(
        lambda x: len(tr.clone_df['subject'][x].unique()))
    # nsubject > 1 implies quasi-publicity)
    tr.clone_df['qpublic'] = tr.clone_df['nsubject'].apply(lambda x: x > 1)
    """
Beispiel #15
0
def _tcrsampler_svgs(tcrrep,
                     default_background=None,
                     default_background_if_missing=None,
                     cdr3_name='cdr3_b_aa',
                     pwmat_str='pw_cdr3_b_aa',
                     chain='beta',
                     gene_names=['v_b_gene', 'j_b_gene'],
                     combine_olga=False,
                     verbose=True):
    """
    Breath. What does this do?
    
    Given a TCRrep instance, this function samples a background repertoire
    using TCRsampler and makes svg-logos using palmotif. 

    This function doesn't return anything it. It needs to access 
    attribute values of a TCRrep (tcrrep) instance and 
    it modifies th etcrrep in place adding svgs and stats colums 
    to .hcluster_df_detailed DataFrame. TODO: could just output a dataframe 
    which would then just be concattenated.

    ONLY WORKS WITH _BETA using defaults:

    Notes
    -----
    Note: TCRSampler.build_background() accepts kwargs, we've set these as fixed as most user 
    won't know what these do and won't need to change them.
            max_rows : int
                Maximum clones per v,j pair (per subject)
            stratify_by_subject : bool
                If True, max_rows will apply to v,j,subject. If False, max_rows applies to v,j
            use_frequency : bool
                If True, uses frequency for ranking rows. If False, uses raw counts.
            make_singleton : bool
                If True, background is still sorted by frequency or counts, 
                but final fequency and counts values are overridden
                and set to 1.
    """
    from tcrsampler.sampler import TCRsampler
    from palmotif import compute_pal_motif, svg_logo
    import pandas as pd
    from tcrdist.summarize import _select

    if chain == 'alpha' and tcrrep.organism == "mouse":
        # Here we enforce the rule that alpha-mouse cannot use an olga-sampler
        # TODO: This should be removed as soon as TCRsampler can be updated with a valid
        # mouse-alpha simulated background.
        combine_olga = False

    # _default_sampler returns a TCRSampler based on organism and chain
    if verbose: print(f"INITIALIZING A TCRSAMPLER")
    print(tcrrep.organism, chain)
    t = _default_sampler(organism=tcrrep.organism, chain=chain)(
        default_background=default_background,
        default_background_if_missing=default_background_if_missing)

    build_kargs = {
        'max_rows': 100,
        'stratify_by_subject': True,
        'use_frequency': True,
        'make_singleton': False
    }

    build_kargs_olga = {
        'max_rows': 1000,
        'stratify_by_subject': False,
        'use_frequency': True,
        'make_singleton': False
    }

    if verbose:
        print(f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs)}")

    t.build_background(**build_kargs)
    # Olga Sampler

    if combine_olga:
        t_olga = _default_sampler_olga(chain=chain, organism=tcrrep.organism)()
        if verbose:
            print(
                f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs_olga)}"
            )
        t.build_background(**build_kargs_olga)

        olga_model = {
            ('beta', 'human'):
            OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta"),
            ('alpha', 'human'):
            OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha"),
            ('beta', 'mouse'):
            OlgaModel(recomb_type="VDJ", chain_folder="mouse_T_beta")
        }[(chain, tcrrep.organism)]

    if 'prune' not in tcrrep.hcluster_df.columns:
        if verbose: print("NO PRUNE COLUMNS USED ALL SET TO 0")
        tcrrep.hcluster_df['prune'] = 0

    print("ITERATE THROUGH CLUSTERS")
    svgs = list()
    svgs_raw = list()
    reference_unique = list()
    reference_unique_olga = list()
    reference_size = list()
    reference_size_olga = list()
    percent_missing_sampler = list()
    percent_missing_sampler_olga = list()
    n_rows = tcrrep.hcluster_df.shape[0]

    bar = IncrementalBar(f'Make {chain} SVGs :',
                         max=n_rows,
                         suffix='%(percent)d%%')
    for i, r in tcrrep.hcluster_df.iterrows():
        bar.next()
        if r['prune'] == 0:
            # <dfnode> is dataframe with all the clones at a given tree node
            dfnode = tcrrep.clone_df.iloc[r['neighbors_i'], ].copy()
            # <pwnode> Pairwise Matrix for node sequences
            pwnode = getattr(
                tcrrep,
                pwmat_str)[r['neighbors_i'], :][:, r['neighbors_i']].copy()
            iloc_idx = pwnode.sum(axis=0).argmin()
            centroid = dfnode[cdr3_name].to_list()[iloc_idx]

            # Compute gene usage at the node
            # Convert to allele_01
            for gene_name in gene_names:
                dfnode[gene_name] = dfnode[gene_name].apply(
                    lambda x: allele_01(x))

            gene_usage = dfnode.groupby(
                gene_names).size()  # e.g., ['v_b_gene','j_b_gene']
            gene_usage_tuples = gene_usage.reset_index().to_dict(
                'split')['data']
            # Given gene usage use the <t> a TCRsampler instance to get background seqs

            # Adjust depth for small nodes
            adjust_depth = 10 * round(10 / dfnode.shape[0])
            if adjust_depth < 10:
                adjust_depth = 10

            sampled_rep = t.sample(gene_usage_tuples,
                                   flatten=True,
                                   depth=adjust_depth * 10)

            # Only keep the non-none sequences
            sampled_rep = [x for x in sampled_rep if x is not None]
            # < missing_gene > Count the percentage missing, sampler returns none when no v,j pair is present
            expected_depth = dfnode.shape[0] * adjust_depth * 10
            recovered_depth = len(sampled_rep)
            percent_missing = round(
                100 * (1 - (recovered_depth / expected_depth)), 1)

            percent_missing_sampler.append(f"{percent_missing}%")
            reference_unique.append(str(pd.Series(sampled_rep).nunique()))
            reference_size.append(str(pd.Series(sampled_rep).count()))

            if combine_olga:
                # We modified Olga source code slightly, such that we simulated sequences
                # with a given V,J gene usage
                # OLD METHOD WHERE WE ACTUALLY SAMPLED, slower but can go much deeper. I don't think one rare sequence however, really make a big difference.
                #flatten = lambda l: [item for sublist in l for item in sublist]
                #sampled_rep_olga = [olga_model.gen_cdr3s(allele_01(v),allele_01(j),n*adjust_depth*10) for v,j,n in gene_usage_tuples]
                #sampled_rep_olga = [x for x in flatten(sampled_rep_olga) if x is not None]
                sampled_rep_olga = t_olga.sample(gene_usage_tuples,
                                                 flatten=True,
                                                 depth=adjust_depth * 10)

                sampled_rep_olga = [
                    x for x in sampled_rep_olga if x is not None
                ]

                expected_depth = dfnode.shape[0] * adjust_depth * 10
                recovered_depth = len(sampled_rep_olga)
                percent_missing_olga = round(
                    100 * (1 - (recovered_depth / expected_depth)), 1)

                percent_missing_sampler_olga.append(f"{percent_missing_olga}%")
                reference_unique_olga.append(
                    str(pd.Series(sampled_rep_olga).nunique()))
                reference_size_olga.append(
                    str(pd.Series(sampled_rep_olga).count()))

                # HERE WE COMBINE INTO A SINGLE BACKGROUND:
                sampled_rep = sampled_rep + sampled_rep_olga

            # Get motif matrix and motif stats
            motif, stat = compute_pal_motif(seqs=_select(
                df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name),
                                            refs=sampled_rep,
                                            centroid=centroid)

            svgs.append(svg_logo(motif, return_str=True))

            # repeaat without references
            raw_motif, raw_stat = compute_pal_motif(seqs=_select(
                df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name),
                                                    centroid=centroid)
            # Convert the motif matrix into an svg_logo, append to list
            svgs_raw.append(svg_logo(raw_motif, return_str=True))
        else:
            # If prune column is 1 don't go to the trouble of sampling and generating seqs
            svgs.append("PRUNE")
            svgs_raw.append("PRUNE")
            reference_size.append("PRUNE")
            reference_unique.append("PRUNE")
            percent_missing_sampler.append("PRUNE")
            percent_missing_sampler_olga.append("PRUNE")
            reference_unique_olga.append("PRUNE")
            reference_size_olga.append("PRUNE")

    bar.next()
    bar.finish()

    # The standard svg_ includes background, whereas raw has no background
    tcrrep.hcluster_df_detailed[f'svg_{chain}'] = svgs
    tcrrep.hcluster_df_detailed[f'svg_raw_{chain}'] = svgs_raw
    tcrrep.hcluster_df_detailed[f'ref_size_{chain}'] = reference_size
    tcrrep.hcluster_df_detailed[f'ref_unique_{chain}'] = reference_unique
    tcrrep.hcluster_df_detailed[
        f'percent_missing_{chain}'] = percent_missing_sampler
    if combine_olga:
        tcrrep.hcluster_df_detailed[
            f'ref_size_olga_{chain}'] = reference_size_olga
        tcrrep.hcluster_df_detailed[
            f'ref_unique_olga_{chain}'] = reference_unique_olga
        tcrrep.hcluster_df_detailed[
            f'percent_missing_olga_{chain}'] = percent_missing_sampler_olga

    return True