Example #1
0
def test_pgen_1():
    """
    How to add pgen estimates to human alpha/beta CDR3s
    """
    import pandas as pd
    from tcrdist.pgen import OlgaModel
    from tcrdist import mappers 
    from tcrdist.repertoire import TCRrep
    from tcrdist.setup_tests import download_and_extract_zip_file

    df = pd.read_csv("dash_human.csv")

    tr = TCRrep(cell_df = df.sample(5, random_state = 3), 
                organism = 'human', 
                chains = ['alpha','beta'], 
                db_file = 'alphabeta_gammadelta_db.tsv', 
                store_all_cdr = False)

    olga_beta  = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ")
    olga_alpha = OlgaModel(chain_folder = "human_T_alpha", recomb_type="VJ")

    tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens(
        CDR3_seq = tr.clone_df.cdr3_b_aa)
    
    tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens(
        CDR3_seq = tr.clone_df.cdr3_a_aa)

    tr.clone_df[['cdr3_b_aa', 'pgen_cdr3_b_aa', 'cdr3_a_aa','pgen_cdr3_a_aa']]
    """
Example #2
0
def test_ex5():
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    df = pd.read_csv('dash.csv')
    df = df[df.epitope.isin(['PA'])]
    tr = TCRrep(cell_df=df, chains=['alpha', 'beta'], organism='mouse')
    tr.tcrdist2(processes=1,
                metric='hamming',
                reduce=True,
                dump=False,
                save=False,
                replacement_weights={
                    'cdr3_a_aa': 3,
                    'pmhc_a_aa': 1,
                    'cdr2_a_aa': 1,
                    'cdr1_a_aa': 1,
                    'cdr3_b_aa': 3,
                    'pmhc_b_aa': 1,
                    'cdr2_b_aa': 1,
                    'cdr1_b_aa': 1
                })

    assert np.all(tr.pw_tcrdist == tr.pw_alpha + tr.pw_beta)
    assert np.all(tr.pw_beta == 3 * tr.cdr3_b_aa_pw + tr.pmhc_b_aa_pw +
                  tr.cdr2_b_aa_pw + tr.cdr1_b_aa_pw)
    assert np.all(tr.pw_alpha == 3 * tr.cdr3_a_aa_pw + tr.pmhc_a_aa_pw +
                  tr.cdr2_a_aa_pw + tr.cdr1_a_aa_pw)
Example #3
0
    def test_repertoire_full_use_case(self):
        """
        This is not a unit test persay! This is a test of a use_case of an instance of
        the TCRrep() class used for pairwise sequence comparison

        """

        testrep = TCRrep(cell_df = example_df, chains = ["alpha", "beta"]) # (1)
        testrep.index_cols.append("epitope")                         # (2)
        testrep.index_cols.append("subject")
        testrep.deduplicate()                                    # (3)
        testrep.cdr3_a_aa_smat = 'blosum62'               # (4)
        testrep.cdr3_b_aa_smat = 'blosum62'
        testrep.compute_pairwise(chain = "alpha")                # (5)
        testrep.compute_pairwise(chain = "beta")                 # (6)
        tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw    # (7)

        expected_tcrdist = np.array([[   0.,  222.,  210.,  223.,  231.,  239.,  219.,  231.,  175.],
               [ 222.,    0.,  116.,  175.,  173.,  185.,  131.,  209.,  205.],
               [ 210.,  116.,    0.,  175.,  169.,  183.,  145.,  201.,  221.],
               [ 223.,  175.,  175.,    0.,  154.,  200.,  162.,  234.,  202.],
               [ 231.,  173.,  169.,  154.,    0.,  152.,  120.,  182.,  192.],
               [ 239.,  185.,  183.,  200.,  152.,    0.,  146.,  112.,  192.],
               [ 219.,  131.,  145.,  162.,  120.,  146.,    0.,  178.,  172.],
               [ 231.,  209.,  201.,  234.,  182.,  112.,  178.,    0.,  220.],
               [ 175.,  205.,  221.,  202.,  192.,  192.,  172.,  220.,    0.]])

        self.assertTrue((tcrdist == expected_tcrdist).all())
Example #4
0
def test_integration_TCRrep_with_TCRMotif():
    import pandas as pd
    import tcrdist as td
    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep
    from tcrdist.cdr3_motif import TCRMotif
    fn = os.path.join("tcrdist","test_files", "vdjDB_PMID28636592.tsv")
    pd_df = pd.read_csv(fn, sep = "\t")        # 1
    t_df = td.mappers.vdjdb_to_tcrdist2(pd_df = pd_df)               # 2
    t_df.organism.value_counts                                       # 3
    index_mus = t_df.organism == "MusMusculus"                       # 4
    t_df_mus = t_df.loc[index_mus,:].copy()                          # 5

    tr = TCRrep(cell_df = t_df_mus, organism = "mouse")              # 6

    tr.infer_cdrs_from_v_gene(chain = 'alpha')                       # 7
    tr.infer_cdrs_from_v_gene(chain = 'beta')                        # 8

    tr.index_cols = ['subject', 'epitope',                           # subject and epitope
                     'v_a_gene',  'j_a_gene', 'v_b_gene', 'j_b_gene',# gene usage
                     'cdr3_a_aa', 'cdr3_b_aa',                       # CDR 3
                     'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',          # alpha CDR 1, 2, and 2.5
                     'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa']          # beta CDR 1, 2, and 2.5

    tr.deduplicate()                                                 # 10

    motif = TCRMotif(clones_df = tr.tcr_motif_clones_df(), organism = "mouse", chains = ["A","B"], epitopes = ["PA"]) # 11
    assert isinstance(motif.clones_df, pd.DataFrame)
Example #5
0
    def test_repertoire_full_use_case_hamming(self):
        """
        This is not a unit test persay! This is a test of a use_case of an instance of
        the TCRrep() class used for pairwise sequence comparison
        """
        testrep = TCRrep(cell_df=example_df, chains=["alpha", "beta"])  # (1)
        testrep.index_cols.append("epitope")  # (2)
        testrep.index_cols.append("subject")
        testrep.deduplicate()  # (3)
        testrep.cdr3_a_aa_smat = 'blosum62'  # (4)
        testrep.cdr3_b_aa_smat = 'blosum62'
        testrep.compute_pairwise_all(chain="alpha", metric="hamming")  # (5)
        testrep.compute_pairwise_all(chain="beta", metric="hamming")  # (6)
        tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw  # (7)

        expected_tcrdist = np.array(
            [[0., 18., 17., 18., 19., 22., 19., 18., 16.],
             [18., 0., 11., 15., 15., 17., 10., 18., 18.],
             [17., 11., 0., 18., 15., 17., 13., 18., 20.],
             [18., 15., 18., 0., 14., 19., 14., 20., 18.],
             [19., 15., 15., 14., 0., 14., 11., 17., 16.],
             [22., 17., 17., 19., 14., 0., 14., 13., 18.],
             [19., 10., 13., 14., 11., 14., 0., 17., 15.],
             [18., 18., 18., 20., 17., 13., 17., 0., 19.],
             [16., 18., 20., 18., 16., 18., 15., 19., 0.]])

        self.assertTrue((tcrdist == expected_tcrdist).all())
Example #6
0
def test_ex11():
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    tr = TCRrep(cell_df=pd.DataFrame(),
                chains=['alpha', 'beta'],
                organism='mouse')
    tr.rebuild(dest_tar_name="some_archive.tar.gz")
Example #7
0
def run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw"):
    cpu = multiprocessing.cpu_count()
    # really basic example
    df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv'))
    df = df[df.epitope.isin(['NP'])]
    tr = TCRrep(cell_df=df, chains=chains, organism='mouse')
    tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save=False)
    return tr
Example #8
0
def test_example_9():
    """
    If you already have a clones file and want 
    to compute 'tcrdistances' on a DataFrame with 
    custom columns names.
    
    Set:
    1. Assign TCRrep.clone_df
    2. set infer_cdrs = False,
    3. compute_distances = False
    4. deduplicate = False
    5. customize the keys for metrics, weights, and kargs with the lambda
        customize = lambda d : {new_cols[k]:v for k,v in d.items()} 
    6. call .calculate_distances()
    """
    import pwseqdist as pw
    import pandas as pd
    from tcrdist.repertoire import TCRrep

    new_cols = {
        'cdr3_a_aa': 'c3a',
        'pmhc_a_aa': 'pa',
        'cdr2_a_aa': 'c2a',
        'cdr1_a_aa': 'c1a',
        'cdr3_b_aa': 'c3b',
        'pmhc_b_aa': 'pb',
        'cdr2_b_aa': 'c2b',
        'cdr1_b_aa': 'c1b'
    }

    df = pd.read_csv("dash2.csv").rename(columns=new_cols)

    tr = TCRrep(
        cell_df=df,
        clone_df=df,  #(1)
        organism='mouse',
        chains=['alpha', 'beta'],
        infer_all_genes=True,
        infer_cdrs=False,  #(2)s
        compute_distances=False,  #(3)
        deduplicate=False,  #(4)
        db_file='alphabeta_gammadelta_db.tsv')

    customize = lambda d: {new_cols[k]: v for k, v in d.items()}  #(5)
    tr.metrics_a = customize(tr.metrics_a)
    tr.metrics_b = customize(tr.metrics_b)
    tr.weights_a = customize(tr.weights_a)
    tr.weights_b = customize(tr.weights_b)
    tr.kargs_a = customize(tr.kargs_a)
    tr.kargs_b = customize(tr.kargs_b)

    tr.compute_distances()  #(6)

    # Notice that pairwise results now have custom names
    tr.pw_c3b
    tr.pw_c3a
    tr.pw_alpha
    tr.pw_beta
Example #9
0
def test_ex2():
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    df = pd.read_csv('dash.csv')
    df = df[df.epitope.isin(['PA'])]
    tr = TCRrep(cell_df=df, chains=['alpha', 'beta'], organism='mouse')
    tr.tcrdist2(processes=1, metric='nw', reduce=True, dump=False, save=False)

    tr.pw_tcrdist
Example #10
0
def test_TCRpublic():
	df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index().copy()
	tr = TCRrep(cell_df = df.head(200).copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv', 
	            compute_distances = True)
	tr.clone_df['radius'] = 40
	from tcrdist.public import TCRpublic
	tp = TCRpublic(tcrrep = tr, organism = 'mouse', chain = 'beta')
	tp.report()
Example #11
0
def test_current_example():
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2
    from tcrdist.automate import auto_pgen
    import scipy.sparse

    fn_mira_background = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.olga100K_brit100K_bkgd.csv'
    fn_mira_background = os.path.join('tcrdist', 'data', 'covid19',
                                      fn_mira_background)
    df_background = pd.read_csv(fn_mira_background)
    tr_background = TCRrep(cell_df=df_background.copy(),
                           organism="human",
                           chains=['beta'],
                           compute_distances=False)

    fn_mira = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv'
    fn_mira = os.path.join('tcrdist', 'data', 'covid19', fn_mira)
    df_mira = pd.read_csv(fn_mira)
    df_mira = df_mira[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']]
    tr = TCRrep(cell_df=df_mira.copy(),
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                store_all_cdr=False,
                compute_distances=True)
    auto_pgen(tr)

    tr.compute_rect_distances(df=tr.clone_df,
                              df2=tr_background.clone_df,
                              store=False)

    assert tr.rw_beta.shape[0] == tr.clone_df.shape[0]

    centers_df = bkgd_cntl_nn2(tr=tr,
                               tr_background=tr_background,
                               ctrl_bkgd=10**-6,
                               weights=tr_background.clone_df.weights,
                               col='cdr3_b_aa',
                               ncpus=2,
                               thresholds=[x for x in range(0, 50, 2)],
                               generate_regex=True,
                               test_regex=True)

    out_fn_center_df = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.centers_df.csv'
    out_fn_rw_beta_sparse_matrix = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.rw_beta.sparse.npz'
    centers_df.to_csv(out_fn_center_df, index=False)
    tr.rw_beta[tr.rw_beta == 0] = 1  # set true zeros to 1
    tr.rw_beta[tr.rw_beta > 50] = 0  # ignores everything less than 100
    rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta)
    scipy.sparse.save_npz(out_fn_rw_beta_sparse_matrix, rw_beta_sparse)
Example #12
0
def test_TCRrep_show_incomplete_entries():
    """
    Test that show TCRrep method returns rows with some missing attribute
    """
    df = pd.DataFrame({
        "cdr3_a_aa": ["A", "B", "C"],
        "v_a_gene": ["TRAV1*01", "TRAV1*01", None],
        "count": [1, 1, 1]
    })
    tr = TCRrep(cell_df=df, organism="human", chains=["alpha"])
    tr.index_cols = ['cdr3_a_aa', 'v_a_gene']
    dfi = tr.show_incomplete()
    assert isinstance(dfi, pd.DataFrame)
    assert dfi.to_dict() == {'cdr3_a_aa': {2: 'C'}, 'v_a_gene': {2: None}}
Example #13
0
def test_calc_radii():
	import numpy as np
	import pandas as pd
	from tcrdist.repertoire import TCRrep
	df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True)
	tr = TCRrep(cell_df = df.copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
	            compute_distances = True)

	df = pd.read_csv("dash.csv").query('epitope != "PA"').reset_index(drop = True)
	tr_bkgd = TCRrep(cell_df = df.copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
               compute_distances = False)

	from tcrdist.centers import calc_radii
	radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = tr_bkgd, chain = 'beta', ctrl_bkgd = 10**-3, use_sparse = False, max_radius=50)
	from tcrdist.public import _neighbors_variable_radius
	# Compute neighbors <= variable radius in the background set and the foreground set
	neighbors     = _neighbors_variable_radius(pwmat = tr.pw_beta , radius_list = radii)
	background_neighbors = _neighbors_variable_radius(pwmat = tr.rw_beta , radius_list = radii)
	tr.clone_df['radius']               = radii
	tr.clone_df['neighbors']            = neighbors
	tr.clone_df['background_neighbors'] = background_neighbors
	tr.clone_df['nsubject']             = tr.clone_df['neighbors'].\
			apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
	tr.clone_df['qpublic']              = tr.clone_df['nsubject'].\
			apply(lambda x: x > 1)
	tr.clone_df
Example #14
0
def test_ex9():
	import pandas as pd
	import numpy as np
	from tcrdist.repertoire import TCRrep
	df = pd.read_csv('dash.csv')
	df = df[df.epitope.isin(['PA'])]
	tr = TCRrep(cell_df=df, chains=['alpha','beta'], organism='mouse')
	tr.tcrdist2(processes = 1,
				metric = 'hamming',
				reduce = True,
				dump = False,
				save = True,
				dest = "some_archive",
				dest_tar_name = "some_archive.tar.gz")
Example #15
0
def test_TCRrep_func_tcrdist2_save_auto_rebuild(chains = ['beta','alpha'], metric = "nw"):
    cpu = multiprocessing.cpu_count()
    # really basic example
    df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv'))
    df = df[df.epitope.isin(['NP'])]
    tr = TCRrep(cell_df=df, chains=chains, organism='mouse')
    tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save= True, dest = "default_archive", dest_tar_name = "default_archive.tar.gz" )
    # Cleanup folder that you just made
    os.system("rm -rf myTCRrep_archive")
    # Rebuild
    tr = TCRrep(cell_df=df.iloc[0:0,:], chains=chains, organism='mouse')
    tr.rebuild(dest_tar_name = "default_archive.tar.gz")

    tr_compare = run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw")

    attendance = {k : k in tr.__dict__.keys() for k in tr_compare.__dict__.keys()}
    assert attendance['pw_tcrdist']
    assert attendance['pw_alpha']
    assert attendance['pw_beta']
    assert attendance['clone_df']
    assert attendance['cell_df']
    # only compare things in common
    shared_attributes = [k for k in attendance.keys() if attendance[k]]
    for k in shared_attributes:
        if not isinstance(getattr(tr, k), pd.DataFrame):
            if not isinstance(getattr(tr, k), dict):
                assert np.all(getattr(tr, k) == getattr(tr_compare, k))
            else:
                assert set(getattr(tr, k).keys()) ==  set(getattr(tr_compare, k).keys())

    assert set(tr.all_genes['mouse'].keys()) == set(tr_compare.all_genes['mouse'].keys())
    assert set(tr.all_genes['human'].keys()) == set(tr_compare.all_genes['human'].keys())
Example #16
0
def test_example_10_sparse():
    """
    If you just want a 'tcrdistances' of some target seqs against another set computed sparsely

    (1) cell_df is asigned the first 10 cells in dash.csv
    (2) compute tcrdistances with default settings.
    (3) compute rectangular distance between clone_df and df2.
    (4) compute rectangular distance between clone_df and any 
    arbtirary df3, which need not be associated with the TCRrep object.
    (5) compute rectangular distance with only a subset of the TCRrep.clone_df
    """
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist.rep_funcs import pw2dense
    import numpy as np

    df = pd.read_csv("dash.csv")
    df2 = pd.read_csv("dash2.csv")
    df = df.head(10)  #(1)
    tr = TCRrep(
        cell_df=df,  #(2)
        df2=df2,
        organism='mouse',
        chains=['alpha', 'beta'],
        db_file='alphabeta_gammadelta_db.tsv')

    tr.compute_rect_distances(df=tr.clone_df, df2=df2)
    assert tr.rw_alpha.shape == (10, 1924)
    assert tr.rw_beta.shape == (10, 1924)

    rw_alpha = tr.rw_alpha.copy()
    rw_beta = tr.rw_beta.copy()

    radius = 100
    tr.cpus = 1
    tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius)
    d = pw2dense(tr.rw_alpha, radius)
    print(rw_alpha[:2, :10])
    print(tr.rw_alpha.todense()[:2, :10])
    print(d[:2, :10])
    assert np.all(rw_alpha[rw_alpha <= radius] == d[d <= radius])

    d = pw2dense(tr.rw_beta, radius)
    assert np.all(rw_beta[rw_beta <= radius] == d[d <= radius])

    radius = 5000
    tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius)
    d = pw2dense(tr.rw_alpha, radius)
    print(rw_alpha[:2, :10])
    print(tr.rw_alpha.todense()[:2, :10])
    assert np.all(rw_alpha == d)

    d = pw2dense(tr.rw_beta, radius)
    assert np.all(rw_beta == d)
Example #17
0
def test_TCRrep_deduplicate_gives_warning_above_complete_values():
    """
    Warn user if there is missing value in an index column and cell count will not match clone count
    """
    df = pd.DataFrame({
        "cdr3_a_aa": ["A", "B", "C"],
        "v_a_gene": ["TRAV1*01", "TRAV1*01", None],
        "count": [1, 1, 1]
    })
    tr = TCRrep(cell_df=df, organism="human", chains=["alpha"])
    tr.index_cols = ['cdr3_a_aa', 'v_a_gene']
    with pytest.warns(None) as record:
        tr.deduplicate()
    assert str(record[0].message).startswith(
        "Not all")  # cells/sequences could be grouped into clones.\n"
Example #18
0
def test_TCRrep_show_incomplete_entries_when_there_are_none():
    """
    Test that show TCRrep method returns rows with some missing attribute
    """
    df = pd.DataFrame({
        "cdr3_a_aa": ["A", "B", "C"],
        "v_a_gene": ["TRAV1*01", "TRAV1*01", "TRAV1*01"],
        "count": [1, 1, 1]
    })
    tr = TCRrep(cell_df=df, organism="human", chains=["alpha"])
    tr.index_cols = ['cdr3_a_aa', 'v_a_gene']
    dfi = tr.show_incomplete()
    assert isinstance(dfi, pd.DataFrame)
    assert dfi.shape[0] == 0
    assert dfi.shape[1] == 2
Example #19
0
 def test_repertoire_infer_cdrs_from_v_gene(self):
     testrep = TCRrep(cell_df = example_df, chains = ["alpha", "beta"])
     testrep.infer_cdrs_from_v_gene(chain = "alpha")
     testrep.infer_cdrs_from_v_gene(chain = "beta")
     testrep.index_cols =['epitope', 'subject', 'cdr3_a_aa', 'cdr1_a_aa',
                          'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_b_aa', 'cdr1_b_aa',
                          'cdr2_b_aa', 'pmhc_b_aa']
     testrep.deduplicate()
     self.assertTrue(np.all([isinstance(testrep.clone_df['cdr1_a_aa'], pd.Series),
                             isinstance(testrep.clone_df['cdr1_b_aa'], pd.Series),
                             isinstance(testrep.clone_df['cdr2_a_aa'], pd.Series),
                             isinstance(testrep.clone_df['cdr2_b_aa'], pd.Series)]))
Example #20
0
def test_import_vdjtools_beta_w_validation():
    import pandas as pd
    import numpy as np
    import os
    from tcrdist.paths import path_to_base
    from tcrdist.vdjtools_funcs import import_vdjtools
    from tcrdist.repertoire import TCRrep

    # Reformat vdj_tools input format for tcrdist3
    vdj_tools_file_beta = os.path.join(
        path_to_base, 'tcrdist', 'data', 'formats',
        'vdj.M_15_CD8_beta.clonotypes.TRB.txt.gz')
    df_beta = import_vdjtools(vdj_tools_file=vdj_tools_file_beta,
                              chain='beta',
                              organism='human',
                              db_file='alphabeta_gammadelta_db.tsv',
                              validate=True)
    assert np.all(df_beta.columns == [
        'count', 'freq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'cdr3_b_nucseq',
        'valid_v', 'valid_j', 'valid_cdr3'
    ])

    # Can be directly imported into a TCRrep instance.
    tr = TCRrep(
        cell_df=df_beta[['count', 'freq', 'cdr3_b_aa', 'v_b_gene',
                         'j_b_gene']],
        chains=['beta'],
        organism='human',
        compute_distances=False)
Example #21
0
def test_introduction_3():
    import pandas as pd
    from tcrdist.repertoire import TCRrep

    df = pd.read_csv("dash.csv")
    tr = TCRrep(cell_df=df,
                organism='mouse',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                compute_distances=False)

    from tcrdist.plotting import plot_pairings, _write_svg

    svg_PA = plot_pairings(
        tr.clone_df.loc[tr.clone_df.epitope == "PA"],
        cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'],
        count_col='count')

    svg_NP = plot_pairings(
        tr.clone_df.loc[tr.clone_df.epitope == "NP"],
        cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'],
        count_col='count')

    _write_svg(svg_PA, name="PA_gene_usage_plot.svg", dest=".")

    _write_svg(svg_NP, name="NP_gene_usage_plot.svg", dest=".")

    import fishersapi
    fishersapi.fishers_frame(tr.clone_df.loc[tr.clone_df.epitope == "NP"],
                             col_pairs=[('v_b_gene', 'j_b_gene'),
                                        ('v_a_gene', 'j_a_gene'),
                                        ('v_a_gene', 'v_b_gene'),
                                        ('j_a_gene', 'j_b_gene')])
Example #22
0
def test_validate_chains():
    """Test that incorrect chain raise causes ValueError"""
    with pytest.raises(ValueError) as info:
        tr = TCRrep(organism="mouse", chains=["ALPHA", "beta"])
    assert str(
        info.value
    ) == "TCRrep chains arg can be one or more of the following ['alpha', 'beta', 'gamma', 'delta'] case-sensitive"
Example #23
0
def test_olga_sample_alphas_for_a_human_repertoire():
    import re
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    import palmotif

    from tcrdist.pgen import OlgaModel
    olga_model_alpha = OlgaModel(recomb_type="VJ",
                                 chain_folder="human_T_alpha")

    from tcrdist.pgen import OlgaModel
    olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")

    df = pd.read_csv("dash_human.csv")
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    rb = [
        olga_model_beta.gen_cdr3s(V=allele_01(r['v_b_gene']),
                                  J=allele_01(r['j_b_gene']),
                                  n=1)
        for _, r in tr.clone_df[['v_b_gene', 'j_b_gene']].iterrows()
    ]
    ra = [
        olga_model_alpha.gen_cdr3s(V=allele_01(r['v_a_gene']),
                                   J=allele_01(r['j_a_gene']),
                                   n=1)
        for _, r in tr.clone_df[['v_a_gene', 'j_a_gene']].iterrows()
    ]
Example #24
0
def test_workflow_1():
    """
    Load all the TCRs associated with a particular epitope in 
    the Adaptive Biotechnology COVID19 Data Release 2
    """
    import os
    import pandas as pd
    from tcrdist.repertoire import TCRrep

    path = os.path.join('tcrdist', 'data', 'covid19')
    file = 'mira_epitope_9_2477_FLQSINFVR_FLQSINFVRI_FLYLYALVYF_GLEAPFLYLY_INFVRIIMR_LQSINFVRI_LQSINFVRII_QSINFVRII_SINFVRIIMR_VYFLQSINF_VYFLQSINFV_YFLQSINFVR_YLYALVYFL.tcrdist3.csv'
    filename = os.path.join(path, file)

    df = pd.read_csv(filename, sep=",")

    df = df[[
        'cell_type', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope',
        'age', 'sex', 'race', 'cohort', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1',
        'hla-c', 'hla-c_1', 'dpa1_1', 'dpb1', 'dpb1_1', 'dqa1', 'dqa1_1',
        'dqb1', 'dqb1_1', 'drb1', 'drb1_1', 'drb3'
    ]]

    df['count'] = 1

    tr = TCRrep(cell_df=df, organism='human', chains=['beta'])
Example #25
0
def test_validate_imgt_aligned():
    """Test that incorrect chain raise causes ValueError"""
    with pytest.raises(ValueError) as info:
        tr = TCRrep(organism="mouse",
                    chains=["alpha", "beta"],
                    imgt_aligned="align")
    assert str(info.value) == "TCRrep imgt_aligned argument must be a boolean"
Example #26
0
def test_TCRpublic_with_neighborhood_dif():
    """
    Use values from neighborhood_diff
    """
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.public import TCRpublic
    fn = os.path.join(
        'tcrdist', 'data', 'covid19',
        'mira_epitope_55_524_ALRKVPTDNYITTY_KVPTDNYITTY.tcrdist3.radius.csv')
    df = pd.read_csv(fn)
    tr = TCRrep(cell_df=df[[
        'cohort', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'radius'
    ]],
                organism="human",
                chains=["beta"])

    from tcrdist.rep_diff import neighborhood_diff
    ndif = neighborhood_diff(clone_df=tr.clone_df,
                             pwmat=tr.pw_beta,
                             count_col='count',
                             x_cols=['cohort'],
                             knn_radius=25,
                             test_method="chi2")
    # Add neighbors and other columns of interest
    # from neighbor_diff result to the clone_df
    tr.clone_df = pd.concat([
        tr.clone_df,
        ndif[['neighbors', 'K_neighbors', 'val_0', 'ct_0', 'pvalue']]
    ],
                            axis=1)
    # Because neighors and K_neighbor are already added to the clone_df
    # TCRpublic.report() uses those instead of finding new ones.
    tp = TCRpublic(tcrrep=tr,
                   output_html_name="quasi_public_clones_with_ndif.html")
    # Add any columns neighbor_diff columns
    #that you want to display in the final report
    tp.labels.append('val_0')
    tp.labels.append('ct_0')
    tp.labels.append('pvalue')
    # chagne sort to be pvalue not publicity
    tp.sort_columns = ['pvalue']
    # because you are sorting by pvalue, change to True
    tp.sort_ascending = True
    tp.report()
Example #27
0
def motif_creation_human_betas():
    import re
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    import palmotif

    from tcrdist.pgen import OlgaModel
    oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha")

    from tcrdist.pgen import OlgaModel
    omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")

    df = pd.read_csv("dash_human.csv")
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    from tcrdist.adpt_funcs import get_basic_centroids
    get_basic_centroids(tr, max_dist=75)
    with open("test_3.svg", 'w') as oh:
        oh.write('<body>')
        for i, r in tr.centroids_df.iterrows():
            if len(r['neighbors']) < 5:
                break
            seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list()
            gene_usages = tr.clone_df.iloc[r['neighbors'], ][[
                'v_b_gene', 'j_b_gene'
            ]].value_counts().reset_index().to_dict('split')['data']
            depth = 3

            refs = flatten([
                omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth)
                for v, j, i in combos_alpha
            ])
            refs = [x for x in refs if x is not None]

            matrix, stats = palmotif.compute_pal_motif(seqs=seqs,
                                                       refs=refs,
                                                       centroid=r['cdr3_b_aa'])
            matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs,
                                                       centroid=r['cdr3_b_aa'])
            refs.append(r['cdr3_b_aa'])
            matrix_bkgd, _ = palmotif.compute_pal_motif(
                seqs=refs, centroid=r['cdr3_b_aa'])

            svgs = [
                palmotif.svg_logo(matrix, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True)
            ]

            [oh.write(f"{s}<div></div>\n") for s in svgs]
            oh.write('<div></div>')
            oh.write(str(r))
            oh.write('<div></div>')

        oh.write('</body>')
Example #28
0
def test_neighbors_and_publicity_directly():
    """
    Instead of enforcing a fixed radius, 
    use a radius specific to each
    centroid, specified in an additional 
    column.
    """
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.public import TCRpublic
    fn = os.path.join(
        'tcrdist', 'data', 'covid19',
        'mira_epitope_55_524_ALRKVPTDNYITTY_KVPTDNYITTY.tcrdist3.radius.csv')
    df = pd.read_csv(fn)
    tr = TCRrep(cell_df=df[[
        'cohort', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'radius'
    ]],
                organism="human",
                chains=["beta"])

    # NEIGHBORS
    from tcrdist.public import _neighbors_fixed_radius
    from tcrdist.public import _neighbors_variable_radius
    # returns lists of lists of all neighbors at fixed of variable radii
    _neighbors_fixed_radius(pwmat=tr.pw_beta, radius=18)
    _neighbors_variable_radius(pwmat=tr.pw_beta,
                               radius_list=tr.clone_df.radius)

    # returns the number (K) neighbors at fixed or vriable radii
    from tcrdist.public import _K_neighbors_fixed_radius
    from tcrdist.public import _K_neighbors_variable_radius
    _K_neighbors_fixed_radius(pwmat=tr.pw_beta, radius=18)
    _K_neighbors_variable_radius(pwmat=tr.pw_beta,
                                 radius_list=tr.clone_df.radius)

    # First find neighbors by your favorite method
    tr.clone_df['neighbors'] = _neighbors_variable_radius(
        pwmat=tr.pw_beta, radius_list=tr.clone_df.radius)
    # Once neighbors are added to a clone_df you can easily determine publicity.
    tr.clone_df['nsubject']   = tr.clone_df['neighbors'].\
        apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    tr.clone_df['qpublic']   = tr.clone_df['nsubject'].\
        apply(lambda x: x > 1)
Example #29
0
def test_mixcr_to_tcrdist_on_clones():
    test_clones = os.path.join('tcrdist', 'test_files_compact',
                               'SRR5130260.1.test.fastq.output.clns.txt')
    df = mixcr.mixcr_to_tcrdist2(chain="delta",
                                 organism="human",
                                 clones_fn=test_clones)

    assert isinstance(df, pd.DataFrame)
    df1 = mixcr.remove_entries_with_invalid_vgene(df,
                                                  chain="delta",
                                                  organism="human")
    assert isinstance(df, pd.DataFrame)
    df1['subject'] = 'SRR5130260.1'

    tr = TCRrep(cell_df=df1,
                organism="human",
                chains=['delta'],
                db_file='gammadelta_db.tsv')
    print(tr.cell_df.shape[0])

    tr.infer_cdrs_from_v_gene(chain='delta', imgt_aligned=True)

    tr.index_cols = [
        'subject', "v_d_gene", 'd_d_gene', 'j_d_gene', 'cdr3_d_nucseq',
        'cdr3_d_aa', 'cdr1_d_aa', 'cdr2_d_aa', 'pmhc_d_aa'
    ]

    tr.deduplicate()
    assert isinstance(tr.clone_df, pd.DataFrame)
Example #30
0
def test_example_10_sparse_multiprocessing():
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist.rep_funcs import pw2dense
    import numpy as np

    df2 = pd.read_csv("dash2.csv")
    tr = TCRrep(
        cell_df=df2,  #(2)
        df2=df2,
        organism='mouse',
        chains=['alpha', 'beta'],
        db_file='alphabeta_gammadelta_db.tsv')

    tr.compute_rect_distances(df=tr.clone_df, df2=df2)
    assert tr.rw_alpha.shape == (1924, 1924)
    assert tr.rw_beta.shape == (1924, 1924)

    rw_alpha = tr.rw_alpha.copy()
    rw_beta = tr.rw_beta.copy()

    radius = 150
    tr.cpus = 2
    tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius)
    d = pw2dense(tr.rw_alpha, radius)
    assert np.all(rw_alpha[rw_alpha <= radius] == d[d <= radius])

    d = pw2dense(tr.rw_beta, radius)
    assert np.all(rw_beta[rw_beta <= radius] == d[d <= radius])