Example #1
0
def test_TCRrep_func_tcrdist2_save_auto_rebuild(chains = ['beta','alpha'], metric = "nw"):
    cpu = multiprocessing.cpu_count()
    # really basic example
    df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv'))
    df = df[df.epitope.isin(['NP'])]
    tr = TCRrep(cell_df=df, chains=chains, organism='mouse')
    tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save= True, dest = "default_archive", dest_tar_name = "default_archive.tar.gz" )
    # Cleanup folder that you just made
    os.system("rm -rf myTCRrep_archive")
    # Rebuild
    tr = TCRrep(cell_df=df.iloc[0:0,:], chains=chains, organism='mouse')
    tr.rebuild(dest_tar_name = "default_archive.tar.gz")

    tr_compare = run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw")

    attendance = {k : k in tr.__dict__.keys() for k in tr_compare.__dict__.keys()}
    assert attendance['pw_tcrdist']
    assert attendance['pw_alpha']
    assert attendance['pw_beta']
    assert attendance['clone_df']
    assert attendance['cell_df']
    # only compare things in common
    shared_attributes = [k for k in attendance.keys() if attendance[k]]
    for k in shared_attributes:
        if not isinstance(getattr(tr, k), pd.DataFrame):
            if not isinstance(getattr(tr, k), dict):
                assert np.all(getattr(tr, k) == getattr(tr_compare, k))
            else:
                assert set(getattr(tr, k).keys()) ==  set(getattr(tr_compare, k).keys())

    assert set(tr.all_genes['mouse'].keys()) == set(tr_compare.all_genes['mouse'].keys())
    assert set(tr.all_genes['human'].keys()) == set(tr_compare.all_genes['human'].keys())
Example #2
0
def test_calc_radii():
	import numpy as np
	import pandas as pd
	from tcrdist.repertoire import TCRrep
	df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True)
	tr = TCRrep(cell_df = df.copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
	            compute_distances = True)

	df = pd.read_csv("dash.csv").query('epitope != "PA"').reset_index(drop = True)
	tr_bkgd = TCRrep(cell_df = df.copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
               compute_distances = False)

	from tcrdist.centers import calc_radii
	radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = tr_bkgd, chain = 'beta', ctrl_bkgd = 10**-3, use_sparse = False, max_radius=50)
	from tcrdist.public import _neighbors_variable_radius
	# Compute neighbors <= variable radius in the background set and the foreground set
	neighbors     = _neighbors_variable_radius(pwmat = tr.pw_beta , radius_list = radii)
	background_neighbors = _neighbors_variable_radius(pwmat = tr.rw_beta , radius_list = radii)
	tr.clone_df['radius']               = radii
	tr.clone_df['neighbors']            = neighbors
	tr.clone_df['background_neighbors'] = background_neighbors
	tr.clone_df['nsubject']             = tr.clone_df['neighbors'].\
			apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
	tr.clone_df['qpublic']              = tr.clone_df['nsubject'].\
			apply(lambda x: x > 1)
	tr.clone_df
Example #3
0
def do_search2(file, df_search, dest, tag, path):

    sample_name = file.replace('.tcrdist.tsv', '')
    tic = time.perf_counter()

    # <tr_search> tcrdist.repertoire.TCRrep object for computing distances
    tr_search = TCRrep(cell_df=df_search,
                       organism='human',
                       chains=['delta'],
                       db_file='alphabeta_gammadelta_db.tsv',
                       compute_distances=False)
    # set cpus according to parameter above
    tr_search.cpus = 1
    df_bulk = pd.read_csv(os.path.join(path, file),
                          sep='\t').rename(columns={'cdr3_b_aa': 'cdr3_d_aa'})
    print(df_bulk)
    df_bulk = df_bulk[[
        'cdr3_d_aa', 'v_d_gene', 'j_d_gene', 'templates',
        'productive_frequency'
    ]].rename(columns={'templates': 'count'})

    tr_bulk = TCRrep(cell_df=df_bulk,
                     organism='human',
                     chains=['delta'],
                     db_file='alphabeta_gammadelta_db.tsv',
                     compute_distances=False)

    #lines_per_file.append(tr_bulk.clone_df.shape[0])

    search_clones = tr_search.clone_df.shape[0]
    bulk_clones = tr_bulk.clone_df.shape[0]
    # To avoid memory pressure on the system we set a target that tcrdist doesn't do more than 10M comparisons per process
    ideal_chunk_size = get_safe_chunk(tr_search.clone_df.shape[0],
                                      tr_bulk.clone_df.shape[0],
                                      target=10**7)
    tr_search.compute_sparse_rect_distances(df=tr_search.clone_df,
                                            df2=tr_bulk.clone_df,
                                            chunk_size=ideal_chunk_size)  #(5)
    r1 = tabulate(clone_df1=tr_search.clone_df,
                  clone_df2=tr_bulk.clone_df,
                  pwmat=tr_search.rw_delta,
                  cdr3_name='cdr3_d_aa',
                  v_gene_name='v_d_gene',
                  j_gene_name='j_d_gene')

    outfile = os.path.join(dest, f"{sample_name}.{tag}.bulk_tabulation.tsv")
    print(f"WRITING: {outfile}")
    r1.to_csv(outfile, sep='\t', index=False)
    toc = time.perf_counter()
    print(f"TABULATED IN {toc - tic:0.4f} seconds")
    del (tr_search)
    del (tr_bulk)
    #return(r1)
    return (f"{toc - tic:0.4f}s")
Example #4
0
def test_current_example():
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2
    from tcrdist.automate import auto_pgen
    import scipy.sparse

    fn_mira_background = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.olga100K_brit100K_bkgd.csv'
    fn_mira_background = os.path.join('tcrdist', 'data', 'covid19',
                                      fn_mira_background)
    df_background = pd.read_csv(fn_mira_background)
    tr_background = TCRrep(cell_df=df_background.copy(),
                           organism="human",
                           chains=['beta'],
                           compute_distances=False)

    fn_mira = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv'
    fn_mira = os.path.join('tcrdist', 'data', 'covid19', fn_mira)
    df_mira = pd.read_csv(fn_mira)
    df_mira = df_mira[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']]
    tr = TCRrep(cell_df=df_mira.copy(),
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                store_all_cdr=False,
                compute_distances=True)
    auto_pgen(tr)

    tr.compute_rect_distances(df=tr.clone_df,
                              df2=tr_background.clone_df,
                              store=False)

    assert tr.rw_beta.shape[0] == tr.clone_df.shape[0]

    centers_df = bkgd_cntl_nn2(tr=tr,
                               tr_background=tr_background,
                               ctrl_bkgd=10**-6,
                               weights=tr_background.clone_df.weights,
                               col='cdr3_b_aa',
                               ncpus=2,
                               thresholds=[x for x in range(0, 50, 2)],
                               generate_regex=True,
                               test_regex=True)

    out_fn_center_df = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.centers_df.csv'
    out_fn_rw_beta_sparse_matrix = 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv.rw_beta.sparse.npz'
    centers_df.to_csv(out_fn_center_df, index=False)
    tr.rw_beta[tr.rw_beta == 0] = 1  # set true zeros to 1
    tr.rw_beta[tr.rw_beta > 50] = 0  # ignores everything less than 100
    rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta)
    scipy.sparse.save_npz(out_fn_rw_beta_sparse_matrix, rw_beta_sparse)
Example #5
0
def test_example_with_report():
	# Example that would work with a large bakcgournd
	import numpy as np
	import pandas as pd
	from tcrdist.repertoire import TCRrep
	from tcrdist.background import sample_britanova
	from tcrdist.sample import _default_sampler
	"""A useful background for beta chain"""
	df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True)

	tr = TCRrep(cell_df = df.copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
	            compute_distances = True)
	
	ts = _default_sampler(organism = "mouse", chain = "beta")()
	trb = TCRrep(cell_df = ts.ref_df.rename(columns = {'v_reps' : 'v_b_gene', 'j_reps': 'j_b_gene', 'cdr3': 'cdr3_b_aa'}).copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
	            compute_distances = False)

	tr.cpus = 2
	tr.compute_sparse_rect_distances(df = tr.clone_df, df2 = trb.clone_df, radius=50,chunk_size=100)
	
	from tcrdist.centers import calc_radii
	radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = trb, chain = 'beta', ctrl_bkgd = 10**-5, use_sparse = True, max_radius=50)

	# Set a maximum radius of 26
	tr.clone_df['radius'] = radii
	tr.clone_df['radius'][tr.clone_df['radius'] > 26] = 26

	# Quick access to publicity
	from tcrdist.public import _neighbors_sparse_variable_radius, _neighbors_variable_radius
	tr.clone_df['neighbors'] = _neighbors_variable_radius(pwmat = tr.pw_beta, radius_list = tr.clone_df['radius'])
	tr.clone_df['background_neighbors'] = _neighbors_sparse_variable_radius(csrmat = tr.rw_beta, radius_list = tr.clone_df['radius'])
	tr.clone_df['nsubject']             = tr.clone_df['neighbors'].\
			apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
	tr.clone_df['qpublic']              = tr.clone_df['nsubject'].\
			apply(lambda x: x > 1)
	tr.clone_df

	# A Report
	from tcrdist.public import TCRpublic
	tp = TCRpublic(
	tcrrep = tr, 
	output_html_name = "quasi_public_clones.html")
	tp.fixed_radius = False
	rp = tp.report()
Example #6
0
def test_old_example():
    """
	The purpose of this example is to show the use of 
	chosing thresholds based on background discovery rate
	"""
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2
    from tcrdist.automate import auto_pgen
    from tcrdist.regex import _index_to_regex_str, _index_to_seqs
    from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets

    fn = os.path.join('tcrdist', 'data', 'covid19', "m60_bkgd_test_input.csv")
    df_background = pd.read_csv(fn)

    tr_background = TCRrep(cell_df=df_background,
                           organism="human",
                           chains=['beta'],
                           compute_distances=False)
    tr_background.clone_df['weights'] = 1
    fn = os.path.join('tcrdist', 'data', 'covid19', "m60_test_input.csv")
    df = pd.read_csv(fn)

    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    auto_pgen(tr)

    tr.compute_rect_distances(df=tr.clone_df,
                              df2=tr_background.clone_df,
                              store=False)

    assert tr.rw_beta.shape[0] == tr.clone_df.shape[0]

    centers_df = bkgd_cntl_nn2(tr=tr,
                               tr_background=tr_background,
                               ctrl_bkgd=2 * 10**-5,
                               weights=tr_background.clone_df.weights,
                               col='cdr3_b_aa',
                               ncpus=2,
                               thresholds=[x for x in range(0, 50, 2)],
                               generate_regex=True,
                               test_regex=True)

    centers_df.sort_values(['target_hits'], ascending=False)
Example #7
0
    def setUpClass(self):
        filename = op.join(td.__path__[0], 'test_files',
                           'vdjDB_PMID28636592.tsv')
        pd_df = pd.read_csv(filename, sep='\t')
        t_df = td.mappers.vdjdb_to_tcrdist2(pd_df=pd_df)

        t_df = t_df.loc[(t_df.organism == 'HomoSapiens')
                        & (t_df.epitope == 'M1')]

        tr = TCRrep(cell_df=t_df, organism='human')
        tr.infer_cdrs_from_v_gene(chain='alpha')
        tr.infer_cdrs_from_v_gene(chain='beta')
        tr.index_cols = ['subject', 'cdr3_b_aa']
        tr.deduplicate()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            tr.compute_pairwise_all(chain='beta', metric='nw', proceses=1)
        self.pw = tr.cdr3_b_aa_pw

        np.random.seed(110820)
        self.clone_df = tr.clone_df.assign(
            Visit=np.random.choice(['Pre', 'Post'],
                                   size=tr.clone_df.shape[0],
                                   p=[0.4, 0.6]),
            Stim=np.random.choice(['A', 'B', 'C'],
                                  size=tr.clone_df.shape[0],
                                  p=[0.4, 0.1, 0.5]))
Example #8
0
def test_introduction_3():
    import pandas as pd
    from tcrdist.repertoire import TCRrep

    df = pd.read_csv("dash.csv")
    tr = TCRrep(cell_df=df,
                organism='mouse',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                compute_distances=False)

    from tcrdist.plotting import plot_pairings, _write_svg

    svg_PA = plot_pairings(
        tr.clone_df.loc[tr.clone_df.epitope == "PA"],
        cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'],
        count_col='count')

    svg_NP = plot_pairings(
        tr.clone_df.loc[tr.clone_df.epitope == "NP"],
        cols=['v_b_gene', 'j_b_gene', 'j_a_gene', 'v_a_gene'],
        count_col='count')

    _write_svg(svg_PA, name="PA_gene_usage_plot.svg", dest=".")

    _write_svg(svg_NP, name="NP_gene_usage_plot.svg", dest=".")

    import fishersapi
    fishersapi.fishers_frame(tr.clone_df.loc[tr.clone_df.epitope == "NP"],
                             col_pairs=[('v_b_gene', 'j_b_gene'),
                                        ('v_a_gene', 'j_a_gene'),
                                        ('v_a_gene', 'v_b_gene'),
                                        ('j_a_gene', 'j_b_gene')])
Example #9
0
def test_TCRrep_func_tcrdist2_save_manual_rebuild(chains = ['beta','alpha'], metric = "nw"):
    cpu = multiprocessing.cpu_count()
    # really basic example
    df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv'))
    df = df[df.epitope.isin(['NP'])]
    tr = TCRrep(cell_df=df, chains=chains, organism='mouse')
    tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save=True, dest = "default_archive", dest_tar_name = "default_archive.tar.gz" )
    # Cleanup folder that you just made
    os.system("rm -rf myTCRrep_archive")
    # Rebuild
    tr = TCRrep(cell_df=df.iloc[0:0,:], chains=chains, organism='mouse')
    z = Zipdist2(name = "test_only", target = tr)
    z._build(dest_tar = "default_archive.tar.gz", target = tr)
    assert isinstance(tr.paired_tcrdist, np.ndarray )
    assert isinstance(tr.pw_tcrdist, np.ndarray )
    assert np.array_equal(tr.pw_tcrdist, tr.paired_tcrdist)
Example #10
0
    def test_repertoire_full_use_case(self):
        """
        This is not a unit test persay! This is a test of a use_case of an instance of
        the TCRrep() class used for pairwise sequence comparison

        """

        testrep = TCRrep(cell_df = example_df, chains = ["alpha", "beta"]) # (1)
        testrep.index_cols.append("epitope")                         # (2)
        testrep.index_cols.append("subject")
        testrep.deduplicate()                                    # (3)
        testrep.cdr3_a_aa_smat = 'blosum62'               # (4)
        testrep.cdr3_b_aa_smat = 'blosum62'
        testrep.compute_pairwise(chain = "alpha")                # (5)
        testrep.compute_pairwise(chain = "beta")                 # (6)
        tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw    # (7)

        expected_tcrdist = np.array([[   0.,  222.,  210.,  223.,  231.,  239.,  219.,  231.,  175.],
               [ 222.,    0.,  116.,  175.,  173.,  185.,  131.,  209.,  205.],
               [ 210.,  116.,    0.,  175.,  169.,  183.,  145.,  201.,  221.],
               [ 223.,  175.,  175.,    0.,  154.,  200.,  162.,  234.,  202.],
               [ 231.,  173.,  169.,  154.,    0.,  152.,  120.,  182.,  192.],
               [ 239.,  185.,  183.,  200.,  152.,    0.,  146.,  112.,  192.],
               [ 219.,  131.,  145.,  162.,  120.,  146.,    0.,  178.,  172.],
               [ 231.,  209.,  201.,  234.,  182.,  112.,  178.,    0.,  220.],
               [ 175.,  205.,  221.,  202.,  192.,  192.,  172.,  220.,    0.]])

        self.assertTrue((tcrdist == expected_tcrdist).all())
Example #11
0
    def test_repertoire____use_case_hamming_paired_tcrdist(self):
        tr = TCRrep(cell_df = example_df.copy(), organism = "human", chains= ["alpha", "beta"])
        tr.infer_cdrs_from_v_gene(chain = "alpha")
        tr.infer_cdrs_from_v_gene(chain = "beta")
        tr.index_cols =['cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa']
        tr.deduplicate()
        #tr.clone_df
        tr.compute_pairwise_all(chain = "alpha", metric = "hamming")
        tr.compute_pairwise_all(chain = "beta", metric = "hamming")
        r = tr.compute_paired_tcrdist(chains = ['alpha', 'beta'])

        expected = {'paired_tcrdist': np.array([[  0.,  50.,  49.,  48.,  49.,  49.,  51.,  44.,  46.],
                [ 50.,   0.,  21.,  29.,  29.,  47.,  40.,  45.,  44.],
                [ 49.,  21.,   0.,  42.,  39.,  47.,  42.,  44.,  50.],
                [ 48.,  29.,  42.,   0.,  14.,  35.,  48.,  52.,  46.],
                [ 49.,  29.,  39.,  14.,   0.,  30.,  45.,  49.,  44.],
                [ 49.,  47.,  47.,  35.,  30.,   0.,  46.,  43.,  48.],
                [ 51.,  40.,  42.,  48.,  45.,  46.,   0.,  36.,  41.],
                [ 44.,  45.,  44.,  52.,  49.,  43.,  36.,   0.,  47.],
                [ 46.,  44.,  50.,  46.,  44.,  48.,  41.,  47.,   0.]]),
         'paired_tcrdist_weights': {'cdr1_a_aa_pw': 1,
          'cdr1_b_aa_pw': 1,
          'cdr2_a_aa_pw': 1,
          'cdr2_b_aa_pw': 1,
          'cdr3_a_aa_pw': 1,
          'cdr3_b_aa_pw': 1,
          'pmhc_a_aa_pw': 1,
          'pmhc_b_aa_pw': 1}}
        #print(r['paired_tcrdist'][1, 2])
        #print(expected['paired_tcrdist'][1, 2])
        #print(tr.clone_df.iloc[1])
        #print(tr.clone_df.iloc[2])
        #print(r['paired_tcrdist'] == expected['paired_tcrdist'])

        self.assertTrue((r['paired_tcrdist'] == expected['paired_tcrdist']).all())
Example #12
0
def test_gamma_delta_manually_step_by_step():
    """
    Test that the user can go through the automatic 
    steps, manually, one-by-one for well formatted
    gamma delta data.
    """
    df = pd.read_csv("sant.csv")
    tr = TCRrep(cell_df=df,
                organism="human",
                chains=['gamma', "delta"],
                imgt_aligned=False,
                infer_cdrs=False,
                infer_index_cols=False,
                deduplicate=False,
                use_defaults=False,
                store_all_cdr=False,
                compute_distances=False,
                cpus=1,
                db_file='alphabeta_gammadelta_db.tsv')

    tr.infer_cdrs_from_v_gene(chain="gamma")
    tr.infer_cdrs_from_v_gene(chain="delta")
    tr.infer_index_cols()
    tr.show_incomplete()
    tr.deduplicate()
    tr._initialize_chain_specific_attributes()
    tr.stora_all_cdr = True
    tr.compute_distances()
    tr.pw_gamma
Example #13
0
def test_validate_imgt_aligned():
    """Test that incorrect chain raise causes ValueError"""
    with pytest.raises(ValueError) as info:
        tr = TCRrep(organism="mouse",
                    chains=["alpha", "beta"],
                    imgt_aligned="align")
    assert str(info.value) == "TCRrep imgt_aligned argument must be a boolean"
Example #14
0
def test_validate_chains():
    """Test that incorrect chain raise causes ValueError"""
    with pytest.raises(ValueError) as info:
        tr = TCRrep(organism="mouse", chains=["ALPHA", "beta"])
    assert str(
        info.value
    ) == "TCRrep chains arg can be one or more of the following ['alpha', 'beta', 'gamma', 'delta'] case-sensitive"
Example #15
0
    def test_repertoire_full_use_case_hamming(self):
        """
        This is not a unit test persay! This is a test of a use_case of an instance of
        the TCRrep() class used for pairwise sequence comparison
        """
        testrep = TCRrep(cell_df=example_df, chains=["alpha", "beta"])  # (1)
        testrep.index_cols.append("epitope")  # (2)
        testrep.index_cols.append("subject")
        testrep.deduplicate()  # (3)
        testrep.cdr3_a_aa_smat = 'blosum62'  # (4)
        testrep.cdr3_b_aa_smat = 'blosum62'
        testrep.compute_pairwise_all(chain="alpha", metric="hamming")  # (5)
        testrep.compute_pairwise_all(chain="beta", metric="hamming")  # (6)
        tcrdist = testrep.cdr3_a_aa_pw + testrep.cdr3_b_aa_pw  # (7)

        expected_tcrdist = np.array(
            [[0., 18., 17., 18., 19., 22., 19., 18., 16.],
             [18., 0., 11., 15., 15., 17., 10., 18., 18.],
             [17., 11., 0., 18., 15., 17., 13., 18., 20.],
             [18., 15., 18., 0., 14., 19., 14., 20., 18.],
             [19., 15., 15., 14., 0., 14., 11., 17., 16.],
             [22., 17., 17., 19., 14., 0., 14., 13., 18.],
             [19., 10., 13., 14., 11., 14., 0., 17., 15.],
             [18., 18., 18., 20., 17., 13., 17., 0., 19.],
             [16., 18., 20., 18., 16., 18., 15., 19., 0.]])

        self.assertTrue((tcrdist == expected_tcrdist).all())
Example #16
0
def test_mixcr_to_tcrdist_on_clones():
    test_clones = os.path.join('tcrdist', 'test_files_compact',
                               'SRR5130260.1.test.fastq.output.clns.txt')
    df = mixcr.mixcr_to_tcrdist2(chain="delta",
                                 organism="human",
                                 clones_fn=test_clones)

    assert isinstance(df, pd.DataFrame)
    df1 = mixcr.remove_entries_with_invalid_vgene(df,
                                                  chain="delta",
                                                  organism="human")
    assert isinstance(df, pd.DataFrame)
    df1['subject'] = 'SRR5130260.1'

    tr = TCRrep(cell_df=df1,
                organism="human",
                chains=['delta'],
                db_file='gammadelta_db.tsv')
    print(tr.cell_df.shape[0])

    tr.infer_cdrs_from_v_gene(chain='delta', imgt_aligned=True)

    tr.index_cols = [
        'subject', "v_d_gene", 'd_d_gene', 'j_d_gene', 'cdr3_d_nucseq',
        'cdr3_d_aa', 'cdr1_d_aa', 'cdr2_d_aa', 'pmhc_d_aa'
    ]

    tr.deduplicate()
    assert isinstance(tr.clone_df, pd.DataFrame)
Example #17
0
def test_import_vdjtools_beta_w_validation():
    import pandas as pd
    import numpy as np
    import os
    from tcrdist.paths import path_to_base
    from tcrdist.vdjtools_funcs import import_vdjtools
    from tcrdist.repertoire import TCRrep

    # Reformat vdj_tools input format for tcrdist3
    vdj_tools_file_beta = os.path.join(
        path_to_base, 'tcrdist', 'data', 'formats',
        'vdj.M_15_CD8_beta.clonotypes.TRB.txt.gz')
    df_beta = import_vdjtools(vdj_tools_file=vdj_tools_file_beta,
                              chain='beta',
                              organism='human',
                              db_file='alphabeta_gammadelta_db.tsv',
                              validate=True)
    assert np.all(df_beta.columns == [
        'count', 'freq', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'cdr3_b_nucseq',
        'valid_v', 'valid_j', 'valid_cdr3'
    ])

    # Can be directly imported into a TCRrep instance.
    tr = TCRrep(
        cell_df=df_beta[['count', 'freq', 'cdr3_b_aa', 'v_b_gene',
                         'j_b_gene']],
        chains=['beta'],
        organism='human',
        compute_distances=False)
Example #18
0
def test_example_10_sparse_multiprocessing():
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist.rep_funcs import pw2dense
    import numpy as np

    df2 = pd.read_csv("dash2.csv")
    tr = TCRrep(
        cell_df=df2,  #(2)
        df2=df2,
        organism='mouse',
        chains=['alpha', 'beta'],
        db_file='alphabeta_gammadelta_db.tsv')

    tr.compute_rect_distances(df=tr.clone_df, df2=df2)
    assert tr.rw_alpha.shape == (1924, 1924)
    assert tr.rw_beta.shape == (1924, 1924)

    rw_alpha = tr.rw_alpha.copy()
    rw_beta = tr.rw_beta.copy()

    radius = 150
    tr.cpus = 2
    tr.compute_sparse_rect_distances(df=tr.clone_df, df2=df2, radius=radius)
    d = pw2dense(tr.rw_alpha, radius)
    assert np.all(rw_alpha[rw_alpha <= radius] == d[d <= radius])

    d = pw2dense(tr.rw_beta, radius)
    assert np.all(rw_beta[rw_beta <= radius] == d[d <= radius])
Example #19
0
def test_ex5():
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    df = pd.read_csv('dash.csv')
    df = df[df.epitope.isin(['PA'])]
    tr = TCRrep(cell_df=df, chains=['alpha', 'beta'], organism='mouse')
    tr.tcrdist2(processes=1,
                metric='hamming',
                reduce=True,
                dump=False,
                save=False,
                replacement_weights={
                    'cdr3_a_aa': 3,
                    'pmhc_a_aa': 1,
                    'cdr2_a_aa': 1,
                    'cdr1_a_aa': 1,
                    'cdr3_b_aa': 3,
                    'pmhc_b_aa': 1,
                    'cdr2_b_aa': 1,
                    'cdr1_b_aa': 1
                })

    assert np.all(tr.pw_tcrdist == tr.pw_alpha + tr.pw_beta)
    assert np.all(tr.pw_beta == 3 * tr.cdr3_b_aa_pw + tr.pmhc_b_aa_pw +
                  tr.cdr2_b_aa_pw + tr.cdr1_b_aa_pw)
    assert np.all(tr.pw_alpha == 3 * tr.cdr3_a_aa_pw + tr.pmhc_a_aa_pw +
                  tr.cdr2_a_aa_pw + tr.cdr1_a_aa_pw)
Example #20
0
def test_olga_sample_alphas_for_a_human_repertoire():
    import re
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    import palmotif

    from tcrdist.pgen import OlgaModel
    olga_model_alpha = OlgaModel(recomb_type="VJ",
                                 chain_folder="human_T_alpha")

    from tcrdist.pgen import OlgaModel
    olga_model_beta = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")

    df = pd.read_csv("dash_human.csv")
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    rb = [
        olga_model_beta.gen_cdr3s(V=allele_01(r['v_b_gene']),
                                  J=allele_01(r['j_b_gene']),
                                  n=1)
        for _, r in tr.clone_df[['v_b_gene', 'j_b_gene']].iterrows()
    ]
    ra = [
        olga_model_alpha.gen_cdr3s(V=allele_01(r['v_a_gene']),
                                   J=allele_01(r['j_a_gene']),
                                   n=1)
        for _, r in tr.clone_df[['v_a_gene', 'j_a_gene']].iterrows()
    ]
Example #21
0
def test_integration_TCRrep_with_TCRMotif():
    import pandas as pd
    import tcrdist as td
    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep
    from tcrdist.cdr3_motif import TCRMotif
    fn = os.path.join("tcrdist","test_files", "vdjDB_PMID28636592.tsv")
    pd_df = pd.read_csv(fn, sep = "\t")        # 1
    t_df = td.mappers.vdjdb_to_tcrdist2(pd_df = pd_df)               # 2
    t_df.organism.value_counts                                       # 3
    index_mus = t_df.organism == "MusMusculus"                       # 4
    t_df_mus = t_df.loc[index_mus,:].copy()                          # 5

    tr = TCRrep(cell_df = t_df_mus, organism = "mouse")              # 6

    tr.infer_cdrs_from_v_gene(chain = 'alpha')                       # 7
    tr.infer_cdrs_from_v_gene(chain = 'beta')                        # 8

    tr.index_cols = ['subject', 'epitope',                           # subject and epitope
                     'v_a_gene',  'j_a_gene', 'v_b_gene', 'j_b_gene',# gene usage
                     'cdr3_a_aa', 'cdr3_b_aa',                       # CDR 3
                     'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',          # alpha CDR 1, 2, and 2.5
                     'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa']          # beta CDR 1, 2, and 2.5

    tr.deduplicate()                                                 # 10

    motif = TCRMotif(clones_df = tr.tcr_motif_clones_df(), organism = "mouse", chains = ["A","B"], epitopes = ["PA"]) # 11
    assert isinstance(motif.clones_df, pd.DataFrame)
Example #22
0
def test_pgen_1():
    """
    How to add pgen estimates to human alpha/beta CDR3s
    """
    import pandas as pd
    from tcrdist.pgen import OlgaModel
    from tcrdist import mappers 
    from tcrdist.repertoire import TCRrep
    from tcrdist.setup_tests import download_and_extract_zip_file

    df = pd.read_csv("dash_human.csv")

    tr = TCRrep(cell_df = df.sample(5, random_state = 3), 
                organism = 'human', 
                chains = ['alpha','beta'], 
                db_file = 'alphabeta_gammadelta_db.tsv', 
                store_all_cdr = False)

    olga_beta  = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ")
    olga_alpha = OlgaModel(chain_folder = "human_T_alpha", recomb_type="VJ")

    tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens(
        CDR3_seq = tr.clone_df.cdr3_b_aa)
    
    tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens(
        CDR3_seq = tr.clone_df.cdr3_a_aa)

    tr.clone_df[['cdr3_b_aa', 'pgen_cdr3_b_aa', 'cdr3_a_aa','pgen_cdr3_a_aa']]
    """
Example #23
0
def test_workflow_1():
    """
    Load all the TCRs associated with a particular epitope in 
    the Adaptive Biotechnology COVID19 Data Release 2
    """
    import os
    import pandas as pd
    from tcrdist.repertoire import TCRrep

    path = os.path.join('tcrdist', 'data', 'covid19')
    file = 'mira_epitope_9_2477_FLQSINFVR_FLQSINFVRI_FLYLYALVYF_GLEAPFLYLY_INFVRIIMR_LQSINFVRI_LQSINFVRII_QSINFVRII_SINFVRIIMR_VYFLQSINF_VYFLQSINFV_YFLQSINFVR_YLYALVYFL.tcrdist3.csv'
    filename = os.path.join(path, file)

    df = pd.read_csv(filename, sep=",")

    df = df[[
        'cell_type', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope',
        'age', 'sex', 'race', 'cohort', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1',
        'hla-c', 'hla-c_1', 'dpa1_1', 'dpb1', 'dpb1_1', 'dqa1', 'dqa1_1',
        'dqb1', 'dqb1_1', 'drb1', 'drb1_1', 'drb3'
    ]]

    df['count'] = 1

    tr = TCRrep(cell_df=df, organism='human', chains=['beta'])
Example #24
0
def test_ex11():
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    tr = TCRrep(cell_df=pd.DataFrame(),
                chains=['alpha', 'beta'],
                organism='mouse')
    tr.rebuild(dest_tar_name="some_archive.tar.gz")
Example #25
0
def test_tcr_join_tcrdist():
    import pandas as pd
    from tcrdist.breadth import get_safe_chunk
    from tcrdist.repertoire import TCRrep
    from tcrdist.join import join_by_dist

    tr20 = TCRrep(cell_df=v20df[[
        'subject', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'bio_identity',
        'protein_coordinate'
    ]].copy(),
                  organism='human',
                  chains=['beta'],
                  compute_distances=False)
    tr21 = TCRrep(cell_df=v21df[[
        'subject', 'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'bio_identity',
        'protein_coordinate'
    ]].copy(),
                  organism='human',
                  chains=['beta'],
                  compute_distances=False)
    tr21.cpus = 2
    tr21.compute_sparse_rect_distances(df=tr21.clone_df,
                                       df2=tr20.clone_df,
                                       radius=36,
                                       chunk_size=get_safe_chunk(
                                           tr21.clone_df.shape[0],
                                           tr20.clone_df.shape[0]))

    left_right_comparision = join_by_dist(how='inner',
                                          csrmat=tr21.rw_beta,
                                          left_df=v21df,
                                          right_df=v20df,
                                          left_cols=[
                                              'cdr3_b_aa', 'v_b_gene',
                                              'j_b_gene', 'protein_coordinate',
                                              'bio_identity', 'subject'
                                          ],
                                          right_cols=[
                                              'cdr3_b_aa', 'v_b_gene',
                                              'j_b_gene', 'protein_coordinate',
                                              'bio_identity', 'subject'
                                          ],
                                          left_suffix='_x',
                                          right_suffix='_y',
                                          max_n=10,
                                          radius=24)
Example #26
0
def run_TCRrep_func_tcrdist2(chains = ['beta','alpha'], metric = "nw"):
    cpu = multiprocessing.cpu_count()
    # really basic example
    df = pd.read_csv(opj('tcrdist', 'datasets', 'dash.csv'))
    df = df[df.epitope.isin(['NP'])]
    tr = TCRrep(cell_df=df, chains=chains, organism='mouse')
    tr.tcrdist2(processes = cpu, metric = metric, dump = True, reduce = True, save=False)
    return tr
Example #27
0
def motif_creation_human_betas():
    import re
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    import palmotif

    from tcrdist.pgen import OlgaModel
    oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha")

    from tcrdist.pgen import OlgaModel
    omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")

    df = pd.read_csv("dash_human.csv")
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    from tcrdist.adpt_funcs import get_basic_centroids
    get_basic_centroids(tr, max_dist=75)
    with open("test_3.svg", 'w') as oh:
        oh.write('<body>')
        for i, r in tr.centroids_df.iterrows():
            if len(r['neighbors']) < 5:
                break
            seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list()
            gene_usages = tr.clone_df.iloc[r['neighbors'], ][[
                'v_b_gene', 'j_b_gene'
            ]].value_counts().reset_index().to_dict('split')['data']
            depth = 3

            refs = flatten([
                omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth)
                for v, j, i in combos_alpha
            ])
            refs = [x for x in refs if x is not None]

            matrix, stats = palmotif.compute_pal_motif(seqs=seqs,
                                                       refs=refs,
                                                       centroid=r['cdr3_b_aa'])
            matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs,
                                                       centroid=r['cdr3_b_aa'])
            refs.append(r['cdr3_b_aa'])
            matrix_bkgd, _ = palmotif.compute_pal_motif(
                seqs=refs, centroid=r['cdr3_b_aa'])

            svgs = [
                palmotif.svg_logo(matrix, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True)
            ]

            [oh.write(f"{s}<div></div>\n") for s in svgs]
            oh.write('<div></div>')
            oh.write(str(r))
            oh.write('<div></div>')

        oh.write('</body>')
Example #28
0
def test_example_9():
    """
    If you already have a clones file and want 
    to compute 'tcrdistances' on a DataFrame with 
    custom columns names.
    
    Set:
    1. Assign TCRrep.clone_df
    2. set infer_cdrs = False,
    3. compute_distances = False
    4. deduplicate = False
    5. customize the keys for metrics, weights, and kargs with the lambda
        customize = lambda d : {new_cols[k]:v for k,v in d.items()} 
    6. call .calculate_distances()
    """
    import pwseqdist as pw
    import pandas as pd
    from tcrdist.repertoire import TCRrep

    new_cols = {
        'cdr3_a_aa': 'c3a',
        'pmhc_a_aa': 'pa',
        'cdr2_a_aa': 'c2a',
        'cdr1_a_aa': 'c1a',
        'cdr3_b_aa': 'c3b',
        'pmhc_b_aa': 'pb',
        'cdr2_b_aa': 'c2b',
        'cdr1_b_aa': 'c1b'
    }

    df = pd.read_csv("dash2.csv").rename(columns=new_cols)

    tr = TCRrep(
        cell_df=df,
        clone_df=df,  #(1)
        organism='mouse',
        chains=['alpha', 'beta'],
        infer_all_genes=True,
        infer_cdrs=False,  #(2)s
        compute_distances=False,  #(3)
        deduplicate=False,  #(4)
        db_file='alphabeta_gammadelta_db.tsv')

    customize = lambda d: {new_cols[k]: v for k, v in d.items()}  #(5)
    tr.metrics_a = customize(tr.metrics_a)
    tr.metrics_b = customize(tr.metrics_b)
    tr.weights_a = customize(tr.weights_a)
    tr.weights_b = customize(tr.weights_b)
    tr.kargs_a = customize(tr.kargs_a)
    tr.kargs_b = customize(tr.kargs_b)

    tr.compute_distances()  #(6)

    # Notice that pairwise results now have custom names
    tr.pw_c3b
    tr.pw_c3a
    tr.pw_alpha
    tr.pw_beta
Example #29
0
def test_alpha_beta():
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory2
    from tcrdist.rep_funcs import compute_n_tally_out_of_memory2
    from hierdiff.association_testing import cluster_association_test

    df = pd.read_csv("dash.csv")
    tr = TCRrep(cell_df=df.sample(100, random_state=1),
                organism='mouse',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                compute_distances=True,
                store_all_cdr=False)

    check_beta = tr.pw_beta.copy()
    check_beta[check_beta == 0] = 1
    check_alpha = tr.pw_alpha.copy()
    check_alpha[check_alpha == 0] = 1
    check_alpha_beta = check_beta + check_alpha

    S, fragments = compute_pw_sparse_out_of_memory2(tr=tr,
                                                    row_size=50,
                                                    pm_processes=1,
                                                    pm_pbar=True,
                                                    max_distance=1000,
                                                    reassemble=True,
                                                    cleanup=False,
                                                    assign=True)

    assert np.all(tr.pw_beta == check_beta)
    assert np.all(tr.pw_alpha == check_alpha)

    ndif1 = compute_n_tally_out_of_memory2(fragments,
                                           to_file=False,
                                           to_memory=True,
                                           pm_processes=2,
                                           x_cols=['epitope'],
                                           count_col='count',
                                           knn_neighbors=None,
                                           knn_radius=100)

    from hierdiff.association_testing import cluster_association_test
    ndif1 = cluster_association_test(res=ndif1, y_col='cmember', method='chi2')

    from tcrdist.rep_diff import neighborhood_diff
    ndif2 = neighborhood_diff(clone_df=tr.clone_df,
                              pwmat=np.array(tr.pw_beta.todense() +
                                             tr.pw_alpha.todense()),
                              count_col='count',
                              x_cols=['epitope'],
                              knn_radius=100,
                              test_method="chi2")

    assert ndif1.shape == ndif2.shape
    np.all(ndif2['FDRq'].to_list() == ndif2['FDRq'].to_list())
Example #30
0
def test_calc_radii_if_big():
	import numpy as np
	import pandas as pd
	from tcrdist.repertoire import TCRrep
	from tcrdist.centers import calc_radii
	from tcrdist.public import _neighbors_sparse_variable_radius
	df = pd.read_csv("dash.csv").query('epitope == "PA"').reset_index(drop = True)
	tr = TCRrep(cell_df = df.copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
	            compute_distances = False)
	# For a large matrix one can use compute_sparse_rect_distances() instead of
	# .compute_distances() for the pairwise square matrix
	tr.cpus = 2
	tr.compute_sparse_rect_distances(df = tr.clone_df, radius=50,chunk_size=100)
	tr.pw_beta = tr.rw_beta.copy()

	# 
	dfb = pd.read_csv("dash.csv").query('epitope != "PA"').reset_index(drop = True)
	tr_bkgd = TCRrep(cell_df = dfb.copy(), 
	            organism = 'mouse', 
	            chains = ['beta'], 
	            db_file = 'alphabeta_gammadelta_db.tsv',
                compute_distances = False)
	
	# Set rw_beta to none as it will be computed between target and background by calc_radii
	tr.rw_beta = None
	from tcrdist.centers import calc_radii
	radii, thresholds, ecdfs = calc_radii(tr = tr, tr_bkgd = tr_bkgd, chain = 'beta', ctrl_bkgd = 10**-3, use_sparse = True, max_radius=50)
	tr.clone_df['radius'] = radii

	from tcrdist.public import _neighbors_sparse_variable_radius
	# Compute neighbors <= variable radius in the background set and the foreground set
	neighbors     = _neighbors_sparse_variable_radius(csrmat = tr.pw_beta , radius_list = tr.clone_df['radius'])
	background_neighbors = _neighbors_sparse_variable_radius(csrmat = tr.rw_beta , radius_list = tr.clone_df['radius'])
	#tr.clone_df['radius']               = radii
	tr.clone_df['neighbors']            = neighbors
	tr.clone_df['background_neighbors'] = background_neighbors
	tr.clone_df['nsubject']             = tr.clone_df['neighbors'].\
			apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
	tr.clone_df['qpublic']              = tr.clone_df['nsubject'].\
			apply(lambda x: x > 1)
	tr.clone_df