def test_mixcr_to_tcrdist_on_clones(): test_clones = os.path.join('tcrdist', 'test_files_compact', 'SRR5130260.1.test.fastq.output.clns.txt') df = mixcr.mixcr_to_tcrdist2(chain="delta", organism="human", clones_fn=test_clones) assert isinstance(df, pd.DataFrame) df1 = mixcr.remove_entries_with_invalid_vgene(df, chain="delta", organism="human") assert isinstance(df, pd.DataFrame) df1['subject'] = 'SRR5130260.1' tr = TCRrep(cell_df=df1, organism="human", chains=['delta'], db_file='gammadelta_db.tsv') print(tr.cell_df.shape[0]) tr.infer_cdrs_from_v_gene(chain='delta', imgt_aligned=True) tr.index_cols = [ 'subject', "v_d_gene", 'd_d_gene', 'j_d_gene', 'cdr3_d_nucseq', 'cdr3_d_aa', 'cdr1_d_aa', 'cdr2_d_aa', 'pmhc_d_aa' ] tr.deduplicate() assert isinstance(tr.clone_df, pd.DataFrame)
def test_mixcr_integration_with_wrong_chain(): test_clones_fn = os.path.join('tcrdist', 'test_files_compact', 'SRR5130260.1.test.fastq.output.clns.txt') df = mixcr.mixcr_to_tcrdist2(chain="gamma", organism="human", seqs_fn=None, clones_fn=test_clones_fn) df2 = mixcr.remove_entries_with_invalid_vgene(df, chain="gamma", organism="human") assert df2.shape[0] == 0
def test_mixcr_integration_with_correct_chain(): test_clones_fn = os.path.join('tcrdist', 'test_files_compact', 'SRR5130260.1.test.fastq.output.clns.txt') df = mixcr.mixcr_to_tcrdist2(chain="delta", organism="human", seqs_fn=None, clones_fn=test_clones_fn) assert isinstance(df, pd.DataFrame) df1 = mixcr.remove_entries_with_invalid_vgene(df, chain="delta", organism="human") assert isinstance(df, pd.DataFrame) assert df1.shape[0] == 89
def test_convert_minervina_to_mixcr_run_tcrdist(f, my_chain): fn = os.path.join('tcrdist', 'test_files', f) df = pd.read_csv(fn, sep="\t") df['bestVGene'] = df['bestVGene'].apply(lambda s: s + "*00") df['bestJGene'] = df['bestJGene'].apply(lambda s: s + "*00") map_minervina_to_mixcr = \ {'Rank':'cloneId', 'Read.count':'cloneCount', 'Read.proportion':'cloneFraction', 'bestVGene': 'allVHitsWithScore', 'bestDGene': 'allDHitsWithScore', 'bestJGene':'allJHitsWithScore', 'CDR3.nucleotide.sequence':'nSeqCDR3', 'CDR3.amino.acid.sequence':'aaSeqCDR3', 'refPoints':'refPoints'} df = df.rename(columns=map_minervina_to_mixcr) # CREATE A FAUX MIXCR OUTPUT df.to_csv('dfmix.clns.txt', index=False, sep="\t") # USE TCRDIST2 TOOL FOR PORTING MIXCR OUTPUTS dfmix = mixcr.mixcr_to_tcrdist2(chain=my_chain, organism="human", clones_fn='dfmix.clns.txt') if my_chain == "alpha": assert set(dfmix.columns.to_list()) == set([ 'clone_id', 'count', 'v_a_gene', 'd_a_gene', 'j_a_gene', 'cdr3_a_nucseq', 'cdr3_a_aa' ]) assert df.shape[0] == dfmix.shape[0] dfmix = mixcr.remove_entries_with_invalid_vgene(dfmix, chain=my_chain, organism="human") dfmix = mixcr.remove_entries_with_invalid_cdr3(dfmix, chain=my_chain) if my_chain == "alpha": tr = TCRrep(cell_df=dfmix, organism="human", chains=['alpha']) elif my_chain == "beta": tr = TCRrep(cell_df=dfmix, organism="human", chains=['beta']) tr.infer_cdrs_from_v_gene(chain=my_chain, imgt_aligned=True) tr.cell_df['subject'] = 'X' tr.cell_df['epitope'] = 'X' if my_chain == "alpha": tr.index_cols = [ 'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_a_nucseq' ] elif my_chain == "beta": tr.index_cols = [ 'clone_id', 'subject', 'epitope', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq' ] tr.deduplicate() if my_chain == "alpha": tr._tcrdist_legacy_method_alpha() assert isinstance(tr.cdr3_a_aa_pw, np.ndarray) assert isinstance(tr.paired_tcrdist, np.ndarray) elif my_chain == "beta": tr._tcrdist_legacy_method_beta() assert isinstance(tr.cdr3_b_aa_pw, np.ndarray) assert isinstance(tr.paired_tcrdist, np.ndarray)
def test_combine_betas_and_alphas(): import pytest import os import numpy as np import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist import mixcr import multiprocessing testfiles = [ ('contracting_clones_M_alpha.tsv', 'alpha', 'M', 'contracting'), ('contracting_clones_M_beta.tsv', 'beta', 'M', 'contracting'), ('contracting_clones_W_alpha.tsv', 'alpha', 'W', 'contracting'), ('contracting_clones_W_beta.tsv', 'beta', 'W', 'contracting'), ('expanding_clones_M_alpha.tsv', 'alpha', 'M', 'expanding'), ('expanding_clones_M_beta.tsv', 'beta', 'M', 'expanding'), ('expanding_clones_W_alpha.tsv', 'alpha', 'W', 'expanding'), ('expanding_clones_W_beta.tsv', 'beta', 'W', 'expanding') ] betas = [] alphas = [] for f, my_chain, sub, group in testfiles: fn = os.path.join('tcrdist', 'test_files', f) df = pd.read_csv(fn, sep="\t") df['bestVGene'] = df['bestVGene'].apply(lambda s: s + "*00") df['bestJGene'] = df['bestJGene'].apply(lambda s: s + "*00") print(df.columns) map_minervina_to_mixcr = \ {'Rank':'cloneId', 'Read.count':'cloneCount', 'Read.proportion':'cloneFraction', 'bestVGene': 'allVHitsWithScore', 'bestDGene': 'allDHitsWithScore', 'bestJGene':'allJHitsWithScore', 'CDR3.nucleotide.sequence':'nSeqCDR3', 'CDR3.amino.acid.sequence':'aaSeqCDR3', 'refPoints':'refPoints'} df = df.rename(columns=map_minervina_to_mixcr) # CREATE A FAUX MIXCR OUTPUT print(df.columns) df.to_csv('dfmix.clns.txt', index=False, sep="\t") dfmix = mixcr.mixcr_to_tcrdist2(chain=my_chain, organism="human", clones_fn='dfmix.clns.txt') dfmix['CD'] = df['CD'].copy() dfmix['proportion'] = df['cloneFraction'].copy() dfmix = mixcr.remove_entries_with_invalid_vgene(dfmix, chain=my_chain, organism="human") dfmix = mixcr.remove_entries_with_invalid_cdr3(dfmix, chain=my_chain) dfmix['source'] = f dfmix['subject'] = sub dfmix['epitope'] = 'X' dfmix['trajectory'] = group if my_chain == "alpha": alphas.append(dfmix) elif my_chain == "beta": betas.append(dfmix) betas_joined = pd.concat(betas) alpha_joined = pd.concat(alphas) testsets = [(alpha_joined, 'alpha'), (betas_joined, 'beta')] tcr_rep_results = dict() for dfmix, my_chain in testsets: if my_chain == "alpha": tr = TCRrep(cell_df=dfmix, organism="human", chains=['alpha']) elif my_chain == "beta": tr = TCRrep(cell_df=dfmix, organism="human", chains=['beta']) tr.infer_cdrs_from_v_gene(chain=my_chain, imgt_aligned=True) if my_chain == "alpha": tr.index_cols = [ 'clone_id', 'subject', 'epitope', 'trajectory', 'v_a_gene', 'j_a_gene', 'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_a_nucseq', 'CD', 'source' ] elif my_chain == "beta": tr.index_cols = [ 'clone_id', 'subject', 'epitope', 'trajectory', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq', 'CD', 'source' ] tr.deduplicate() my_processes = 10 if my_processes > multiprocessing.cpu_count(): my_processes = multiprocessing.cpu_count() if my_chain == "alpha": tr._tcrdist_legacy_method_alpha(processes=my_processes) assert isinstance(tr.cdr3_a_aa_pw, np.ndarray) assert isinstance(tr.paired_tcrdist, np.ndarray) elif my_chain == "beta": tr._tcrdist_legacy_method_beta(processes=my_processes) assert isinstance(tr.cdr3_b_aa_pw, np.ndarray) assert isinstance(tr.paired_tcrdist, np.ndarray) tcr_rep_results[my_chain] = tr return tcr_rep_results