Esempio n. 1
0
def test_convert_minervina_to_mixcr_run_tcrdist(f, my_chain):

    fn = os.path.join('tcrdist', 'test_files', f)
    df = pd.read_csv(fn, sep="\t")
    df['bestVGene'] = df['bestVGene'].apply(lambda s: s + "*00")
    df['bestJGene'] = df['bestJGene'].apply(lambda s: s + "*00")

    map_minervina_to_mixcr = \
        {'Rank':'cloneId',
        'Read.count':'cloneCount',
        'Read.proportion':'cloneFraction',
        'bestVGene': 'allVHitsWithScore',
        'bestDGene': 'allDHitsWithScore',
        'bestJGene':'allJHitsWithScore',
        'CDR3.nucleotide.sequence':'nSeqCDR3',
        'CDR3.amino.acid.sequence':'aaSeqCDR3',
        'refPoints':'refPoints'}

    df = df.rename(columns=map_minervina_to_mixcr)
    # CREATE A FAUX MIXCR OUTPUT
    df.to_csv('dfmix.clns.txt', index=False, sep="\t")
    # USE TCRDIST2 TOOL FOR PORTING MIXCR OUTPUTS
    dfmix = mixcr.mixcr_to_tcrdist2(chain=my_chain,
                                    organism="human",
                                    clones_fn='dfmix.clns.txt')

    if my_chain == "alpha":
        assert set(dfmix.columns.to_list()) == set([
            'clone_id', 'count', 'v_a_gene', 'd_a_gene', 'j_a_gene',
            'cdr3_a_nucseq', 'cdr3_a_aa'
        ])

    assert df.shape[0] == dfmix.shape[0]
    dfmix = mixcr.remove_entries_with_invalid_vgene(dfmix,
                                                    chain=my_chain,
                                                    organism="human")

    dfmix = mixcr.remove_entries_with_invalid_cdr3(dfmix, chain=my_chain)

    if my_chain == "alpha":
        tr = TCRrep(cell_df=dfmix, organism="human", chains=['alpha'])
    elif my_chain == "beta":
        tr = TCRrep(cell_df=dfmix, organism="human", chains=['beta'])

    tr.infer_cdrs_from_v_gene(chain=my_chain, imgt_aligned=True)
    tr.cell_df['subject'] = 'X'
    tr.cell_df['epitope'] = 'X'

    if my_chain == "alpha":
        tr.index_cols = [
            'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene',
            'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_a_nucseq'
        ]

    elif my_chain == "beta":
        tr.index_cols = [
            'clone_id', 'subject', 'epitope', 'v_b_gene', 'j_b_gene',
            'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq'
        ]

    tr.deduplicate()

    if my_chain == "alpha":
        tr._tcrdist_legacy_method_alpha()
        assert isinstance(tr.cdr3_a_aa_pw, np.ndarray)
        assert isinstance(tr.paired_tcrdist, np.ndarray)
    elif my_chain == "beta":
        tr._tcrdist_legacy_method_beta()
        assert isinstance(tr.cdr3_b_aa_pw, np.ndarray)
        assert isinstance(tr.paired_tcrdist, np.ndarray)
Esempio n. 2
0
def test_combine_betas_and_alphas():
    import pytest
    import os
    import numpy as np
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist import mixcr
    import multiprocessing

    testfiles = [
        ('contracting_clones_M_alpha.tsv', 'alpha', 'M', 'contracting'),
        ('contracting_clones_M_beta.tsv', 'beta', 'M', 'contracting'),
        ('contracting_clones_W_alpha.tsv', 'alpha', 'W', 'contracting'),
        ('contracting_clones_W_beta.tsv', 'beta', 'W', 'contracting'),
        ('expanding_clones_M_alpha.tsv', 'alpha', 'M', 'expanding'),
        ('expanding_clones_M_beta.tsv', 'beta', 'M', 'expanding'),
        ('expanding_clones_W_alpha.tsv', 'alpha', 'W', 'expanding'),
        ('expanding_clones_W_beta.tsv', 'beta', 'W', 'expanding')
    ]

    betas = []
    alphas = []

    for f, my_chain, sub, group in testfiles:
        fn = os.path.join('tcrdist', 'test_files', f)
        df = pd.read_csv(fn, sep="\t")
        df['bestVGene'] = df['bestVGene'].apply(lambda s: s + "*00")
        df['bestJGene'] = df['bestJGene'].apply(lambda s: s + "*00")
        print(df.columns)
        map_minervina_to_mixcr = \
            {'Rank':'cloneId',
            'Read.count':'cloneCount',
            'Read.proportion':'cloneFraction',
            'bestVGene': 'allVHitsWithScore',
            'bestDGene': 'allDHitsWithScore',
            'bestJGene':'allJHitsWithScore',
            'CDR3.nucleotide.sequence':'nSeqCDR3',
            'CDR3.amino.acid.sequence':'aaSeqCDR3',
            'refPoints':'refPoints'}

        df = df.rename(columns=map_minervina_to_mixcr)
        # CREATE A FAUX MIXCR OUTPUT
        print(df.columns)
        df.to_csv('dfmix.clns.txt', index=False, sep="\t")
        dfmix = mixcr.mixcr_to_tcrdist2(chain=my_chain,
                                        organism="human",
                                        clones_fn='dfmix.clns.txt')
        dfmix['CD'] = df['CD'].copy()
        dfmix['proportion'] = df['cloneFraction'].copy()
        dfmix = mixcr.remove_entries_with_invalid_vgene(dfmix,
                                                        chain=my_chain,
                                                        organism="human")

        dfmix = mixcr.remove_entries_with_invalid_cdr3(dfmix, chain=my_chain)
        dfmix['source'] = f
        dfmix['subject'] = sub
        dfmix['epitope'] = 'X'
        dfmix['trajectory'] = group

        if my_chain == "alpha":
            alphas.append(dfmix)
        elif my_chain == "beta":
            betas.append(dfmix)

    betas_joined = pd.concat(betas)
    alpha_joined = pd.concat(alphas)

    testsets = [(alpha_joined, 'alpha'), (betas_joined, 'beta')]

    tcr_rep_results = dict()
    for dfmix, my_chain in testsets:
        if my_chain == "alpha":
            tr = TCRrep(cell_df=dfmix, organism="human", chains=['alpha'])
        elif my_chain == "beta":
            tr = TCRrep(cell_df=dfmix, organism="human", chains=['beta'])

        tr.infer_cdrs_from_v_gene(chain=my_chain, imgt_aligned=True)

        if my_chain == "alpha":
            tr.index_cols = [
                'clone_id', 'subject', 'epitope', 'trajectory', 'v_a_gene',
                'j_a_gene', 'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
                'cdr3_a_nucseq', 'CD', 'source'
            ]

        elif my_chain == "beta":
            tr.index_cols = [
                'clone_id', 'subject', 'epitope', 'trajectory', 'v_b_gene',
                'j_b_gene', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
                'cdr3_b_nucseq', 'CD', 'source'
            ]

        tr.deduplicate()
        my_processes = 10
        if my_processes > multiprocessing.cpu_count():
            my_processes = multiprocessing.cpu_count()

        if my_chain == "alpha":
            tr._tcrdist_legacy_method_alpha(processes=my_processes)
            assert isinstance(tr.cdr3_a_aa_pw, np.ndarray)
            assert isinstance(tr.paired_tcrdist, np.ndarray)
        elif my_chain == "beta":
            tr._tcrdist_legacy_method_beta(processes=my_processes)
            assert isinstance(tr.cdr3_b_aa_pw, np.ndarray)
            assert isinstance(tr.paired_tcrdist, np.ndarray)

        tcr_rep_results[my_chain] = tr
    return tcr_rep_results