Esempio n. 1
0
def test_generic_pandas_mapper():
    """
    Test that generic pandas mapping selects and renames columns
    """
    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
    mapper = OrderedDict([('a', 'ardvark')])
    r = mappers.generic_pandas_mapper(df, mapper)
    expected = pd.DataFrame({"ardvark": [1, 2]})
    assert r.equals(expected)
Esempio n. 2
0
def test_generic_pandas_mapper_raises_assertion_error():
    """
    Test that a KeyError is raised if the mapper contains keys not in df.columns
    """
    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
    mapper = OrderedDict([('c', 'canary')])
    with pytest.raises(KeyError):
        r = mappers.generic_pandas_mapper(df, mapper)
        expected = pd.DataFrame({"ardvark": [1, 2]})
Esempio n. 3
0
def test_data_type_conversion_with_reduce_file_size():
    """
    Test that successfull conversion of np attributes from
    np.float64 to np.int16
    after calling

    """
    import pandas as pd
    import numpy as np
    import tcrdist as td

    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep = "\t")               #1

    ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope == "F2")
    tcrdist_clone_df = tcrdist_clone_df[ind].copy()

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping                     #3
    tcrdist2_df = mappers.generic_pandas_mapper(df = tcrdist_clone_df,         #4
                                              mapping = mapping)



    #1
    tr = TCRrep(cell_df = tcrdist2_df, organism = "mouse")

    #2
    tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain = 'beta',  imgt_aligned=True)

    #3
    tr.index_cols = ['clone_id', 'subject', 'epitope',
                   'v_a_gene',  'j_a_gene', 'v_b_gene', 'j_b_gene',
                   'cdr3_a_aa', 'cdr3_b_aa',
                   'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
                   'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
                   'cdr3_b_nucseq', 'cdr3_a_nucseq',
                   'va_countreps', 'ja_countreps',
                   'vb_countreps', 'jb_countreps',
                   'va_gene', 'vb_gene',
                   'ja_gene', 'jb_gene']

    #4
    tr.deduplicate()

    #5
    tr._tcrdist_legacy_method_alpha_beta()
    #print(type(tr.cdr3_a_aa_pw[1,1]))
    assert isinstance(tr.cdr3_a_aa_pw[1,1], np.int)
    assert isinstance(tr.cdr3_b_aa_pw[1,1], np.int)
    tr.reduce_file_size()
    assert isinstance(tr.cdr3_a_aa_pw[1,1], np.int16)
    assert isinstance(tr.cdr3_b_aa_pw[1,1], np.int16)
Esempio n. 4
0
def generate_tr():
    import pandas as pd
    import numpy as np
    import tcrdist as td


    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep = "\t")               #1

    ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope == "F2")
    tcrdist_clone_df = tcrdist_clone_df[ind].copy()

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping                     #3
    tcrdist2_df = mappers.generic_pandas_mapper(df = tcrdist_clone_df,         #4
                                              mapping = mapping)



    #1
    tr = TCRrep(cell_df = tcrdist2_df, organism = "mouse")

    #2
    tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain = 'beta',  imgt_aligned=True)

    #3
    tr.index_cols = ['clone_id', 'subject', 'epitope',
                   'v_a_gene',  'j_a_gene', 'v_b_gene', 'j_b_gene',
                   'cdr3_a_aa', 'cdr3_b_aa',
                   'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
                   'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
                   'cdr3_b_nucseq', 'cdr3_a_nucseq',
                   'va_countreps', 'ja_countreps',
                   'vb_countreps', 'jb_countreps',
                   'va_gene', 'vb_gene',
                   'ja_gene', 'jb_gene']

    #4
    tr.deduplicate()

    #5
    tr._tcrdist_legacy_method_alpha_beta()
    return tr
Esempio n. 5
0
def test_CompleteExample_with_TCRMotif_Invoked_From_within_TCRsubset():
    import pandas as pd
    import numpy as np
    import tcrdist as td
    #import IPython

    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep
    from tcrdist.cdr3_motif import TCRMotif
    from tcrdist.subset import TCRsubset
    from tcrdist.storage import StoreIOMotif, StoreIOEntropy
    from tcrdist.plotting import plot_pwm

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep="\t")  #1

    ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope
                                                == "F2")
    tcrdist_clone_df = tcrdist_clone_df[ind].copy()

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping  #3
    tcrdist2_df = mappers.generic_pandas_mapper(
        df=tcrdist_clone_df,  #4
        mapping=mapping)

    #1
    tr = TCRrep(cell_df=tcrdist2_df, organism="mouse")

    #2
    tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True)

    #3
    tr.index_cols = [
        'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene',
        'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa',
        'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq',
        'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps',
        'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene'
    ]

    #4
    tr.deduplicate()

    #5
    tr._tcrdist_legacy_method_alpha_beta()

    #6
    distA = tr.dist_a
    distB = tr.dist_b
    assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)

    # 1
    criteria = tr.clone_df.epitope == "PA"
    clone_df_subset = tr.clone_df[criteria]

    # 2
    distA_subset = distA.loc[clone_df_subset.clone_id,
                             clone_df_subset.clone_id].copy()
    distB_subset = distB.loc[clone_df_subset.clone_id,
                             clone_df_subset.clone_id].copy()

    # 3
    ts = TCRsubset(clone_df_subset,
                   organism="mouse",
                   epitopes=["PA"],
                   epitope="PA",
                   chains=["A", "B"],
                   dist_a=distA_subset,
                   dist_b=distB_subset)

    # ts.find_motif()

    cnames = [
        "file_type", "count", "expect_random", "expect_nextgen", "chi_squared",
        "nfixed", "showmotif", "num", "othernum", "overlap", "ep", "ab",
        "nseqs", "v_rep_counts", "j_rep_counts"
    ]
    motif_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones_cdr3_motifs_PA.log'
    x = open(motif_fn, "r").readlines()
    ts.motif_df = pd.DataFrame([l.split() for l in x], columns=cnames)

    i = 0
    row = ts.motif_df.iloc[i, :].to_dict()

    motif_list = list()
    motif_logo = list()
    for i, row in ts.motif_df.iterrows():
        StoreIOMotif_instance = ts.eval_motif(row)
        motif_list.append(StoreIOMotif_instance)
        motif_logo.append(
            plot_pwm(StoreIOMotif_instance,
                     create_file=False,
                     my_height=200,
                     my_width=600))
        if i > 1:
            break
Esempio n. 6
0
def test_Complete_Performance_Example():
    import pandas as pd
    import numpy as np
    import tcrdist as td
    from collections import namedtuple

    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep="\t")

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping
    tcrdist2_df = mappers.generic_pandas_mapper(df=tcrdist_clone_df,
                                                mapping=mapping)

    tr = TCRrep(cell_df=tcrdist2_df, organism="mouse")

    tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True)

    tr.index_cols = [
        'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene',
        'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa',
        'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq',
        'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps',
        'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene'
    ]

    tr.deduplicate()

    tr._tcrdist_legacy_method_alpha_beta(processes=1)

    distA = tr.dist_a
    distB = tr.dist_b
    assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)

    # K NEAREST NEIGHBORS
    pr = namedtuple("perf", ["observed", "predicted", "dist"])
    obsereved = tr.clone_df.epitope.to_list()
    performance = list()

    k = 5
    for i, row in tr.clone_df.iterrows():
        ind = (tr.clone_df.subject != row.subject
               )  # Index hold out all data from that subject
        distances = tr.paired_tcrdist[
            i, ind]  # Get Distances from the ith row, holding out subject
        sorted_indices = np.argsort(
            distances)  # Get index of storted distances small to large
        sorted_epitopes = tr.clone_df.epitope.iloc[sorted_indices].to_list(
        )  # Get epitopes associated wtih those indices
        sorted_distances = distances[
            sorted_indices]  # Get distances associated with those neighbors
        predicted = sorted_epitopes[
            0:k]  # Get Predicted epitopes for K nearest neighbors
        predicted_distance = sorted_distances[
            0:k]  # Get distances for K nearest neighbots
        performance.append(
            pr(obsereved[i], predicted,
               predicted_distance))  # Save Performance Information

    performance[1:10]