Example #1
0
def test_data_type_conversion_with_reduce_file_size():
    """
    Test that successfull conversion of np attributes from
    np.float64 to np.int16
    after calling

    """
    import pandas as pd
    import numpy as np
    import tcrdist as td

    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep = "\t")               #1

    ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope == "F2")
    tcrdist_clone_df = tcrdist_clone_df[ind].copy()

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping                     #3
    tcrdist2_df = mappers.generic_pandas_mapper(df = tcrdist_clone_df,         #4
                                              mapping = mapping)



    #1
    tr = TCRrep(cell_df = tcrdist2_df, organism = "mouse")

    #2
    tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain = 'beta',  imgt_aligned=True)

    #3
    tr.index_cols = ['clone_id', 'subject', 'epitope',
                   'v_a_gene',  'j_a_gene', 'v_b_gene', 'j_b_gene',
                   'cdr3_a_aa', 'cdr3_b_aa',
                   'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
                   'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
                   'cdr3_b_nucseq', 'cdr3_a_nucseq',
                   'va_countreps', 'ja_countreps',
                   'vb_countreps', 'jb_countreps',
                   'va_gene', 'vb_gene',
                   'ja_gene', 'jb_gene']

    #4
    tr.deduplicate()

    #5
    tr._tcrdist_legacy_method_alpha_beta()
    #print(type(tr.cdr3_a_aa_pw[1,1]))
    assert isinstance(tr.cdr3_a_aa_pw[1,1], np.int)
    assert isinstance(tr.cdr3_b_aa_pw[1,1], np.int)
    tr.reduce_file_size()
    assert isinstance(tr.cdr3_a_aa_pw[1,1], np.int16)
    assert isinstance(tr.cdr3_b_aa_pw[1,1], np.int16)
Example #2
0
def generate_tr():
    import pandas as pd
    import numpy as np
    import tcrdist as td


    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep = "\t")               #1

    ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope == "F2")
    tcrdist_clone_df = tcrdist_clone_df[ind].copy()

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping                     #3
    tcrdist2_df = mappers.generic_pandas_mapper(df = tcrdist_clone_df,         #4
                                              mapping = mapping)



    #1
    tr = TCRrep(cell_df = tcrdist2_df, organism = "mouse")

    #2
    tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain = 'beta',  imgt_aligned=True)

    #3
    tr.index_cols = ['clone_id', 'subject', 'epitope',
                   'v_a_gene',  'j_a_gene', 'v_b_gene', 'j_b_gene',
                   'cdr3_a_aa', 'cdr3_b_aa',
                   'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
                   'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
                   'cdr3_b_nucseq', 'cdr3_a_nucseq',
                   'va_countreps', 'ja_countreps',
                   'vb_countreps', 'jb_countreps',
                   'va_gene', 'vb_gene',
                   'ja_gene', 'jb_gene']

    #4
    tr.deduplicate()

    #5
    tr._tcrdist_legacy_method_alpha_beta()
    return tr
Example #3
0
def test_Complete_Performance_Example():
    import pandas as pd
    import numpy as np
    import tcrdist as td
    from collections import namedtuple

    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep="\t")

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping
    tcrdist2_df = mappers.generic_pandas_mapper(df=tcrdist_clone_df,
                                                mapping=mapping)

    tr = TCRrep(cell_df=tcrdist2_df, organism="mouse")

    tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True)

    tr.index_cols = [
        'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene',
        'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa',
        'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq',
        'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps',
        'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene'
    ]

    tr.deduplicate()

    tr._tcrdist_legacy_method_alpha_beta(processes=1)

    distA = tr.dist_a
    distB = tr.dist_b
    assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)

    # K NEAREST NEIGHBORS
    pr = namedtuple("perf", ["observed", "predicted", "dist"])
    obsereved = tr.clone_df.epitope.to_list()
    performance = list()

    k = 5
    for i, row in tr.clone_df.iterrows():
        ind = (tr.clone_df.subject != row.subject
               )  # Index hold out all data from that subject
        distances = tr.paired_tcrdist[
            i, ind]  # Get Distances from the ith row, holding out subject
        sorted_indices = np.argsort(
            distances)  # Get index of storted distances small to large
        sorted_epitopes = tr.clone_df.epitope.iloc[sorted_indices].to_list(
        )  # Get epitopes associated wtih those indices
        sorted_distances = distances[
            sorted_indices]  # Get distances associated with those neighbors
        predicted = sorted_epitopes[
            0:k]  # Get Predicted epitopes for K nearest neighbors
        predicted_distance = sorted_distances[
            0:k]  # Get distances for K nearest neighbots
        performance.append(
            pr(obsereved[i], predicted,
               predicted_distance))  # Save Performance Information

    performance[1:10]
Example #4
0
def test_CompleteExample_with_TCRMotif_Invoked_From_within_TCRsubset():
    import pandas as pd
    import numpy as np
    import tcrdist as td
    #import IPython

    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep
    from tcrdist.cdr3_motif import TCRMotif
    from tcrdist.subset import TCRsubset
    from tcrdist.storage import StoreIOMotif, StoreIOEntropy
    from tcrdist.plotting import plot_pwm

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep="\t")  #1

    ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope
                                                == "F2")
    tcrdist_clone_df = tcrdist_clone_df[ind].copy()

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping  #3
    tcrdist2_df = mappers.generic_pandas_mapper(
        df=tcrdist_clone_df,  #4
        mapping=mapping)

    #1
    tr = TCRrep(cell_df=tcrdist2_df, organism="mouse")

    #2
    tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True)

    #3
    tr.index_cols = [
        'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene',
        'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa',
        'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq',
        'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps',
        'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene'
    ]

    #4
    tr.deduplicate()

    #5
    tr._tcrdist_legacy_method_alpha_beta()

    #6
    distA = tr.dist_a
    distB = tr.dist_b
    assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)

    # 1
    criteria = tr.clone_df.epitope == "PA"
    clone_df_subset = tr.clone_df[criteria]

    # 2
    distA_subset = distA.loc[clone_df_subset.clone_id,
                             clone_df_subset.clone_id].copy()
    distB_subset = distB.loc[clone_df_subset.clone_id,
                             clone_df_subset.clone_id].copy()

    # 3
    ts = TCRsubset(clone_df_subset,
                   organism="mouse",
                   epitopes=["PA"],
                   epitope="PA",
                   chains=["A", "B"],
                   dist_a=distA_subset,
                   dist_b=distB_subset)

    # ts.find_motif()

    cnames = [
        "file_type", "count", "expect_random", "expect_nextgen", "chi_squared",
        "nfixed", "showmotif", "num", "othernum", "overlap", "ep", "ab",
        "nseqs", "v_rep_counts", "j_rep_counts"
    ]
    motif_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones_cdr3_motifs_PA.log'
    x = open(motif_fn, "r").readlines()
    ts.motif_df = pd.DataFrame([l.split() for l in x], columns=cnames)

    i = 0
    row = ts.motif_df.iloc[i, :].to_dict()

    motif_list = list()
    motif_logo = list()
    for i, row in ts.motif_df.iterrows():
        StoreIOMotif_instance = ts.eval_motif(row)
        motif_list.append(StoreIOMotif_instance)
        motif_logo.append(
            plot_pwm(StoreIOMotif_instance,
                     create_file=False,
                     my_height=200,
                     my_width=600))
        if i > 1:
            break
Example #5
0
def test_hot_start_example_in_full():
    """
    This is the code that makes up the HotStart example in the docs
    """
    # basic imports
    import os
    import pandas as pd
    import numpy as np
    #import IPython

    # tcrdist classes
    from tcrdist.repertoire import TCRrep
    from tcrdist.subset import TCRsubset
    from tcrdist.cdr3_motif import TCRMotif
    from tcrdist.storage import StoreIOMotif, StoreIOEntropy

    # tcrdist functions
    from tcrdist import plotting
    from tcrdist.mappers import populate_legacy_fields

    # scipy functions for clustering
    from scipy.spatial import distance
    from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

    # sklearn functions for low-dimensional embeddings
    from sklearn.manifold import TSNE, MDS

    # plotnine to allow grammar of graphics plotting akin to R's ggplot2
    #import plotnine as gg

    #1 load data, subset to receptors recognizing "PA" epitope
    tcrdist2_df = pd.read_csv(
        os.path.join("tcrdist", "test_files_compact", "dash.csv"))
    tcrdist2_df = tcrdist2_df[tcrdist2_df.epitope == "PA"].copy()

    #2 create instance of TCRrep class, initializes input as tr.cell_df attribute
    tr = TCRrep(cell_df=tcrdist2_df,
                chains=['alpha', 'beta'],
                organism="mouse")

    #3 Infer CDR1,CDR2,CDR2.5 (a.k.a. phmc) from germline v-genes
    tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True)

    #4 Define index columns for determining unique clones.
    tr.index_cols = [
        'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene',
        'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa',
        'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq',
        'cdr3_a_nucseq'
    ]

    #4 Deduplicate based on index cols, creating tr.clone_df attribute
    tr.deduplicate()

    #5 calculate tcrdists by method in Dash et al.
    tr._tcrdist_legacy_method_alpha_beta()

    #6 Check that sum of alpah-chain and beta-chain distance matrices equal paired_tcrdist
    distA = tr.dist_a
    distB = tr.dist_b
    assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)

    # Cluster
    from scipy.spatial import distance
    from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
    compressed_dmat = distance.squareform(tr.paired_tcrdist, force="vector")
    Z = linkage(compressed_dmat, method="complete")
    den = dendrogram(Z, color_threshold=np.inf, no_plot=True)
    cluster_index = fcluster(Z, t=20, criterion="maxclust")
    assert len(cluster_index) == tr.clone_df.shape[0]
    assert len(cluster_index) == tr.paired_tcrdist.shape[0]
    tr.clone_df['cluster_index'] = cluster_index

    # Subset to Cluster 5
    criteria = (cluster_index == 5)
    clone_df_subset = tr.clone_df[criteria]
    clone_df_subset = clone_df_subset[clone_df_subset.epitope == "PA"].copy()
    dist_a_subset = tr.dist_a.loc[clone_df_subset.clone_id,
                                  clone_df_subset.clone_id].copy()
    dist_b_subset = tr.dist_b.loc[clone_df_subset.clone_id,
                                  clone_df_subset.clone_id].copy()

    clone_df_subset = populate_legacy_fields(df=clone_df_subset,
                                             chains=['alpha', 'beta'])

    ts = TCRsubset(clone_df_subset,
                   organism="mouse",
                   epitopes=["PA"],
                   epitope="PA",
                   chains=["A", "B"],
                   dist_a=dist_a_subset,
                   dist_b=dist_b_subset)

    # Find Motifs
    if os.path.isfile(
            os.path.join("tcrdist", "test_files_compact",
                         "dash_PA_cluster_5_motifs.csv")):
        ts.motif_df = pd.read_csv(
            os.path.join("tcrdist", "test_files_compact",
                         "dash_PA_cluster_5_motifs.csv"))
    else:
        motif_df = ts.find_motif()

    # Save Motifs
    ts.motif_df.to_csv(os.path.join("tcrdist", "test_files_compact",
                                    "dash_PA_cluster_5_motifs.csv"),
                       index=False)

    # Preprocess Motifs
    motif_list_a = list()
    motif_logos_a = list()
    for i, row in ts.motif_df[ts.motif_df.ab == "A"].iterrows():
        StoreIOMotif_instance = ts.eval_motif(row)
        motif_list_a.append(StoreIOMotif_instance)
        motif_logos_a.append(
            plotting.plot_pwm(StoreIOMotif_instance,
                              create_file=False,
                              my_height=200,
                              my_width=600))

    motif_list_b = list()
    motif_logos_b = list()
    for i, row in ts.motif_df[ts.motif_df.ab == "B"].iterrows():
        StoreIOMotif_instance = ts.eval_motif(row)
        motif_list_b.append(StoreIOMotif_instance)
        motif_logos_b.append(
            plotting.plot_pwm(StoreIOMotif_instance,
                              create_file=False,
                              my_height=200,
                              my_width=600))