Exemple #1
0
def test_CompleteExample_with_TCRMotif_Invoked_From_within_TCRsubset():
    import pandas as pd
    import numpy as np
    import tcrdist as td
    #import IPython

    from tcrdist import mappers
    from tcrdist.repertoire import TCRrep
    from tcrdist.cdr3_motif import TCRMotif
    from tcrdist.subset import TCRsubset
    from tcrdist.storage import StoreIOMotif, StoreIOEntropy
    from tcrdist.plotting import plot_pwm

    tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
    tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep="\t")  #1

    ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope
                                                == "F2")
    tcrdist_clone_df = tcrdist_clone_df[ind].copy()

    mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping  #3
    tcrdist2_df = mappers.generic_pandas_mapper(
        df=tcrdist_clone_df,  #4
        mapping=mapping)

    #1
    tr = TCRrep(cell_df=tcrdist2_df, organism="mouse")

    #2
    tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True)

    #3
    tr.index_cols = [
        'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene',
        'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa',
        'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq',
        'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps',
        'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene'
    ]

    #4
    tr.deduplicate()

    #5
    tr._tcrdist_legacy_method_alpha_beta()

    #6
    distA = tr.dist_a
    distB = tr.dist_b
    assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)

    # 1
    criteria = tr.clone_df.epitope == "PA"
    clone_df_subset = tr.clone_df[criteria]

    # 2
    distA_subset = distA.loc[clone_df_subset.clone_id,
                             clone_df_subset.clone_id].copy()
    distB_subset = distB.loc[clone_df_subset.clone_id,
                             clone_df_subset.clone_id].copy()

    # 3
    ts = TCRsubset(clone_df_subset,
                   organism="mouse",
                   epitopes=["PA"],
                   epitope="PA",
                   chains=["A", "B"],
                   dist_a=distA_subset,
                   dist_b=distB_subset)

    # ts.find_motif()

    cnames = [
        "file_type", "count", "expect_random", "expect_nextgen", "chi_squared",
        "nfixed", "showmotif", "num", "othernum", "overlap", "ep", "ab",
        "nseqs", "v_rep_counts", "j_rep_counts"
    ]
    motif_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones_cdr3_motifs_PA.log'
    x = open(motif_fn, "r").readlines()
    ts.motif_df = pd.DataFrame([l.split() for l in x], columns=cnames)

    i = 0
    row = ts.motif_df.iloc[i, :].to_dict()

    motif_list = list()
    motif_logo = list()
    for i, row in ts.motif_df.iterrows():
        StoreIOMotif_instance = ts.eval_motif(row)
        motif_list.append(StoreIOMotif_instance)
        motif_logo.append(
            plot_pwm(StoreIOMotif_instance,
                     create_file=False,
                     my_height=200,
                     my_width=600))
        if i > 1:
            break
def test_hot_start_example_in_full():
    """
    This is the code that makes up the HotStart example in the docs
    """
    # basic imports
    import os
    import pandas as pd
    import numpy as np
    #import IPython

    # tcrdist classes
    from tcrdist.repertoire import TCRrep
    from tcrdist.subset import TCRsubset
    from tcrdist.cdr3_motif import TCRMotif
    from tcrdist.storage import StoreIOMotif, StoreIOEntropy

    # tcrdist functions
    from tcrdist import plotting
    from tcrdist.mappers import populate_legacy_fields

    # scipy functions for clustering
    from scipy.spatial import distance
    from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

    # sklearn functions for low-dimensional embeddings
    from sklearn.manifold import TSNE, MDS

    # plotnine to allow grammar of graphics plotting akin to R's ggplot2
    #import plotnine as gg

    #1 load data, subset to receptors recognizing "PA" epitope
    tcrdist2_df = pd.read_csv(
        os.path.join("tcrdist", "test_files_compact", "dash.csv"))
    tcrdist2_df = tcrdist2_df[tcrdist2_df.epitope == "PA"].copy()

    #2 create instance of TCRrep class, initializes input as tr.cell_df attribute
    tr = TCRrep(cell_df=tcrdist2_df,
                chains=['alpha', 'beta'],
                organism="mouse")

    #3 Infer CDR1,CDR2,CDR2.5 (a.k.a. phmc) from germline v-genes
    tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True)
    tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True)

    #4 Define index columns for determining unique clones.
    tr.index_cols = [
        'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene',
        'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa',
        'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq',
        'cdr3_a_nucseq'
    ]

    #4 Deduplicate based on index cols, creating tr.clone_df attribute
    tr.deduplicate()

    #5 calculate tcrdists by method in Dash et al.
    tr._tcrdist_legacy_method_alpha_beta()

    #6 Check that sum of alpah-chain and beta-chain distance matrices equal paired_tcrdist
    distA = tr.dist_a
    distB = tr.dist_b
    assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)

    # Cluster
    from scipy.spatial import distance
    from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
    compressed_dmat = distance.squareform(tr.paired_tcrdist, force="vector")
    Z = linkage(compressed_dmat, method="complete")
    den = dendrogram(Z, color_threshold=np.inf, no_plot=True)
    cluster_index = fcluster(Z, t=20, criterion="maxclust")
    assert len(cluster_index) == tr.clone_df.shape[0]
    assert len(cluster_index) == tr.paired_tcrdist.shape[0]
    tr.clone_df['cluster_index'] = cluster_index

    # Subset to Cluster 5
    criteria = (cluster_index == 5)
    clone_df_subset = tr.clone_df[criteria]
    clone_df_subset = clone_df_subset[clone_df_subset.epitope == "PA"].copy()
    dist_a_subset = tr.dist_a.loc[clone_df_subset.clone_id,
                                  clone_df_subset.clone_id].copy()
    dist_b_subset = tr.dist_b.loc[clone_df_subset.clone_id,
                                  clone_df_subset.clone_id].copy()

    clone_df_subset = populate_legacy_fields(df=clone_df_subset,
                                             chains=['alpha', 'beta'])

    ts = TCRsubset(clone_df_subset,
                   organism="mouse",
                   epitopes=["PA"],
                   epitope="PA",
                   chains=["A", "B"],
                   dist_a=dist_a_subset,
                   dist_b=dist_b_subset)

    # Find Motifs
    if os.path.isfile(
            os.path.join("tcrdist", "test_files_compact",
                         "dash_PA_cluster_5_motifs.csv")):
        ts.motif_df = pd.read_csv(
            os.path.join("tcrdist", "test_files_compact",
                         "dash_PA_cluster_5_motifs.csv"))
    else:
        motif_df = ts.find_motif()

    # Save Motifs
    ts.motif_df.to_csv(os.path.join("tcrdist", "test_files_compact",
                                    "dash_PA_cluster_5_motifs.csv"),
                       index=False)

    # Preprocess Motifs
    motif_list_a = list()
    motif_logos_a = list()
    for i, row in ts.motif_df[ts.motif_df.ab == "A"].iterrows():
        StoreIOMotif_instance = ts.eval_motif(row)
        motif_list_a.append(StoreIOMotif_instance)
        motif_logos_a.append(
            plotting.plot_pwm(StoreIOMotif_instance,
                              create_file=False,
                              my_height=200,
                              my_width=600))

    motif_list_b = list()
    motif_logos_b = list()
    for i, row in ts.motif_df[ts.motif_df.ab == "B"].iterrows():
        StoreIOMotif_instance = ts.eval_motif(row)
        motif_list_b.append(StoreIOMotif_instance)
        motif_logos_b.append(
            plotting.plot_pwm(StoreIOMotif_instance,
                              create_file=False,
                              my_height=200,
                              my_width=600))