Beispiel #1
0
 def compute_distances(self, df=None):
     if df is None:
         df = self.clone_df
     if 'alpha' in self.chains:
         pw_alpha = _pws(df=df,
                         metrics=self.metrics_a,
                         weights=self.weights_a,
                         kargs=self.kargs_a,
                         cpu=self.cpus,
                         store=self.store_all_cdr)
         self._assign_distance_attributes(d=pw_alpha, chain='alpha')
     if 'beta' in self.chains:
         pw_beta = _pws(df=df,
                        metrics=self.metrics_b,
                        weights=self.weights_b,
                        kargs=self.kargs_b,
                        cpu=self.cpus,
                        store=self.store_all_cdr)
         self._assign_distance_attributes(d=pw_beta, chain='beta')
     if 'gamma' in self.chains:
         pw_gamma = _pws(df=df,
                         metrics=self.metrics_g,
                         weights=self.weights_g,
                         kargs=self.kargs_g,
                         cpu=self.cpus,
                         store=self.store_all_cdr)
         self._assign_distance_attributes(d=pw_gamma, chain='gamma')
     if 'delta' in self.chains:
         pw_delta = _pws(df=df,
                         metrics=self.metrics_d,
                         weights=self.weights_d,
                         kargs=self.kargs_d,
                         cpu=self.cpus,
                         store=self.store_all_cdr)
         self._assign_distance_attributes(d=pw_delta, chain='delta')
Beispiel #2
0
def test_example_7():
    """
    If you don't want to use OOP, but you I still want a multi-CDR 
    tcrdistances on a single chain, using you own metric 

    def my_own_metric(s1,s2):   
        return Levenshtein.distance(s1,s2)    
    """
    import multiprocessing
    import pandas as pd
    from tcrdist.rep_funcs import _pws, _pw

    df = pd.read_csv("dash2.csv")

    metrics_b = {
        "cdr3_b_aa": my_own_metric,
        "pmhc_b_aa": my_own_metric,
        "cdr2_b_aa": my_own_metric,
        "cdr1_b_aa": my_own_metric
    }

    weights_b = {
        "cdr3_b_aa": 1,
        "pmhc_b_aa": 1,
        "cdr2_b_aa": 1,
        "cdr1_b_aa": 1
    }

    kargs_b = {
        'cdr3_b_aa': {
            'use_numba': False
        },
        'pmhc_b_aa': {
            'use_numba': False
        },
        'cdr2_b_aa': {
            'use_numba': False
        },
        'cdr1_b_aa': {
            'use_numba': False
        }
    }

    dmats = _pws(df=df,
                 metrics=metrics_b,
                 weights=weights_b,
                 kargs=kargs_b,
                 cpu=1,
                 uniquify=True,
                 store=True)

    print(dmats.keys())
Beispiel #3
0
def test_example_8():
    """
    You want a 'tcrdistance' but you don't want to bother with the tcrdist3 framework. 
    
    Note that the columns names are completely arbitrary under this 
    framework, so one can directly compute a tcrdist on a 
    AIRR, MIXCR, VDJTools, or other formated file without any
    reformatting.
    """
    import multiprocessing
    import pandas as pd
    import pwseqdist as pw
    from tcrdist.rep_funcs import _pws, _pw

    df_airr = pd.read_csv("dash_beta_airr.csv")

    # Choose the metrics you want to apply to each CDR
    metrics = {
        'cdr3_aa': pw.metrics.nb_vector_tcrdist,
        'cdr2_aa': pw.metrics.nb_vector_tcrdist,
        'cdr1_aa': pw.metrics.nb_vector_tcrdist
    }

    # Choose the weights that are right for you.
    weights = {'cdr3_aa': 3, 'cdr2_aa': 1, 'cdr1_aa': 1}

    # Provide arguments for the distance metrics
    kargs = {
        'cdr3_aa': {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 3,
            'ctrim': 2,
            'fixed_gappos': False
        },
        'cdr2_aa': {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        'cdr1_aa': {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        }
    }

    # Here are your distance matrices
    from tcrdist.rep_funcs import _pws

    dmats = _pws(df=df_airr,
                 metrics=metrics,
                 weights=weights,
                 kargs=kargs,
                 cpu=1,
                 store=True)

    dmats['tcrdist']
Beispiel #4
0
def get_centroid_seq_alpha(df, cdr3_name='cdr3_a_aa'):
    """
    Given a list of sequences, returns the sequence with the minimum 
    sum of distances to all other seqs in the list.

    Parameters
    ----------
    seqs : list
        list of strings (amino acid rep)
    metric : func
        defaults to pw.metrics.nw_hamming_metric

    Returns
    -------
    centroid_seq : str

    Example 
    -------
    >>> seqs = ['CASSEILAALGTQYF', 'CASSWTSRETQYF', 'CASSLAQETQYF', 'CASSLAPGDVSQYF', 'CASSWDQETQYF', 'CASSLWWDSGANVLTF', 'CASSLARTLSSGANVLTF', 'CASIPGTLFTFSGANVLTF', 'CASSFASSGANVLTF', 'CASSYRLLSGANVLTF']	
    >>> get_centroid_seq(seqs)
    'CASSFASSGANVLTF'

    Notes 
    -----
    In case of multiple occurrences of the minimum values, the indices 
    corresponding to the first occurrence are returned.
    """
    #import pwseqdist as pw
    #from scipy.spatial.distance import squareform
    seqs = df[cdr3_name]
    if len(seqs) < 3:
        return df.head(1)[cdr3_name], None, None, None

    metrics = {
        "cdr3_a_aa": pw.metrics.nb_vector_tcrdist,
        "pmhc_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr2_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr1_a_aa": pw.metrics.nb_vector_tcrdist
    }

    # Define weights
    weights = {"cdr3_a_aa": 3, "pmhc_a_aa": 1, "cdr2_a_aa": 1, "cdr1_a_aa": 1}

    kargs = {
        "cdr3_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 3,
            'ctrim': 2,
            'fixed_gappos': False
        },
        "pmhc_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr2_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr1_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        }
    }

    dmat = _pws(df=df,
                metrics=metrics,
                weights=weights,
                store=False,
                uniquify=False,
                kargs=kargs)

    dmat = dmat['tcrdist']

    dmat = dmat.astype(int)
    iloc_idx = dmat.sum(axis=0).argmin()
    centroid_seq = df[cdr3_name].to_list()[iloc_idx]
    loc_idx = df.index.to_list()[iloc_idx]
    return centroid_seq, dmat, iloc_idx, loc_idx
Beispiel #5
0
def test_dash_nw_metric_fixed_gappos_False():
    import pandas as pd
    import pwseqdist as pw
    from tcrdist.rep_funcs import _pws

    # Define metrics for each region
    metrics = {
        "cdr3_a_aa": pw.metrics.nw_metric,
        "pmhc_a_aa": pw.metrics.nw_metric,
        "cdr2_a_aa": pw.metrics.nw_metric,
        "cdr1_a_aa": pw.metrics.nw_metric,
        "cdr3_b_aa": pw.metrics.nw_metric,
        "pmhc_b_aa": pw.metrics.nw_metric,
        "cdr2_b_aa": pw.metrics.nw_metric,
        "cdr1_b_aa": pw.metrics.nw_metric
    }

    # Define weights
    weights = {
        "cdr3_a_aa": 3,
        "pmhc_a_aa": 1,
        "cdr2_a_aa": 1,
        "cdr1_a_aa": 1,
        "cdr3_b_aa": 3,
        "pmhc_b_aa": 1,
        "cdr2_b_aa": 1,
        "cdr1_b_aa": 1
    }

    kargs = {
        "cdr3_a_aa": {
            'use_numba': False
        },
        "pmhc_a_aa": {
            'use_numba': False
        },
        "cdr2_a_aa": {
            'use_numba': False
        },
        "cdr1_a_aa": {
            'use_numba': False
        },
        "cdr3_b_aa": {
            'use_numba': False
        },
        "pmhc_b_aa": {
            'use_numba': False
        },
        "cdr2_b_aa": {
            'use_numba': False
        },
        "cdr1_b_aa": {
            'use_numba': False
        }
    }

    df = pd.read_csv("dash2.csv")
    import multiprocessing
    df = df.head(100)
    r = _pws(df=df,
             metrics=metrics,
             weights=weights,
             kargs=kargs,
             cpu=1,
             store=False)
    assert r['tcrdist'].shape[0] == 100
    assert r['tcrdist'].shape[1] == 100
Beispiel #6
0
def test_dash_tcrdist_fixed_gappos_True():
    import pandas as pd
    import pwseqdist as pw
    from tcrdist.rep_funcs import _pws

    # Define metrics for each region
    metrics = {
        "cdr3_a_aa": pw.metrics.nb_vector_tcrdist,
        "pmhc_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr2_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr1_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr3_b_aa": pw.metrics.nb_vector_tcrdist,
        "pmhc_b_aa": pw.metrics.nb_vector_tcrdist,
        "cdr2_b_aa": pw.metrics.nb_vector_tcrdist,
        "cdr1_b_aa": pw.metrics.nb_vector_tcrdist
    }

    # Define weights
    weights = {
        "cdr3_a_aa": 3,
        "pmhc_a_aa": 1,
        "cdr2_a_aa": 1,
        "cdr1_a_aa": 1,
        "cdr3_b_aa": 3,
        "pmhc_b_aa": 1,
        "cdr2_b_aa": 1,
        "cdr1_b_aa": 1
    }

    kargs = {
        "cdr3_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 3,
            'ctrim': 2,
            'fixed_gappos': True
        },
        "pmhc_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr2_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr1_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr3_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 3,
            'ctrim': 2,
            'fixed_gappos': True
        },
        "pmhc_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr2_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr1_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        }
    }

    df = pd.read_csv("dash2.csv")
    r = _pws(df=df,
             metrics=metrics,
             weights=weights,
             kargs=kargs,
             cpu=1,
             store=False)
    assert r['tcrdist'].shape[0] == 1924
    assert r['tcrdist'].shape[1] == 1924
Beispiel #7
0
def test_pws_rectangular_computation():
    metrics = {
        "cdr3_a_aa": pw.metrics.nb_vector_tcrdist,
        "pmhc_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr2_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr1_a_aa": pw.metrics.nb_vector_tcrdist,
        "cdr3_b_aa": pw.metrics.nb_vector_tcrdist,
        "pmhc_b_aa": pw.metrics.nb_vector_tcrdist,
        "cdr2_b_aa": pw.metrics.nb_vector_tcrdist,
        "cdr1_b_aa": pw.metrics.nb_vector_tcrdist
    }

    weights = {
        "cdr3_a_aa": 3,
        "pmhc_a_aa": 1,
        "cdr2_a_aa": 1,
        "cdr1_a_aa": 1,
        "cdr3_b_aa": 3,
        "pmhc_b_aa": 1,
        "cdr2_b_aa": 1,
        "cdr1_b_aa": 1
    }

    kargs = {
        "cdr3_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 3,
            'ctrim': 2,
            'fixed_gappos': False
        },
        "pmhc_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr2_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr1_a_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr3_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 3,
            'ctrim': 2,
            'fixed_gappos': False
        },
        "pmhc_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr2_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        },
        "cdr1_b_aa": {
            'use_numba': True,
            'distance_matrix': pw.matrices.tcr_nb_distance_matrix,
            'dist_weight': 1,
            'gap_penalty': 4,
            'ntrim': 0,
            'ctrim': 0,
            'fixed_gappos': True
        }
    }

    df = pd.read_csv("dash2.csv")
    df = df.head(10).copy()
    df2 = pd.read_csv("dash2.csv")
    r = _pws(df=df,
             df2=df2,
             metrics=metrics,
             weights=weights,
             kargs=kargs,
             cpu=1,
             store=False)
    assert r['tcrdist'].shape == (10, 1924)