def compute_distances(self, df=None): if df is None: df = self.clone_df if 'alpha' in self.chains: pw_alpha = _pws(df=df, metrics=self.metrics_a, weights=self.weights_a, kargs=self.kargs_a, cpu=self.cpus, store=self.store_all_cdr) self._assign_distance_attributes(d=pw_alpha, chain='alpha') if 'beta' in self.chains: pw_beta = _pws(df=df, metrics=self.metrics_b, weights=self.weights_b, kargs=self.kargs_b, cpu=self.cpus, store=self.store_all_cdr) self._assign_distance_attributes(d=pw_beta, chain='beta') if 'gamma' in self.chains: pw_gamma = _pws(df=df, metrics=self.metrics_g, weights=self.weights_g, kargs=self.kargs_g, cpu=self.cpus, store=self.store_all_cdr) self._assign_distance_attributes(d=pw_gamma, chain='gamma') if 'delta' in self.chains: pw_delta = _pws(df=df, metrics=self.metrics_d, weights=self.weights_d, kargs=self.kargs_d, cpu=self.cpus, store=self.store_all_cdr) self._assign_distance_attributes(d=pw_delta, chain='delta')
def test_example_7(): """ If you don't want to use OOP, but you I still want a multi-CDR tcrdistances on a single chain, using you own metric def my_own_metric(s1,s2): return Levenshtein.distance(s1,s2) """ import multiprocessing import pandas as pd from tcrdist.rep_funcs import _pws, _pw df = pd.read_csv("dash2.csv") metrics_b = { "cdr3_b_aa": my_own_metric, "pmhc_b_aa": my_own_metric, "cdr2_b_aa": my_own_metric, "cdr1_b_aa": my_own_metric } weights_b = { "cdr3_b_aa": 1, "pmhc_b_aa": 1, "cdr2_b_aa": 1, "cdr1_b_aa": 1 } kargs_b = { 'cdr3_b_aa': { 'use_numba': False }, 'pmhc_b_aa': { 'use_numba': False }, 'cdr2_b_aa': { 'use_numba': False }, 'cdr1_b_aa': { 'use_numba': False } } dmats = _pws(df=df, metrics=metrics_b, weights=weights_b, kargs=kargs_b, cpu=1, uniquify=True, store=True) print(dmats.keys())
def test_example_8(): """ You want a 'tcrdistance' but you don't want to bother with the tcrdist3 framework. Note that the columns names are completely arbitrary under this framework, so one can directly compute a tcrdist on a AIRR, MIXCR, VDJTools, or other formated file without any reformatting. """ import multiprocessing import pandas as pd import pwseqdist as pw from tcrdist.rep_funcs import _pws, _pw df_airr = pd.read_csv("dash_beta_airr.csv") # Choose the metrics you want to apply to each CDR metrics = { 'cdr3_aa': pw.metrics.nb_vector_tcrdist, 'cdr2_aa': pw.metrics.nb_vector_tcrdist, 'cdr1_aa': pw.metrics.nb_vector_tcrdist } # Choose the weights that are right for you. weights = {'cdr3_aa': 3, 'cdr2_aa': 1, 'cdr1_aa': 1} # Provide arguments for the distance metrics kargs = { 'cdr3_aa': { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 3, 'ctrim': 2, 'fixed_gappos': False }, 'cdr2_aa': { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, 'cdr1_aa': { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True } } # Here are your distance matrices from tcrdist.rep_funcs import _pws dmats = _pws(df=df_airr, metrics=metrics, weights=weights, kargs=kargs, cpu=1, store=True) dmats['tcrdist']
def get_centroid_seq_alpha(df, cdr3_name='cdr3_a_aa'): """ Given a list of sequences, returns the sequence with the minimum sum of distances to all other seqs in the list. Parameters ---------- seqs : list list of strings (amino acid rep) metric : func defaults to pw.metrics.nw_hamming_metric Returns ------- centroid_seq : str Example ------- >>> seqs = ['CASSEILAALGTQYF', 'CASSWTSRETQYF', 'CASSLAQETQYF', 'CASSLAPGDVSQYF', 'CASSWDQETQYF', 'CASSLWWDSGANVLTF', 'CASSLARTLSSGANVLTF', 'CASIPGTLFTFSGANVLTF', 'CASSFASSGANVLTF', 'CASSYRLLSGANVLTF'] >>> get_centroid_seq(seqs) 'CASSFASSGANVLTF' Notes ----- In case of multiple occurrences of the minimum values, the indices corresponding to the first occurrence are returned. """ #import pwseqdist as pw #from scipy.spatial.distance import squareform seqs = df[cdr3_name] if len(seqs) < 3: return df.head(1)[cdr3_name], None, None, None metrics = { "cdr3_a_aa": pw.metrics.nb_vector_tcrdist, "pmhc_a_aa": pw.metrics.nb_vector_tcrdist, "cdr2_a_aa": pw.metrics.nb_vector_tcrdist, "cdr1_a_aa": pw.metrics.nb_vector_tcrdist } # Define weights weights = {"cdr3_a_aa": 3, "pmhc_a_aa": 1, "cdr2_a_aa": 1, "cdr1_a_aa": 1} kargs = { "cdr3_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 3, 'ctrim': 2, 'fixed_gappos': False }, "pmhc_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr2_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr1_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True } } dmat = _pws(df=df, metrics=metrics, weights=weights, store=False, uniquify=False, kargs=kargs) dmat = dmat['tcrdist'] dmat = dmat.astype(int) iloc_idx = dmat.sum(axis=0).argmin() centroid_seq = df[cdr3_name].to_list()[iloc_idx] loc_idx = df.index.to_list()[iloc_idx] return centroid_seq, dmat, iloc_idx, loc_idx
def test_dash_nw_metric_fixed_gappos_False(): import pandas as pd import pwseqdist as pw from tcrdist.rep_funcs import _pws # Define metrics for each region metrics = { "cdr3_a_aa": pw.metrics.nw_metric, "pmhc_a_aa": pw.metrics.nw_metric, "cdr2_a_aa": pw.metrics.nw_metric, "cdr1_a_aa": pw.metrics.nw_metric, "cdr3_b_aa": pw.metrics.nw_metric, "pmhc_b_aa": pw.metrics.nw_metric, "cdr2_b_aa": pw.metrics.nw_metric, "cdr1_b_aa": pw.metrics.nw_metric } # Define weights weights = { "cdr3_a_aa": 3, "pmhc_a_aa": 1, "cdr2_a_aa": 1, "cdr1_a_aa": 1, "cdr3_b_aa": 3, "pmhc_b_aa": 1, "cdr2_b_aa": 1, "cdr1_b_aa": 1 } kargs = { "cdr3_a_aa": { 'use_numba': False }, "pmhc_a_aa": { 'use_numba': False }, "cdr2_a_aa": { 'use_numba': False }, "cdr1_a_aa": { 'use_numba': False }, "cdr3_b_aa": { 'use_numba': False }, "pmhc_b_aa": { 'use_numba': False }, "cdr2_b_aa": { 'use_numba': False }, "cdr1_b_aa": { 'use_numba': False } } df = pd.read_csv("dash2.csv") import multiprocessing df = df.head(100) r = _pws(df=df, metrics=metrics, weights=weights, kargs=kargs, cpu=1, store=False) assert r['tcrdist'].shape[0] == 100 assert r['tcrdist'].shape[1] == 100
def test_dash_tcrdist_fixed_gappos_True(): import pandas as pd import pwseqdist as pw from tcrdist.rep_funcs import _pws # Define metrics for each region metrics = { "cdr3_a_aa": pw.metrics.nb_vector_tcrdist, "pmhc_a_aa": pw.metrics.nb_vector_tcrdist, "cdr2_a_aa": pw.metrics.nb_vector_tcrdist, "cdr1_a_aa": pw.metrics.nb_vector_tcrdist, "cdr3_b_aa": pw.metrics.nb_vector_tcrdist, "pmhc_b_aa": pw.metrics.nb_vector_tcrdist, "cdr2_b_aa": pw.metrics.nb_vector_tcrdist, "cdr1_b_aa": pw.metrics.nb_vector_tcrdist } # Define weights weights = { "cdr3_a_aa": 3, "pmhc_a_aa": 1, "cdr2_a_aa": 1, "cdr1_a_aa": 1, "cdr3_b_aa": 3, "pmhc_b_aa": 1, "cdr2_b_aa": 1, "cdr1_b_aa": 1 } kargs = { "cdr3_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 3, 'ctrim': 2, 'fixed_gappos': True }, "pmhc_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr2_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr1_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr3_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 3, 'ctrim': 2, 'fixed_gappos': True }, "pmhc_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr2_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr1_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True } } df = pd.read_csv("dash2.csv") r = _pws(df=df, metrics=metrics, weights=weights, kargs=kargs, cpu=1, store=False) assert r['tcrdist'].shape[0] == 1924 assert r['tcrdist'].shape[1] == 1924
def test_pws_rectangular_computation(): metrics = { "cdr3_a_aa": pw.metrics.nb_vector_tcrdist, "pmhc_a_aa": pw.metrics.nb_vector_tcrdist, "cdr2_a_aa": pw.metrics.nb_vector_tcrdist, "cdr1_a_aa": pw.metrics.nb_vector_tcrdist, "cdr3_b_aa": pw.metrics.nb_vector_tcrdist, "pmhc_b_aa": pw.metrics.nb_vector_tcrdist, "cdr2_b_aa": pw.metrics.nb_vector_tcrdist, "cdr1_b_aa": pw.metrics.nb_vector_tcrdist } weights = { "cdr3_a_aa": 3, "pmhc_a_aa": 1, "cdr2_a_aa": 1, "cdr1_a_aa": 1, "cdr3_b_aa": 3, "pmhc_b_aa": 1, "cdr2_b_aa": 1, "cdr1_b_aa": 1 } kargs = { "cdr3_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 3, 'ctrim': 2, 'fixed_gappos': False }, "pmhc_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr2_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr1_a_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr3_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 3, 'ctrim': 2, 'fixed_gappos': False }, "pmhc_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr2_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True }, "cdr1_b_aa": { 'use_numba': True, 'distance_matrix': pw.matrices.tcr_nb_distance_matrix, 'dist_weight': 1, 'gap_penalty': 4, 'ntrim': 0, 'ctrim': 0, 'fixed_gappos': True } } df = pd.read_csv("dash2.csv") df = df.head(10).copy() df2 = pd.read_csv("dash2.csv") r = _pws(df=df, df2=df2, metrics=metrics, weights=weights, kargs=kargs, cpu=1, store=False) assert r['tcrdist'].shape == (10, 1924)