Ejemplo n.º 1
0
def nw_metric(self, s1, s2):
    """May or may not produce a true metric. Details in:
        E. Halpering, J. Buhler, R. Karp, R. Krauthgamer, and B. Westover.
            Detecting protein sequence conservation via metric embeddings.
            Bioinformatics, 19(Suppl. 1):i122–i129, 2003"""
    xx = parasail.nw_stats(s1, s1, open=3, extend=3, matrix=parasail.blosum62).score
    yy = parasail.nw_stats(s2, s2, open=3, extend=3, matrix=parasail.blosum62).score
    xy = parasail.nw_stats(s1, s2, open=3, extend=3, matrix=parasail.blosum62).score
    D = xx + yy - 2 * xy
    return D
Ejemplo n.º 2
0
def nw_metric(self, s1, s2):
    """May or may not produce a true metric. Details in:
        E. Halpering, J. Buhler, R. Karp, R. Krauthgamer, and B. Westover.
            Detecting protein sequence conservation via metric embeddings.
            Bioinformatics, 19(Suppl. 1):i122–i129, 2003"""
    xx = parasail.nw_stats(s1, s1, open=3, extend=3,
                           matrix=parasail.blosum62).score
    yy = parasail.nw_stats(s2, s2, open=3, extend=3,
                           matrix=parasail.blosum62).score
    xy = parasail.nw_stats(s1, s2, open=3, extend=3,
                           matrix=parasail.blosum62).score
    D = xx + yy - 2 * xy
    return D
Ejemplo n.º 3
0
def nw_hamming_metric(s1, s2, matrix='blosum62', open=3, extend=3):
    """Function applying Parasail's Needleman-Wuncsh Algorithm to align and
    compute a Hamming Distance between any two sequences: number of
    mismatched positions. Gaps count as a mismatch. Penalties and matrix
    are used for alignment purposes, not in the distance calculation.

    Parameters
    ----------
    s1: string
        string containing amino acid letters
    s2: string
        string containing amino acid letters
    matrix : str
        Attribute of parasail that names a substitution matrix

    Returns
    -------
    D : float
        distance between strings (Hamming Distance: number of mismatched positions)

    Notes
    -----
    .. code-block:: python

        xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix)
        xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix)
        hamming_distance = len(xy_t.traceback.comp)-xy.matches
        return hamming_distance"""
    p_matrix = getattr(parasail, matrix)
    xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix)
    xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=p_matrix)
    D = len(xy_t.traceback.comp)-xy.matches
    return D
Ejemplo n.º 4
0
 def _try_cache(self, e):
     try:
         xx = self.sim_cache[(e, e)]
     except KeyError:
         xx = parasail.nw_stats(e, e, **self.paraParams).score
         self.sim_cache[(e, e)] = xx
     return xx
Ejemplo n.º 5
0
 def _try_cache(self, e):
     try:
         xx = self.sim_cache[(e, e)]
     except KeyError:
         xx = parasail.nw_stats(e, e, **self.paraParams).score
         self.sim_cache[(e, e)] = xx
     return xx
Ejemplo n.º 6
0
def nw_metric(s1, s2, matrix='blosum62', open=3, extend=3, return_similarity=False):
    """Function applying Parasail's Needleman-Wuncsh Algorithm to compute
    a distance between any two sequences.

    Parameters
    ----------
    s1: string
        string containing amino acid letters
    s2: string
        string containing amino acid letters
    matrix : str
        Attribute of parasail that names a substitution matrix

    Returns
    -------
    D : float
        distance via reciprocal alignment scores.

    Notes
    -----

    .. code-block:: python

      xx = parasail.nw_stats(s1, s1, open=open, extend=extend, matrix=matrix).score
      yy = parasail.nw_stats(s2, s2, open=open, extend=extend, matrix=matrix).score
      xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix).score
      D = xx + yy - 2 * xy
      return D


    May or may not produce a true metric. Details in:
    E. Halpering, J. Buhler, R. Karp, R. Krauthgamer, and B. Westover.
    Detecting protein sequence conservation via metric embeddings.
    Bioinformatics, 19 (sup 1) 2003
    """
    p_matrix = getattr(parasail, matrix)

    xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix).score
    
    if return_similarity:
        return xy
    else:
        xx = parasail.nw_stats(s1, s1, open=open, extend=extend, matrix=p_matrix).score
        yy = parasail.nw_stats(s2, s2, open=open, extend=extend, matrix=p_matrix).score
        
        D = xx + yy - 2 * xy
        return D
Ejemplo n.º 7
0
 def calculate(self,
               query_base_sequence,
               target_base_sequence,
               gap_open=16,
               gap_extend=4):
     result = parasail.nw_stats(query_base_sequence, target_base_sequence,
                                gap_open, gap_extend, parasail.dnafull)
     score = result.similar * 1.0 / result.length
     return score
Ejemplo n.º 8
0
    def metric(self, i1, i2):
        """sklearn specifies that function will receive
        two rows as parameters and return one value as distance"""
        xx = self._try_cache(self.i2e[i1[0]])
        yy = self._try_cache(self.i2e[i2[0]])
        """Don't need to cache the xy similarity because it doesn't have other uses"""
        xy = parasail.nw_stats(self.i2e[i1[0]], self.i2e[i2[0]],
                               **self.paraParams).score

        D = xx + yy - 2 * xy
        return D
Ejemplo n.º 9
0
    def metric(self, i1, i2):
        """sklearn specifies that function will receive
        two rows as parameters and return one value as distance"""
        xx = self._try_cache(self.i2e[i1[0]])
        yy = self._try_cache(self.i2e[i2[0]])

        """Don't need to cache the xy similarity because it doesn't have other uses"""
        xy = parasail.nw_stats(self.i2e[i1[0]], self.i2e[i2[0]], **self.paraParams).score

        D = xx + yy - 2 * xy
        return D
Ejemplo n.º 10
0
def hm_metric(s1, s2, matrix='blosum62', open=3, extend=3):
    """
    Function applying Parasail's Needleman-Wuncsh Algorithm to allign and get
    a Hamming Distance between any two sequences: number of mismatched positions


    Parameters
    ----------
    s1: string
        string containing amino acid letters

    s2: string
        string containing amino acid letters

    Returns
    -------
    D : float
        distance between strings (Hamming Distance: number of mismatched positions)

    Notes
    -----

    .. code-block:: python

        xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix)
        xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix)
        hamming_distance = len(xy_t.traceback.comp)-xy.matches
        return hamming_distance



    """
    p_matrix = getattr(parasail, matrix)
    xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix)
    xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=p_matrix)
    D = len(xy_t.traceback.comp) - xy.matches
    return D
Ejemplo n.º 11
0
def hm_matches(s1, s2, matrix=parasail.blosum62, open=3, extend=3):
    xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix)
    xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix)
    return xy.matches
Ejemplo n.º 12
0
def aligned_mm_metric(s1, s2, open=3, extend=3, matrix=parasail.blosum62):
    res = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix)
    return res.length - res.matches