Beispiel #1
0
def lowess(x, y, f=2. / 3., iter=3):
    """lowess(x, y, f=2./3., iter=3) -> yest

    Lowess smoother: Robust locally weighted regression.
    The lowess function fits a nonparametric regression curve to a scatterplot.
    The arrays x and y contain an equal number of elements; each pair
    (x[i], y[i]) defines a data point in the scatterplot. The function returns
    the estimated (smooth) values of y.

    The smoothing span is given by f. A larger value for f will result in a
    smoother curve. The number of robustifying iterations is given by iter. The
    function will run faster with a smaller number of iterations.

    x and y should be numpy float arrays of equal length.  The return value is
    also a numpy float array of that length.

    e.g.
    >>> import numpy
    >>> x = numpy.array([4,  4,  7,  7,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12,
    ...                 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16,
    ...                 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20,
    ...                 20, 22, 23, 24, 24, 24, 24, 25], numpy.float)
    >>> y = numpy.array([2, 10,  4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24,
    ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
    ...                 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56,
    ...                 64, 66, 54, 70, 92, 93, 120, 85], numpy.float)
    >>> result = lowess(x, y)
    >>> len(result)
    50
    >>> print("[%0.2f, ..., %0.2f]" % (result[0], result[-1]))
    [4.85, ..., 84.98]
    """
    n = len(x)
    r = int(numpy.ceil(f * n))
    h = [numpy.sort(abs(x - x[i]))[r] for i in range(n)]
    w = numpy.clip(abs(([x] - numpy.transpose([x])) / h), 0.0, 1.0)
    w = 1 - w * w * w
    w = w * w * w
    yest = numpy.zeros(n)
    delta = numpy.ones(n)
    for iteration in range(iter):
        for i in range(n):
            weights = delta * w[:, i]
            weights_mul_x = weights * x
            b1 = numpy.dot(weights, y)
            b2 = numpy.dot(weights_mul_x, y)
            A11 = sum(weights)
            A12 = sum(weights_mul_x)
            A21 = A12
            A22 = numpy.dot(weights_mul_x, x)
            determinant = A11 * A22 - A12 * A21
            beta1 = (A22 * b1 - A12 * b2) / determinant
            beta2 = (A11 * b2 - A21 * b1) / determinant
            yest[i] = beta1 + beta2 * x[i]
        residuals = y - yest
        s = median(abs(residuals))
        delta[:] = numpy.clip(residuals / (6 * s), -1, 1)
        delta[:] = 1 - delta * delta
        delta[:] = delta * delta
    return yest
Beispiel #2
0
    def make_instances_from_counts(self):
        """Creates "fake" instances for a motif created from a count matrix.

        In case the sums of counts are different for different columnes, the
        shorter columns are padded with background.
        """
        alpha = "".join(self.alphabet.letters)
        #col[i] is a column taken from aligned motif instances
        col = []
        self.has_instances = True
        self.instances = []
        s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters)
        for i in range(self.length):
            col.append("")
            for n in self.alphabet.letters:
                col[i] = col[i] + n*(self.counts[n][i])
            if len(col[i]) < s:
                print("WARNING, column too short %i %i" % (len(col[i]), s))
                col[i] += (alpha*s)[:(s-len(col[i]))]
            #print("column %i, %s" % (i, col[i]))
        #iterate over instances
        for i in range(s): 
            inst = "" #start with empty seq
            for j in range(self.length): #iterate over positions
                inst += col[j][i]
            #print("%i %s" % (i,inst)
            inst = Seq(inst, self.alphabet)                
            self.add_instance(inst)
        return self.instances
Beispiel #3
0
def lowess(x, y, f=2. / 3., iter=3):
    """lowess(x, y, f=2./3., iter=3) -> yest

    Lowess smoother: Robust locally weighted regression.
    The lowess function fits a nonparametric regression curve to a scatterplot.
    The arrays x and y contain an equal number of elements; each pair
    (x[i], y[i]) defines a data point in the scatterplot. The function returns
    the estimated (smooth) values of y.

    The smoothing span is given by f. A larger value for f will result in a
    smoother curve. The number of robustifying iterations is given by iter. The
    function will run faster with a smaller number of iterations.

    x and y should be numpy float arrays of equal length.  The return value is
    also a numpy float array of that length.

    e.g.
    >>> import numpy
    >>> x = numpy.array([4,  4,  7,  7,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12,
    ...                 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16,
    ...                 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20,
    ...                 20, 22, 23, 24, 24, 24, 24, 25], numpy.float)
    >>> y = numpy.array([2, 10,  4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24,
    ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
    ...                 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56,
    ...                 64, 66, 54, 70, 92, 93, 120, 85], numpy.float)
    >>> result = lowess(x, y)
    >>> len(result)
    50
    >>> print("[%0.2f, ..., %0.2f]" % (result[0], result[-1]))
    [4.85, ..., 84.98]
    """
    n = len(x)
    r = int(numpy.ceil(f * n))
    h = [numpy.sort(abs(x - x[i]))[r] for i in range(n)]
    w = numpy.clip(abs(([x] - numpy.transpose([x])) / h), 0.0, 1.0)
    w = 1 - w * w * w
    w = w * w * w
    yest = numpy.zeros(n)
    delta = numpy.ones(n)
    for iteration in range(iter):
        for i in range(n):
            weights = delta * w[:, i]
            weights_mul_x = weights * x
            b1 = numpy.dot(weights, y)
            b2 = numpy.dot(weights_mul_x, y)
            A11 = sum(weights)
            A12 = sum(weights_mul_x)
            A21 = A12
            A22 = numpy.dot(weights_mul_x, x)
            determinant = A11 * A22 - A12 * A21
            beta1 = (A22 * b1 - A12 * b2) / determinant
            beta2 = (A11 * b2 - A21 * b1) / determinant
            yest[i] = beta1 + beta2 * x[i]
        residuals = y - yest
        s = median(abs(residuals))
        delta[:] = numpy.clip(residuals / (6 * s), -1, 1)
        delta[:] = 1 - delta * delta
        delta[:] = delta * delta
    return yest
Beispiel #4
0
    def forward_algorithm(self):
        """Calculate sequence probability using the forward algorithm.

        This implements the forward algorithm, as described on p57-58 of
        Durbin et al.

        Returns:

        o A dictionary containing the forward variables. This has keys of the
        form (state letter, position in the training sequence), and values
        containing the calculated forward variable.

        o The calculated probability of the sequence.
        """
        # all of the different letters that the state path can be in
        state_letters = self._seq.states.alphabet.letters

        # -- initialize the algorithm
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        forward_var = {}
        # f_{0}(0) = 1
        forward_var[(state_letters[0], -1)] = 1
        # f_{k}(0) = 0, for k > 0
        for k in range(1, len(state_letters)):
            forward_var[(state_letters[k], -1)] = 0

        # -- now do the recursion step
        # loop over the training sequence
        # Recursion step: (i = 1 .. L)
        for i in range(len(self._seq.emissions)):
            # now loop over the letters in the state path
            for main_state in state_letters:
                # calculate the forward value using the appropriate
                # method to prevent underflow errors
                forward_value = self._forward_recursion(
                    main_state, i, forward_var)

                if forward_value is not None:
                    forward_var[(main_state, i)] = forward_value

        # -- termination step - calculate the probability of the sequence
        first_state = state_letters[0]
        seq_prob = 0

        for state_item in state_letters:
            # f_{k}(L)
            forward_value = forward_var[(state_item,
                                         len(self._seq.emissions) - 1)]
            # a_{k0}
            transition_value = self._mm.transition_prob[(state_item,
                                                         first_state)]

            seq_prob += forward_value * transition_value

        return forward_var, seq_prob
Beispiel #5
0
    def forward_algorithm(self):
        """Calculate sequence probability using the forward algorithm.

        This implements the forward algorithm, as described on p57-58 of
        Durbin et al.

        Returns:

        o A dictionary containing the forward variables. This has keys of the
        form (state letter, position in the training sequence), and values
        containing the calculated forward variable.

        o The calculated probability of the sequence.
        """
        # all of the different letters that the state path can be in
        state_letters = self._seq.states.alphabet.letters

        # -- initialize the algorithm
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        forward_var = {}
        # f_{0}(0) = 1
        forward_var[(state_letters[0], -1)] = 1
        # f_{k}(0) = 0, for k > 0
        for k in range(1, len(state_letters)):
            forward_var[(state_letters[k], -1)] = 0

        # -- now do the recursion step
        # loop over the training sequence
        # Recursion step: (i = 1 .. L)
        for i in range(len(self._seq.emissions)):
            # now loop over the letters in the state path
            for main_state in state_letters:
                # calculate the forward value using the appropriate
                # method to prevent underflow errors
                forward_value = self._forward_recursion(main_state, i,
                                                        forward_var)

                if forward_value is not None:
                    forward_var[(main_state, i)] = forward_value

        # -- termination step - calculate the probability of the sequence
        first_state = state_letters[0]
        seq_prob = 0

        for state_item in state_letters:
            # f_{k}(L)
            forward_value = forward_var[(state_item,
                                         len(self._seq.emissions) - 1)]
            # a_{k0}
            transition_value = self._mm.transition_prob[(state_item,
                                                         first_state)]

            seq_prob += forward_value * transition_value

        return forward_var, seq_prob
Beispiel #6
0
    def calculate(self, sequence):
        """
        returns the PWM score for a given sequence for all positions.

        - the sequence can only be a DNA sequence
        - the search is performed only on one strand
        - if the sequence and the motif have the same length, a single
          number is returned
        - otherwise, the result is a one-dimensional list or numpy array
        """
        #TODO - Code itself tolerates ambiguous bases (as NaN).
        if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA):
            raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" \
                                 % self.alphabet)
        if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA):
            raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" \
                                 % sequence.alphabet)

        #TODO - Force uppercase here and optimise switch statement in C
        #by assuming upper case?
        sequence = str(sequence)
        m = self.length
        n = len(sequence)

        scores = []
        # check if the fast C code can be used
        try:
            import _pwm
        except ImportError:
            # use the slower Python code otherwise
            #The C code handles mixed case so Python version must too:
            sequence = sequence.upper()
            for i in range(n - m + 1):
                score = 0.0
                for position in range(m):
                    letter = sequence[i + position]
                    try:
                        score += self[letter][position]
                    except KeyError:
                        score = _nan
                        break
                scores.append(score)
        else:
            # get the log-odds matrix into a proper shape
            # (each row contains sorted (ACGT) log-odds values)
            logodds = [[self[letter][i] for letter in "ACGT"]
                       for i in range(m)]
            scores = _pwm.calculate(sequence, logodds)
        if len(scores) == 1:
            return scores[0]
        else:
            return scores
Beispiel #7
0
    def calculate(self, sequence):
        """
        returns the PWM score for a given sequence for all positions.

        - the sequence can only be a DNA sequence
        - the search is performed only on one strand
        - if the sequence and the motif have the same length, a single
          number is returned
        - otherwise, the result is a one-dimensional list or numpy array
        """
        #TODO - Code itself tolerates ambiguous bases (as NaN).
        if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA):
            raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" \
                                 % self.alphabet)
        if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA):
            raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" \
                                 % sequence.alphabet)

        #TODO - Force uppercase here and optimise switch statement in C
        #by assuming upper case?
        sequence = str(sequence)
        m = self.length
        n = len(sequence)

        scores = []
        # check if the fast C code can be used
        try:
            import _pwm
        except ImportError:
            # use the slower Python code otherwise
            #The C code handles mixed case so Python version must too:
            sequence = sequence.upper()
            for i in range(n-m+1):
                score = 0.0
                for position in range(m):
                    letter = sequence[i+position]
                    try:
                        score += self[letter][position]
                    except KeyError:
                        score = _nan
                        break
                scores.append(score)
        else:
            # get the log-odds matrix into a proper shape
            # (each row contains sorted (ACGT) log-odds values)
            logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)]
            scores = _pwm.calculate(sequence, logodds)
        if len(scores)==1:
            return scores[0]
        else:
            return scores
Beispiel #8
0
def matches_schema(pattern, schema, ambiguity_character='*'):
    """Determine whether or not the given pattern matches the schema.

    Arguments:

    o pattern - A string representing the pattern we want to check for
    matching. This pattern can contain ambiguity characters (which are
    assumed to be the same as those in the schema).

    o schema - A string schema with ambiguity characters.

    o ambiguity_character - The character used for ambiguity in the schema.
    """
    if len(pattern) != len(schema):
        return 0

    # check each position, and return a non match if the schema and pattern
    # are non ambiguous and don't match
    for pos in range(len(pattern)):
        if schema[pos] != ambiguity_character and \
           pattern[pos] != ambiguity_character and \
           pattern[pos] != schema[pos]:

            return 0

    return 1
Beispiel #9
0
    def pwm(self,laplace=True):
        """
        returns the PWM computed for the set of instances

        if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions.
        """
        
        if self._pwm_is_current:
            return self._pwm
        #we need to compute new pwm
        self._pwm = []
        for i in range(self.length):
            dict = {}
            #filling the dict with 0's
            for letter in self.alphabet.letters:
                if laplace:
                    dict[letter]=self.beta*self.background[letter]
                else:
                    dict[letter]=0.0
            if self.has_counts:
                #taking the raw counts
                for letter in self.alphabet.letters:
                    dict[letter]+=self.counts[letter][i]
            elif self.has_instances:
                #counting the occurences of letters in instances
                for seq in self.instances:
                    #dict[seq[i]]=dict[seq[i]]+1
                    try:
                        dict[seq[i]]+=1
                    except KeyError: #we need to ignore non-alphabet letters
                        pass
            self._pwm.append(FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet)) 
        self._pwm_is_current=1
        return self._pwm
Beispiel #10
0
def calculate_pseudocounts(motif):
    alphabet = motif.alphabet
    background = motif.background

    # It is possible to have unequal column sums so use the average
    # number of instances.
    total = 0
    for i in range(motif.length):
        total += sum(float(motif.counts[letter][i])
                     for letter in alphabet.letters)

    avg_nb_instances = total / motif.length
    sq_nb_instances = math.sqrt(avg_nb_instances)

    if background:
        background = dict(background)
    else:
        background = dict.fromkeys(sorted(alphabet.letters), 1.0)

    total = sum(background.values())
    pseudocounts = {}

    for letter in alphabet.letters:
        background[letter] /= total
        pseudocounts[letter] = sq_nb_instances * background[letter]

    return pseudocounts
Beispiel #11
0
    def normalize(self, pseudocounts=None):
        """
        create and return a position-weight matrix by normalizing the counts matrix.

        If pseudocounts is None (default), no pseudocounts are added
        to the counts.
        If pseudocounts is a number, it is added to the counts before
        calculating the position-weight matrix.
        Alternatively, the pseudocounts can be a dictionary with a key
        for each letter in the alphabet associated with the motif.
        """

        counts = {}
        if pseudocounts is None:
            for letter in self.alphabet.letters:
                counts[letter] = [0.0] * self.length
        elif isinstance(pseudocounts, dict):
            for letter in self.alphabet.letters:
                counts[letter] = [float(pseudocounts[letter])] * self.length
        else:
            for letter in self.alphabet.letters:
                counts[letter] = [float(pseudocounts)] * self.length
        for i in range(self.length):
            for letter in self.alphabet.letters:
                counts[letter][i] += self[letter][i]
        # Actual normalization is done in the PositionWeightMatrix initializer
        return PositionWeightMatrix(self.alphabet, counts)
Beispiel #12
0
 def std(self, background=None):
     """Standard deviation of the score of a motif."""
     if background is None:
         background = dict.fromkeys(self._letters, 1.0)
     else:
         background = dict(background)
     total = sum(background.values())
     for letter in self._letters:
         background[letter] /= total
     variance = 0.0
     for i in range(self.length):
         sx = 0.0
         sxx = 0.0
         for letter in self._letters:
             logodds = self[letter, i]
             if _isnan(logodds):
                 continue
             if _isinf(logodds) and logodds < 0:
                 continue
             b = background[letter]
             p = b * math.pow(2, logodds)
             sx += p * logodds
             sxx += p * logodds * logodds
         sxx -= sx * sx
         variance += sxx
     variance = max(variance, 0)  # to avoid roundoff problems
     return math.sqrt(variance)
Beispiel #13
0
    def representation(self, sequence):
        """Represent the given input sequence as a bunch of motif counts.

        Arguments:

        o sequence - A Bio.Seq object we are going to represent as schemas.

        This takes the sequence, searches for the motifs within it, and then
        returns counts specifying the relative number of times each motifs
        was found. The frequencies are in the order the original motifs were
        passed into the initializer.
        """
        schema_counts = []

        for schema in self._schemas:
            num_counts = self._converter.num_matches(schema, str(sequence))
            schema_counts.append(num_counts)

        # normalize the counts to go between zero and one
        min_count = 0
        max_count = max(schema_counts)

        # only normalize if we've actually found something, otherwise
        # we'll just return 0 for everything
        if max_count > 0:
            for count_num in range(len(schema_counts)):
                schema_counts[count_num] = (float(schema_counts[count_num]) -
                                           float(min_count)) / float(max_count)

        return schema_counts
Beispiel #14
0
def matches_schema(pattern, schema, ambiguity_character='*'):
    """Determine whether or not the given pattern matches the schema.

    Arguments:

    o pattern - A string representing the pattern we want to check for
    matching. This pattern can contain ambiguity characters (which are
    assumed to be the same as those in the schema).

    o schema - A string schema with ambiguity characters.

    o ambiguity_character - The character used for ambiguity in the schema.
    """
    if len(pattern) != len(schema):
        return 0

    # check each position, and return a non match if the schema and pattern
    # are non ambiguous and don't match
    for pos in range(len(pattern)):
        if schema[pos] != ambiguity_character and \
           pattern[pos] != ambiguity_character and \
           pattern[pos] != schema[pos]:

            return 0

    return 1
Beispiel #15
0
    def _crossover( self, x, no, locs ):
        """Generalized Crossover Function:

           arguments:
               x (int)        - genome number [0|1]
               no (organism,organism)
                              - new organisms
               locs (int list, int list)
                              - lists of locations,
                                [0, +n points+, bound]
                                for each genome (sync'd with x)

            return type: sequence (to replace no[x])
        """
        s = no[ x ].genome[ :locs[ x ][1] ]
        for n in range(1, self._npoints):
            # flipflop between genome_0 and genome_1
            mode = (x+n)%2
            # _generate_locs gives us [0, +n points+, bound]
            #  so we can iterate: { 0:loc(1) ... loc(n):bound }
            t = no[ mode ].genome[ locs[mode][n]:locs[mode][n+1] ]
            if (s):
                s = s + t
            else:
                s = t
        return s
Beispiel #16
0
    def representation(self, sequence):
        """Represent the given input sequence as a bunch of motif counts.

        Arguments:

        o sequence - A Bio.Seq object we are going to represent as schemas.

        This takes the sequence, searches for the motifs within it, and then
        returns counts specifying the relative number of times each motifs
        was found. The frequencies are in the order the original motifs were
        passed into the initializer.
        """
        schema_counts = []

        for schema in self._schemas:
            num_counts = self._converter.num_matches(schema, str(sequence))
            schema_counts.append(num_counts)

        # normalize the counts to go between zero and one
        min_count = 0
        max_count = max(schema_counts)

        # only normalize if we've actually found something, otherwise
        # we'll just return 0 for everything
        if max_count > 0:
            for count_num in range(len(schema_counts)):
                schema_counts[count_num] = (
                    float(schema_counts[count_num]) -
                    float(min_count)) / float(max_count)

        return schema_counts
Beispiel #17
0
 def std(self, background=None):
     """Standard deviation of the score of a motif."""
     if background is None:
         background = dict.fromkeys(self._letters, 1.0)
     else:
         background = dict(background)
     total = sum(background.values())
     for letter in self._letters:
         background[letter] /= total
     variance = 0.0
     for i in range(self.length):
         sx = 0.0
         sxx = 0.0
         for letter in self._letters:
             logodds = self[letter, i]
             if _isnan(logodds):
                 continue
             if _isinf(logodds) and logodds < 0:
                 continue
             b = background[letter]
             p = b * math.pow(2, logodds)
             sx += p*logodds
             sxx += p*logodds*logodds
         sxx -= sx*sx
         variance += sxx
     variance = max(variance, 0) # to avoid roundoff problems
     return math.sqrt(variance)
Beispiel #18
0
def intermediate_points(start, end, graph_data):
    """ intermediate_points(start, end, graph_data)

        o graph_data

        o start

        o end

        Returns a list of (start, end, value) tuples describing the passed
        graph data as 'bins' between position midpoints.
    """
    #print start, end, len(graph_data)
    newdata = []  # data in form (X0, X1, val)
    # add first block
    newdata.append(
        (start, graph_data[0][0] + (graph_data[1][0] - graph_data[0][0]) / 2.,
         graph_data[0][1]))
    # add middle set
    for index in range(1, len(graph_data) - 1):
        lastxval, lastyval = graph_data[index - 1]
        xval, yval = graph_data[index]
        nextxval, nextyval = graph_data[index + 1]
        newdata.append((lastxval + (xval - lastxval) / 2.,
                        xval + (nextxval - xval) / 2., yval))
    # add last block
    newdata.append((xval + (nextxval - xval) / 2., end, graph_data[-1][1]))
    #print newdata[-1]
    #print newdata
    return newdata
Beispiel #19
0
    def normalize(self, pseudocounts=None):
        """
        create and return a position-weight matrix by normalizing the counts matrix.

        If pseudocounts is None (default), no pseudocounts are added
        to the counts.
        If pseudocounts is a number, it is added to the counts before
        calculating the position-weight matrix.
        Alternatively, the pseudocounts can be a dictionary with a key
        for each letter in the alphabet associated with the motif.
        """

        counts = {}
        if pseudocounts is None:
            for letter in self.alphabet.letters:
                counts[letter] = [0.0] * self.length
        elif isinstance(pseudocounts, dict):
            for letter in self.alphabet.letters:
                counts[letter] = [float(pseudocounts[letter])] * self.length
        else:
            for letter in self.alphabet.letters:
                counts[letter] = [float(pseudocounts)] * self.length
        for i in range(self.length):
            for letter in self.alphabet.letters:
                counts[letter][i] += self[letter][i]
        # Actual normalization is done in the PositionWeightMatrix initializer
        return PositionWeightMatrix(self.alphabet, counts)
Beispiel #20
0
def calculate_pseudocounts(motif):
    alphabet = motif.alphabet
    background = motif.background

    # It is possible to have unequal column sums so use the average
    # number of instances.
    total = 0
    for i in range(motif.length):
        total += sum(
            float(motif.counts[letter][i]) for letter in alphabet.letters)

    avg_nb_instances = total / motif.length
    sq_nb_instances = math.sqrt(avg_nb_instances)

    if background:
        background = dict(background)
    else:
        background = dict.fromkeys(sorted(alphabet.letters), 1.0)

    total = sum(background.values())
    pseudocounts = {}

    for letter in alphabet.letters:
        background[letter] /= total
        pseudocounts[letter] = sq_nb_instances * background[letter]

    return pseudocounts
Beispiel #21
0
 def __init__(self, alphabet, counts):
     GenericPositionMatrix.__init__(self, alphabet, counts)
     for i in range(self.length):
         total = sum(float(self[letter][i]) for letter in alphabet.letters)
         for letter in alphabet.letters:
             self[letter][i] /= total
     for letter in alphabet.letters:
         self[letter] = tuple(self[letter])
Beispiel #22
0
def _gen_random_array(n):
    """ Return an array of n random numbers, where the elements of the array sum
    to 1.0"""
    randArray = [random.random() for i in range(n)]
    total = sum(randArray)
    normalizedRandArray = [x/total for x in randArray]

    return normalizedRandArray
Beispiel #23
0
 def dist_product_at(self, other, offset):
     s=0
     for i in range(max(self.length, offset+other.length)):
         f1=self[i]
         f2=other[i-offset]
         for n, b in self.background.items():
             s+=b*f1[n]*f2[n]
     return s/i
Beispiel #24
0
 def __init__(self, alphabet, counts):
     GenericPositionMatrix.__init__(self, alphabet, counts)
     for i in range(self.length):
         total = sum(float(self[letter][i]) for letter in alphabet.letters)
         for letter in alphabet.letters:
             self[letter][i] /= total
     for letter in alphabet.letters:
         self[letter] = tuple(self[letter])
Beispiel #25
0
 def _pwm_calculate(self, sequence):
     logodds = self.log_odds()
     m = len(logodds)
     s = len(sequence)
     n = s - m + 1
     result = [None] * n
     for i in range(n):
         score = 0.0
         for j in range(m):
             c = sequence[i+j]
             temp = logodds[j].get(c)
             if temp is None:
                 break
             score += temp
         else:
             result[i] = score
     return result
Beispiel #26
0
    def _schema_from_motif(self, motif, motif_list, num_ambiguous):
        """Create a schema from a given starting motif.

        Arguments:

        o motif - A motif with the pattern we will start from.

        o motif_list - The total motifs we have.to match to.

        o num_ambiguous - The number of ambiguous characters that should
        be present in the schema.

        Returns:

        o A string representing the newly generated schema.

        o A list of all of the motifs in motif_list that match the schema.
        """
        assert motif in motif_list, \
               "Expected starting motif present in remaining motifs."

        # convert random positions in the motif to ambiguous characters
        # convert the motif into a list of characters so we can manipulate it
        new_schema_list = list(motif)
        for add_ambiguous in range(num_ambiguous):
            # add an ambiguous position in a new place in the motif
            while True:
                ambig_pos = random.choice(list(range(len(new_schema_list))))

                # only add a position if it isn't already ambiguous
                # otherwise, we'll try again
                if new_schema_list[ambig_pos] != self._ambiguity_symbol:
                    new_schema_list[ambig_pos] = self._ambiguity_symbol
                    break

        # convert the schema back to a string
        new_schema = ''.join(new_schema_list)

        # get the motifs that the schema matches
        matched_motifs = []
        for motif in motif_list:
            if matches_schema(motif, new_schema, self._ambiguity_symbol):
                matched_motifs.append(motif)

        return new_schema, matched_motifs
Beispiel #27
0
    def _schema_from_motif(self, motif, motif_list, num_ambiguous):
        """Create a schema from a given starting motif.

        Arguments:

        o motif - A motif with the pattern we will start from.

        o motif_list - The total motifs we have.to match to.

        o num_ambiguous - The number of ambiguous characters that should
        be present in the schema.

        Returns:

        o A string representing the newly generated schema.

        o A list of all of the motifs in motif_list that match the schema.
        """
        assert motif in motif_list, \
               "Expected starting motif present in remaining motifs."

        # convert random positions in the motif to ambiguous characters
        # convert the motif into a list of characters so we can manipulate it
        new_schema_list = list(motif)
        for add_ambiguous in range(num_ambiguous):
            # add an ambiguous position in a new place in the motif
            while True:
                ambig_pos = random.choice(list(range(len(new_schema_list))))

                # only add a position if it isn't already ambiguous
                # otherwise, we'll try again
                if new_schema_list[ambig_pos] != self._ambiguity_symbol:
                    new_schema_list[ambig_pos] = self._ambiguity_symbol
                    break

        # convert the schema back to a string
        new_schema = ''.join(new_schema_list)

        # get the motifs that the schema matches
        matched_motifs = []
        for motif in motif_list:
            if matches_schema(motif, new_schema, self._ambiguity_symbol):
                matched_motifs.append(motif)

        return new_schema, matched_motifs
Beispiel #28
0
 def search(self, sequence):
     """
     a generator function, returning found positions of motif instances in a given sequence
     """
     for pos in range(0, len(sequence) - self.length + 1):
         for instance in self:
             if str(instance) == str(sequence[pos:pos + self.length]):
                 yield (pos, instance)
                 break  # no other instance will fit (we don't want to return multiple hits)
Beispiel #29
0
    def __getitem__(self, index):
        """Returns the probability distribution over symbols at a given position, padding with background.

        If the requested index is out of bounds, the returned distribution comes from background.
        """
        if index in range(self.length):
            return self.pwm()[index]
        else:
            return self.background
Beispiel #30
0
 def search(self, sequence):
     """
     a generator function, returning found positions of motif instances in a given sequence
     """
     for pos in range(0, len(sequence) - self.length + 1):
         for instance in self:
             if str(instance) == str(sequence[pos : pos + self.length]):
                 yield (pos, instance)
                 break  # no other instance will fit (we don't want to return multiple hits)
Beispiel #31
0
    def min(self):
        """Minimal possible score for this motif.

        returns the score computed for the anticonsensus sequence.
        """
        score = 0.0
        letters = self._letters
        for position in range(0, self.length):
            score += min(self[letter][position] for letter in letters)
        return score
Beispiel #32
0
 def __str__(self):
     words = ["%6d" % i for i in range(self.length)]
     line = "   " + " ".join(words)
     lines = [line]
     for letter in self._letters:
         words = ["%6.2f" % value for value in self[letter]]
         line = "%c: " % letter + " ".join(words)
         lines.append(line)
     text = "\n".join(lines) + "\n"
     return text
Beispiel #33
0
    def min(self):
        """Minimal possible score for this motif.

        returns the score computed for the anticonsensus sequence.
        """
        score = 0.0
        letters = self._letters
        for position in range(0, self.length):
            score += min(self[letter][position] for letter in letters)
        return score
Beispiel #34
0
 def __str__(self):
     words = ["%6d" % i for i in range(self.length)]
     line = "   " + " ".join(words)
     lines = [line]
     for letter in self._letters:
         words = ["%6.2f" % value for value in self[letter]]
         line = "%c: " % letter + " ".join(words)
         lines.append(line)
     text = "\n".join(lines) + "\n"
     return text
Beispiel #35
0
 def search_instances(self, sequence):
     """
     a generator function, returning found positions of instances of the motif in a given sequence
     """
     if not self.has_instances:
         raise ValueError ("This motif has no instances")
     for pos in range(0, len(sequence) - self.length + 1):
         for instance in self.instances:
             if str(instance) == str(sequence[pos:pos + self.length]):
                 yield (pos, instance)
                 break # no other instance will fit (we don't want to return multiple hits)
Beispiel #36
0
 def ic(self):
     """Method returning the information content of a motif.
     """
     res=0
     pwm=self.pwm()
     for i in range(self.length):
         res+=2
         for a in self.alphabet.letters:
             if pwm[i][a]!=0:
                 res+=pwm[i][a]*math.log(pwm[i][a], 2)
     return res
Beispiel #37
0
 def _to_horizontal_matrix(self,letters=None,normalized=True):
     """Return string representation of the motif as  a matrix.
     
     """
     if letters is None:
         letters = self.alphabet.letters
     res = ""
     if normalized: #output PWM
         self._pwm_is_current=False
         mat=self.pwm(laplace=False)
         for a in letters:
             res += "\t".join(str(mat[i][a]) for i in range(self.length))
             res += "\n"
     else: #output counts
         if not self.has_counts:
             self.make_counts_from_instances()
         mat = self.counts
         for a in letters:
             res += "\t".join(str(mat[a][i]) for i in range(self.length))
             res += "\n"
     return res
Beispiel #38
0
 def anticonsensus(self):
     """returns the least probable pattern to be generated from this motif.
     """
     res=""
     for i in range(self.length):
         min_f=10.0
         min_n="X"
         for n in sorted(self[i]):
             if self[i][n]<min_f:
                 min_f=self[i][n]
                 min_n=n
         res+=min_n
     return Seq(res, self.alphabet)
Beispiel #39
0
    def __str__(self,masked=False):
        """ string representation of a motif.
        """
        str = "".join(str(inst) + "\n" for inst in self.instances)

        if masked:
            for i in range(self.length):
                if self.mask[i]:
                    str += "*"
                else:
                    str += " "
            str += "\n"
        return str
Beispiel #40
0
    def gc_content(self):
        """
Compute the fraction GC content.
"""
        alphabet = self.alphabet
        gc_total = 0.0
        total = 0.0
        for i in range(self.length):
            for letter in alphabet.letters:
                if letter in 'CG':
                    gc_total += self[letter][i]
                total += self[letter][i]
        return gc_total / total
Beispiel #41
0
    def gc_content(self):
        """
Compute the fraction GC content.
"""
        alphabet = self.alphabet
        gc_total = 0.0
        total = 0.0
        for i in range(self.length):
            for letter in alphabet.letters:
                if letter in 'CG':
                    gc_total += self[letter][i]
                total += self[letter][i]
        return gc_total / total
Beispiel #42
0
 def consensus(self):
     """Returns the consensus sequence of a motif.
     """
     res=""
     for i in range(self.length):
         max_f=0
         max_n="X"
         for n in sorted(self[i]):
             if self[i][n]>max_f:
                 max_f=self[i][n]
                 max_n=n
         res+=max_n
     return Seq(res, self.alphabet)
Beispiel #43
0
 def _to_vertical_matrix(self,letters=None):
     """Return string representation of the motif as  a matrix.
     
     """
     if letters is None:
         letters = self.alphabet.letters
     self._pwm_is_current=False
     pwm = self.pwm(laplace=False)
     res = ""
     for i in range(self.length):
         res += "\t".join(str(pwm[i][a]) for a in letters)
         res += "\n"
     return res
Beispiel #44
0
 def dist_product(self, other):
     """
     A similarity measure taking into account a product probability of generating overlaping instances of two motifs
     """
     max_p=0.0
     for offset in range(-self.length+1, other.length):
         if offset<0:
             p = self.dist_product_at(other, -offset)
         else: #offset>=0
             p = other.dist_product_at(self, offset)
         if max_p<p:
             max_p=p
             max_o=-offset
     return 1-max_p/self.dist_product_at(self, 0), max_o
Beispiel #45
0
    def backward_algorithm(self):
        """Calculate sequence probability using the backward algorithm.

        This implements the backward algorithm, as described on p58-59 of
        Durbin et al.

        Returns:

        o A dictionary containing the backwards variables. This has keys
        of the form (state letter, position in the training sequence),
        and values containing the calculated backward variable.
        """
        # all of the different letters that the state path can be in
        state_letters = self._seq.states.alphabet.letters

        # -- initialize the algorithm
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        backward_var = {}

        first_letter = state_letters[0]
        # b_{k}(L) = a_{k0} for all k
        for state in state_letters:
            backward_var[(state, len(self._seq.emissions) - 1)] = \
              self._mm.transition_prob[(state, state_letters[0])]

        # -- recursion
        # first loop over the training sequence backwards
        # Recursion step: (i = L - 1 ... 1)
        all_indexes = list(range(len(self._seq.emissions) - 1))
        all_indexes.reverse()
        for i in all_indexes:
            # now loop over the letters in the state path
            for main_state in state_letters:
                # calculate the backward value using the appropriate
                # method to prevent underflow errors
                backward_value = self._backward_recursion(main_state, i,
                                                          backward_var)

                if backward_value is not None:
                    backward_var[(main_state, i)] = backward_value

        # skip the termination step to avoid recalculations -- you should
        # get sequence probabilities using the forward algorithm

        return backward_var