Esempio n. 1
0
    def toConsensus(self, cutoff=None, fully_degenerate=False,\
        include_all=False):
        """Returns the consensus sequence from a profile.

        cutoff: cutoff value, determines how much should be covered in a
        position (row) of the profile. Example: pos 0 [.2,.1,.3,.4]
        (CharOrder: TCAG). To cover .65 (=cutoff) we need two characters:
        A and G, which results in the degenerate character R.
        
        fully_degenerate: determines whether the fully degenerate character
        is returned at a position. For the example above an 'N' would
        be returned.
       
        inlcude_all: all possibilities are included in the degenerate 
        character. Example: row = UCAG = [.1,.3,.3,.3] cutoff = .4, 
        consensus = 'V' (even though only 2 chars would be enough to 
        reach the cutoff value).

        The Alphabet of the Profile should implement degenerateFromSequence.
        
        Note that cutoff has priority over fully_degenerate. In other words,
        if you specify a cutoff value and set fully_degenerate to true, 
        the calculation will be done with the cutoff value. If nothing 
        gets passed in, the maximum argument is chosen. In the first example
        above G will be returned.
        """
        #set up some local variables
        co = array(self.CharOrder)
        alpha = self.Alphabet
        data = self.Data

        #determine the action. Cutoff takes priority over fully_degenerate
        if cutoff:
            result = []
            degen = self.rowDegeneracy(cutoff)
            sorted = argsort(data)
            if include_all:
                #if include_all include all possiblilities in the degen char 
                for row_idx, (num_to_keep, row) in enumerate(zip(degen,sorted)):
                    to_take = [item for item in row[-num_to_keep:]\
                    if item in nonzero(data[row_idx])] +\
                    [item for item in nonzero(data[row_idx] ==\
                        data[row_idx,row[-num_to_keep]]) if item in\
                        nonzero(data[row_idx])]
                    result.append(alpha.degenerateFromSequence(\
                    map(str,take(co, to_take))))
            else:
                for row_idx, (num_to_keep, row) in enumerate(zip(degen,sorted)):
                    result.append(alpha.degenerateFromSequence(\
                        map(str,take(co, [item for item in row[-num_to_keep:]\
                        if item in nonzero(data[row_idx])]))))
                                    
        elif not fully_degenerate: 
            result = take(co, argmax(self.Data))
        else:
            result = []
            for row in self.Data:
                result.append(alpha.degenerateFromSequence(\
                map(str,take(co, nonzero(row)))))
        return ''.join(map(str,result))
Esempio n. 2
0
    def epsilon_greedy(self,sensation,applicable_actions):
        """
        Given self.epsilon() and self.Q(), return a distribution over
        applicable_actions as an array where each element contains the
        a probability mass for the corresponding action.  I.e.  The
        action with the highest Q gets p = self.epsilon() and the
        others get the remainder of the mass, uniformly distributed.
        """
        Q = array([self.Q(sensation,action) for action in applicable_actions])

        # simple epsilon-greedy policy
        # get a vector with a 1 where each max element is, zero elsewhere
        mask = (Q == mmax(Q))

        num_maxes = len(nonzero(mask))
        num_others = len(mask) - num_maxes

        if num_others == 0: return mask
        
        e0 = self.epsilon()/num_maxes
        e1 = self.epsilon()/num_others

        result = zeros(len(mask))+0.0
        putmask(result,mask,1-e0)
        putmask(result,mask==0,e1)
        return result
    def epsilon_greedy(self, sensation, applicable_actions):
        """
        Given self.epsilon() and self.Q(), return a distribution over
        applicable_actions as an array where each element contains the
        a probability mass for the corresponding action.  I.e.  The
        action with the highest Q gets p = self.epsilon() and the
        others get the remainder of the mass, uniformly distributed.
        """
        Q = array([self.Q(sensation, action) for action in applicable_actions])

        # simple epsilon-greedy policy
        # get a vector with a 1 where each max element is, zero elsewhere
        mask = (Q == mmax(Q))

        num_maxes = len(nonzero(mask))
        num_others = len(mask) - num_maxes

        if num_others == 0: return mask

        e0 = self.epsilon() / num_maxes
        e1 = self.epsilon() / num_others

        result = zeros(len(mask)) + 0.0
        putmask(result, mask, 1 - e0)
        putmask(result, mask == 0, e1)
        return result
Esempio n. 4
0
def safe_sum_p_log_p(a, base=None):
    """Calculates p * log(p) safely for an array that may contain zeros."""
    flat = ravel(a)
    nz = take(flat, nonzero(flat))
    logs = log(nz)
    if base:
        logs /= log(base)
    return sum(nz * logs)
    def find_in_radius(self,key,radius):
        if not self.db:
            return [],[]
        X = array([x for x,v in self.db])
        dists = matrixnorm(key-X)

        close_enough = nonzero(dists <= radius)
        return ([self.db[i] for i in close_enough],
                [dists[i] for i in close_enough])
Esempio n. 6
0
    def setIds(self, id_fun=lambda x: x.Data.split("_")[-1]):
        """
        Sets "LeafLabel", "LeafCts", and "ContainsAll" attributes

        id_fun: function that takes node and generate a unique id (label)
            for each node. By default will create a label consisting of 
            the string to the right of the last underscore in the data
            attribute. E.g. if the node has data label of 1234_HSA, the
            function will return a unique lable of "HSA". the idea being
            that if your tree has multiple human (HSA) sequences, the
            result of the function will be multiple nodes w/the same
            label. 

        The LeafLabel attribute is the the result of the id_fun function.

        The LeafCts attribute is an array with counts of the leaves with the 
            same label.

        The ContainsAll attribute is True when it contains every instance 
            of the LeafLabels of its terminal descendants. E.g. the set
            of LeafLabels of its terminal descendants occur nowhere else
            in the tree. 

        This is used by the uniqueIds function to remove duplicate species
        from the tree but can be used for any label you choose.
        """
        labels =  [id_fun(x)  for x in self.TerminalDescendants]
        u_labels = list(set(labels))
        len_u_labels = len(u_labels)
        labels_dict =  dict(zip(u_labels, range(len_u_labels)))
        all_cts = zeros(len(u_labels))

        for label in labels: 
            all_cts[labels_dict[label]] += 1
      
        for n in self.traverse(self_before=False, self_after=True):
            if not n.Children:
                setattr(n, "LeafLabel", id_fun(n))
                setattr(n, "LeafCts", zeros(len_u_labels))
                n.LeafCts[labels_dict[n.LeafLabel]] = 1
            else:
                n.LeafCts = zeros(len_u_labels)
                for c in n.Children:
                    n.LeafCts += c.LeafCts 
            nzero = nonzero(n.LeafCts)
            total = sum(take(all_cts, nzero)- take(n.LeafCts, nzero))
            setattr(n, "ContainsAll", (total == 0))
Esempio n. 7
0
def safe_log(a):
    """Returns the log (base 2) of each nonzero item in a.

    a: Numeric array

    WARNING: log2 is only defined on positive numbers, so make sure
    there are no negative numbers in the array.

    Always returns an array with floats in there to avoid unexpected
    results when applying it to an array with just integers.
    """
    c = array(a.copy(),Float64)
    flat = ravel(c)
    nz_i = nonzero(flat)
    nz_e = take(flat,nz_i)
    log_nz = log2(nz_e)
    put(flat,nz_i,log_nz)
    return c
Esempio n. 8
0
    def normalizeSequences(self):
        """Normalized the data by sequences (the columns) to one
        
        It does not make sense to normalize anything with negative
        numbers in there. However, the method does NOT check for that, 
        because it would slow down the calculations too much. It will work, 
        but you might get very unexpected results.

        The method will raise an error when one or more columns add up to
        one. It checks explicitly for that to avoid OverflowErrors, 
        ZeroDivisionErrors, and infinities in the results. 

        WARNING: this method works in place with respect to the Profile
        object, not with respect to the Data attribute. Normalization
        rebinds self.Data to a new array.
        """
        col_sums = sum(self.Data)
        if col_sums == 0:
            zero_indices = nonzero(col_sums==0).tolist()
            raise ProfileError,\
            "Can't normalize profile, columns at indices %s add up to zero"\
            %(zero_indices)
        else:
            self.Data = self.Data/col_sums