def toConsensus(self, cutoff=None, fully_degenerate=False,\ include_all=False): """Returns the consensus sequence from a profile. cutoff: cutoff value, determines how much should be covered in a position (row) of the profile. Example: pos 0 [.2,.1,.3,.4] (CharOrder: TCAG). To cover .65 (=cutoff) we need two characters: A and G, which results in the degenerate character R. fully_degenerate: determines whether the fully degenerate character is returned at a position. For the example above an 'N' would be returned. inlcude_all: all possibilities are included in the degenerate character. Example: row = UCAG = [.1,.3,.3,.3] cutoff = .4, consensus = 'V' (even though only 2 chars would be enough to reach the cutoff value). The Alphabet of the Profile should implement degenerateFromSequence. Note that cutoff has priority over fully_degenerate. In other words, if you specify a cutoff value and set fully_degenerate to true, the calculation will be done with the cutoff value. If nothing gets passed in, the maximum argument is chosen. In the first example above G will be returned. """ #set up some local variables co = array(self.CharOrder) alpha = self.Alphabet data = self.Data #determine the action. Cutoff takes priority over fully_degenerate if cutoff: result = [] degen = self.rowDegeneracy(cutoff) sorted = argsort(data) if include_all: #if include_all include all possiblilities in the degen char for row_idx, (num_to_keep, row) in enumerate(zip(degen,sorted)): to_take = [item for item in row[-num_to_keep:]\ if item in nonzero(data[row_idx])] +\ [item for item in nonzero(data[row_idx] ==\ data[row_idx,row[-num_to_keep]]) if item in\ nonzero(data[row_idx])] result.append(alpha.degenerateFromSequence(\ map(str,take(co, to_take)))) else: for row_idx, (num_to_keep, row) in enumerate(zip(degen,sorted)): result.append(alpha.degenerateFromSequence(\ map(str,take(co, [item for item in row[-num_to_keep:]\ if item in nonzero(data[row_idx])])))) elif not fully_degenerate: result = take(co, argmax(self.Data)) else: result = [] for row in self.Data: result.append(alpha.degenerateFromSequence(\ map(str,take(co, nonzero(row))))) return ''.join(map(str,result))
def epsilon_greedy(self,sensation,applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation,action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon()/num_maxes e1 = self.epsilon()/num_others result = zeros(len(mask))+0.0 putmask(result,mask,1-e0) putmask(result,mask==0,e1) return result
def epsilon_greedy(self, sensation, applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation, action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon() / num_maxes e1 = self.epsilon() / num_others result = zeros(len(mask)) + 0.0 putmask(result, mask, 1 - e0) putmask(result, mask == 0, e1) return result
def safe_sum_p_log_p(a, base=None): """Calculates p * log(p) safely for an array that may contain zeros.""" flat = ravel(a) nz = take(flat, nonzero(flat)) logs = log(nz) if base: logs /= log(base) return sum(nz * logs)
def find_in_radius(self,key,radius): if not self.db: return [],[] X = array([x for x,v in self.db]) dists = matrixnorm(key-X) close_enough = nonzero(dists <= radius) return ([self.db[i] for i in close_enough], [dists[i] for i in close_enough])
def setIds(self, id_fun=lambda x: x.Data.split("_")[-1]): """ Sets "LeafLabel", "LeafCts", and "ContainsAll" attributes id_fun: function that takes node and generate a unique id (label) for each node. By default will create a label consisting of the string to the right of the last underscore in the data attribute. E.g. if the node has data label of 1234_HSA, the function will return a unique lable of "HSA". the idea being that if your tree has multiple human (HSA) sequences, the result of the function will be multiple nodes w/the same label. The LeafLabel attribute is the the result of the id_fun function. The LeafCts attribute is an array with counts of the leaves with the same label. The ContainsAll attribute is True when it contains every instance of the LeafLabels of its terminal descendants. E.g. the set of LeafLabels of its terminal descendants occur nowhere else in the tree. This is used by the uniqueIds function to remove duplicate species from the tree but can be used for any label you choose. """ labels = [id_fun(x) for x in self.TerminalDescendants] u_labels = list(set(labels)) len_u_labels = len(u_labels) labels_dict = dict(zip(u_labels, range(len_u_labels))) all_cts = zeros(len(u_labels)) for label in labels: all_cts[labels_dict[label]] += 1 for n in self.traverse(self_before=False, self_after=True): if not n.Children: setattr(n, "LeafLabel", id_fun(n)) setattr(n, "LeafCts", zeros(len_u_labels)) n.LeafCts[labels_dict[n.LeafLabel]] = 1 else: n.LeafCts = zeros(len_u_labels) for c in n.Children: n.LeafCts += c.LeafCts nzero = nonzero(n.LeafCts) total = sum(take(all_cts, nzero)- take(n.LeafCts, nzero)) setattr(n, "ContainsAll", (total == 0))
def safe_log(a): """Returns the log (base 2) of each nonzero item in a. a: Numeric array WARNING: log2 is only defined on positive numbers, so make sure there are no negative numbers in the array. Always returns an array with floats in there to avoid unexpected results when applying it to an array with just integers. """ c = array(a.copy(),Float64) flat = ravel(c) nz_i = nonzero(flat) nz_e = take(flat,nz_i) log_nz = log2(nz_e) put(flat,nz_i,log_nz) return c
def normalizeSequences(self): """Normalized the data by sequences (the columns) to one It does not make sense to normalize anything with negative numbers in there. However, the method does NOT check for that, because it would slow down the calculations too much. It will work, but you might get very unexpected results. The method will raise an error when one or more columns add up to one. It checks explicitly for that to avoid OverflowErrors, ZeroDivisionErrors, and infinities in the results. WARNING: this method works in place with respect to the Profile object, not with respect to the Data attribute. Normalization rebinds self.Data to a new array. """ col_sums = sum(self.Data) if col_sums == 0: zero_indices = nonzero(col_sums==0).tolist() raise ProfileError,\ "Can't normalize profile, columns at indices %s add up to zero"\ %(zero_indices) else: self.Data = self.Data/col_sums