def SliceArray(array, slicelist): """Method for slicing an array This method can be used for slicing an array by specifying the array and a slicelist. The slicelist should have the same length as the number of axes in the array. **An example:** To slice an array having three axes with (12:17,:,[13,26]): 'SliceArray(array,[range(12,17),None,(13,26)])' Note that 'None' implies that all the values along the given axis are retained. The output array will have the same number of axes as the input array. """ from Numeric import take for i in range(len(slicelist)): if slicelist[i] == None: pass elif type(slicelist[i]) == type(1): array = take(array, [slicelist[i]], i) else: array = take(array, slicelist[i], i) return array
def toConsensus(self, cutoff=None, fully_degenerate=False,\ include_all=False): """Returns the consensus sequence from a profile. cutoff: cutoff value, determines how much should be covered in a position (row) of the profile. Example: pos 0 [.2,.1,.3,.4] (CharOrder: TCAG). To cover .65 (=cutoff) we need two characters: A and G, which results in the degenerate character R. fully_degenerate: determines whether the fully degenerate character is returned at a position. For the example above an 'N' would be returned. inlcude_all: all possibilities are included in the degenerate character. Example: row = UCAG = [.1,.3,.3,.3] cutoff = .4, consensus = 'V' (even though only 2 chars would be enough to reach the cutoff value). The Alphabet of the Profile should implement degenerateFromSequence. Note that cutoff has priority over fully_degenerate. In other words, if you specify a cutoff value and set fully_degenerate to true, the calculation will be done with the cutoff value. If nothing gets passed in, the maximum argument is chosen. In the first example above G will be returned. """ #set up some local variables co = array(self.CharOrder) alpha = self.Alphabet data = self.Data #determine the action. Cutoff takes priority over fully_degenerate if cutoff: result = [] degen = self.rowDegeneracy(cutoff) sorted = argsort(data) if include_all: #if include_all include all possiblilities in the degen char for row_idx, (num_to_keep, row) in enumerate(zip(degen,sorted)): to_take = [item for item in row[-num_to_keep:]\ if item in nonzero(data[row_idx])] +\ [item for item in nonzero(data[row_idx] ==\ data[row_idx,row[-num_to_keep]]) if item in\ nonzero(data[row_idx])] result.append(alpha.degenerateFromSequence(\ map(str,take(co, to_take)))) else: for row_idx, (num_to_keep, row) in enumerate(zip(degen,sorted)): result.append(alpha.degenerateFromSequence(\ map(str,take(co, [item for item in row[-num_to_keep:]\ if item in nonzero(data[row_idx])])))) elif not fully_degenerate: result = take(co, argmax(self.Data)) else: result = [] for row in self.Data: result.append(alpha.degenerateFromSequence(\ map(str,take(co, nonzero(row))))) return ''.join(map(str,result))
def test_center_of_mass_two_array(self): """center_of_mass_two_array should behave correctly""" com2 = center_of_mass_two_array coor = take(self.square_odd,(0,1),1) weights = take(self.square_odd,(2,),1) self.assertEqual(com2(coor,weights), array([2,2])) weights = weights.flat self.assertEqual(com2(coor,weights), array([2,2]))
def center_of_mass_one_array(data,weight_idx=-1): """Calculates the center of mass for a dataset data should be an array of x1,...,xn,r coordinates, where r is the weight of the point """ data = array(data) coord_idx = range(data.shape[1]) del coord_idx[weight_idx] coordinates = take(data,(coord_idx),1) weights = take(data,(weight_idx,),1) return sum(coordinates * weights)/sum(weights)
def Translate(array, translation): """Method for translating an array used by Simulations.Dacapo.Grid""" from Numeric import concatenate, take import copy newarray = array size = array.shape for dim in range(len(translation)): axis = dim - len(translation) newarray = concatenate( (take(newarray, range(translation[dim], size[axis]), axis), take(newarray, range(translation[dim]), axis)), axis) # the array is copied to make it contiguous return copy.copy(newarray)
def test_center_of_mass(self): """center_of_mass should make right choice between functional methods """ com = center_of_mass com1 = center_of_mass_one_array com2 = center_of_mass_two_array self.assertEqual(com(self.simple),com1(self.simple)) self.assertFloatEqual(com(self.more_weight),com1(self.more_weight)) self.assertEqual(com(self.sec_weight,1), com1(self.sec_weight,1)) coor = take(self.square_odd,(0,1),1) weights = take(self.square_odd,(2,),1) self.assertEqual(com(coor,weights), com2(coor,weights)) weights = weights.flat self.assertEqual(com(coor,weights), com2(coor,weights))
def output_in_copath_format(self, outfname, node_rank): """ 04-20-05 output go_no2cluster_group 04-25-05 cluster_id redefined """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) outf = open(outfname, 'a') writer = csv.writer(outf, delimiter='\t') for go_no, cluster_group in self.go_no2cluster_group.iteritems(): counter = 0 for bicluster in cluster_group.bicluster_list: seed_edge_id_list = list(take(cluster_group.edge_id_array, bicluster.row_index_list)) edge_id_list = seed_edge_id_list + bicluster.added_edge_id_list vertex_list , edge_list = get_vertex_edge_list_by_edge_id(curs, edge_id_list) no_of_nodes = len(vertex_list) connectivity = len(edge_list)*2.0/(no_of_nodes*(no_of_nodes-1)) vertex_string = '{' + ';'.join(vertex_list) + ';}' edge_string = self.edge_string_from_edge_list(edge_list) cluster_id = "%s.%s"%(go_no, counter) writer.writerow([cluster_id, connectivity, vertex_string, edge_string]) counter += 1 del writer outf.close()
def seed_grow(self, node_rank, cor_cut_off, euc_dist_cut_off): """ 04-20-05 add candidate edge based on its correlation with consensus_list, (>=0.8) """ sys.stderr.write("Node %s, seed_growing...\n"%node_rank) for i in range(self.candidate_edge_array.shape[0]): candidate_edge_vector = self.candidate_edge_array[i,:] edge_id = int(candidate_edge_vector[0]) #first grab the edge_id candidate_edge_vector = candidate_edge_vector[1:] #then grab it's correlation vector for go_no, cluster_group in self.go_no2cluster_group.iteritems(): if edge_id in cluster_group.edge_id_set: continue #this edge is among the function group for j in range(len(cluster_group.bicluster_list)): bicluster = cluster_group.bicluster_list[j] selected_candidate_edge_vector = list(take(candidate_edge_vector, bicluster.column_index_list)) edge_data = graph_modeling.ind_cor(selected_candidate_edge_vector, \ bicluster.consensus_list, -1) #leave_one_out = -1, means no leave_one_out euc_edge_data = graph_modeling.euc_dist(selected_candidate_edge_vector,\ bicluster.consensus_list) if edge_data.value>=cor_cut_off and euc_edge_data.value/(euc_edge_data.degree+2)<=euc_dist_cut_off: #average euclidean distance bicluster.added_edge_id_list.append(edge_id) bicluster.added_edge_matrix.append(selected_candidate_edge_vector) cluster_group.bicluster_list[j] = bicluster #update the list values, different from dictionary sys.stderr.write("Node %s, Done.\n"%(node_rank))
def __getitem__(self, key): """Override built in. This is called when instance is called to retrieve a position e.g.: li = matrix['A'] returns a list (a single column vector if only one position specified), or list of lists: (a set of column vectors if several positions specified) of tuples for that position""" if type(key) == types.TupleType: row, colName = key if colName in self.colList: col = self.extraCount + self.colList.index(colName) else: raise KeyError("can't find %s column" % colName) return self.array[(row, col)] elif type(key) == types.StringType: colNames = string.split(key, ":") li = [] for col in colNames: # check first in list of alleles if col in self.colList: # get relative location in list relativeLoc = self.colList.index(col) # calculate real locations in array col1 = relativeLoc * 2 + self.extraCount col2 = col1 + 1 li.append(col1) li.append(col2) # now check in non-allele metadata elif col in self.extraList: li.append(self.extraList.index(col)) else: raise KeyError("can't find %s column" % col) if len(colNames) == 1: # return simply the pair of columns at that location as # a list return take(self.array, tuple(li[0:2]), 1).tolist() else: # return the matrix consisting of column vectors # of the designated keys return take(self.array, tuple(li), 1).tolist() else: raise KeyError("keys must be a string or tuple")
def setIds(self, id_fun=lambda x: x.Data.split("_")[-1]): """ Sets "LeafLabel", "LeafCts", and "ContainsAll" attributes id_fun: function that takes node and generate a unique id (label) for each node. By default will create a label consisting of the string to the right of the last underscore in the data attribute. E.g. if the node has data label of 1234_HSA, the function will return a unique lable of "HSA". the idea being that if your tree has multiple human (HSA) sequences, the result of the function will be multiple nodes w/the same label. The LeafLabel attribute is the the result of the id_fun function. The LeafCts attribute is an array with counts of the leaves with the same label. The ContainsAll attribute is True when it contains every instance of the LeafLabels of its terminal descendants. E.g. the set of LeafLabels of its terminal descendants occur nowhere else in the tree. This is used by the uniqueIds function to remove duplicate species from the tree but can be used for any label you choose. """ labels = [id_fun(x) for x in self.TerminalDescendants] u_labels = list(set(labels)) len_u_labels = len(u_labels) labels_dict = dict(zip(u_labels, range(len_u_labels))) all_cts = zeros(len(u_labels)) for label in labels: all_cts[labels_dict[label]] += 1 for n in self.traverse(self_before=False, self_after=True): if not n.Children: setattr(n, "LeafLabel", id_fun(n)) setattr(n, "LeafCts", zeros(len_u_labels)) n.LeafCts[labels_dict[n.LeafLabel]] = 1 else: n.LeafCts = zeros(len_u_labels) for c in n.Children: n.LeafCts += c.LeafCts nzero = nonzero(n.LeafCts) total = sum(take(all_cts, nzero)- take(n.LeafCts, nzero)) setattr(n, "ContainsAll", (total == 0))
def __getitem__(self, key): """Override built in. This is called when instance is called to retrieve a position e.g.: li = matrix['A'] returns a list (a single column vector if only one position specified), or list of lists: (a set of column vectors if several positions specified) of tuples for that position""" if type(key) == types.TupleType: row,colName= key if colName in self.colList: col = self.extraCount+self.colList.index(colName) else: raise KeyError("can't find %s column" % colName) return self.array[(row,col)] elif type(key) == types.StringType: colNames = string.split(key, ":") li = [] for col in colNames: # check first in list of alleles if col in self.colList: # get relative location in list relativeLoc = self.colList.index(col) # calculate real locations in array col1 = relativeLoc * 2 + self.extraCount col2 = col1 + 1 li.append(col1) li.append(col2) # now check in non-allele metadata elif col in self.extraList: li.append(self.extraList.index(col)) else: raise KeyError("can't find %s column" % col) if len(colNames) == 1: # return simply the pair of columns at that location as # a list return take(self.array, tuple(li[0:2]), 1).tolist() else: # return the matrix consisting of column vectors # of the designated keys return take(self.array, tuple(li), 1).tolist() else: raise KeyError("keys must be a string or tuple")
def safe_sum_p_log_p(a, base=None): """Calculates p * log(p) safely for an array that may contain zeros.""" flat = ravel(a) nz = take(flat, nonzero(flat)) logs = log(nz) if base: logs /= log(base) return sum(nz * logs)
def gaussian_activation(self): x = self.dists radii = zeros(self.dists.shape) * 0.0 for u,conn_dict in enumerate(self.connections): neighbors = take(self.weights,conn_dict.keys()) radii[u] = average(matrixnorm(neighbors-self.weights[u])) self.__activation = gaussian(x,radii/2)
def gaussian_activation(self): x = self.dists radii = zeros(self.dists.shape) * 0.0 for u, conn_dict in enumerate(self.connections): neighbors = take(self.weights, conn_dict.keys()) radii[u] = average(matrixnorm(neighbors - self.weights[u])) self.__activation = gaussian(x, radii / 2)
def pca(M): "Perform PCA on M, return eigenvectors and eigenvalues, sorted." T, N = shape(M) # if there are fewer rows T than columns N, use snapshot method if T < N: C = dot(M, t(M)) evals, evecsC = eigenvectors(C) # HACK: make sure evals are all positive evals = where(evals < 0, 0, evals) evecs = 1. / sqrt(evals) * dot(t(M), t(evecsC)) else: # calculate covariance matrix K = 1. / T * dot(t(M), M) evals, evecs = eigenvectors(K) # sort the eigenvalues and eigenvectors, descending order order = (argsort(evals)[::-1]) evecs = take(evecs, order, 1) evals = take(evals, order) return evals, t(evecs)
def randomSequence(self, force_accumulate=False, random_f = random): """Returns random sequence matching current probability matrix. Stores cumulative sum (sort of) of probability matrix in self._accumulated; Use force_accumulate to reset if you change the matrix in place (which you shouldn't do anyway). """ co = self.CharOrder random_indices = self.randomIndices(force_accumulate,random_f) return ''.join(map(str,take(co,random_indices)))
def uniform_gaussian_activation(self): x = self.dists total = 0.0 count = 0 for u,conn_dict in enumerate(self.connections): neighbors = take(self.weights,conn_dict.keys()) total += sum(matrixnorm(neighbors-self.weights[u])) count += len(conn_dict) self.__activation = gaussian(x,(total/count)/2)
def uniform_gaussian_activation(self): x = self.dists total = 0.0 count = 0 for u, conn_dict in enumerate(self.connections): neighbors = take(self.weights, conn_dict.keys()) total += sum(matrixnorm(neighbors - self.weights[u])) count += len(conn_dict) self.__activation = gaussian(x, (total / count) / 2)
def pca(M): from Numeric import take, dot, shape, argsort, where, sqrt, transpose as t from LinearAlgebra import eigenvectors "Perform PCA on M, return eigenvectors and eigenvalues, sorted." T, N = shape(M) # if there are less rows T than columns N, use # snapshot method if T < N: C = dot(M, t(M)) evals, evecsC = eigenvectors(C) # HACK: make sure evals are all positive evals = where(evals < 0, 0, evals) evecs = 1./sqrt(evals) * dot(t(M), t(evecsC)) else: # calculate covariance matrix K = 1./T * dot(t(M), M) evals, evecs = eigenvectors(K) # sort the eigenvalues and eigenvectors, decending order order = (argsort(evals)[::-1]) evecs = take(evecs, order, 1) evals = take(evals, order) return evals, t(evecs)
def output(self, outfname, node_rank): """ 04-26-05 output the information about the bicluster, easy to check 04-25-05 cluster_id redefined """ outf = open(outfname, 'a') writer = csv.writer(outf, delimiter='\t') for go_no, cluster_group in self.go_no2cluster_group.iteritems(): counter = 0 for bicluster in cluster_group.bicluster_list: cluster_id = "%s.%s"%(go_no, counter) seed_edge_id_list = list(take(cluster_group.edge_id_array, bicluster.row_index_list)) edge_id_list = seed_edge_id_list + bicluster.added_edge_id_list writer.writerow([cluster_id, bicluster.score, repr(edge_id_list), repr(bicluster.column_index_list)]) counter += 1
def safe_log(a): """Returns the log (base 2) of each nonzero item in a. a: Numeric array WARNING: log2 is only defined on positive numbers, so make sure there are no negative numbers in the array. Always returns an array with floats in there to avoid unexpected results when applying it to an array with just integers. """ c = array(a.copy(),Float64) flat = ravel(c) nz_i = nonzero(flat) nz_e = take(flat,nz_i) log_nz = log2(nz_e) put(flat,nz_i,log_nz) return c
def pairs_to_array(pairs, num_items=None, transform=None): """Returns array with same data as pairs (list of tuples). pairs can contain (first, second, weight) or (first, second) tuples. If 2 items in the tuple, weight will be assumed to be 1. num_items should contain the number of items that the pairs are chosen from. If None, will calculate based on the largest item in the actual list. transform contains a array that maps indices in the pairs coordinates to other indices, i.e. transform[old_index] = new_index. It is anticipated that transform will be the result of calling ungapped_to_gapped on the original, gapped sequence before the sequence is passed into something that strips out the gaps (e.g. for motif finding or RNA folding). WARNING: all tuples must be the same length! (i.e. if weight is supplied for any, it must be supplied for all. WARNING: if num_items is actually smaller than the biggest index in the list (+ 1, because the indices start with 0), you'll get an exception when trying to place the object. Don't do it. """ #handle easy case if not pairs: return array([]) data = array(pairs) #figure out if we're mapping the indices to gapped coordinates if transform: #pairs of indices idx_pairs = take(transform, data[:,0:2].astype(Int32)) else: idx_pairs = data[:,0:2].astype(Int32) #figure out biggest item if not supplied if num_items is None: num_items = int(max(ravel(idx_pairs))) + 1 #make result array result = zeros((num_items,num_items), Float64) if len(data[0]) == 2: values = 1 else: values = data[:,2] put(ravel(result), idx_pairs[:,0]*num_items+idx_pairs[:,1], values) return result
def condense_matrix(matrix, smallest_index, large_value): """converges the rows and columns indicated by smallest_index Smallest index is returned from find_smallest_index. For both the rows and columns, the values for the two indices are averaged. The resulting vector replaces the first index in the array and the second index is replaced by an array with large numbers so that it is never chosen again with find_smallest_index. """ first_index, second_index = smallest_index #get the rows and make a new vector that has their average rows = take(matrix, smallest_index) new_vector = average(rows) #replace info in the row and column for first index with new_vector matrix[first_index] = new_vector matrix[:, first_index] = new_vector #replace the info in the row and column for the second index with #high numbers so that it is ignored matrix[second_index] = large_value matrix[:, second_index] = large_value return matrix
def splint(xa, ya, y2a, x, derivs=False): """returns the interpolated from from the spline x can either be a scalar or a listable item, in which case a Numeric Float array will be returned and the multiple interpolations will be done somewhat more efficiently. If derivs is not False, return y, y', y'' instead of just y.""" if type(x) is types.IntType or type(x) is types.FloatType: if (x<xa[0] or x>xa[-1]): raise RangeError, "%f not in range (%f, %f) in splint()" % (x, xa[0], xa[-1]) khi=max(searchsorted(xa,x),1) klo=khi-1 h=float(xa[khi]-xa[klo]) a=(xa[khi]-x)/h; b=1.0-a ylo=ya[klo]; yhi=ya[khi]; y2lo=y2a[klo]; y2hi=y2a[khi] else: #if we got here, we are processing a list, and should do so more efficiently if (min(x)<xa[0] or max(x)>xa[-1]): raise RangeError, "(%f, %f) not in range (%f, %f) in splint()" % (min(x), max(x), xa[0], xa[-1]) npoints=len(x) khi=clip(searchsorted(xa,x),1,len(xa)) klo=khi-1 xhi=take(xa, khi) xlo=take(xa, klo) yhi=take(ya, khi) ylo=take(ya, klo) y2hi=take(y2a, khi) y2lo=take(y2a, klo) h=(xhi-xlo).astype(Float) a=(xhi-x)/h b=1.0-a y=a*ylo+b*yhi+((a*a*a-a)*y2lo+(b*b*b-b)*y2hi)*(h*h)/6.0 if derivs: return y, (yhi-ylo)/h+((3*b*b-1)*y2hi-(3*a*a-1)*y2lo)*h/6.0, b*y2hi+a*y2lo else: return y
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\ weights=None): """Generates a Profile object from an Alignment. aln: Alignment object alphabet (optional): an Alphabet object (or list of chars, but if you want to split degenerate symbols, the alphabet must have a Degenerates property. Default is the alphabet of the first seq in the alignment. char_order (optional): order of the characters in the profile. Default is list(alphabet) split_degenerates (optional): Whether you want the counts for the degenerate symbols to be divided over the non-degenerate symbols they code for. weights (optional): dictionary of seq_id: weight. If not entered all seqs are weighted equally A Profile is a position x character matrix describing which characters occur at each position of an alignment. The Profile is always normalized, so it gives the probabilities of each character at each position. Ignoring chars: you can ignore characters in the alignment by not putting the char in the CharOrder. If you ignore all characters at a particular position, an error will be raised, because the profile can't be normalized. Splitting degenerates: you can split degenerate characters over the non-degenerate characters they code for. For example: R = A or G. So, an R at a position counts for 0.5 A and 0.5 G. Example: seq1 TCAG weight: 0.5 seq2 TAR- weight: 0.25 seq3 YAG- weight: 0.25 Profile(aln,alphabet=DnaAlphabet,char_order="TACG",weights=w, split_degenerates=True) Profile: T A C G [[ 0.875 0. 0.125 0. ] [ 0. 0.5 0.5 0. ] [ 0. 0.625 0. 0.375] [ 0. 0. 0. 1. ]] """ if alphabet is None: alphabet = aln.values()[0].Alphabet if char_order is None: char_order = list(alphabet) if weights is None: weights = dict.fromkeys(aln.keys(),1/len(aln)) char_meaning = CharMeaningProfile(alphabet, char_order,\ split_degenerates) profiles = [] for k,v in aln.items(): profiles.append(take(char_meaning.Data, asarray(v.upper(), UInt8))\ * weights[k]) s = reduce(add,profiles) result = Profile(s,alphabet, char_order) try: result.normalizePositions() except: raise ValueError,\ "Probably one of the rows in your profile adds up to zero,\n "+\ "because you are ignoring all of the characters in the "+\ "corresponding\n column in the alignment" return result
def SeqToProfile(seq, alphabet=None, char_order=None,\ split_degenerates=False): """Generates a Profile object from a Sequence object. seq: Sequence object alphabet (optional): Alphabet object (if you want to split degenerate symbols, the alphabet object should have a Degenerates property. Default is the Alphabet associated with the Sequence object. char_order (optional): The order the characters occur in the Profile. Default is the list(alphabet) split_degenerates (optional): Whether you want the counts for the degenerate symbols to be divided over the non-degenerate symbols they code for. A Profile is a position x character matrix describing which characters occur at each position. In a sequence (as opposed to an alignment) only one character occurs at each position. In general a sequence profile will only contain ones and zeros. However, you have the possibility of splitting degenerate characters. For example, if a position is R, it means that you have 50/50% chance of A and G. It's also possible to ignore characters, which in a sequence profile will lead to positions (rows) containing only zeros. Example: Sequence = ACGU Profile(seq, CharOrder=UCAG): U C A G 0 0 1 0 first pos 0 1 0 0 second pos 0 0 0 1 third pos 1 0 0 0 fourth pos Sequence= GURY Profile(seq,CharOrder=UCAG, split_degenerates=True) U C A G 0 0 0 1 first pos 1 0 0 0 second pos 0 0 .5 .5 third pos .5 .5 0 0 fourth pos Characters can also be ignored Sequence = ACN- Profile(seq, CharOrder=UCAGN, split_degenerates=True) U C A G 0 0 1 0 first pos 0 1 0 0 second pos .25 .25 .25 .25 third pos 0 0 0 0 fourth pos <--contains only zeros """ if alphabet is None: alphabet = seq.Alphabet if char_order is None: char_order = list(alphabet) #Determine the meaning of each character based on the alphabet, the #character order, and the option to split degenerates char_meaning = CharMeaningProfile(alphabet, char_order,\ split_degenerates) #construct profile data result_data = take(char_meaning.Data, asarray(seq.upper(), UInt8)) return Profile(result_data, alphabet, char_order)
def redraw(screen, buf, palette, frames): x, y = indices(screensize) # this 256 is not ncolors; it's a timing/pacing thing buf += ((x + frames) & (y + frames)) >> (frames % 256) >> 3 buf %= ncolors pygame.surfarray.blit_array(screen, take(palette, buf))