def test_randomSequence(self): """randomSequence: 99% of new frequencies should be within 3*SD""" r_num, c_num = 100,20 num_elements = r_num*c_num alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" r = random([r_num,c_num]) p = Profile(r,alpha[:c_num]) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 a = Alignment([p.randomSequence() for x in range(n)]) def absoluteProfile(alignment,char_order): f = a.columnFrequencies() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: col = char_order.index(i) res[row, col] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01
def calc_contingency_expected(matrix): """Calculates expected frequencies from a table of observed frequencies The input matrix is a dict2D object and represents a frequency table with different variables in the rows and columns. (observed frequencies as values) The expected value is calculated with the following equation: Expected = row_total x column_total / overall_total The returned matrix (dict2D) has lists of the observed and the expected frequency as values """ #transpose matrix for calculating column totals t_matrix = matrix.copy() t_matrix.transpose() overall_total = sum(list(matrix.Items)) #make new matrix for storing results result = matrix.copy() #populate result with expected values for row in matrix: row_sum = sum(matrix[row].values()) for item in matrix[row]: column_sum = sum(t_matrix[item].values()) #calculate expected frequency Expected = (row_sum * column_sum)/overall_total result[row][item] = [result[row][item]] result[row][item].append(Expected) return result
def regress_origin(x,y): """Returns coefficients to regression "y=ax+b" passing through origin. Requires vectors x and y of same length. See p. 351 of Zar (1999) Biostatistical Analysis. returns slope, intercept as a tuple. """ return sum(x*y)/sum(x*x), 0
def go(self, xn): if (self.x == None): self.x = ones(len(self.b)) * xn * 1.0 self.y = ones(len(self.a)) * xn * 1.0 * sum(self.b) / (1+sum(self.a)) self.x = concatenate([[xn], self.x[:-1]]) yn = dot(self.b, self.x) - dot(self.a, self.y) self.y = concatenate([[yn], self.y[:-1]]) return yn
def msum(ar): """ Take the sum of an array of arbitrary dimension. Works for slices and other arrays where ar.flat is undefined. """ from Numeric import sum if len(ar.shape) == 1: return sum(ar) else: return sum([msum(x) for x in ar])
def _alpha_scaled(self,obsIndices): """ computes forward values""" B = self.B A = self.A alpha_t = B[obsIndices[0]] * self.pi # (19) scaling_factors = [sum(alpha_t)] alpha_scaled = [alpha_t/scaling_factors[-1]] for o in obsIndices[1:]: alpha_t = matrixmultiply(alpha_scaled[-1],A)*B[o] # (92a) scaling_factors.append(sum(alpha_t)) alpha_scaled.append(alpha_t/scaling_factors[-1]) # (92b) return alpha_scaled,scaling_factors
def center_of_mass_one_array(data,weight_idx=-1): """Calculates the center of mass for a dataset data should be an array of x1,...,xn,r coordinates, where r is the weight of the point """ data = array(data) coord_idx = range(data.shape[1]) del coord_idx[weight_idx] coordinates = take(data,(coord_idx),1) weights = take(data,(weight_idx,),1) return sum(coordinates * weights)/sum(weights)
def center_of_mass_two_array(coordinates,weights): """Calculates the center of mass for a set of weighted coordinates coordinates should be an array of coordinates weights should be an array of weights. Should have same number of items as the coordinates. Can be either row or column. """ coordinates = array(coordinates) weights = array(weights) try: return sum(coordinates * weights)/sum(weights) except ValueError: weights = weights[:,NewAxis] return sum(coordinates * weights)/sum(weights)
def entropy(X): """ Computes the entropy of a histogram contained in sequence X. """ from Numeric import log, sum def fn(x): if x == 0: return 0 else: return x * (log(x) / log(2)) P = X / float(sum(X)) return -sum(map(fn, P))
def chi_square_from_Dict2D(data): """Chi Square test on a Dict2D data is a Dict2D. The values are a list of the observed (O) and expected (E) frequencies,(can be created with calc_contingency_expected) The chi-square value (test) is the sum of (O-E)^2/E over the items in data degrees of freedom are calculated from data as: (r-1)*(c-1) if cols and rows are both > 1 otherwise is just 1 - the # of rows or columns (whichever is greater than 1) """ test = sum([((item[0] - item[1]) * (item[0] - item[1]))/item[1] \ for item in data.Items]) num_rows = len(data) num_cols = len([col for col in data.Cols]) if num_rows == 1: df = num_cols - 1 elif num_cols == 1: df = num_rows - 1 elif num_rows == 0 or num_cols == 0: raise ValueError, "data matrix must have data" else: df = (len(data) - 1) * (len([col for col in data.Cols]) - 1) return test, chi_high(test, df)
def analyse_transitions(T): from Numeric import zeros states, actions, results = T.shape entropies = zeros((states, actions)) * 0.0 counts = zeros((states, actions)) * 0.0 max_prob = zeros((states, actions)) * 0.0 max_act = zeros((states, actions)) * 0.0 for s in range(states): for a in range(actions): entropies[s, a] = entropy(normalize_sum(T[s, a])) counts[s, a] = sum(T[s, a]) max_prob[s, a] = max(normalize_sum(T[s, a])) max_act[s, a] = argmax(normalize_sum(T[s, a])) print ' : ', for c in 'FBLR': print '%10s' % c, print print '-------------------------------------------------------------' for r in range(states): print '%6d' % r, ': ', for c in range(actions): print '%6.2f (%2d)' % (max_prob[r, c], max_act[r, c]), print
def positionalGC(self, purge_unwanted=True): """Returns GC, P1, P2 P3. Use purge_unwanted=False to get raw counts.""" p = self.positionalBases(purge_unwanted) p.normalize() result = [i['G'] + i['C'] for i in p] average = sum(result)/3 return [average] + result
def eigenvalue_vec(v): """ @param v: Vector of number data. @type v: Numeric array @return: (transpose(v) * v) the eigen-value of "matrix" v as float or int """ return sum(v * v);
def quad(func, a, b, n=5): """val = quad(func,a,b,n=5) Integrate func(x) from a to b using Gaussian Quadrature of order n. """ [x, w] = P_roots(n) y = (b - a) * (x + 1) / 2.0 + a return (b - a) / 2.0 * sum(w * func(y))
def eigenvalue_vec(v): """ @param v: Vector of number data. @type v: Numeric array @return: (transpose(v) * v) the eigen-value of "matrix" v as float or int """ return sum(v * v)
def vec_inner(v): """ @param v: Vector of number data. @type v: Numeric array @return: transpose(v) * v (float or int) """ return sum(v * v);
def min_dist(coord, surface): """ Return minimum distance between coord and surface. """ d=surface-coord d2=sum(d*d, 1) return sqrt(min(d2))
def safe_sum_p_log_p(a, base=None): """Calculates p * log(p) safely for an array that may contain zeros.""" flat = ravel(a) nz = take(flat, nonzero(flat)) logs = log(nz) if base: logs /= log(base) return sum(nz * logs)
def weightedMean(data, sigma): """Weighted mean of a sequence of numbers with given standard deviations. |data| is a list of measurements, |sigma| a list with corresponding standard deviations. Returns weighted mean and corresponding standard deviation. """ from Numeric import array, Float, sqrt, sum if len(data) != len(sigma): raise ValueError data = 1. * Numeric.array(data) sigma = 1. * Numeric.array(sigma) nom = sum(data / sigma**2) denom = sum(1. / sigma**2) mean = nom / denom sig = sqrt(1. / denom) return mean, sig
def _score_profile(self, profile, offset=0): """Returns score of the profile against the input_profile. profile: Profile of a sequence or alignment that has to be scored offset: where to start the matching procedure This function doesn't do any input validation. That is done in 'score' See method 'score' for more information. """ data = self.Data self_l = len(data) #profile length other_l = len(profile.Data) #other profile length result = [] for start in range(offset,other_l-self_l+1): stop = start + self_l slice = profile.Data[start:stop,:] result.append(sum(sum(self.Data*slice))) return array(result)
def __getitem__(self, key): """Normalizes key and treats T=U.""" key = self.Mask(key) if len(key) == 2: #pair of bases, e.g. GC for GC content dup = BaseUsage(self) dup.normalize() return sum([dup.get(i,0) for i in key]) else: return super(CodonUsage, self).__getitem__(key)
def fisher(probs): """Uses Fisher's method to combine multiple tests of a hypothesis. -2 * SUM(ln(P)) gives chi-squared distribution with 2n degrees of freedom. """ try: return chi_high(-2 * sum(map(log, probs)), 2 * len(probs)) except OverflowError, e: return 0.0
def norm(a): """Returns the norm of a matrix or vector Calculates the Euclidean norm of a vector. Applies the Frobenius norm function to a matrix (a.k.a. Euclidian matrix norm) a = Numeric array """ return sqrt(sum((a*a).flat))
def fingerprint(self, which_blocks='quartets', include_mean=True,\ normalize=True): """Returns fingerprint data for fingerprint plots. which_blocks: whether to include only the usual 4-codon quartets (the default), the split blocks only, or all blocks. include_mean: whether to include the mean (True). normalize: whether to normalize so that the quartets sum to 1 (True) """ if which_blocks == 'split': blocks = self.SplitBlocks elif which_blocks == 'quartets': blocks = self.SingleAABlocks elif which_blocks == 'all': blocks = self.Blocks else: raise "Got invalid option %s for which_blocks:\n" % which_blocks+\ " (valid options: 'split', 'quartets', 'all')." result = [] for b in blocks: #iterates over doublet string U, C, A, G = [self[b+i] for i in 'UCAG'] all = U+C+A+G if G+C: g_ratio = G/(G+C) else: g_ratio = 0.5 if A+U: a_ratio = A/(A+U) else: a_ratio=0.5 result.append([g_ratio, a_ratio, all]) result = array(result) if normalize: #make the shown bubbles sum to 1 sum_ = sum(result[:,-1]) if sum_: result[:,-1] /= sum_ if include_mean: #calculate mean from all codons third = self.positionalBases().Third U, C, A, G = [third[i] for i in 'UCAG'] if G+C: g_ratio = G/(G+C) else: g_ratio = 0.5 if A+U: a_ratio = A/(A+U) else: a_ratio=0.5 result = concatenate((result, array([[g_ratio,a_ratio,1]]))) return result
def run(self): "Superimpose the coordinate sets." if self.coords is None or self.reference_coords is None: raise Exception, "No coordinates set." coords=self.coords reference_coords=self.reference_coords # center on centroid av1=sum(coords)/self.n av2=sum(reference_coords)/self.n coords=coords-av1 reference_coords=reference_coords-av2 # correlation matrix a=matrixmultiply(transpose(coords), reference_coords) u, d, vt=singular_value_decomposition(a) self.rot=transpose(matrixmultiply(transpose(vt), transpose(u))) # check if we have found a reflection if determinant(self.rot)<0: vt[2]=-vt[2] self.rot=transpose(matrixmultiply(transpose(vt), transpose(u))) self.tran=av2-matrixmultiply(av1, self.rot)
def __sub__(self, other): """ Calculate distance between two atoms. Example: >>> distance=atom1-atom2 @param other: the other atom @type other: L{Atom} """ diff=self.coord-other.coord return sqrt(sum(diff*diff))
def test_conservation_of_area(self): """Test that coefficients in lp satisfy the dilation equation """ from daubfilt import daubfilt, number_of_filters for p in range(number_of_filters): D = 2*(p+1) lp, hp = daubfilt(D) err = abs(sum(lp)-sqrt(2)) #assert abs(err) <= epsilon, 'Error == %e' %err assert allclose(err, 0), 'Error == %e' %err
def hamming_distance(x,y): """Returns the Hamming distance between two arrays. The Hamming distance is the number of characters which differ between two sequences (arrays). WARNING: This function truncates the longest array to the length of the shortest one. Example: ABC, ABB -> 1 ABCDEFG, ABCEFGH -> 4 """ shortest = min(map(len,[x,y])) return sum(x[:shortest] != y[:shortest])
def _ksi(self,obsIndices,alpha,beta): N = self.N A = self.A B = self.B ksi = [] for t in range(len(obsIndices)-1): ksi_t = zeros((N,N),Float) obs = obsIndices[t+1] for i in range(N): for j in range(N): #print t,i,j,alpha[t][i],A[i,j],B[obs][j],beta[t+1][j],alpha[t][i]*A[i,j]*B[obs][j]*beta[t+1][j] #print t,i,j,len(obsIndices),len(alpha) ksi_t[i,j] = alpha[t][i]*A[i,j]*B[obs][j]*beta[t+1][j] # numerator of (37) ksi_t /= sum(reshape(ksi_t,(N*N,))) # normalization of (37) ksi.append(ksi_t) return ksi
def mat_prod(A, x): """ @param A: 2-dimensional matrix of number data. @type A: Numeric array @param x: Vector of number data. @type x: Numeric array @return: b of (Ax = b). Product of: matrix A (m,n) * vector x (n) = vector b (m) """ #m = A.shape[0] #b = zeros((m), float) # calc: Ax = b #for i in range(m): # b[i] = sum(A[i,:]*x) return array([sum(a[:]*x) for a in A])
def column_uncertainty(a): """Returns uncertainty (Shannon's entropy) for each column in a in BITS a: Numeric array (has to be 2-dimensional) The uncertainty is calculated in BITS not NATS!!! Will return 0 for every empty row, but an empty array for every empty column, thanks to this sum behavior: >>> sum(array([[]]),1) array([0]) >>> sum(array([[]])) zeros((0,), 'l') """ if len(a.shape) < 2: raise ValueError, "Array has to be two-dimensional" return sum(safe_p_log_p(a))
def mat_prod(A, x): """ @param A: 2-dimensional matrix of number data. @type A: Numeric array @param x: Vector of number data. @type x: Numeric array @return: b of (Ax = b). Product of: matrix A (m,n) * vector x (n) = vector b (m) """ #m = A.shape[0] #b = zeros((m), float) # calc: Ax = b #for i in range(m): # b[i] = sum(A[i,:]*x) return array(map(lambda a: sum(a[:] * x), A))
def rscu(self): """Normalizes self in-place to RSCU, relative synonymous codon usage. RSCU divides the frequency of each codon to the sum of the freqs for that codon's amino acid. """ gc = self.GeneticCode syn = gc.Synonyms aa_sums = {} for key, codons in syn.items(): aa_sums[key] = sum([self[c] for c in codons]) for codon in self: try: curr = self[codon] self[codon] = curr/aa_sums[gc[codon]] except (KeyError, ZeroDivisionError): pass return self
def G_ind(m, williams=False): """Returns G test for independence in an r x c table. Requires input data as a Numeric array. From Sokal and Rohlf p 738. """ f_ln_f_elements = safe_sum_p_log_p(m) f_ln_f_rows = safe_sum_p_log_p(sum(m)) f_ln_f_cols = safe_sum_p_log_p(sum(m,1)) tot = sum(ravel(m)) f_ln_f_table = tot * log(tot) df = (len(m)-1) * (len(m[0])-1) G = 2*(f_ln_f_elements-f_ln_f_rows-f_ln_f_cols+f_ln_f_table) if williams: q = 1+((tot*sum(1.0/sum(m,1))-1)*(tot*sum(1.0/sum(m))-1)/ \ (6*tot*df)) G = G/q return G, chi_high(max(G,0), df)
def _score_indices(self, seq_indices, offset=0): """Returns score of the profile for each slice of the seq_indices seq_indices: translation of sequence into indices that match the characters in the CharOrder of the profile offset: where to start the matching procedure This function doesn't do any input validation. That is done in 'score' See method 'score' for more information. """ data = self.Data pl = len(data) #profile length (number of positions) sl = len(seq_indices) r = range(pl) #fixed range result = [] for starting_pos in range(offset, len(seq_indices)-pl+1): slice = seq_indices[starting_pos:starting_pos+pl] result.append(sum(array([data[i] for i in zip(r,slice)]))) return array(result)
def test_normalizeSequences(self): """normalizeSequences: should normalize or raise appropriate error """ p = self.full.copy() p.normalizeSequences() self.assertEqual(p.Data,array([[2/9,4/17],[3/9,5/17],[4/9,8/17]])) self.assertEqual(sum(p.Data),[1,1]) p = self.empty_row.copy() p.normalizeSequences() self.assertEqual(p.Data,array([[1,1],[0,0]])) p = self.empty_col.copy() self.assertRaises(ProfileError,p.normalizeSequences) p = Profile(array([[0.0],[0.0]]),"AB") self.assertRaises(ProfileError,p.normalizeSequences) #negative numbers!!!!!! p1 = Profile(array([[3,4],[-2,-3]]),"AB") p1.normalizeSequences() self.assertEqual(p1.Data,array([[3,4],[-2,-3]])) p2 = Profile(array([[3,4],[-3,-3]]),"AB") self.assertRaises(ProfileError,p2.normalizeSequences)
def test_normalizePositions(self): """normalizePositions: should normalize or raise appropriate error """ p = self.full.copy() p.normalizePositions() self.assertEqual(p.Data,array([[2/6,4/6],[3/8,5/8],[4/12,8/12]])) self.assertEqual(sum(p.Data,1),[1,1,1]) p = self.empty_col.copy() p.normalizePositions() self.assertEqual(p.Data,array([[0,1],[0,1]])) p = self.empty_row.copy() self.assertRaises(ProfileError,p.normalizePositions) p = Profile(array([[0.0,0.0]]),"AB") self.assertRaises(ProfileError,p.normalizePositions) #negative numbers!!!!!! p1 = Profile(array([[3,-2],[4,-3]]),"AB") p1.normalizePositions() self.assertEqual(p1.Data,array([[3,-2],[4,-3]])) p2 = Profile(array([[3,-3],[4,-3]]),"AB") self.assertRaises(ProfileError,p2.normalizePositions)
def pastel(colour, weight=2.4): """ Convert colour into a nice pastel shade""" rgb = asarray(colorConverter.to_rgb(colour)) # scale colour maxc = max(rgb) if maxc < 1.0 and maxc > 0: # scale colour scale = 1.0 / maxc rgb = rgb * scale # now decrease saturation total = sum(rgb) slack = 0 for x in rgb: slack += 1.0 - x # want to increase weight from total to weight # pick x s.t. slack * x == weight - total # x = (weight - total) / slack x = (weight - total) / slack rgb = [c + (x * (1.0 - c)) for c in rgb] return rgb
def L2norm(ar): from Numeric import dot, sqrt return sqrt(sum(ar**2))
def nipals_arr(X, PCs, threshold, E_matrices): """ @param X: 2-dimensional matrix of number data. @type X: Numeric array @param PCs: Number of Principal Components. @type PCs: int @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001). @type threshold: float @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC). @type E_matrices: bool @return: (Scores, Loadings, E) """ (rows, cols) = shape(X) maxPCs = min(rows, cols) # max number of PCs is min(objects, variables) if maxPCs < PCs: PCs = maxPCs # change to maxPCs if PCs > maxPCs Scores = zeros((rows, PCs), Float) # all Scores (T) Loadings = zeros((PCs, cols), Float) # all Loadings (P) E = X.copy() #E[0] (should already be mean centered) if E_matrices: Error_matrices = zeros((PCs, rows, cols), Float) # all Error matrices (E) else: explained_var = zeros((PCs), Float) tot_explained_var = 0 # total object residual variance for PC[0] (calculating from E[0]) e_tot0 = 0 # for E[0] the total object residual variance is 100% for k in range(rows): e_k = E[k, :]**2 e_tot0 += sum(e_k) t = get_column(E) # extract a column p = zeros((cols), Float) # do iterations (0, PCs) for i in range(PCs): convergence = False ready_for_compare = False E_t = transpose(E) while not convergence: _temp = eigenvalue_vec(t) p = mat_prod( E_t, t) / _temp # ..................................... step 1 _temp = eigenvalue_vec(p)**(-0.5) p = p * _temp # .................................................... step 2 _temp = eigenvalue_vec(p) t = mat_prod( E, p) / _temp # ....................................... step 3 eigenval_new = eigenvalue_vec(t) if not ready_for_compare: ready_for_compare = True else: # ready for convergence check if (eigenval_new - eigenval_old) < threshold * eigenval_new: # ... step 4 convergence = True eigenval_old = eigenval_new remove_tp_prod( E, t, p) # .............................................. step 5 # add Scores and Loadings for PC[i] to the collection of all PCs Scores[:, i] = t Loadings[i, :] = p if E_matrices: # complete error matrix # can calculate object residual variance (row-wise) or variable resiudal variance (column-wise) # total residual variance can also be calculated Error_matrices[i] = E.copy() else: # total object residual variance for E[i] e_tot = 0 for k in range(rows): e_k = E[k, :]**2 e_tot += sum(e_k) tot_obj_residual_var = (e_tot / e_tot0) explained_var[i] = 1 - tot_obj_residual_var - tot_explained_var tot_explained_var += explained_var[i] if E_matrices: return Scores, Loadings, Error_matrices else: return Scores, Loadings, explained_var
def normalize_sum(ar): d = float(sum(ar)) if d == 0: return ar * 0.0 return ar / d
def _dist(p, q): diff = p - q return sqrt(sum(diff * diff))
def norm(ar, L=2): return sum(abs(ar)**L)**(1.0 / L)