def parse_recurrence(self, combined_vector, no_of_edges=0, cor_cut_off=0): """ 03-03-05 replace the combined_vector with combined_sig_vector get a fair recurrence_array(see 'redefine recurrence and connectivity' in log_05) 07-03-05 combined_vector is already a 2d list. 08-09-05 replace no_of_edges with x_dimension, no_of_edges defunct """ cor_array = numarray.array(combined_vector) x_dimension, y_dimension = cor_array.shape #07-03-05 cor_array is 2d array recurrence_array = [] for i in range(y_dimension): #regard the correlations >= self.cor_cut_off to be 1, others 0 if cor_cut_off>0: edge_cor_in_one_dataset = numarray.greater_equal(cor_array[:,i], cor_cut_off) else: edge_cor_in_one_dataset = cor_array[:,i] recurrence_array.append(sum(edge_cor_in_one_dataset)/float(x_dimension)) if self.debug: print edge_cor_in_one_dataset print recurrence_array #handle this in gene_stat_plot.py #recurrence_array = numarray.greater_equal(recurrence_array, self.subgraph_cut_off) return recurrence_array
def get_cluster_accuracy(self, curs, p_gene_table, mcl_id_list, p_value_cut_off=0.01): """ 04-07-05 get the accuracy, no_of_corrected predictions, known predictions, total predictions for each cluster """ accuracy2cluster = [] for mcl_id in mcl_id_list: curs.execute("select is_correct_lca from %s where mcl_id=%s and p_value_cut_off<=%s"%(p_gene_table,\ mcl_id, p_value_cut_off)) rows = curs.fetchall() if rows: is_correct_lca_array = array(rows) correct_array = greater(is_correct_lca_array[:,0],0) known_array = greater_equal(is_correct_lca_array[:,0],0) accuracy = float(sum(correct_array))/float(sum(known_array)) accuracy2cluster.append([accuracy, sum(correct_array), sum(known_array), len(correct_array), mcl_id]) return accuracy2cluster
def _gene_stat_leave_one_out(self, row, node_distance_class=None, curs=None): """ 03-08-05 set a default(1.0) for min_p_value fix a bug, alter >self.depth_cut_off to >= self.depth_cut_off 03-08-05 don't take floor of recurrence and connectivity anymore, p_gene_analysis.py and p_gene_lm.py will take care of this. 03-08-05 fix another important bug when looking for other functions that have same min_p_value, the depth_cut_off requirement is forgotten. 03-14-05 getting go_no2distance via node_distance_class 03-15-05 get lca's for common_ancestor_deep_enough approach 08-13-05 change following cluster_stat's change """ mcl_id = row[0] gene_no = row[1] p_value_vector = row[2][2:-2].split('], [') connectivity = float(row[3]) unknown_gene_ratio = float(row[4]) recurrence_array = row[5][1:-1].split(',') recurrence_array = map(float, recurrence_array) vertex_set = row[6][1:-1].split(',') vertex_set = map(int, vertex_set) #subgraph_cut_off is the cutoff for a cluster to be counted as occurred if self.subgraph_cut_off!=0: #0 means no cutoff recurrence_array = greater_equal(recurrence_array, self.subgraph_cut_off) """ #take the floor of the recurrence recurrence = int(math.floor(sum(recurrence_array)/self.recurrence_gap_size)*self.recurrence_gap_size) #take the floor of the connectivity *10 connectivity = int(math.floor(connectivity*10/self.connectivity_gap_size)*self.connectivity_gap_size) """ recurrence = sum(recurrence_array) #setup in prediction_tuple2list prediction_tuple = (recurrence, connectivity) if prediction_tuple not in self.prediction_tuple2list: self.prediction_tuple2list[prediction_tuple] = [] self.no_of_records += 1 for entry in p_value_vector: p_value, go_no = entry.split(',') p_value = float(p_value) go_no = int(go_no) if self.go_no2depth[go_no] < self.depth_cut_off: #again we need the go_no to be deep enough continue if gene_no in self.known_genes_dict: k_functions_set = self.known_genes_dict[gene_no] is_correct = self.direct_match(go_no, k_functions_set) is_correct_L1 = self.L1_match(go_no, k_functions_set, node_distance_class, curs) is_correct_lca = self.common_ancestor_deep_enough(go_no, k_functions_set, node_distance_class, curs) else: #unknown gene is_correct = -1 is_correct_L1 = -1 is_correct_lca = -1 #clear lca_list self.lca_list = [] prediction_list = [p_value, mcl_id, gene_no, go_no, is_correct, is_correct_L1, \ is_correct_lca, len(vertex_set), unknown_gene_ratio, self.lca_list] self.prediction_tuple2list[prediction_tuple].append(prediction_list) if self.debug: print "recurrence and connectivity",prediction_tuple print "prediction_list", prediction_list raw_input("pause:")
def runCorrelations(p, strainCount, traits, db): """ To run the correlations between the traits and the database. This function computes a correlation coefficent between each trait and every entry in the database, and partitions the database into a disjoint array of arrays which it returns. The length of the return array is 2^n, where n is the length of the trait array. Which constitutent element a of the return array a given trait ends up in is determined by the following formula i = i_02^0 + ... + i_(n-1)2^(n-1) where i_0 is 1 if corr(a,trait 0) >= threshold and 0 otherwise Since most of the several thousand database traits will end up with i=0, we don't return them, so the first element of the return array will be empty. A particular element of subarray j of the return array contains a 2-tuple (trait,kvalues). The variable trait is obviously the particular database trait that matches the user traits l_1, ..., l_m to which subarray j corresponds. kvalues is a list of the correlation values linking trait to l_1, ..., l_m, so the length of kvalues is the number of 1s in the binary representation of j (there must be a better way to describe this length). The return array is an array of 2-tuples. The first element of each tuple is the index of the particular subarray, and the second element is the subarray itself. The array is sorted in descending order by the number of 1's in the binary representation of the index so the first few subarrays are the ones that correspond to the largest sets. Each subarray is then sorted by the average of the magnitude of the individual correlation values. """ kMin = p["threshold"] traitArrays = {} # TODO: Add Spearman support if p["correlation"] == "pearson": correlations = correlation.calcPearsonMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name else: correlations = correlation.calcSpearmanMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name # now we test all of the correlations in bulk test = numarray.absolute(correlations) test = numarray.greater_equal(test, kMin) test = test.astype(numarray.Int8) #print test for i in range(len(db)): cIndex = 0 prods = [] for j in range(len(traits)): if test[i,j] == 1: cIndex += pow(2, j) prods.append(correlations[i,j]) if cIndex != 0: if not traitArrays.has_key(cIndex): traitArrays[cIndex] = [] traitArrays[cIndex].append((db[i], prods)) # sort each inner list of traitArrays # so the matched traits appear in descending order by the # average magnitude of the correlation def customCmp(traitPair, traitPair2): magAvg1 = numarray.average(map(abs, traitPair[1])) magAvg2 = numarray.average(map(abs, traitPair2[1])) # invert the sign to get descending order return -cmp(magAvg1, magAvg2) for traitArray in traitArrays.values(): traitArray.sort(customCmp) # sort the outer list of traitArrays traitArrays2 = [] i = 0 for key in traitArrays.keys(): a = traitArrays[key] if len(a) > 0: traitArrays2.append((key,a,len(binaryDecompose(key)), len(a))) # we sort by the number of 1's in the binary output # and then by the size of the list, both in descending order def customCmp2(aL,bL): a = -cmp(aL[2], bL[2]) if a == 0: return -cmp(aL[3], bL[3]) else: return a traitArrays2.sort(customCmp2) return traitArrays2
def _gene_stat_leave_one_out(self, row, node_distance_class=None, curs=None): """ 03-08-05 set a default(1.0) for min_p_value fix a bug, alter >self.depth_cut_off to >= self.depth_cut_off 03-08-05 don't take floor of recurrence and connectivity anymore, p_gene_analysis.py and p_gene_lm.py will take care of this. 03-08-05 fix another important bug when looking for other functions that have same min_p_value, the depth_cut_off requirement is forgotten. 03-14-05 getting go_no2distance via node_distance_class 03-15-05 get lca's for common_ancestor_deep_enough approach """ mcl_id = row[0] gene_no = row[1] p_value_vector = row[2][1:-1].split(',') connectivity = float(row[3]) recurrence_array = row[4][1:-1].split(',') recurrence_array = map(float, recurrence_array) vertex_set = row[5][1:-1].split(',') vertex_set = map(int, vertex_set) #subgraph_cut_off is the cutoff for a cluster to be counted as occurred if self.subgraph_cut_off!=0: #0 means no cutoff recurrence_array = greater_equal(recurrence_array, self.subgraph_cut_off) """ #take the floor of the recurrence recurrence = int(math.floor(sum(recurrence_array)/self.recurrence_gap_size)*self.recurrence_gap_size) #take the floor of the connectivity *10 connectivity = int(math.floor(connectivity*10/self.connectivity_gap_size)*self.connectivity_gap_size) """ recurrence = sum(recurrence_array) #setup in prediction_tuple2list prediction_tuple = (recurrence, connectivity) if prediction_tuple not in self.prediction_tuple2list: self.prediction_tuple2list[prediction_tuple] = [] #default min_p_value is 1.0 min_p_value = 1.0 #transform into float type p_value_index_tuple_list = self.index_tuple(p_value_vector) for (p_value, index) in p_value_index_tuple_list: if self.wu: #index 0 corresponds to go_no 0. go_no = index else: #index 0 corresponds to go_no 1 go_no = index+1 if self.go_no2depth[go_no] >= self.depth_cut_off: min_p_value = p_value break #p-value 1.0 means the corresponding function has no associated genes in the cluster. #this situation means the cluster's associated functions are above the depth_cut_off if min_p_value >= 1.0: return if self.wu: unknown_gene_ratio = float(p_value_vector[0]) else: unknown_gene_ratio = -1 #The cluster is an eligible cluster. Passing all the cut_offs. # self.no_of_records += 1 #looking for go_nos that have the same min_p_value for (p_value, index) in p_value_index_tuple_list: if p_value > min_p_value: break elif index == 0 or p_value==1.0: #0 is the unknown function, this is almost impossible because its depth = 2(see condition above) #1.0 is for function that has no associated genes continue elif p_value == min_p_value: if self.wu: #index 0 corresponds to go_no 0. go_no = index else: #index 0 corresponds to go_no 1 go_no = index+1 if self.go_no2depth[go_no] < self.depth_cut_off: #again we need the go_no to be deep enough continue if gene_no in self.known_genes_dict: k_functions_set = self.known_genes_dict[gene_no] is_correct = self.direct_match(go_no, k_functions_set) is_correct_L1 = self.L1_match(go_no, k_functions_set, node_distance_class, curs) is_correct_lca = self.common_ancestor_deep_enough(go_no, k_functions_set, node_distance_class, curs) else: #unknown gene is_correct = -1 is_correct_L1 = -1 is_correct_lca = -1 #clear lca_list self.lca_list = [] prediction_list = [p_value, mcl_id, gene_no, go_no, is_correct, is_correct_L1, \ is_correct_lca, len(vertex_set), unknown_gene_ratio, self.lca_list] self.prediction_tuple2list[prediction_tuple].append(prediction_list)
def threshhold(arrays, low=None, high=None, outputs=None): """threshhold() computes a boolean array 'outputs' with corresponding elements for each element of arrays. The boolean value is true where each of the arrays values is < the low or >= the high threshholds. >>> a=num.arange(100, shape=(10,10)) >>> threshhold(a, 1, 50) array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool) >>> threshhold([ range(10)]*10, 3, 7) array([[1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1]], type=Bool) >>> threshhold(a, high=50) array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool) >>> threshhold(a, low=50) array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], type=Bool) """ if not isinstance(arrays[0], num.NumArray): return threshhold(num.inputarray(arrays), low, high, outputs) if outputs is None: outs = num.zeros(shape=(len(arrays), ) + arrays[0].shape, type=num.Bool) else: outs = outputs for i in range(len(arrays)): a, out = arrays[i], outs[i] out[:] = 0 if high is not None: num.greater_equal(a, high, out) if low is not None: num.logical_or(out, a < low, out) else: if low is not None: num.less(a, low, out) if outputs is None: return outs
def threshhold(arrays, low=None, high=None, outputs=None): """threshhold() computes a boolean array 'outputs' with corresponding elements for each element of arrays. The boolean value is true where each of the arrays values is < the low or >= the high threshholds. >>> a=num.arange(100, shape=(10,10)) >>> threshhold(a, 1, 50) array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool) >>> threshhold([ range(10)]*10, 3, 7) array([[1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 1]], type=Bool) >>> threshhold(a, high=50) array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool) >>> threshhold(a, low=50) array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], type=Bool) """ if not isinstance(arrays[0], num.NumArray): return threshhold( num.inputarray(arrays), low, high, outputs) if outputs is None: outs = num.zeros(shape=(len(arrays),)+arrays[0].shape, type=num.Bool) else: outs = outputs for i in range(len(arrays)): a, out = arrays[i], outs[i] out[:] = 0 if high is not None: num.greater_equal(a, high, out) if low is not None: num.logical_or(out, a < low, out) else: if low is not None: num.less(a, low, out) if outputs is None: return outs