def rhyper(m, n, M, N, report=0): ''' calculates cumulative probability based on hypergeometric distribution over/under/both (report = 0/1/2) (uses R through RPy) N = total balls in urn M = total white balls in urn n = drawn balls from urn m = drawn white balls from urn ''' from rpy import r assert ((type(m) == type(n) == type(M) == type(N) == int) and m <= n and m <= M and n <= N) if report == 0: #p-val for over-repr. return r.phyper(m - 1, M, N - M, n, lower_tail=False) elif report == 1: #p-val for under-repr. return r.phyper(m, M, N - M, n) elif report == 2: #tuple (over, under) return r.phyper(m - 1, M, N - M, n, lower_tail=False), r.phyper(m, M, N - M, n) else: raise "unknown option"
def rhyper(m, n, M, N, report=0): ''' calculates cumulative probability based on hypergeometric distribution over/under/both (report = 0/1/2) (uses R through RPy) N = total balls in urn M = total white balls in urn n = drawn balls from urn m = drawn white balls from urn ''' from rpy import r assert( (type(m) == type(n) == type(M) == type(N) == int) and m <= n and m <= M and n <= N) if report == 0: #p-val for over-repr. return r.phyper(m-1, M, N-M, n, lower_tail=False) elif report == 1: #p-val for under-repr. return r.phyper(m, M, N-M, n) elif report == 2: #tuple (over, under) return r.phyper(m-1, M, N-M, n, lower_tail=False), r.phyper(m, M, N-M, n) else: raise "unknown option"
def _cluster_stat(self, mcl_id, vertex_set): if vertex_set in self.cluster_memory: entry = self.cluster_memory[vertex_set] p_value_min = entry[0] go_no_vector = entry[1] unknown_gene_ratio = entry[2] self.to_db.append([p_value_min, go_no_vector, unknown_gene_ratio, mcl_id]) self.no_of_records += 1 return else: _cluster_memroy = {} vertex_list = vertex_set[1:-1].split(',') vertex_list = map(int, vertex_list) cluster_size = len(vertex_list) self.local_go_no_dict_construct(vertex_list) if 0 in self.local_go_no_dict: unknown_gene_ratio = self.local_go_no_dict[0]/float(cluster_size) else: unknown_gene_ratio = 0 if self.local_go_no_dict == {}: self.logfile.write('%d %s: local_go_no_dict empty\n'%(mcl_id, repr(vertex_set))) return for go_no in self.local_go_no_dict: if self.wu: # code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included) x = self.local_go_no_dict[go_no] m = self.global_go_no_to_size_dict[go_no] n = self.no_of_genes - m k = cluster_size else: pass if self.bonferroni: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self.local_go_no_dict) else: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE) self.logfile.write('%d %d %d %d %d %d %f %f\n'%(mcl_id,go_no,x,m,n,k,p_value, unknown_gene_ratio)) if p_value in _cluster_memroy: _cluster_memroy[p_value].append(go_no) else: _cluster_memroy[p_value] = [go_no] p_value_vector = _cluster_memroy.keys() if p_value_vector == []: self.logfile.write('%d %s: all vertices belong to population singleton classes\n'%(mcl_id, repr(vertex_set))) return p_value_min = min(p_value_vector) go_no_vector = _cluster_memroy[p_value_min] self.no_of_records += 1 self.cluster_memory[vertex_set] =[p_value_min, go_no_vector, unknown_gene_ratio] self.to_db.append([p_value_min, go_no_vector, unknown_gene_ratio, mcl_id])
def _cluster_stat(self, mcl_id, vertex_set, connectivity): vertex_list_all = vertex_set[1:-1].split(',') vertex_list = [] for i in range(len(vertex_list_all)): vertex_list_all[i] = int(vertex_list_all[i]) if vertex_list_all[i] in self.global_gene_to_go_dict: #this filter will only be useful when Jasmine's strategy is applied to whole gene-set(unknown included) vertex_list.append(vertex_list_all[i]) cluster_size = len(vertex_list) p_value_vector = [1] * self.no_of_functions self.local_go_no_dict_construct(vertex_list) for gene_no in vertex_list_all: self.go_no_dict_adjust(gene_no) for go_no in self._local_go_no_dict: if self.wu or (gene_no not in self.global_gene_to_go_dict): # code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included) x = self._local_go_no_dict[go_no] m = self._global_go_no_dict[go_no] n = self.no_of_genes - m k = cluster_size else: x = self._local_go_no_dict[go_no] m = self._global_go_no_dict[go_no] n = self.no_of_genes -1 - m k = cluster_size-1 if self.bonferroni: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self._local_go_no_dict) else: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE) if self.log: self.logfile.write('%d %d %d %d %d %d %d %f\n'%\ (mcl_id,gene_no,go_no,x,m,n,k,p_value)) p_value_vector[go_no] = p_value #for the unknown class, use the ratio instead of p_value, in accordance with mcl_result_stat.py if self.wu: p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size) else: #not wu's strategy, throw away the gene, the cluster_size is down by 1. if self._local_go_no_dict.has_key(0): #after leave_one_out, still unknown genes present p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size-1) else: #no unknown genes p_value_vector[0] = 1 if self.output: self.outf.write('%d\t%d\t%s\t%f\n'%(mcl_id, gene_no, repr(p_value_vector), connectivity)) elif self.needcommit: self.curs.execute("insert into %s(mcl_id, leave_one_out, p_value_vector, connectivity)\ values(%d, %d, ARRAY%s, %8.6f)"%(self.target_table, mcl_id, gene_no, repr(p_value_vector), connectivity)) self.no_of_records += 1
def get_information_of_go_functions(self, curs, go_no2association_genes, cluster_size, \ no_of_total_genes, p_value_cut_off=0, go_table='go'): """ 04-06-05 input: go_no_list output: go_no2information information includes go_no, go_id, name, depth, no_of_associated genes """ sys.stderr.write("Getting information about a list of go_nos...") go_no2information = {} for go_no,association_genes in go_no2association_genes.iteritems(): no_of_associated_genes = len(association_genes) curs.execute("select go_no, go_id, name, depth, array_upper(gene_array,1) from %s \ where go_no=%s"%(go_table, go_no)) rows = curs.fetchall() for row in rows: p_value = r.phyper(no_of_associated_genes-1, row[-1],no_of_total_genes-row[-1], cluster_size,lower_tail = r.FALSE) if p_value_cut_off: #non zero, needs cut some p-values if p_value>p_value_cut_off: continue go_no2information[go_no] = list(row) + [no_of_associated_genes, p_value] #go_no, go_id, name, depth, population size, local size, p_value sys.stderr.write("Done.\n") return go_no2information
def _cluster_stat(self, mcl_id, vertex_set): vertex_list = vertex_set[1:-1].split(',') vertex_list = map(int, vertex_list) vertex_list_gene_symbol = [] for vertex in vertex_list: vertex_list_gene_symbol.append(self.gene_no2gene_id[vertex]) cluster_size = len(vertex_list) local_go_no_dict = self.local_go_no_dict_construct(vertex_list) if local_go_no_dict == {}: self.logfile.write('%d %s: local_go_no_dict empty\n'%(mcl_id, repr(vertex_set))) return for go_no in local_go_no_dict: if self.wu: # code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included) x = len(local_go_no_dict[go_no]) m = self.global_go_no_to_size_dict[go_no] n = self.no_of_genes - m k = cluster_size else: pass if self.bonferroni: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(local_go_no_dict) else: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE) if x >1: #this function must have more than one gene associated. transfac_dict = self.transfac_dict_construct(local_go_no_dict[go_no]) gene_id_list = [] for gene_no in local_go_no_dict[go_no]: gene_id_list.append(self.gene_no2gene_id[gene_no]) self.logfile.write('%d\t%s\t%d\t%d\t%d\t%d\t%f\t%s\t%s\t%s\n'%(mcl_id,\ self.global_go_no2go_name[go_no], x, k, m, self.no_of_genes,\ p_value, '|'.join(gene_id_list), repr(vertex_list_gene_symbol), repr(transfac_dict))) self.no_of_records += 1
def _cluster_stat(self, mcl_id, vertex_set, connectivity): """ 04-18-05 add two important criteria to avoid the situation that hypergeometric test is powerless (the population size of the go-no is too small). 1. percentage of associated-genes over total known genes >= uniformity (0.5 default) 2. apart from the percentage, the absolute number is also needed in case the cluster is too small. 04-19-05 fix a bug. self._no_of_known_genes_of_the_cluster could be 0. """ vertex_list_all = vertex_set[1:-1].split(',') vertex_list = [] for i in range(len(vertex_list_all)): vertex_list_all[i] = int(vertex_list_all[i]) if vertex_list_all[i] in self.global_gene_to_go_dict: #this filter will only be useful when Jasmine's strategy is applied to whole gene-set(unknown included) vertex_list.append(vertex_list_all[i]) cluster_size = len(vertex_list) p_value_vector = [1] * self.no_of_functions self.local_go_no_dict_construct(vertex_list) for gene_no in vertex_list_all: self.go_no_dict_adjust(gene_no) for go_no in self._local_go_no_dict: if self.global_go_no_to_size_dict[go_no]<self.min_node_size: #06-11-05 continue if self.go_no2depth[go_no]>self.max_node_depth: #06-11-05 continue if self.wu or (gene_no not in self.global_gene_to_go_dict): # code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included) x = self._local_go_no_dict[go_no] m = self._global_go_no_dict[go_no] n = self.no_of_genes - m k = cluster_size else: x = self._local_go_no_dict[go_no] m = self._global_go_no_dict[go_no] n = self.no_of_genes -1 - m k = cluster_size-1 if self._no_of_known_genes_of_the_cluster == 0: go_no_ratio = 0 else: go_no_ratio = float(x)/self._no_of_known_genes_of_the_cluster #NOTE: it's different from no_of_known_genes_of_the_cluster if go_no_ratio < self.uniformity and go_no!=0: #It doesn't apply to the 0(unknown) category. #ignore the function category if its percentage is < uniformity continue if x < 3 and go_no!=0: #apart from the percentage, the absolute number is also needed in case the cluster is too small. continue if self.bonferroni: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self._local_go_no_dict) else: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE) if self.log: self.logfile.write('%d %d %d %d %d %d %d %f\n'%\ (mcl_id,gene_no,go_no,x,m,n,k,p_value)) p_value_vector[go_no] = p_value #for the unknown class, use the ratio instead of p_value, in accordance with mcl_result_stat.py if self.wu: p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size) else: #not wu's strategy, throw away the gene, the cluster_size is down by 1. if self._local_go_no_dict.has_key(0): #after leave_one_out, still unknown genes present p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size-1) else: #no unknown genes p_value_vector[0] = 1 #03-18-05increment before inserted into table, cluster_stat_id starting from 1 self.no_of_records += 1 if self.output: self.outf.write('%d\t%d\t%s\t%f\n'%(mcl_id, gene_no, repr(p_value_vector), connectivity)) elif self.needcommit: self.curs.execute("insert into %s(cluster_stat_id, mcl_id, leave_one_out, p_value_vector, connectivity)\ values(%d, %d, %d, ARRAY%s, %8.6f)"%(self.target_table, self.no_of_records, mcl_id, gene_no, repr(p_value_vector), connectivity))
def _cluster_stat(self, mcl_id, vertex_set, connectivity): """ 04-18-05 add two important criteria to avoid the situation that hypergeometric test is powerless (the population size of the go-no is too small). 1. percentage of associated-genes over total known genes >= uniformity (0.5 default) 2. apart from the percentage, the absolute number is also needed in case the cluster is too small. 04-19-05 fix a bug. self._no_of_known_genes_of_the_cluster could be 0. 08-13-05 remove the redundant information in the p_value_vector to save space p_value_vector only keeps (p_value,go_no) pairs with lowest p_value and go_no>=min_node_depth unknown_gene_ratio is split from p_value_vector a bug found, p_value_vector is not grounded after each gene cycle table submit() is not supported anymore 08-15-05 if go_no == 0, skip p-value calculation self.local_go_no_dict[go_no]<3, skip the go_no. previously it's using self._local_go_no_dict[go_no] to do filter, known genes and unknown genes get different treatment. """ vertex_list_all = vertex_set[1:-1].split(',') vertex_list = [] for i in range(len(vertex_list_all)): vertex_list_all[i] = int(vertex_list_all[i]) if vertex_list_all[i] in self.global_gene_to_go_dict: #this filter will only be useful when Jasmine's strategy is applied to whole gene-set(unknown included) vertex_list.append(vertex_list_all[i]) cluster_size = len(vertex_list) self.local_go_no_dict_construct(vertex_list) for gene_no in vertex_list_all: p_value_vector = [] self.go_no_dict_adjust(gene_no) #for the unknown class, use the ratio instead of p_value, in accordance with mcl_result_stat.py if self.wu: unknown_gene_ratio = self._local_go_no_dict[0]/float(cluster_size) #08-13-05 unknown_gene_ratio is split from p_value_vector else: #not wu's strategy, throw away the gene, the cluster_size is down by 1. if self._local_go_no_dict.has_key(0): #after leave_one_out, still unknown genes present unknown_gene_ratio = self._local_go_no_dict[0]/float(cluster_size-1) else: #no unknown genes unknown_gene_ratio = 0 for go_no in self._local_go_no_dict: if go_no == 0: #unknown function category doesn't need to do calculation continue if self.local_go_no_dict[go_no]<3: #apart from the percentage, the absolute number is also needed in case the cluster is too small. #08-15-05 fix a bug here, previously it's using self._local_go_no_dict[go_no] to do filter. known genes and unknown genes get different treatment. continue if self.global_go_no_to_size_dict[go_no]<self.min_node_size: #06-11-05 continue if self.go_no2depth[go_no]<self.min_node_depth: #06-11-05, 08-13-05 max_node_depth changed to min_node_depth continue if self.wu or (gene_no not in self.global_gene_to_go_dict): # code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included) x = self._local_go_no_dict[go_no] m = self._global_go_no_dict[go_no] n = self.no_of_genes - m k = cluster_size else: x = self._local_go_no_dict[go_no] m = self._global_go_no_dict[go_no] n = self.no_of_genes -1 - m k = cluster_size-1 if self._no_of_known_genes_of_the_cluster == 0: go_no_ratio = 0 else: go_no_ratio = float(x)/self._no_of_known_genes_of_the_cluster #NOTE: it's different from no_of_known_genes_of_the_cluster if go_no_ratio < self.uniformity and go_no!=0: #It doesn't apply to the 0(unknown) category. #ignore the function category if its percentage is < uniformity continue if self.bonferroni: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self._local_go_no_dict) else: p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE) if self.log: self.logfile.write('%d %d %d %d %d %d %d %f\n'%\ (mcl_id,gene_no,go_no,x,m,n,k,p_value)) p_value_vector.append([p_value, go_no]) #08-13-05 if len(p_value_vector) == 0: #08-13-05 no qualified go_no for this gene_no continue p_value_vector = self.retain_min_p_value_pairs(p_value_vector) #08-13-05 get the pairs with minimum p_value #03-18-05increment before inserted into table, cluster_stat_id starting from 1 self.no_of_records += 1 if self.output: self.outf.write('%d\t%d\t%s\t%f\t%f\n'%(mcl_id, gene_no, repr(p_value_vector), connectivity, unknown_gene_ratio)) #08-13-05 elif self.needcommit: self.curs.execute("insert into %s(cluster_stat_id, mcl_id, leave_one_out, p_value_vector, connectivity)\ values(%d, %d, %d, ARRAY%s, %8.6f)"%(self.target_table, self.no_of_records, mcl_id, gene_no, repr(p_value_vector), connectivity))