Example #1
0
	def parse_recurrence(self, combined_vector, no_of_edges=0, cor_cut_off=0):
		"""
		03-03-05
			replace the combined_vector with combined_sig_vector
			get a fair recurrence_array(see 'redefine recurrence and connectivity' in log_05)
		07-03-05
			combined_vector is already a 2d list.
		08-09-05
			replace no_of_edges with x_dimension, no_of_edges defunct
		"""
		cor_array = numarray.array(combined_vector)
		x_dimension, y_dimension = cor_array.shape	#07-03-05	cor_array is 2d array
		recurrence_array = []
		for i in range(y_dimension):
			#regard the correlations >= self.cor_cut_off to be 1, others 0
			if cor_cut_off>0:
				edge_cor_in_one_dataset = numarray.greater_equal(cor_array[:,i], cor_cut_off)
			else:
				edge_cor_in_one_dataset = cor_array[:,i]
			recurrence_array.append(sum(edge_cor_in_one_dataset)/float(x_dimension))
			if self.debug:
				print edge_cor_in_one_dataset
				print recurrence_array
		#handle this in gene_stat_plot.py
		#recurrence_array = numarray.greater_equal(recurrence_array, self.subgraph_cut_off)
		return recurrence_array
Example #2
0
	def get_cluster_accuracy(self, curs, p_gene_table, mcl_id_list, p_value_cut_off=0.01):
		"""
		04-07-05
			get the accuracy, no_of_corrected predictions, known predictions, total predictions for each cluster
		"""
		accuracy2cluster = []
		for mcl_id in mcl_id_list:
			curs.execute("select is_correct_lca from %s where mcl_id=%s and p_value_cut_off<=%s"%(p_gene_table,\
				mcl_id, p_value_cut_off))
			rows = curs.fetchall()
			if rows:
				is_correct_lca_array = array(rows)
				correct_array = greater(is_correct_lca_array[:,0],0)
				known_array = greater_equal(is_correct_lca_array[:,0],0)
				accuracy = float(sum(correct_array))/float(sum(known_array))
				accuracy2cluster.append([accuracy, sum(correct_array), sum(known_array), len(correct_array), mcl_id]) 
		return accuracy2cluster
Example #3
0
	def _gene_stat_leave_one_out(self, row, node_distance_class=None, curs=None):
		"""
		03-08-05
			set a default(1.0) for min_p_value
			fix a bug, alter >self.depth_cut_off to >= self.depth_cut_off
			
		03-08-05
			don't take floor of recurrence and connectivity anymore, p_gene_analysis.py and p_gene_lm.py will take care of this.
		
		03-08-05
			fix another important bug
			when looking for other functions that have same min_p_value, the depth_cut_off requirement is forgotten.
		
		03-14-05
			getting go_no2distance via node_distance_class
		
		03-15-05
			get lca's for common_ancestor_deep_enough approach
		08-13-05
			change following cluster_stat's change
		"""		
		mcl_id = row[0]
		gene_no = row[1]
		p_value_vector = row[2][2:-2].split('], [')
		connectivity = float(row[3])
		unknown_gene_ratio = float(row[4])
		recurrence_array = row[5][1:-1].split(',')
		recurrence_array = map(float, recurrence_array)
		vertex_set = row[6][1:-1].split(',')
		vertex_set = map(int, vertex_set)
		
		#subgraph_cut_off is the cutoff for a cluster to be counted as occurred
		if self.subgraph_cut_off!=0:
			#0 means no cutoff
			recurrence_array = greater_equal(recurrence_array, self.subgraph_cut_off)
		"""
		#take the floor of the recurrence
		recurrence = int(math.floor(sum(recurrence_array)/self.recurrence_gap_size)*self.recurrence_gap_size)
		#take the floor of the connectivity *10
		connectivity = int(math.floor(connectivity*10/self.connectivity_gap_size)*self.connectivity_gap_size)
		"""
		recurrence = sum(recurrence_array)

		#setup in prediction_tuple2list
		prediction_tuple = (recurrence, connectivity)
		if prediction_tuple not in self.prediction_tuple2list:
			self.prediction_tuple2list[prediction_tuple] = []
			
		self.no_of_records += 1
		
		for entry in p_value_vector:
			p_value, go_no = entry.split(',')
			p_value = float(p_value)
			go_no = int(go_no)
			if self.go_no2depth[go_no] < self.depth_cut_off:
				#again we need the go_no to be deep enough
				continue
			
			if gene_no in self.known_genes_dict:
				k_functions_set = self.known_genes_dict[gene_no]
				is_correct = self.direct_match(go_no, k_functions_set)
				is_correct_L1 = self.L1_match(go_no, k_functions_set, node_distance_class, curs)
				is_correct_lca = self.common_ancestor_deep_enough(go_no, k_functions_set, node_distance_class, curs)
			else:
				#unknown gene
				is_correct = -1
				is_correct_L1 = -1
				is_correct_lca = -1
				#clear lca_list
				self.lca_list = []
				
			prediction_list = [p_value, mcl_id, gene_no, go_no, is_correct, is_correct_L1, \
				is_correct_lca, len(vertex_set), unknown_gene_ratio, self.lca_list]
			self.prediction_tuple2list[prediction_tuple].append(prediction_list)
			if self.debug:
				print "recurrence and connectivity",prediction_tuple
				print "prediction_list", prediction_list
				raw_input("pause:")
def runCorrelations(p, strainCount, traits, db):
    """
    To run the correlations between the traits and the database.
    This function computes a correlation coefficent between each
    trait and every entry in the database, and partitions the database
    into a disjoint array of arrays which it returns.

    The length of the return array is 2^n, where n is the length of
    the trait array. Which constitutent element a of the return array
    a given trait ends up in is determined by the following formula
    i = i_02^0 + ... + i_(n-1)2^(n-1)
    where i_0 is 1 if corr(a,trait 0) >= threshold and 0 otherwise

    Since most of the several thousand database traits will end up
    with i=0, we don't return them, so the first element of the
    return array will be empty.

    A particular element of subarray j of the return array contains
    a 2-tuple (trait,kvalues). The variable trait is obviously the
    particular database trait that matches the user traits l_1, ..., l_m
    to which subarray j corresponds. kvalues is a list of the correlation
    values linking trait to l_1, ..., l_m, so the length of kvalues is
    the number of 1s in the binary representation of j (there must be
    a better way to describe this length).

    The return array is an array of 2-tuples. The first element of
    each tuple is the index of the particular subarray, and the second
    element is the subarray itself. The array is sorted in descending
    order by the number of 1's in the binary representation of the
    index so the first few subarrays are the ones that correspond to
    the largest sets. Each subarray is then sorted by the average of
    the magnitude of the individual correlation values.
    """
    kMin = p["threshold"]
    traitArrays = {}

    # TODO: Add Spearman support
    if p["correlation"] == "pearson":
        correlations = correlation.calcPearsonMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name
    else:
        correlations = correlation.calcSpearmanMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name

    # now we test all of the correlations in bulk
    test = numarray.absolute(correlations) 
    test = numarray.greater_equal(test, kMin)
    test = test.astype(numarray.Int8)
    #print test
    

    for i in range(len(db)):
        cIndex = 0
        prods = []
        for j in range(len(traits)):
            if test[i,j] == 1:
                cIndex += pow(2, j)
                prods.append(correlations[i,j])
        if cIndex != 0:
            if not traitArrays.has_key(cIndex):
                traitArrays[cIndex] = []

            traitArrays[cIndex].append((db[i], prods))
                
    # sort each inner list of traitArrays
    # so the matched traits appear in descending order by the
    # average magnitude of the correlation
    def customCmp(traitPair, traitPair2):
        magAvg1 = numarray.average(map(abs, traitPair[1]))
        magAvg2 = numarray.average(map(abs, traitPair2[1]))

        # invert the sign to get descending order
        return -cmp(magAvg1, magAvg2)
    
    for traitArray in traitArrays.values():
        traitArray.sort(customCmp)

    # sort the outer list of traitArrays
    traitArrays2 = []
    i = 0
    for key in traitArrays.keys():
        a = traitArrays[key]
        if len(a) > 0:
            traitArrays2.append((key,a,len(binaryDecompose(key)),
                                 len(a)))

    # we sort by the number of 1's in the binary output
    # and then by the size of the list, both in descending order
    def customCmp2(aL,bL):
        a = -cmp(aL[2], bL[2])
        if a == 0:
            return -cmp(aL[3], bL[3])
        else:
            return a

    traitArrays2.sort(customCmp2)

    return traitArrays2
Example #5
0
	def _gene_stat_leave_one_out(self, row, node_distance_class=None, curs=None):
		"""
		03-08-05
			set a default(1.0) for min_p_value
			fix a bug, alter >self.depth_cut_off to >= self.depth_cut_off
			
		03-08-05
			don't take floor of recurrence and connectivity anymore, p_gene_analysis.py and p_gene_lm.py will take care of this.
		
		03-08-05
			fix another important bug
			when looking for other functions that have same min_p_value, the depth_cut_off requirement is forgotten.
		
		03-14-05
			getting go_no2distance via node_distance_class
		
		03-15-05
			get lca's for common_ancestor_deep_enough approach
		"""
		
		mcl_id = row[0]
		gene_no = row[1]
		p_value_vector = row[2][1:-1].split(',')
		connectivity = float(row[3])
		recurrence_array = row[4][1:-1].split(',')
		recurrence_array = map(float, recurrence_array)
		vertex_set = row[5][1:-1].split(',')
		vertex_set = map(int, vertex_set)
		
		#subgraph_cut_off is the cutoff for a cluster to be counted as occurred
		if self.subgraph_cut_off!=0:
			#0 means no cutoff
			recurrence_array = greater_equal(recurrence_array, self.subgraph_cut_off)
		"""
		#take the floor of the recurrence
		recurrence = int(math.floor(sum(recurrence_array)/self.recurrence_gap_size)*self.recurrence_gap_size)
		#take the floor of the connectivity *10
		connectivity = int(math.floor(connectivity*10/self.connectivity_gap_size)*self.connectivity_gap_size)
		"""
		recurrence = sum(recurrence_array)

		#setup in prediction_tuple2list
		prediction_tuple = (recurrence, connectivity)
		if prediction_tuple not in self.prediction_tuple2list:
			self.prediction_tuple2list[prediction_tuple] = []
		
		#default min_p_value is 1.0
		min_p_value = 1.0
		
		#transform into float type
		p_value_index_tuple_list = self.index_tuple(p_value_vector)
		for (p_value, index) in p_value_index_tuple_list:
			if self.wu:
				#index 0 corresponds to go_no 0.
				go_no = index
			else:
				#index 0 corresponds to go_no 1
				go_no = index+1
			if self.go_no2depth[go_no] >= self.depth_cut_off:
				min_p_value = p_value
				break

		#p-value 1.0 means the corresponding function has no associated genes in the cluster.
		#this situation means the cluster's associated functions are above the depth_cut_off
		if min_p_value >= 1.0:
			return
			
		if self.wu:
			unknown_gene_ratio = float(p_value_vector[0])
		else:
			unknown_gene_ratio = -1
		#The cluster is an eligible cluster. Passing all the cut_offs.
		#
		self.no_of_records += 1
		
		#looking for go_nos that have the same min_p_value
		for (p_value, index) in p_value_index_tuple_list:
			if p_value > min_p_value:
				break
			elif index == 0 or p_value==1.0:
				#0 is the unknown function, this is almost impossible because its depth = 2(see condition above)
				#1.0 is for function that has no associated genes
				continue
			elif p_value == min_p_value:
				if self.wu:
					#index 0 corresponds to go_no 0.
					go_no = index
				else:
					#index 0 corresponds to go_no 1
					go_no = index+1
				if self.go_no2depth[go_no] < self.depth_cut_off:
					#again we need the go_no to be deep enough
					continue
				
				if gene_no in self.known_genes_dict:
					k_functions_set = self.known_genes_dict[gene_no]
					is_correct = self.direct_match(go_no, k_functions_set)
					is_correct_L1 = self.L1_match(go_no, k_functions_set, node_distance_class, curs)
					is_correct_lca = self.common_ancestor_deep_enough(go_no, k_functions_set, node_distance_class, curs)
				else:
					#unknown gene
					is_correct = -1
					is_correct_L1 = -1
					is_correct_lca = -1
					#clear lca_list
					self.lca_list = []
					
				prediction_list = [p_value, mcl_id, gene_no, go_no, is_correct, is_correct_L1, \
					is_correct_lca, len(vertex_set), unknown_gene_ratio, self.lca_list]
				self.prediction_tuple2list[prediction_tuple].append(prediction_list)
Example #6
0
def threshhold(arrays, low=None, high=None, outputs=None):
    """threshhold() computes a boolean array 'outputs' with
    corresponding elements for each element of arrays.  The
    boolean value is true where each of the arrays values
    is < the low or >= the high threshholds.

    >>> a=num.arange(100, shape=(10,10))
    >>> threshhold(a, 1, 50)
    array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool)
    >>> threshhold([ range(10)]*10, 3, 7)
    array([[1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1]], type=Bool)
    >>> threshhold(a, high=50)
    array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool)
    >>> threshhold(a, low=50)
    array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], type=Bool)
    """

    if not isinstance(arrays[0], num.NumArray):
        return threshhold(num.inputarray(arrays), low, high, outputs)

    if outputs is None:
        outs = num.zeros(shape=(len(arrays), ) + arrays[0].shape,
                         type=num.Bool)
    else:
        outs = outputs

    for i in range(len(arrays)):
        a, out = arrays[i], outs[i]
        out[:] = 0

        if high is not None:
            num.greater_equal(a, high, out)
            if low is not None:
                num.logical_or(out, a < low, out)
        else:
            if low is not None:
                num.less(a, low, out)

    if outputs is None:
        return outs
Example #7
0
def threshhold(arrays, low=None, high=None, outputs=None):
    """threshhold() computes a boolean array 'outputs' with
    corresponding elements for each element of arrays.  The
    boolean value is true where each of the arrays values
    is < the low or >= the high threshholds.

    >>> a=num.arange(100, shape=(10,10))
    >>> threshhold(a, 1, 50)
    array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool)
    >>> threshhold([ range(10)]*10, 3, 7)
    array([[1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
           [1, 1, 1, 0, 0, 0, 0, 1, 1, 1]], type=Bool)
    >>> threshhold(a, high=50)
    array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], type=Bool)
    >>> threshhold(a, low=50)
    array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], type=Bool)
    """

    if not isinstance(arrays[0],  num.NumArray):
        return threshhold( num.inputarray(arrays), low, high, outputs)
    
    if outputs is None:
        outs = num.zeros(shape=(len(arrays),)+arrays[0].shape,
                         type=num.Bool)
    else:
        outs = outputs

    for i in range(len(arrays)):
        a, out = arrays[i], outs[i]
        out[:] = 0

        if high is not None:
            num.greater_equal(a, high, out)
            if low is not None:
                num.logical_or(out, a < low, out)
        else:
            if low is not None:
                num.less(a, low, out)

    if outputs is None:
        return outs