Beispiel #1
0
	def fill_edge2encodedOccurrence(self, sig_vector_fname, min_sup, max_sup, total_vertex_set=None):
		"""
		04-04-06
		"""
		sys.stderr.write("Getting edge2encodedOccurrence...\n")
		from MpiFromDatasetSignatureToPattern import encodeOccurrenceBv
		edge2encodedOccurrence = {}
		reader = csv.reader(open(sig_vector_fname), delimiter='\t')
		no_of_datasets = 0
		counter = 0
		for row in reader:
			edge = row[:2]
			edge = map(int, edge)
			#04-06-06 any vertex of the edge doesn't appear in total_vertex_set, skip the edge
			if total_vertex_set and (edge[0] not in total_vertex_set or edge[1] not in total_vertex_set):
				continue
			edge.sort()	#04-06-06 in ascending order
			sig_vector = row[2:]
			sig_vector = map(int, sig_vector)
			if no_of_datasets==0:
				no_of_datasets = len(sig_vector)
			if sum(sig_vector)>=min_sup and sum(sig_vector)<=max_sup:
				edge2encodedOccurrence[tuple(edge)] = encodeOccurrenceBv(sig_vector)
		sys.stderr.write("Done.\n")
		del reader
		return edge2encodedOccurrence, no_of_datasets
Beispiel #2
0
    def fill_edge2encodedOccurrence(
        self, hostname, dbname, schema, edge2encodedOccurrence, min_sup, max_sup, edge_table="edge_cor_vector"
    ):
        """
		09-05-05
			get the edge2encodedOccurrence from the database
		"""
        sys.stderr.write("Getting edges...\n")
        (conn, curs) = db_connect(hostname, dbname, schema)
        curs.execute(
            "DECLARE crs CURSOR FOR select edge_name,sig_vector \
			from %s"
            % (edge_table)
        )
        curs.execute("fetch 5000 from crs")
        rows = curs.fetchall()
        no_of_datasets = 0
        counter = 0
        while rows:
            for row in rows:
                edge = row[0][1:-1].split(",")
                edge = map(int, edge)
                sig_vector = row[1][1:-1].split(",")
                sig_vector = map(int, sig_vector)
                if no_of_datasets == 0:
                    no_of_datasets = len(sig_vector)
                if sum(sig_vector) >= min_sup and sum(sig_vector) <= max_sup:
                    edge2encodedOccurrence[tuple(edge)] = encodeOccurrenceBv(sig_vector)
            curs.execute("fetch 5000 from crs")
            rows = curs.fetchall()
        sys.stderr.write("Done.\n")
        return no_of_datasets
Beispiel #3
0
	def get_recurrence_go_no_rec_array_cluster_id_ls(self, curs, pattern_table, mcl_id2go_no_set):
		"""
		2006-09-26
			from pattern_table and use mcl_id2go_no_set
			go_no_list is the go_id Set
			mcl_id2enc_recurrence is for get_recurrence_rec_array_bs_no_list()
		"""
		sys.stderr.write("Getting recurrence_go_no_rec_array_cluster_id_ls...\n")
		no_of_datasets = 0
		go_no2recurrence_cluster_id = {}
		mcl_id2enc_recurrence = {}
		curs.execute("DECLARE crs CURSOR FOR SELECT id, recurrence_array from %s"\
			%pattern_table)
		curs.execute("fetch 5000 from crs")
		rows = curs.fetchall()
		counter = 0
		real_counter = 0
		while rows:
			for row in rows:
				mcl_id, recurrence_array = row
				if mcl_id in mcl_id2go_no_set:
					#if this pattern has functions predicted
					recurrence_array = recurrence_array[1:-1].split(',')
					recurrence_array = map(float, recurrence_array)	#this is not a binary 0/1 array
					occurrence_cutoff_func = lambda x: int(x>=0.8)	#0.8 is arbitrary
					recurrence_array = map(occurrence_cutoff_func, recurrence_array)
					if no_of_datasets == 0:
						no_of_datasets = len(recurrence_array)
					go_no_list = mcl_id2go_no_set[mcl_id]
					encoded_recurrence = encodeOccurrenceBv(recurrence_array)
					mcl_id2enc_recurrence[mcl_id] = encoded_recurrence	#2006-09-26
					for go_no in go_no_list:
						if go_no not in go_no2recurrence_cluster_id:
							go_no2recurrence_cluster_id[go_no] = [encoded_recurrence, Set([mcl_id])]
								#use Set() because mcl_id has duplicates due to different p-values
						else:
							go_no2recurrence_cluster_id[go_no][0] = \
								go_no2recurrence_cluster_id[go_no][0] | encoded_recurrence
							go_no2recurrence_cluster_id[go_no][1].add(mcl_id)
					real_counter += 1
				counter += 1
			if self.report:
				sys.stderr.write("%s%s\t%s"%('\x08'*20, counter, real_counter))
			curs.execute("fetch 5000 from crs")
			rows = curs.fetchall()
		curs.execute("close crs")
		
		recurrence_go_no_rec_array_cluster_id_ls = []
		for go_no in go_no2recurrence_cluster_id:
			encoded_recurrence, mcl_id_set = go_no2recurrence_cluster_id[go_no]
			recurrence_array = decodeOccurrence(encoded_recurrence)	#not binary vector
			recurrence = len(recurrence_array)
			recurrence_go_no_rec_array_cluster_id_ls.append([recurrence, go_no, recurrence_array, mcl_id_set])
		
		recurrence_go_no_rec_array_cluster_id_ls.sort()
		sys.stderr.write("End getting recurrence_go_no_rec_array_cluster_id_ls.\n")
		return recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets, mcl_id2enc_recurrence
	def get_gene2enc_array(self, gim_inputfname, gene_id2no):
		sys.stderr.write("Getting gene2enc_array...\n")
		reader = csv.reader(open(gim_inputfname), delimiter='\t')
		gene2enc_array = {}
		for row in reader:
			no_of_occurrences, occ_array, gene_id = row[0], row[1:-1], row[-1]
			if gene_id in gene_id2no:
				gene_no = gene_id2no[gene_id]
				occ_array = map(int, occ_array)
				gene2enc_array[gene_no] = encodeOccurrenceBv(occ_array)
		sys.stderr.write("End getting gene2enc_array.\n")
		return gene2enc_array