Example #1
0
	def run(self):
		"""
		09-19-05
			rewrite
			
			--db_connect()
			--get_global_gene_id2gene_no()
			--org2tax_id()
			--get_gene_id2mt_no_list()
			--return_gene_id_set()
			--submit()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		gene_id2gene_no = get_global_gene_id2gene_no(curs, self.organism)
		tax_id = org2tax_id(self.organism)
		"""
		#01-14-06 comment it out for future 
		gene_no2tf_set = get_gene_no2tf_set(curs)	#12-15-05 just yeast.
		#12-15-05 convert gene_no(integer) into gene_id(string)
		gene_id2mt_no_list = {}
		for gene_no, tf_set in gene_no2tf_set.iteritems():
			gene_id2mt_no_list[repr(gene_no)] = list(tf_set)
		"""
		gene_id2mt_no_list = get_gene_id2mt_no_list(tax_id)
		gene_id_set = self.return_gene_id_set(self.dir, gene_id2gene_no, self.min_frequency)
		self.submit(curs, output_table, gene_id_set, gene_id2gene_no, gene_id2mt_no_list)
		if self.needcommit:
			conn.commit()
Example #2
0
    def run(self):
        """
		09-08-05
			
			--db_connect()
			--org_short2long()
			--org2tax_id()
			--setup_acc2gene_id()
			if self.new_table
				--create_output_table()
			--parse_input_filename()
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname)
        long_organism = org_short2long(self.organism)
        tax_id_set = Set([org2tax_id(long_organism)])

        MdbId2GeneId_instance = MdbId2GeneId()
        acc2gene_id = MdbId2GeneId_instance.setup_acc2gene_id(
            self.acc_file, tax_id_set)
        if self.new_table:
            self.create_output_table(curs, self.output_table)
        self.parse_input_filename(curs, self.input_filename, self.output_table,  acc2gene_id,\
         org2tax_id(long_organism), self.up_length, self.comment, long_organism, self.type)
        if self.commit:
            curs.execute("end")
Example #3
0
	def run(self):
		"""
		06-03-05
		
		--db_connect()
		--prepare_gene_no2go_no()
		--get_function_edge_matrix_data()
			--_get_function_edge_matrix_data()
				--return_common_go_no_of_edge()
				--return_edge_vector()
		--edge_data_output()
		"""		
		conn,curs = db_connect(self.hostname, self.dbname,self.schema)
		
		self.gene_no2go_no = self.prepare_gene_no2go_no(curs)
		self.get_function_edge_matrix_data(curs, self.no_of_nas, self.table)
		
		#make a directory first
		if not os.path.isdir(self.output_dir):
			os.makedirs(self.output_dir)
		
		for go_no, edge_data in self.go_no2edge_matrix_data.iteritems():
			if len(edge_data)>=self.min_no_of_edges:
				self.edge_data_output(self.output_dir, go_no, edge_data)
				self.go_no_qualified.append(go_no)
	def run(self):
		"""
		01-18-06
		
			--db_connect()
			--get_mt_id_gc_perc2no_of_random_hits()
			--parse_file()
				--write_down_mt_id2no_of_hits()
					--get_seq_id_gc_percentage_length()
					--get_hit_pvalue()
			
			--draw_pvalue_histogram()
			
			--calculate_pi0()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		data_fname = '%s.data'%self.output_prefix
		if os.path.isfile(data_fname):
			sys.stderr.write("Getting p_value from %s..."%os.path.basename(data_fname))
			reader = csv.reader(open(data_fname), delimiter='\t')
			for row in reader:
				self.p_value_list.append(float(row[5]))
			del reader
			sys.stderr.write("Done.\n")
		else:
			pickle_fname = os.path.expanduser('~/pickle/mt_id_gc_perc2no_of_random_hits.pickle')
			if os.path.isfile(pickle_fname):
				mt_id_gc_perc2no_of_random_hits = cPickle.load(open(pickle_fname))
			else:
				mt_id_gc_perc2no_of_random_hits = self.get_mt_id_gc_perc2no_of_random_hits(curs,\
					self.matrix2no_of_random_hits_table)
				of = open(pickle_fname, 'w')
				cPickle.dump(mt_id_gc_perc2no_of_random_hits, of)
				del of
			writer = csv.writer(open(data_fname, 'w') , delimiter='\t')
			
			self.log_f = open('%s.log'%self.output_prefix,'w')
			
			files = os.listdir(self.input_dir)
			files.sort()
			sys.stderr.write("\tTotally, %d files to be processed.\n"%len(files))
			for input_fname in files:
				input_fname = os.path.join(self.input_dir, input_fname)
				self.parse_file(curs, input_fname, writer, mt_id_gc_perc2no_of_random_hits)
			del writer
			self.log_f.close()
		
		self.p_value_list.sort()
		top_p_value_cutoff = 0.95	#important not 1, p_value histogram shows an abnormal peak from 0.95 to 1
		top_p_value_list = self.remove_top_p_values(self.p_value_list, top_p_value_cutoff)
		
		figure_fname = '%s_p_value_hist.png'%self.output_prefix
		self.draw_pvalue_histogram(self.p_value_list, figure_fname)
		
		figure_fname = '%s_pi0Tolambda.png'%self.output_prefix
		lambda_list, pi0_list = self.calculate_pi0_list(self.p_value_list, figure_fname, top_p_value_cutoff)
		
		estimated_pi0 = self.estimate_pi0(lambda_list, pi0_list)
		
		self.cal_q_value_list(self.p_value_list, estimated_pi0, top_p_value_cutoff, self.output_prefix)
Example #5
0
	def run(self):
		"""
		10-22-05
			
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		free_computing_nodes = range(1,communicator.size-1)
		print "this is node",node_rank
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			edge2occurrrence, no_of_datasets = get_edge2occurrence(curs, self.min_sup, self.max_sup)
			edge2occurrrence_pickle = cPickle.dumps((edge2occurrrence, no_of_datasets), -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(edge2occurrrence_pickle, node, 0)
			del conn, curs
		elif node_rank in free_computing_nodes:	#exclude the last node
			data, source, tag = communicator.receiveString(0, 0)
			edge2occurrrence, no_of_datasets = cPickle.loads(data)
		
		mpi_synchronize(communicator)
		if node_rank == 0:
			inf = csv.reader(open(self.inputfile,'r'), delimiter='\t')
			parameter_list = [inf]
			input_node(communicator, parameter_list, free_computing_nodes, self.message_size, self.report, input_handler=self.input_handler)
			del inf
		elif node_rank in free_computing_nodes:
			parameter_list = [self.min_size, self.alpha, edge2occurrrence, no_of_datasets]
			computing_node(communicator, parameter_list, self.node_fire, report=self.report)
		elif node_rank == communicator.size-1:
			writer = csv.writer(open(self.outputfile, 'w'), delimiter='\t')
			parameter_list = [writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report)
			del writer
Example #6
0
	def output_in_copath_format(self, outfname, node_rank):
		"""
		04-20-05
			output go_no2cluster_group
		04-25-05
			cluster_id redefined
		"""
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		
		outf = open(outfname, 'a')
		writer = csv.writer(outf, delimiter='\t')
		for go_no, cluster_group in self.go_no2cluster_group.iteritems():
			counter = 0
			for bicluster in cluster_group.bicluster_list:
				seed_edge_id_list = list(take(cluster_group.edge_id_array, bicluster.row_index_list))
				edge_id_list = seed_edge_id_list + bicluster.added_edge_id_list
				vertex_list , edge_list = get_vertex_edge_list_by_edge_id(curs, edge_id_list)
				no_of_nodes = len(vertex_list)
				connectivity = len(edge_list)*2.0/(no_of_nodes*(no_of_nodes-1))
				vertex_string = '{' + ';'.join(vertex_list) + ';}'
				edge_string  = self.edge_string_from_edge_list(edge_list)
				cluster_id = "%s.%s"%(go_no, counter)
				writer.writerow([cluster_id, connectivity, vertex_string, edge_string])
				counter += 1
		del writer
		outf.close()
Example #7
0
	def run(self):
		"""
		2007-03-20
		2007-04-03
		"""		
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		if self.draw_only:
			header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.output_fname)
			data_matrix = Numeric.array(data_matrix)
		else:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname)
			
			snp_acc_ls = header[2:]
			strain_id2index = self.get_id2index(curs, self.strain_info_table, strain_acc_list)
			snp_id2index = self.get_id2index(curs, self.snp_locus_table, snp_acc_ls)
			
			from dbSNP2data import dbSNP2data
			dbSNP2data_instance = dbSNP2data(report=self.report)
			data_matrix = dbSNP2data_instance.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.data_table, need_heterozygous_call=1)
			
			FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(data_matrix)
		self.displayDataMatrix(heterozygous_data_matrix, title='heterozygous_data_matrix, 5-10=hetero, else=0')
		self.displayDataMatrix(coarse_data_matrix, title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero')
		raw_input("enter")
Example #8
0
	def run(self):
		"""
		03-30-05
		
		06-30-05
			more complex data grouping via which_column_list and group_size_list
			if both lists are of length 2, 2-level grouping.
			
		--db_connect()
		--get_go_no2depth()
		--data_fetch()
		--group_data()
		if self.stat_table_fname:
			--prediction_space_output()
		"""
		self.init()
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		from codense.common import  get_go_no2depth
		self.go_no2depth = get_go_no2depth(curs)
		
		self.data_fetch(curs, self.table, self.mcl_table, self.gene_table)
		local_prediction_space2attr = self.group_data(self.prediction_data,key_column=self.which_column_list[0], group_size=self.group_size_list[0])
		for key, unit in local_prediction_space2attr.iteritems():
			if len(self.which_column_list)>1 and len(self.group_size_list)>1:
				local_prediction_space2attr_2 = self.group_data(unit, key_column=self.which_column_list[1], group_size=self.group_size_list[1])
				for key2, unit2 in local_prediction_space2attr_2.iteritems():
					self.prediction_space2attr[(key,key2)] = unit2
			else:
				self.prediction_space2attr[(key,)] = unit
		stat_table_f = open(self.stat_table_fname, 'w')
		self.prediction_space_output(stat_table_f, self.prediction_space2attr)
Example #9
0
    def run(self):
        """
		03-01-05
			initial
		
		--db_connect()
		--get_go_no2term_id()		#for get_distance(), needs self.go_no2term_id
		--data_fetch()
			--gene_no2p_gene_setup()
		--p_gene_id_map()
			--_p_gene_map()  or --_p_gene_map_network_topology()
				--get_distance()		#touches self.go_no2distance
		--submit()		
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        curs.execute("begin")  # because of cursor usage
        self.go_no2term_id = get_go_no2term_id(curs, self.schema, self.term_table)
        self.data_fetch(curs, self.p_gene_table, self.gene_p_table)
        if self.type == 2 and self.pattern_table == None:
            sys.stderr.write("\n type=2 needs pattern_table.\n")
            sys.exit(3)
        self.p_gene_map(
            self.gene_no2p_gene,
            self.p_gene_id_map,
            curs,
            self.distance_table,
            self.go_no2distance,
            self.go_no2term_id,
            self.type,
        )
        if self.needcommit:
            self.submit(curs, self.gene_p_table, self.p_gene_id_map)
            curs.execute("end")
Example #10
0
    def fill_edge2encodedOccurrence(
        self, hostname, dbname, schema, edge2encodedOccurrence, min_sup, max_sup, edge_table="edge_cor_vector"
    ):
        """
		09-05-05
			get the edge2encodedOccurrence from the database
		"""
        sys.stderr.write("Getting edges...\n")
        (conn, curs) = db_connect(hostname, dbname, schema)
        curs.execute(
            "DECLARE crs CURSOR FOR select edge_name,sig_vector \
			from %s"
            % (edge_table)
        )
        curs.execute("fetch 5000 from crs")
        rows = curs.fetchall()
        no_of_datasets = 0
        counter = 0
        while rows:
            for row in rows:
                edge = row[0][1:-1].split(",")
                edge = map(int, edge)
                sig_vector = row[1][1:-1].split(",")
                sig_vector = map(int, sig_vector)
                if no_of_datasets == 0:
                    no_of_datasets = len(sig_vector)
                if sum(sig_vector) >= min_sup and sum(sig_vector) <= max_sup:
                    edge2encodedOccurrence[tuple(edge)] = encodeOccurrenceBv(sig_vector)
            curs.execute("fetch 5000 from crs")
            rows = curs.fetchall()
        sys.stderr.write("Done.\n")
        return no_of_datasets
Example #11
0
	def run(self):
		"""
		04-18-05
			Serve for jasmine's darwin input.
		04-19-05
			changed to put 2nd-order clusters and its connected components into one file.
		08-31-05
			much simpler, just output clusters from mcl_table
			
			--db_connect()
			--data_fetch()
				(loop)
					--get_gene_no2gene_id()
					--get_no_of_total_genes()
					--get_go_functions_of_this_gene_set()
					--get_information_of_go_functions()
					--get_cor_sig_2d_list()
					--return_string_form_of_cluster_dstructure()
		"""
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		
		
		#e_splat_table = self.table+'e'
		#e_mcl_table = self.mcl_table+'e'
		#CoexprFromCooccu_instance = CoexprFromCooccu()
		#pre_2nd_cc_hierarchy = CoexprFromCooccu_instance.data_fetch(curs, self.mcl_table, e_mcl_table)
		#mcl_id2cluster_dstructure = self.data_fetch(curs, self.table,  self.mcl_table, crs_no=1)
		#mcl_id_2nd_order2cluster_dstructure = self.data_fetch(curs, e_splat_table, e_mcl_table, crs_no=2)
		#self.cluster_dstructure_output_with_both_hierarchy(curs, self.output_fname, pre_2nd_cc_hierarchy,\
		#	mcl_id2cluster_dstructure, mcl_id_2nd_order2cluster_dstructure)
		#self.cluster_dstructure_output(curs, self.output_fname, self.order_1st_id2all_clusters)
		self.data_fetch(curs, self.table, self.mcl_table, crs_no=1, output_fname=self.output_fname)
Example #12
0
	def run(self):
		"""
		2006-09-04
			-db_connect()
			-get_gene_id_list()
			-get_masked_seq()
			-run_transfac()
			-get_top_mt_id_list()
			-output_transfac_pwm_cismodscan_format()
			-run_cismodscan()
		"""
		if not os.path.isdir(self.output_dir):
			os.makedirs(self.output_dir)
		seq_fname = os.path.join(self.output_dir, 'pattern_%s.seq'%(self.pattern_id))
		transfac_output_fname = os.path.join(self.output_dir, 'pattern_%s.match'%(self.pattern_id))
		pwm_fname = os.path.join(self.output_dir, 'pattern_%s.pwm'%(self.pattern_id))
		pwm_id_mapping_fname = os.path.join(self.output_dir, 'pattern_%s.pwm_id_mapping'%(self.pattern_id))
		cismodscan_output_fname = os.path.join(self.output_dir, 'pattern_%s.cismodscan'%(self.pattern_id))
		
		(conn, curs) =  db_connect(self.hostname, self.dbname)
		gene_id_list = self.get_gene_id_list(curs, self.pattern_table, self.pattern_id)
		self.get_masked_seq(curs, gene_id_list, self.prom_seq_table, seq_fname)
		mt_id2no_of_seqs = self.run_transfac(seq_fname, transfac_output_fname, self.match_bin_path, self.matrix_data_path, self.profile_filename)
		mt_id_list = self.get_top_mt_id_list(mt_id2no_of_seqs, self.no_of_tfs)
		matrix_table = 'transfac.matrix'
		self.output_transfac_pwm_cismodscan_format(curs, mt_id_list, matrix_table, pwm_fname, pwm_id_mapping_fname)
		self.run_cismodscan(self.cismodscan_binary_path, seq_fname, pwm_fname, cismodscan_output_fname, self.no_of_tfs, self.mod_length, self.expt_ratio)
Example #13
0
	def run(self):
		"""
		01-03-06
		"""
		padding_width=5
		padding_height=10
		max_seq_name_length = 20
		bs_width=1
		im_10kb_length=800
		
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		tf_info_list = self.get_gene_binding_sites(curs, self.input_gene_id)
		tf_name2binding_sites = self.get_tf_name2binding_sites(tf_info_list)
		seq_name, seq_length, seq_strand = self.get_gene_prom_seq_info(curs, self.input_gene_id)
		
		tf_name_list = tf_name2binding_sites.keys()
		tf_legend_im, tf_name2color = self.draw_tf_legend(tf_name_list, padding_width, max_seq_name_length)
		composite_tf_im = self.draw_tf_line(seq_name, seq_length, seq_strand, tf_info_list, tf_name2color, \
			padding_width, padding_height, max_seq_name_length, bs_width, im_10kb_length)
		
		im = self.get_composite_and_individual_tf_line(composite_tf_im, tf_name2binding_sites, \
			seq_name, seq_length, seq_strand, tf_name2color, padding_width,\
			padding_height, max_seq_name_length, bs_width, im_10kb_length)
		
		tf_legend_output_fname = '%s_tf_legend.png'%self.output_prefix
		tf_legend_im.save(tf_legend_output_fname)
		tf_line_output_fname = '%s_tf_line.png'%self.output_prefix
		im.save(tf_line_output_fname)
	def run(self):
		"""
		10-27-05
			
			--db_connect()
			--get_prediction_step()
			--get_prediction_heap()
			--get_sorted_param_acc_list()
			--get_cutoff()
			--lm_table_create()
			--submit()
		"""
		p_gene_lm_instance = p_gene_lm()
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		step = self.get_prediction_step(curs, self.p_gene_table, p_gene_lm_instance.is_correct_dict, \
			self.judger_type)
		prediction_heap = self.get_prediction_heap(curs, self.p_gene_table, p_gene_lm_instance.is_correct_dict, \
			self.judger_type, self.which_dict, self.which, step)
		sorted_param_acc_list = self.get_sorted_param_acc_list(prediction_heap)
		del prediction_heap	#10-27-05 release memory
		cutoff_row = self.get_cutoff(sorted_param_acc_list, self.accuracy_cut_off)
		del sorted_param_acc_list	#10-27-05 release memory
		print "cutoff_row",cutoff_row
		if self.commit and cutoff_row and self.lm_table:	#cutoff_row is not None
			p_gene_lm_instance.lm_table_create(curs, self.lm_table)
			go_no2lm_results = {}
			go_no2lm_results[-1] = [[0]*7, [1]*7, cutoff_row[0]]	#11-09-05 extend the list
			go_no2lm_results[-1][0][which+1] = 1	#the coeffcient for "which" param is 1, others are 0
			p_gene_lm_instance.submit(curs, self.lm_table, go_no2lm_results)
			curs.execute("end")
Example #15
0
	def run(self):
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		if self.mapping_file==None:
			self.mapping_file = os.path.expanduser('~/mapping/%s_datasets_mapping'%self.schema)
		dataset_no2id = self.get_dataset_no2id(self.mapping_file)
		self.submit_to_table(curs, self.table, dataset_no2id)
		if self.commit:
			curs.execute("end")
Example #16
0
	def run(self):
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		MdbId2GeneId_instance = MdbId2GeneId()
		acc_tax_id2gene_id_list = MdbId2GeneId_instance.setup_acc2gene_id(self.acc_file, Set(self.tax_id_list))
		for input_fname in self.input_fname_list:
			self.parse_intact_xml_file(curs, input_fname, self.expt_table, self.interaction_table, acc_tax_id2gene_id_list)
		if self.commit:
			curs.execute("end")
	def run(self):
		"""
		12-28-05
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		self.parse(curs, self.input_fname, self.table)
		if self.commit:
			curs.execute("end")
	def run(self):
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		tax_tree = self.construct_tax_tree(curs)
		tax_id_set = self.get_tax_id_set(curs)
		tax_id2index = self.get_tax_id2index_given_tax_id_set(tax_tree, tax_id_set)
		self.submit2tax_id_index(curs, tax_id2index)
		self.submit_common_ancestor(self.src_tax_id, tax_id2index, curs)
		if self.commit:
			curs.execute("end")
Example #19
0
	def run(self):
		"""
		2006-09-25
			use self.cluster_bs_table and self.pattern_table
		"""
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		mcl_id2tf_set = get_mcl_id2tf_set(curs, self.cluster_bs_table, self.mt_no2tf_name)
		self._tf_darwin_format(curs, self.pattern_table, self.output_fname, self.gene_no2id, mcl_id2tf_set)
		del conn, curs
	def run(self):
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		acc_tax_id2tf_acc = self.get_factor_info(curs, self.factor_table)
		tf_acc2gene_id_bridge_acc_ls = self.setup_acc2gene_id(self.gene2acc_file, acc_tax_id2tf_acc)
		tf_acc2entrezgene_id_set = self.submit_raw_result(curs, tf_acc2gene_id_bridge_acc_ls, self.raw_output_table)
		self.submit_result(curs, tf_acc2entrezgene_id_set, self.output_table)
		
		if self.commit:
			curs.execute("end")
Example #21
0
	def run(self):
		"""
		03-09-05
		
		04-01-05
		"""
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		self.dstruc_loadin(curs)
		if self.r_fname:
			self.r_f = open(self.r_fname, 'w')
		if self.type == 1:
			subgraph = self.get_subgraph(curs, self.table, self.mcl_table, self.mcl_id)
			#unweighted
			weighted=0
			self.subgraph_output(self.r_f, subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \
				self.centralnode, self.function, self.functioncolor, self.plot_type, weighted)
			self.r_f.close()
			r.source(self.r_fname)
			raw_input("Pause:\t")
		elif self.type == 2:
			if self.gene_table==None or self.gene_p_table==None:
				sys.stderr.write("Error: Please specify both the gene_p_table and gene_table.\n")
				sys.exit(2)
			subgraph = self.context_subgraph(curs, self.table, self.mcl_table, self.gene_p_table, self.gene_table, \
				self.centralnode, self.function)
			self.subgraph_output(self.r_f, subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \
				self.centralnode, self.function, self.functioncolor, self.plot_type)
			self.r_f.close()
			r.source(self.r_fname)
			raw_input("Pause:\t")
		elif self.type == 3:
			for i in range(self.no_of_datasets):
				sys.stdout.write("Dataset %s\n"%(i+1))
				if self.edge_table==None:
					sys.stderr.write("Error: Please specify both the edge_table.\n")
					sys.exit(2)
					
				sub_subgraph = self.subgraph_in_one_dataset(curs, self.table, self.mcl_table, self.edge_table, self.mcl_id, i)
				self.subgraph_output(self.r_f, sub_subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \
					self.centralnode, self.function, self.functioncolor, self.plot_type)
				
				self.r_f.close()
				r.source(self.r_fname)
				#asking  to continue or not
				no_stop = raw_input("Continue? Y/n:\t")
				if no_stop == 'n' or no_stop == 'N':
					sys.exit(3)
				#open it again for the next dataset
				self.r_f = open(self.r_fname, 'w')
		elif self.type==4:
			subgraph = self.get_subgraph(curs, self.table, self.mcl_table, self.mcl_id)
			original_subgraph = self.get_original_graph(curs, subgraph)
			self.subgraph_output(self.r_f, original_subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \
				self.centralnode, self.function, self.functioncolor, self.plot_type)
			self.r_f.close()
			r.source(self.r_fname)
			raw_input("Pause:\t")
Example #22
0
	def run(self):
		"""
		02-28-05
		
		03-07-05
			implementing two posterior maneuvering of go_no2prediction_space, grouping and accumulatiing.
			See log of 2005, section 'linear model overfitting' for detail.
		
		--init()
		--db_connect()
		--IF self.p_value_cut_off==0
			--get_go_no2lm_results
			--get_general_lm_results
		
		--data_fetch()
			--_p_gene_analysis()
				--prediction_accepted()
		--IF self.stat_table_fname
			--overview_stats()
				--return_known_unknown_gene_sets()
			--go_no_accuracy()
			--table_output()
				--return_known_unknown_gene_sets()
		--IF self.gene_p_table
			--gene_p_table_submit()
		"""
		self.init()
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		self.known_genes_dict = self.get_known_genes_dict(curs)
		if self.p_value_cut_off == 0:
			if self.lm_table:
				self.go_no2lm_results, lm_results_2d_list = self.get_go_no2lm_results(curs, self.lm_table)
				self.general_lm_results = self.get_general_lm_results(lm_results_2d_list)
				if self.debug:
					print "go_no2lm_results: ",self.go_no2lm_results
					print "general_lm_results: ",self.general_lm_results
			else:
				sys.stderr.write("p_value_cut_off==0, need the lm_table to get the linear model\n")
				sys.exit(127)
		
		self.data_fetch(curs, self.gene_table, self.table)
		if self.stat_table_fname:
			self.overview_stats(self.stat_table_f)
			self.go_no_accuracy(self.prediction_pair2attr, self.stat_table_f, curs)
			self.table_output(self.stat_table_f, self.prediction_space2attr)
			"""
			#first grouping the data of parent-child go functions
			distance_table = 'go.node_dist'
			go_no_group2prediction_space = self.return_go_no_group2prediction_space(self.go_no2prediction_space, curs, distance_table)
			#output the prediction_space go_no by go_no
			self.prediction_space_split_output(self.stat_table_f, go_no_group2prediction_space, self.recurrence_gap_size, self.connectivity_gap_size)
			"""
		if self.gene_p_table:
			self.gene_p_table_submit(curs, self.gene_p_table, self.gene_p_list)
		if self.needcommit:
			curs.execute("end")
Example #23
0
	def run(self):
		"""
		2007-02-08
		"""
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		mcl_id_set = self.get_mcl_id_set_from_good_cluster_table(curs, self.cluster_bs_table)
		mcl_id2vertex_edge_recurrence = self.get_mcl_id2vertex_edge_recurrence(curs, self.pattern_table, self.gene_no2id, self.go_no2id, mcl_id_set)
		self._prediction_csv_format(curs, self.input_fname, self.lm_bit, \
			self.gene_no2id, self.go_no2id, self.output_fname, mcl_id2vertex_edge_recurrence)
		del conn, curs
Example #24
0
	def run(self):
		"""
		2006-09-25
			use self.pattern_table
		"""
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		#mcl_id_set = self.get_mcl_id_set_from_good_cluster_table(curs, schema_instance.good_cluster_table)
		mcl_id_set = None	#01-14-06
		self._pattern_darwin_format(curs, self.pattern_table, self.gene_no2id, self.go_no2id, self.output_fname, mcl_id_set)
		del conn, curs
Example #25
0
	def run(self):
		"""
		2006-09-25
			use self.input_fname
		"""
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		self._prediction_darwin_format(curs, self.input_fname, self.lm_bit, \
			self.gene_no2id, self.go_no2id, self.output_fname)
		#self._prediction_darwin_format_from_file(self.input_fname, self.gene_no2id, self.go_no2id, self.output_fname)
		del conn, curs
Example #26
0
	def run(self):
		"""
		04-11-05
		
		04-19-05
			generate pre_2nd_cc_hierarchy.
		"""
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		self.data_fetch(curs, self.mcl_table, self.e_mcl_table)
		self.output(self.pre_2nd_cc_hierarchy)
Example #27
0
	def run(self):
		if self.ofname and self.acc_cut_off and self.lm_bit:
			schema_instance = form_schema_tables(self.ofname, self.acc_cut_off, self.lm_bit)
			
		else:
			sys.stderr.write("ofname: %s and acc_cut_off: %s and lm_bit %s, NOT VALID\n"%(self.ofname, self.acc_cut_off, self.lm_bit))
			sys.exit(2)
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		self._cluster_darwin_format(curs, schema_instance.good_cluster_table, self.gene_no2id, self.go_no2id, self.output_fname)
		del conn, curs
Example #28
0
	def run(self):
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		if self.input_type==1:
			failed_ls = self.ProcessDianeGPSInfo(curs, self.input_fname, self.strain_info_table, self.report)
		elif self.input_type==2:
			failed_ls = self.Process850NaturalAccessions(curs, self.input_fname, self.strain_info_table, self.report)
		print "%s failures"%len(failed_ls)
		print failed_ls
		if self.commit:
			curs.execute("end")
Example #29
0
    def run(self):
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        acc_tax_id2tf_acc = self.get_factor_info(curs, self.factor_table)
        tf_acc2gene_id_bridge_acc_ls = self.setup_acc2gene_id(
            self.gene2acc_file, acc_tax_id2tf_acc)
        tf_acc2entrezgene_id_set = self.submit_raw_result(
            curs, tf_acc2gene_id_bridge_acc_ls, self.raw_output_table)
        self.submit_result(curs, tf_acc2entrezgene_id_set, self.output_table)

        if self.commit:
            curs.execute("end")
 def run(self):
     (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
     id2chr_start_stop = self.get_id2chr_start_stop(curs, self.tax_id)
     gene_id2gene_symbol = get_gene_id2gene_symbol(curs, self.tax_id)
     mt_id2gene_symbol = self.get_mt_id2gene_symbol(curs, gene_id2gene_symbol, self.tax_id)
     mt_id_gene_symbol2color_code = self.get_mt_id2color_code(
         self.input_dir, self.color_code_list, mt_id2gene_symbol
     )
     self.parse_files(
         self.input_dir, self.output_fname, id2chr_start_stop, mt_id_gene_symbol2color_code, mt_id2gene_symbol
     )
	def run(self):
		files = os.listdir(self.input_dir)
		files.sort()
		sys.stderr.write("\tTotally, %d files to be processed.\n"%len(files))
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		for input_fname in files:
			input_fname = os.path.join(self.input_dir, input_fname)
			self.parse_file(curs, input_fname, self.output_table, self.GC_percentage)
		
		if self.commit:
			curs.execute("end")
Example #32
0
	def run(self):
		"""
		03-02-05
			initial
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		curs.execute("begin")	#because of cursor usage
		
		self.data_fetch(curs, self.p_gene_table, self.gene_p_table, self.mcl_table)
		self.output_dict[self.output_type](curs, sys.stdout, self.known_gene_no2p_gene_id_src, self.unknown_gene_no2p_gene_id_src, self.p_gene_id_src_map)
		self.stat_output(sys.stdout, self.known_gene_no2p_gene_id_src, self.unknown_gene_no2p_gene_id_src)
Example #33
0
 def run(self):
     (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
     id2chr_start_stop = self.get_id2chr_start_stop(curs, self.tax_id)
     gene_id2gene_symbol = get_gene_id2gene_symbol(curs, self.tax_id)
     mt_id2gene_symbol = self.get_mt_id2gene_symbol(curs,
                                                    gene_id2gene_symbol,
                                                    self.tax_id)
     mt_id_gene_symbol2color_code = self.get_mt_id2color_code(
         self.input_dir, self.color_code_list, mt_id2gene_symbol)
     self.parse_files(self.input_dir, self.output_fname, id2chr_start_stop,
                      mt_id_gene_symbol2color_code, mt_id2gene_symbol)
Example #34
0
    def run(self):
        files = os.listdir(self.input_dir)
        files.sort()
        sys.stderr.write("\tTotally, %d files to be processed.\n" % len(files))
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        for input_fname in files:
            input_fname = os.path.join(self.input_dir, input_fname)
            self.parse_file(curs, input_fname, self.output_table,
                            self.GC_percentage)

        if self.commit:
            curs.execute("end")
Example #35
0
	def run(self):
		"""
		11-14-05
			--db_connect()
			--get_prom_seq_from_entrezgene_mapping_table()
				--return_closest_anchor()
				--tax_id2org()
				--get_sequence_segment()
				--submit_to_prom_seq()
		"""
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		self.get_prom_seq_from_entrezgene_mapping_table(curs, self.prom_seq_table)
		if self.commit:
			curs.execute("end")
Example #36
0
    def run(self):
        """
		02-01-06
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname)
        tax_id = get_tax_id_from_org(curs, self.organism)
        mt_id2no = get_mt_id2no(curs, self.matrix_table)
        prom_id2gene_id = self.get_prom_id2gene_id(curs, self.prom_seq_table,
                                                   self.organism)

        self.parse_input_fname(curs, self.input_fname, self.p_value_cut_off,
                               prom_id2gene_id, mt_id2no, tax_id,
                               self.output_table)
        if self.commit:
            curs.execute("end")
Example #37
0
    def run(self):
        """
		11-15-05
			correct a bug related to self.size
		2006-08-27
			if sequence is empty, ignore it.
		2006-11-27
			add running_type
		"""
        if not os.path.isdir(self.folder):
            os.makedirs(self.folder)
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        if self.running_type == 1:
            curs.execute(
                "DECLARE crs CURSOR FOR SELECT id, sequence from prom_seq \
				where sequence is not null and strpos(chromosome, 'random')=0 and organism='%s'"
                % self.organism)
            #09-14-05	not null sequence and no 'random' in chromosome
        elif self.running_type == 2:
            curs.execute(
                "DECLARE crs CURSOR FOR SELECT r.id, r.sequence from sequence.raw_sequence r, sequence.annot_assembly a\
				where r.acc_ver=a.acc_ver and a.tax_id=%s" % org2tax_id(self.organism))
            #2006-11-27 from a specific tax_id, the later condition will guarantee that sequence is not empty
        else:
            sys.stderr.write("Unsupported running_type: %s\n" %
                             self.running_type)
            sys.exit(3)
        curs.execute("fetch %s from crs" % self.size)
        rows = curs.fetchall()
        counter = 0
        sys.stderr.write("Starting to output...\n")
        while rows:
            output_file = os.path.join(self.folder,
                                       '%s%s' % (self.prefix, counter))
            of = open(output_file, 'w')
            for row in rows:
                id, sequence = row
                if sequence:
                    of.write('>%s\n%s\n' % (id, sequence))
            del of
            counter += 1
            if self.report:
                sys.stderr.write('%s%s' % ('\x08' * 20, counter))
            curs.execute("fetch %s from crs" % self.size)
            rows = curs.fetchall()
        del conn, curs
        sys.stderr.write("Done.\n")
    def run(self):
        """
		2007-03-20
		2007-04-03
		"""
        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
        if self.draw_only:
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.output_fname)
            data_matrix = Numeric.array(data_matrix)
        else:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname)

            snp_acc_ls = header[2:]
            strain_id2index = self.get_id2index(curs, self.strain_info_table,
                                                strain_acc_list)
            snp_id2index = self.get_id2index(curs, self.snp_locus_table,
                                             snp_acc_ls)

            from dbSNP2data import dbSNP2data
            dbSNP2data_instance = dbSNP2data(report=self.report)
            data_matrix = dbSNP2data_instance.get_data_matrix(
                curs,
                strain_id2index,
                snp_id2index,
                nt2number,
                self.data_table,
                need_heterozygous_call=1)

            FilterStrainSNPMatrix_instance.write_data_matrix(
                data_matrix, self.output_fname, header, strain_acc_list,
                category_list)

        heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(
            data_matrix)
        self.displayDataMatrix(
            heterozygous_data_matrix,
            title='heterozygous_data_matrix, 5-10=hetero, else=0')
        self.displayDataMatrix(
            coarse_data_matrix,
            title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero')
        raw_input("enter")
Example #39
0
	def run(self):
		"""
		09-18-05
		09-30-05
			way of calling get_mt_id2no() changed
			get gene_id_dict is a thread now, to speed up
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname)
		tax_id = org2tax_id(self.organism)
		tax_id_set = Set([tax_id])
		get_gene_id_dict_instance = get_gene_id_dict(self.acc_file, tax_id_set)
		get_gene_id_dict_instance.start()
		mt_id2no = get_mt_id2no(curs, self.matrix_table)
		mt_no2matches = self.get_mt_no2matches(curs, self.input_table, self.prom_seq_table, self.top_number, self.organism, mt_id2no)
		get_gene_id_dict_instance.join()	#must wait it to finish before going on, need gene_id_dict
		self.dump2output_table(curs, self.output_table, mt_no2matches, get_gene_id_dict_instance.gene_id_dict, tax_id)
		if self.commit:
			curs.execute("end")
Example #40
0
    def run(self):
        """
		2006-09-04
			-db_connect()
			-get_gene_id_list()
			-get_masked_seq()
			-run_transfac()
			-get_top_mt_id_list()
			-output_transfac_pwm_cismodscan_format()
			-run_cismodscan()
		"""
        if not os.path.isdir(self.output_dir):
            os.makedirs(self.output_dir)
        seq_fname = os.path.join(self.output_dir,
                                 'pattern_%s.seq' % (self.pattern_id))
        transfac_output_fname = os.path.join(
            self.output_dir, 'pattern_%s.match' % (self.pattern_id))
        pwm_fname = os.path.join(self.output_dir,
                                 'pattern_%s.pwm' % (self.pattern_id))
        pwm_id_mapping_fname = os.path.join(
            self.output_dir, 'pattern_%s.pwm_id_mapping' % (self.pattern_id))
        cismodscan_output_fname = os.path.join(
            self.output_dir, 'pattern_%s.cismodscan' % (self.pattern_id))

        (conn, curs) = db_connect(self.hostname, self.dbname)
        gene_id_list = self.get_gene_id_list(curs, self.pattern_table,
                                             self.pattern_id)
        self.get_masked_seq(curs, gene_id_list, self.prom_seq_table, seq_fname)
        mt_id2no_of_seqs = self.run_transfac(seq_fname, transfac_output_fname,
                                             self.match_bin_path,
                                             self.matrix_data_path,
                                             self.profile_filename)
        mt_id_list = self.get_top_mt_id_list(mt_id2no_of_seqs, self.no_of_tfs)
        matrix_table = 'transfac.matrix'
        self.output_transfac_pwm_cismodscan_format(curs, mt_id_list,
                                                   matrix_table, pwm_fname,
                                                   pwm_id_mapping_fname)
        self.run_cismodscan(self.cismodscan_binary_path, seq_fname, pwm_fname,
                            cismodscan_output_fname, self.no_of_tfs,
                            self.mod_length, self.expt_ratio)
    def run(self):
        """
		2007-03-29
		2007-04-03
		2007-05-01
			--db_connect()
			--FilterStrainSNPMatrix_instance.read_data()
			if self.comparison_only:
				--FilterStrainSNPMatrix_instance.read_data()
			else:
				--get_SNPpos2index()
				--create_SNP_matrix_2010()
					--get_align_length_from_fname()
						--get_positions_to_be_checked_ls()
					--get_align_matrix_from_fname()
						--get_positions_to_be_checked_ls()
				--get_mapping_info_regarding_strain_acc()
				--shuffle_data_matrix_according_to_strain_acc_ls()
				--FilterStrainSNPMatrix_instance.write_data_matrix()
			
			--extract_sub_data_matrix()
			if self.sub_justin_output_fname:
				--FilterStrainSNPMatrix_instance.write_data_matrix()
			--compare_two_SNP_matrix()
			--outputDiffType()
			
		"""
        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
        header, src_strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
            self.input_fname)
        if self.comparison_only:
            header, strain_acc_ls, abbr_name_ls_sorted, SNP_matrix_2010_sorted = FilterStrainSNPMatrix_instance.read_data(
                self.output_fname)
            SNP_matrix_2010_sorted = Numeric.array(SNP_matrix_2010_sorted)
        else:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            #extract data from alignment
            snp_acc_ls = header[2:]
            SNPpos2index = self.get_SNPpos2index(curs, snp_acc_ls,
                                                 self.snp_locus_table)
            abbr_name_ls, SNP_matrix_2010 = self.create_SNP_matrix_2010(
                SNPpos2index, self.data_dir_2010)
            strain_acc_ls, strain_acc2abbr_name, strain_acc2index = self.get_mapping_info_regarding_strain_acc(
                curs, self.strain_info_table, self.strain_info_2010_table,
                abbr_name_ls)
            SNP_matrix_2010_sorted = self.shuffle_data_matrix_according_to_strain_acc_ls(
                SNP_matrix_2010, strain_acc_ls, strain_acc2index)
            abbr_name_ls_sorted = []
            for strain_acc in strain_acc_ls:
                abbr_name_ls_sorted.append(strain_acc2abbr_name[strain_acc])
            FilterStrainSNPMatrix_instance.write_data_matrix(
                SNP_matrix_2010_sorted, self.output_fname, header,
                strain_acc_ls, abbr_name_ls_sorted)

        #comparison
        data_matrix = Numeric.array(data_matrix)
        sub_data_matrix = self.extract_sub_data_matrix(src_strain_acc_list,
                                                       data_matrix,
                                                       strain_acc_ls)
        if self.sub_justin_output_fname:
            FilterStrainSNPMatrix_instance.write_data_matrix(
                sub_data_matrix, self.sub_justin_output_fname, header,
                strain_acc_ls, abbr_name_ls_sorted)
        diff_matrix, diff_tag_dict, diff_tag2counter = self.compare_two_SNP_matrix(
            SNP_matrix_2010_sorted, sub_data_matrix)
        if self.diff_output_fname:
            self.outputDiffType(diff_matrix, SNP_matrix_2010_sorted,
                                sub_data_matrix, diff_tag_dict,
                                self.diff_type_to_be_outputted,
                                abbr_name_ls_sorted, header[2:],
                                self.diff_output_fname)

        summary_result_ls = []
        for tag, counter in diff_tag2counter.iteritems():
            summary_result_ls.append('%s(%s):%s' %
                                     (tag, diff_tag_dict[tag], counter))
            print '\t%s(%s)\t%s' % (tag, diff_tag_dict[tag], counter)
        import pylab
        pylab.clf()
        diff_matrix_reverse = list(diff_matrix)
        diff_matrix_reverse.reverse()
        diff_matrix_reverse = Numeric.array(diff_matrix_reverse)
        pylab.imshow(diff_matrix_reverse, interpolation='nearest')
        pylab.title(' '.join(summary_result_ls))
        pylab.colorbar()
        pylab.show()

        #2007-11-01 do something as CmpAccession2Ecotype.py
        from CmpAccession2Ecotype import CmpAccession2Ecotype
        CmpAccession2Ecotype_ins = CmpAccession2Ecotype()
        nt_number2diff_matrix_index = CmpAccession2Ecotype_ins.get_nt_number2diff_matrix_index(
            nt2number)
        dc_placeholder = dict(
            zip(range(sub_data_matrix.shape[0]),
                range(sub_data_matrix.shape[1])))
        diff_matrix_ls = CmpAccession2Ecotype_ins.cmp_two_matricies(
            SNP_matrix_2010_sorted, sub_data_matrix,
            nt_number2diff_matrix_index, dc_placeholder, dc_placeholder,
            dc_placeholder)
        print diff_matrix_ls
Example #42
0
    def run(self):
        """
		11-16-05
		11-19-05
			use no_of_validations to multiply the setting(separate the one setting's validations
				to different nodes)
			the extra setting copy  is for a non-validation real model fitting
			
			--computing_handler()
				--is_site_confirmed()
					--get_no_of_mismatches_allowed()
					--get_no_of_mismatches_for_consensus()
						--is_good_consensus()
					--get_no_of_mismatches_for_site()
		"""
        communicator = MPI.world.duplicate()
        node_rank = communicator.rank
        free_computing_nodes = range(1, communicator.size -
                                     1)  #exclude the last node
        if node_rank == 0:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            unknown_data, known_data = self.get_data(curs, self.fname,
                                                     self.filter_type,
                                                     self.is_correct_type,
                                                     self.need_cal_hg_p_value)
            known_data_pickle = cPickle.dumps(known_data, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                communicator.send(known_data_pickle, node, 0)
            unknown_data_pickle = cPickle.dumps(unknown_data, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                communicator.send(unknown_data_pickle, node, 0)
        elif node_rank in free_computing_nodes:
            data, source, tag = communicator.receiveString(0, 0)
            known_data = cPickle.loads(data)  #take the data
            """
			#11-19-05 shuffle data to check
			index_ls = range(len(known_data))
			random.shuffle(index_ls)
			for i in range(len(index_ls)):
				index_ls[i] = known_data[i]
			known_data = index_ls
			"""
            data, source, tag = communicator.receiveString(0, 0)
            unknown_data = cPickle.loads(data)  #take the data
            """
			#11-19-05 shuffle data to check
			index_ls = range(len(unknown_data))
			random.shuffle(index_ls)
			for i in range(len(index_ls)):
				index_ls[i] = unknown_data[i]
			unknown_data = index_ls
			"""
        elif node_rank == communicator.size - 1:
            writer = csv.writer(open(self.output_file, 'w'), delimiter='\t')
            #write down the header
            writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\
             'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std'])

        mpi_synchronize(communicator)
        if node_rank == 0:
            if self.type == 1:
                setting_ls = self.form_setting_ls(self.rpart_cp_ls,
                                                  self.loss_matrix_ls,
                                                  self.prior_prob_ls,
                                                  self.no_of_validations)
            elif self.type == 2:
                #randomForest replaces rpart_cp_ls with mty_ls, others are ignored later
                setting_ls = self.form_setting_ls(self.mty_ls,
                                                  self.loss_matrix_ls,
                                                  self.prior_prob_ls,
                                                  self.no_of_validations)
            else:
                sys.stderr.write("type %s not supported.\n" % self.type)
                sys.exit(3)
            self.input_node(communicator, setting_ls, free_computing_nodes,
                            self.report)
        elif node_rank in free_computing_nodes:
            parameter_list = [
                unknown_data, known_data, self.training_perc,
                self.no_of_validations, self.type, self.bit_string
            ]  #03-17-06 add type, bit_string
            computing_node(communicator,
                           parameter_list,
                           self.computing_handler,
                           report=self.report)
        elif node_rank == communicator.size - 1:
            setting2validation_stat = {}
            setting2unknown_known_acc_ls = {}
            parameter_list = [
                writer, setting2validation_stat, setting2unknown_known_acc_ls,
                self.no_of_validations
            ]
            output_node(communicator, free_computing_nodes, parameter_list,
                        self.output_handler, self.report)
            #cPickle.dump([setting2validation_stat, setting2unknown_known_acc_ls], open('/home/yuhuang/MpiRpartValidation.setting2result.pickle','w'))	#11-23-05
            del writer
Example #43
0
    def run(self):
        """
		2007-04-16
			(rank==0)
				--get_chr_start_ls()
			elif free_computing_nodes:
				-- (receive data)
			
			--mpi_synchronize()
			
			(rank==0)
				--input_node()
					--input_handler()
			elif free_computing_nodes:
				--computing_node()
					--computing_node_handler()
						--identify_ancestry_with_min_jumps()
							--initialize_score_trace_matrix()
								--is_child_heterozygous_SNP_compatible_with_parents()
							(for loop)
								--identify_ancestry_of_one_chr_with_DP()
									--is_child_heterozygous_SNP_compatible_with_parents()
							--trace()
								--recursive_trace()
			else:
				--output_node()
					--output_node_handler()
		"""
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        if node_rank == 0:
            FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname)
            snp_acc_list = header[2:]
            data_matrix = Numeric.array(data_matrix)
            no_of_strains = data_matrix.shape[0]
            (conn, curs) = db_connect(self.hostname,
                                      self.dbname,
                                      self.schema,
                                      password='******',
                                      user='******')

            #2007-09-17 send strain_acc_list to the output_node
            strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1)
            self.communicator.send(strain_acc_list_pickle,
                                   self.communicator.size - 1, 0)

            chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list,
                                                 self.snp_locus_table)

            chr_start_ls_pickle = cPickle.dumps(
                chr_start_ls, -1)  #-1 means use the highest protocol
            data_matrix_pickle = cPickle.dumps(data_matrix, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                self.communicator.send(chr_start_ls_pickle, node, 0)
                self.communicator.send(data_matrix_pickle, node, 0)
        elif node_rank in free_computing_nodes:
            data, source, tag = self.communicator.receiveString(0, 0)
            chr_start_ls = cPickle.loads(data)  #take the data
            data, source, tag = self.communicator.receiveString(0, 0)
            data_matrix = cPickle.loads(data)
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            strain_acc_list = cPickle.loads(data)

        mpi_synchronize(self.communicator)

        if node_rank == 0:
            parameter_list = [no_of_strains]
            self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, \
             self.report)
        elif node_rank in free_computing_nodes:
            trio_arrangement_ls = [[0, 1, 2], [1, 2, 0], [
                2, 0, 1
            ]]  #three different ways to pick the parent-set and the child
            parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls]
            computing_node(self.communicator,
                           parameter_list,
                           self.computing_node_handler,
                           report=self.report)
        else:
            writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t')
            parameter_list = [writer, strain_acc_list]
            output_node(self.communicator, free_computing_nodes,
                        parameter_list, self.output_node_handler, self.report)
            del writer
Example #44
0
 def run(self):
     (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
     snp_acc_ls = self.readSNPMarkers(self.input_fname)
     self.markSelected(curs, self.output_table, snp_acc_ls)
     if self.commit:
         curs.execute("end")
Example #45
0
    def run(self):
        """
		01-18-06
		
			--db_connect()
			--get_mt_id_gc_perc2no_of_random_hits()
			--parse_file()
				--write_down_mt_id2no_of_hits()
					--get_seq_id_gc_percentage_length()
					--get_hit_pvalue()
			
			--draw_pvalue_histogram()
			
			--calculate_pi0()
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        data_fname = '%s.data' % self.output_prefix
        if os.path.isfile(data_fname):
            sys.stderr.write("Getting p_value from %s..." %
                             os.path.basename(data_fname))
            reader = csv.reader(open(data_fname), delimiter='\t')
            for row in reader:
                self.p_value_list.append(float(row[5]))
            del reader
            sys.stderr.write("Done.\n")
        else:
            pickle_fname = os.path.expanduser(
                '~/pickle/mt_id_gc_perc2no_of_random_hits.pickle')
            if os.path.isfile(pickle_fname):
                mt_id_gc_perc2no_of_random_hits = cPickle.load(
                    open(pickle_fname))
            else:
                mt_id_gc_perc2no_of_random_hits = self.get_mt_id_gc_perc2no_of_random_hits(curs,\
                 self.matrix2no_of_random_hits_table)
                of = open(pickle_fname, 'w')
                cPickle.dump(mt_id_gc_perc2no_of_random_hits, of)
                del of
            writer = csv.writer(open(data_fname, 'w'), delimiter='\t')

            self.log_f = open('%s.log' % self.output_prefix, 'w')

            files = os.listdir(self.input_dir)
            files.sort()
            sys.stderr.write("\tTotally, %d files to be processed.\n" %
                             len(files))
            for input_fname in files:
                input_fname = os.path.join(self.input_dir, input_fname)
                self.parse_file(curs, input_fname, writer,
                                mt_id_gc_perc2no_of_random_hits)
            del writer
            self.log_f.close()

        self.p_value_list.sort()
        top_p_value_cutoff = 0.95  #important not 1, p_value histogram shows an abnormal peak from 0.95 to 1
        top_p_value_list = self.remove_top_p_values(self.p_value_list,
                                                    top_p_value_cutoff)

        figure_fname = '%s_p_value_hist.png' % self.output_prefix
        self.draw_pvalue_histogram(self.p_value_list, figure_fname)

        figure_fname = '%s_pi0Tolambda.png' % self.output_prefix
        lambda_list, pi0_list = self.calculate_pi0_list(
            self.p_value_list, figure_fname, top_p_value_cutoff)

        estimated_pi0 = self.estimate_pi0(lambda_list, pi0_list)

        self.cal_q_value_list(self.p_value_list, estimated_pi0,
                              top_p_value_cutoff, self.output_prefix)