def run(self): """ 09-19-05 rewrite --db_connect() --get_global_gene_id2gene_no() --org2tax_id() --get_gene_id2mt_no_list() --return_gene_id_set() --submit() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) gene_id2gene_no = get_global_gene_id2gene_no(curs, self.organism) tax_id = org2tax_id(self.organism) """ #01-14-06 comment it out for future gene_no2tf_set = get_gene_no2tf_set(curs) #12-15-05 just yeast. #12-15-05 convert gene_no(integer) into gene_id(string) gene_id2mt_no_list = {} for gene_no, tf_set in gene_no2tf_set.iteritems(): gene_id2mt_no_list[repr(gene_no)] = list(tf_set) """ gene_id2mt_no_list = get_gene_id2mt_no_list(tax_id) gene_id_set = self.return_gene_id_set(self.dir, gene_id2gene_no, self.min_frequency) self.submit(curs, output_table, gene_id_set, gene_id2gene_no, gene_id2mt_no_list) if self.needcommit: conn.commit()
def run(self): """ 09-08-05 --db_connect() --org_short2long() --org2tax_id() --setup_acc2gene_id() if self.new_table --create_output_table() --parse_input_filename() """ (conn, curs) = db_connect(self.hostname, self.dbname) long_organism = org_short2long(self.organism) tax_id_set = Set([org2tax_id(long_organism)]) MdbId2GeneId_instance = MdbId2GeneId() acc2gene_id = MdbId2GeneId_instance.setup_acc2gene_id( self.acc_file, tax_id_set) if self.new_table: self.create_output_table(curs, self.output_table) self.parse_input_filename(curs, self.input_filename, self.output_table, acc2gene_id,\ org2tax_id(long_organism), self.up_length, self.comment, long_organism, self.type) if self.commit: curs.execute("end")
def run(self): """ 06-03-05 --db_connect() --prepare_gene_no2go_no() --get_function_edge_matrix_data() --_get_function_edge_matrix_data() --return_common_go_no_of_edge() --return_edge_vector() --edge_data_output() """ conn,curs = db_connect(self.hostname, self.dbname,self.schema) self.gene_no2go_no = self.prepare_gene_no2go_no(curs) self.get_function_edge_matrix_data(curs, self.no_of_nas, self.table) #make a directory first if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) for go_no, edge_data in self.go_no2edge_matrix_data.iteritems(): if len(edge_data)>=self.min_no_of_edges: self.edge_data_output(self.output_dir, go_no, edge_data) self.go_no_qualified.append(go_no)
def run(self): """ 01-18-06 --db_connect() --get_mt_id_gc_perc2no_of_random_hits() --parse_file() --write_down_mt_id2no_of_hits() --get_seq_id_gc_percentage_length() --get_hit_pvalue() --draw_pvalue_histogram() --calculate_pi0() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) data_fname = '%s.data'%self.output_prefix if os.path.isfile(data_fname): sys.stderr.write("Getting p_value from %s..."%os.path.basename(data_fname)) reader = csv.reader(open(data_fname), delimiter='\t') for row in reader: self.p_value_list.append(float(row[5])) del reader sys.stderr.write("Done.\n") else: pickle_fname = os.path.expanduser('~/pickle/mt_id_gc_perc2no_of_random_hits.pickle') if os.path.isfile(pickle_fname): mt_id_gc_perc2no_of_random_hits = cPickle.load(open(pickle_fname)) else: mt_id_gc_perc2no_of_random_hits = self.get_mt_id_gc_perc2no_of_random_hits(curs,\ self.matrix2no_of_random_hits_table) of = open(pickle_fname, 'w') cPickle.dump(mt_id_gc_perc2no_of_random_hits, of) del of writer = csv.writer(open(data_fname, 'w') , delimiter='\t') self.log_f = open('%s.log'%self.output_prefix,'w') files = os.listdir(self.input_dir) files.sort() sys.stderr.write("\tTotally, %d files to be processed.\n"%len(files)) for input_fname in files: input_fname = os.path.join(self.input_dir, input_fname) self.parse_file(curs, input_fname, writer, mt_id_gc_perc2no_of_random_hits) del writer self.log_f.close() self.p_value_list.sort() top_p_value_cutoff = 0.95 #important not 1, p_value histogram shows an abnormal peak from 0.95 to 1 top_p_value_list = self.remove_top_p_values(self.p_value_list, top_p_value_cutoff) figure_fname = '%s_p_value_hist.png'%self.output_prefix self.draw_pvalue_histogram(self.p_value_list, figure_fname) figure_fname = '%s_pi0Tolambda.png'%self.output_prefix lambda_list, pi0_list = self.calculate_pi0_list(self.p_value_list, figure_fname, top_p_value_cutoff) estimated_pi0 = self.estimate_pi0(lambda_list, pi0_list) self.cal_q_value_list(self.p_value_list, estimated_pi0, top_p_value_cutoff, self.output_prefix)
def run(self): """ 10-22-05 """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) print "this is node",node_rank if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) edge2occurrrence, no_of_datasets = get_edge2occurrence(curs, self.min_sup, self.max_sup) edge2occurrrence_pickle = cPickle.dumps((edge2occurrrence, no_of_datasets), -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(edge2occurrrence_pickle, node, 0) del conn, curs elif node_rank in free_computing_nodes: #exclude the last node data, source, tag = communicator.receiveString(0, 0) edge2occurrrence, no_of_datasets = cPickle.loads(data) mpi_synchronize(communicator) if node_rank == 0: inf = csv.reader(open(self.inputfile,'r'), delimiter='\t') parameter_list = [inf] input_node(communicator, parameter_list, free_computing_nodes, self.message_size, self.report, input_handler=self.input_handler) del inf elif node_rank in free_computing_nodes: parameter_list = [self.min_size, self.alpha, edge2occurrrence, no_of_datasets] computing_node(communicator, parameter_list, self.node_fire, report=self.report) elif node_rank == communicator.size-1: writer = csv.writer(open(self.outputfile, 'w'), delimiter='\t') parameter_list = [writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) del writer
def output_in_copath_format(self, outfname, node_rank): """ 04-20-05 output go_no2cluster_group 04-25-05 cluster_id redefined """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) outf = open(outfname, 'a') writer = csv.writer(outf, delimiter='\t') for go_no, cluster_group in self.go_no2cluster_group.iteritems(): counter = 0 for bicluster in cluster_group.bicluster_list: seed_edge_id_list = list(take(cluster_group.edge_id_array, bicluster.row_index_list)) edge_id_list = seed_edge_id_list + bicluster.added_edge_id_list vertex_list , edge_list = get_vertex_edge_list_by_edge_id(curs, edge_id_list) no_of_nodes = len(vertex_list) connectivity = len(edge_list)*2.0/(no_of_nodes*(no_of_nodes-1)) vertex_string = '{' + ';'.join(vertex_list) + ';}' edge_string = self.edge_string_from_edge_list(edge_list) cluster_id = "%s.%s"%(go_no, counter) writer.writerow([cluster_id, connectivity, vertex_string, edge_string]) counter += 1 del writer outf.close()
def run(self): """ 2007-03-20 2007-04-03 """ from FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() if self.draw_only: header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.output_fname) data_matrix = Numeric.array(data_matrix) else: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname) snp_acc_ls = header[2:] strain_id2index = self.get_id2index(curs, self.strain_info_table, strain_acc_list) snp_id2index = self.get_id2index(curs, self.snp_locus_table, snp_acc_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(report=self.report) data_matrix = dbSNP2data_instance.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.data_table, need_heterozygous_call=1) FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list) heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(data_matrix) self.displayDataMatrix(heterozygous_data_matrix, title='heterozygous_data_matrix, 5-10=hetero, else=0') self.displayDataMatrix(coarse_data_matrix, title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero') raw_input("enter")
def run(self): """ 03-30-05 06-30-05 more complex data grouping via which_column_list and group_size_list if both lists are of length 2, 2-level grouping. --db_connect() --get_go_no2depth() --data_fetch() --group_data() if self.stat_table_fname: --prediction_space_output() """ self.init() (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) from codense.common import get_go_no2depth self.go_no2depth = get_go_no2depth(curs) self.data_fetch(curs, self.table, self.mcl_table, self.gene_table) local_prediction_space2attr = self.group_data(self.prediction_data,key_column=self.which_column_list[0], group_size=self.group_size_list[0]) for key, unit in local_prediction_space2attr.iteritems(): if len(self.which_column_list)>1 and len(self.group_size_list)>1: local_prediction_space2attr_2 = self.group_data(unit, key_column=self.which_column_list[1], group_size=self.group_size_list[1]) for key2, unit2 in local_prediction_space2attr_2.iteritems(): self.prediction_space2attr[(key,key2)] = unit2 else: self.prediction_space2attr[(key,)] = unit stat_table_f = open(self.stat_table_fname, 'w') self.prediction_space_output(stat_table_f, self.prediction_space2attr)
def run(self): """ 03-01-05 initial --db_connect() --get_go_no2term_id() #for get_distance(), needs self.go_no2term_id --data_fetch() --gene_no2p_gene_setup() --p_gene_id_map() --_p_gene_map() or --_p_gene_map_network_topology() --get_distance() #touches self.go_no2distance --submit() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) curs.execute("begin") # because of cursor usage self.go_no2term_id = get_go_no2term_id(curs, self.schema, self.term_table) self.data_fetch(curs, self.p_gene_table, self.gene_p_table) if self.type == 2 and self.pattern_table == None: sys.stderr.write("\n type=2 needs pattern_table.\n") sys.exit(3) self.p_gene_map( self.gene_no2p_gene, self.p_gene_id_map, curs, self.distance_table, self.go_no2distance, self.go_no2term_id, self.type, ) if self.needcommit: self.submit(curs, self.gene_p_table, self.p_gene_id_map) curs.execute("end")
def fill_edge2encodedOccurrence( self, hostname, dbname, schema, edge2encodedOccurrence, min_sup, max_sup, edge_table="edge_cor_vector" ): """ 09-05-05 get the edge2encodedOccurrence from the database """ sys.stderr.write("Getting edges...\n") (conn, curs) = db_connect(hostname, dbname, schema) curs.execute( "DECLARE crs CURSOR FOR select edge_name,sig_vector \ from %s" % (edge_table) ) curs.execute("fetch 5000 from crs") rows = curs.fetchall() no_of_datasets = 0 counter = 0 while rows: for row in rows: edge = row[0][1:-1].split(",") edge = map(int, edge) sig_vector = row[1][1:-1].split(",") sig_vector = map(int, sig_vector) if no_of_datasets == 0: no_of_datasets = len(sig_vector) if sum(sig_vector) >= min_sup and sum(sig_vector) <= max_sup: edge2encodedOccurrence[tuple(edge)] = encodeOccurrenceBv(sig_vector) curs.execute("fetch 5000 from crs") rows = curs.fetchall() sys.stderr.write("Done.\n") return no_of_datasets
def run(self): """ 04-18-05 Serve for jasmine's darwin input. 04-19-05 changed to put 2nd-order clusters and its connected components into one file. 08-31-05 much simpler, just output clusters from mcl_table --db_connect() --data_fetch() (loop) --get_gene_no2gene_id() --get_no_of_total_genes() --get_go_functions_of_this_gene_set() --get_information_of_go_functions() --get_cor_sig_2d_list() --return_string_form_of_cluster_dstructure() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) #e_splat_table = self.table+'e' #e_mcl_table = self.mcl_table+'e' #CoexprFromCooccu_instance = CoexprFromCooccu() #pre_2nd_cc_hierarchy = CoexprFromCooccu_instance.data_fetch(curs, self.mcl_table, e_mcl_table) #mcl_id2cluster_dstructure = self.data_fetch(curs, self.table, self.mcl_table, crs_no=1) #mcl_id_2nd_order2cluster_dstructure = self.data_fetch(curs, e_splat_table, e_mcl_table, crs_no=2) #self.cluster_dstructure_output_with_both_hierarchy(curs, self.output_fname, pre_2nd_cc_hierarchy,\ # mcl_id2cluster_dstructure, mcl_id_2nd_order2cluster_dstructure) #self.cluster_dstructure_output(curs, self.output_fname, self.order_1st_id2all_clusters) self.data_fetch(curs, self.table, self.mcl_table, crs_no=1, output_fname=self.output_fname)
def run(self): """ 2006-09-04 -db_connect() -get_gene_id_list() -get_masked_seq() -run_transfac() -get_top_mt_id_list() -output_transfac_pwm_cismodscan_format() -run_cismodscan() """ if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) seq_fname = os.path.join(self.output_dir, 'pattern_%s.seq'%(self.pattern_id)) transfac_output_fname = os.path.join(self.output_dir, 'pattern_%s.match'%(self.pattern_id)) pwm_fname = os.path.join(self.output_dir, 'pattern_%s.pwm'%(self.pattern_id)) pwm_id_mapping_fname = os.path.join(self.output_dir, 'pattern_%s.pwm_id_mapping'%(self.pattern_id)) cismodscan_output_fname = os.path.join(self.output_dir, 'pattern_%s.cismodscan'%(self.pattern_id)) (conn, curs) = db_connect(self.hostname, self.dbname) gene_id_list = self.get_gene_id_list(curs, self.pattern_table, self.pattern_id) self.get_masked_seq(curs, gene_id_list, self.prom_seq_table, seq_fname) mt_id2no_of_seqs = self.run_transfac(seq_fname, transfac_output_fname, self.match_bin_path, self.matrix_data_path, self.profile_filename) mt_id_list = self.get_top_mt_id_list(mt_id2no_of_seqs, self.no_of_tfs) matrix_table = 'transfac.matrix' self.output_transfac_pwm_cismodscan_format(curs, mt_id_list, matrix_table, pwm_fname, pwm_id_mapping_fname) self.run_cismodscan(self.cismodscan_binary_path, seq_fname, pwm_fname, cismodscan_output_fname, self.no_of_tfs, self.mod_length, self.expt_ratio)
def run(self): """ 01-03-06 """ padding_width=5 padding_height=10 max_seq_name_length = 20 bs_width=1 im_10kb_length=800 (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) tf_info_list = self.get_gene_binding_sites(curs, self.input_gene_id) tf_name2binding_sites = self.get_tf_name2binding_sites(tf_info_list) seq_name, seq_length, seq_strand = self.get_gene_prom_seq_info(curs, self.input_gene_id) tf_name_list = tf_name2binding_sites.keys() tf_legend_im, tf_name2color = self.draw_tf_legend(tf_name_list, padding_width, max_seq_name_length) composite_tf_im = self.draw_tf_line(seq_name, seq_length, seq_strand, tf_info_list, tf_name2color, \ padding_width, padding_height, max_seq_name_length, bs_width, im_10kb_length) im = self.get_composite_and_individual_tf_line(composite_tf_im, tf_name2binding_sites, \ seq_name, seq_length, seq_strand, tf_name2color, padding_width,\ padding_height, max_seq_name_length, bs_width, im_10kb_length) tf_legend_output_fname = '%s_tf_legend.png'%self.output_prefix tf_legend_im.save(tf_legend_output_fname) tf_line_output_fname = '%s_tf_line.png'%self.output_prefix im.save(tf_line_output_fname)
def run(self): """ 10-27-05 --db_connect() --get_prediction_step() --get_prediction_heap() --get_sorted_param_acc_list() --get_cutoff() --lm_table_create() --submit() """ p_gene_lm_instance = p_gene_lm() (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) step = self.get_prediction_step(curs, self.p_gene_table, p_gene_lm_instance.is_correct_dict, \ self.judger_type) prediction_heap = self.get_prediction_heap(curs, self.p_gene_table, p_gene_lm_instance.is_correct_dict, \ self.judger_type, self.which_dict, self.which, step) sorted_param_acc_list = self.get_sorted_param_acc_list(prediction_heap) del prediction_heap #10-27-05 release memory cutoff_row = self.get_cutoff(sorted_param_acc_list, self.accuracy_cut_off) del sorted_param_acc_list #10-27-05 release memory print "cutoff_row",cutoff_row if self.commit and cutoff_row and self.lm_table: #cutoff_row is not None p_gene_lm_instance.lm_table_create(curs, self.lm_table) go_no2lm_results = {} go_no2lm_results[-1] = [[0]*7, [1]*7, cutoff_row[0]] #11-09-05 extend the list go_no2lm_results[-1][0][which+1] = 1 #the coeffcient for "which" param is 1, others are 0 p_gene_lm_instance.submit(curs, self.lm_table, go_no2lm_results) curs.execute("end")
def run(self): conn, curs = db_connect(self.hostname, self.dbname, self.schema) if self.mapping_file==None: self.mapping_file = os.path.expanduser('~/mapping/%s_datasets_mapping'%self.schema) dataset_no2id = self.get_dataset_no2id(self.mapping_file) self.submit_to_table(curs, self.table, dataset_no2id) if self.commit: curs.execute("end")
def run(self): conn, curs = db_connect(self.hostname, self.dbname, self.schema) MdbId2GeneId_instance = MdbId2GeneId() acc_tax_id2gene_id_list = MdbId2GeneId_instance.setup_acc2gene_id(self.acc_file, Set(self.tax_id_list)) for input_fname in self.input_fname_list: self.parse_intact_xml_file(curs, input_fname, self.expt_table, self.interaction_table, acc_tax_id2gene_id_list) if self.commit: curs.execute("end")
def run(self): """ 12-28-05 """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) self.parse(curs, self.input_fname, self.table) if self.commit: curs.execute("end")
def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) tax_tree = self.construct_tax_tree(curs) tax_id_set = self.get_tax_id_set(curs) tax_id2index = self.get_tax_id2index_given_tax_id_set(tax_tree, tax_id_set) self.submit2tax_id_index(curs, tax_id2index) self.submit_common_ancestor(self.src_tax_id, tax_id2index, curs) if self.commit: curs.execute("end")
def run(self): """ 2006-09-25 use self.cluster_bs_table and self.pattern_table """ conn, curs = db_connect(self.hostname, self.dbname, self.schema) mcl_id2tf_set = get_mcl_id2tf_set(curs, self.cluster_bs_table, self.mt_no2tf_name) self._tf_darwin_format(curs, self.pattern_table, self.output_fname, self.gene_no2id, mcl_id2tf_set) del conn, curs
def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) acc_tax_id2tf_acc = self.get_factor_info(curs, self.factor_table) tf_acc2gene_id_bridge_acc_ls = self.setup_acc2gene_id(self.gene2acc_file, acc_tax_id2tf_acc) tf_acc2entrezgene_id_set = self.submit_raw_result(curs, tf_acc2gene_id_bridge_acc_ls, self.raw_output_table) self.submit_result(curs, tf_acc2entrezgene_id_set, self.output_table) if self.commit: curs.execute("end")
def run(self): """ 03-09-05 04-01-05 """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) self.dstruc_loadin(curs) if self.r_fname: self.r_f = open(self.r_fname, 'w') if self.type == 1: subgraph = self.get_subgraph(curs, self.table, self.mcl_table, self.mcl_id) #unweighted weighted=0 self.subgraph_output(self.r_f, subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \ self.centralnode, self.function, self.functioncolor, self.plot_type, weighted) self.r_f.close() r.source(self.r_fname) raw_input("Pause:\t") elif self.type == 2: if self.gene_table==None or self.gene_p_table==None: sys.stderr.write("Error: Please specify both the gene_p_table and gene_table.\n") sys.exit(2) subgraph = self.context_subgraph(curs, self.table, self.mcl_table, self.gene_p_table, self.gene_table, \ self.centralnode, self.function) self.subgraph_output(self.r_f, subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \ self.centralnode, self.function, self.functioncolor, self.plot_type) self.r_f.close() r.source(self.r_fname) raw_input("Pause:\t") elif self.type == 3: for i in range(self.no_of_datasets): sys.stdout.write("Dataset %s\n"%(i+1)) if self.edge_table==None: sys.stderr.write("Error: Please specify both the edge_table.\n") sys.exit(2) sub_subgraph = self.subgraph_in_one_dataset(curs, self.table, self.mcl_table, self.edge_table, self.mcl_id, i) self.subgraph_output(self.r_f, sub_subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \ self.centralnode, self.function, self.functioncolor, self.plot_type) self.r_f.close() r.source(self.r_fname) #asking to continue or not no_stop = raw_input("Continue? Y/n:\t") if no_stop == 'n' or no_stop == 'N': sys.exit(3) #open it again for the next dataset self.r_f = open(self.r_fname, 'w') elif self.type==4: subgraph = self.get_subgraph(curs, self.table, self.mcl_table, self.mcl_id) original_subgraph = self.get_original_graph(curs, subgraph) self.subgraph_output(self.r_f, original_subgraph, self.label_dict[self.label], self.global_gene_to_go_dict, \ self.centralnode, self.function, self.functioncolor, self.plot_type) self.r_f.close() r.source(self.r_fname) raw_input("Pause:\t")
def run(self): """ 02-28-05 03-07-05 implementing two posterior maneuvering of go_no2prediction_space, grouping and accumulatiing. See log of 2005, section 'linear model overfitting' for detail. --init() --db_connect() --IF self.p_value_cut_off==0 --get_go_no2lm_results --get_general_lm_results --data_fetch() --_p_gene_analysis() --prediction_accepted() --IF self.stat_table_fname --overview_stats() --return_known_unknown_gene_sets() --go_no_accuracy() --table_output() --return_known_unknown_gene_sets() --IF self.gene_p_table --gene_p_table_submit() """ self.init() (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) self.known_genes_dict = self.get_known_genes_dict(curs) if self.p_value_cut_off == 0: if self.lm_table: self.go_no2lm_results, lm_results_2d_list = self.get_go_no2lm_results(curs, self.lm_table) self.general_lm_results = self.get_general_lm_results(lm_results_2d_list) if self.debug: print "go_no2lm_results: ",self.go_no2lm_results print "general_lm_results: ",self.general_lm_results else: sys.stderr.write("p_value_cut_off==0, need the lm_table to get the linear model\n") sys.exit(127) self.data_fetch(curs, self.gene_table, self.table) if self.stat_table_fname: self.overview_stats(self.stat_table_f) self.go_no_accuracy(self.prediction_pair2attr, self.stat_table_f, curs) self.table_output(self.stat_table_f, self.prediction_space2attr) """ #first grouping the data of parent-child go functions distance_table = 'go.node_dist' go_no_group2prediction_space = self.return_go_no_group2prediction_space(self.go_no2prediction_space, curs, distance_table) #output the prediction_space go_no by go_no self.prediction_space_split_output(self.stat_table_f, go_no_group2prediction_space, self.recurrence_gap_size, self.connectivity_gap_size) """ if self.gene_p_table: self.gene_p_table_submit(curs, self.gene_p_table, self.gene_p_list) if self.needcommit: curs.execute("end")
def run(self): """ 2007-02-08 """ conn, curs = db_connect(self.hostname, self.dbname, self.schema) mcl_id_set = self.get_mcl_id_set_from_good_cluster_table(curs, self.cluster_bs_table) mcl_id2vertex_edge_recurrence = self.get_mcl_id2vertex_edge_recurrence(curs, self.pattern_table, self.gene_no2id, self.go_no2id, mcl_id_set) self._prediction_csv_format(curs, self.input_fname, self.lm_bit, \ self.gene_no2id, self.go_no2id, self.output_fname, mcl_id2vertex_edge_recurrence) del conn, curs
def run(self): """ 2006-09-25 use self.pattern_table """ conn, curs = db_connect(self.hostname, self.dbname, self.schema) #mcl_id_set = self.get_mcl_id_set_from_good_cluster_table(curs, schema_instance.good_cluster_table) mcl_id_set = None #01-14-06 self._pattern_darwin_format(curs, self.pattern_table, self.gene_no2id, self.go_no2id, self.output_fname, mcl_id_set) del conn, curs
def run(self): """ 2006-09-25 use self.input_fname """ conn, curs = db_connect(self.hostname, self.dbname, self.schema) self._prediction_darwin_format(curs, self.input_fname, self.lm_bit, \ self.gene_no2id, self.go_no2id, self.output_fname) #self._prediction_darwin_format_from_file(self.input_fname, self.gene_no2id, self.go_no2id, self.output_fname) del conn, curs
def run(self): """ 04-11-05 04-19-05 generate pre_2nd_cc_hierarchy. """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) self.data_fetch(curs, self.mcl_table, self.e_mcl_table) self.output(self.pre_2nd_cc_hierarchy)
def run(self): if self.ofname and self.acc_cut_off and self.lm_bit: schema_instance = form_schema_tables(self.ofname, self.acc_cut_off, self.lm_bit) else: sys.stderr.write("ofname: %s and acc_cut_off: %s and lm_bit %s, NOT VALID\n"%(self.ofname, self.acc_cut_off, self.lm_bit)) sys.exit(2) conn, curs = db_connect(self.hostname, self.dbname, self.schema) self._cluster_darwin_format(curs, schema_instance.good_cluster_table, self.gene_no2id, self.go_no2id, self.output_fname) del conn, curs
def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) if self.input_type==1: failed_ls = self.ProcessDianeGPSInfo(curs, self.input_fname, self.strain_info_table, self.report) elif self.input_type==2: failed_ls = self.Process850NaturalAccessions(curs, self.input_fname, self.strain_info_table, self.report) print "%s failures"%len(failed_ls) print failed_ls if self.commit: curs.execute("end")
def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) acc_tax_id2tf_acc = self.get_factor_info(curs, self.factor_table) tf_acc2gene_id_bridge_acc_ls = self.setup_acc2gene_id( self.gene2acc_file, acc_tax_id2tf_acc) tf_acc2entrezgene_id_set = self.submit_raw_result( curs, tf_acc2gene_id_bridge_acc_ls, self.raw_output_table) self.submit_result(curs, tf_acc2entrezgene_id_set, self.output_table) if self.commit: curs.execute("end")
def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) id2chr_start_stop = self.get_id2chr_start_stop(curs, self.tax_id) gene_id2gene_symbol = get_gene_id2gene_symbol(curs, self.tax_id) mt_id2gene_symbol = self.get_mt_id2gene_symbol(curs, gene_id2gene_symbol, self.tax_id) mt_id_gene_symbol2color_code = self.get_mt_id2color_code( self.input_dir, self.color_code_list, mt_id2gene_symbol ) self.parse_files( self.input_dir, self.output_fname, id2chr_start_stop, mt_id_gene_symbol2color_code, mt_id2gene_symbol )
def run(self): files = os.listdir(self.input_dir) files.sort() sys.stderr.write("\tTotally, %d files to be processed.\n"%len(files)) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) for input_fname in files: input_fname = os.path.join(self.input_dir, input_fname) self.parse_file(curs, input_fname, self.output_table, self.GC_percentage) if self.commit: curs.execute("end")
def run(self): """ 03-02-05 initial """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) curs.execute("begin") #because of cursor usage self.data_fetch(curs, self.p_gene_table, self.gene_p_table, self.mcl_table) self.output_dict[self.output_type](curs, sys.stdout, self.known_gene_no2p_gene_id_src, self.unknown_gene_no2p_gene_id_src, self.p_gene_id_src_map) self.stat_output(sys.stdout, self.known_gene_no2p_gene_id_src, self.unknown_gene_no2p_gene_id_src)
def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) id2chr_start_stop = self.get_id2chr_start_stop(curs, self.tax_id) gene_id2gene_symbol = get_gene_id2gene_symbol(curs, self.tax_id) mt_id2gene_symbol = self.get_mt_id2gene_symbol(curs, gene_id2gene_symbol, self.tax_id) mt_id_gene_symbol2color_code = self.get_mt_id2color_code( self.input_dir, self.color_code_list, mt_id2gene_symbol) self.parse_files(self.input_dir, self.output_fname, id2chr_start_stop, mt_id_gene_symbol2color_code, mt_id2gene_symbol)
def run(self): files = os.listdir(self.input_dir) files.sort() sys.stderr.write("\tTotally, %d files to be processed.\n" % len(files)) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) for input_fname in files: input_fname = os.path.join(self.input_dir, input_fname) self.parse_file(curs, input_fname, self.output_table, self.GC_percentage) if self.commit: curs.execute("end")
def run(self): """ 11-14-05 --db_connect() --get_prom_seq_from_entrezgene_mapping_table() --return_closest_anchor() --tax_id2org() --get_sequence_segment() --submit_to_prom_seq() """ conn, curs = db_connect(self.hostname, self.dbname, self.schema) self.get_prom_seq_from_entrezgene_mapping_table(curs, self.prom_seq_table) if self.commit: curs.execute("end")
def run(self): """ 02-01-06 """ (conn, curs) = db_connect(self.hostname, self.dbname) tax_id = get_tax_id_from_org(curs, self.organism) mt_id2no = get_mt_id2no(curs, self.matrix_table) prom_id2gene_id = self.get_prom_id2gene_id(curs, self.prom_seq_table, self.organism) self.parse_input_fname(curs, self.input_fname, self.p_value_cut_off, prom_id2gene_id, mt_id2no, tax_id, self.output_table) if self.commit: curs.execute("end")
def run(self): """ 11-15-05 correct a bug related to self.size 2006-08-27 if sequence is empty, ignore it. 2006-11-27 add running_type """ if not os.path.isdir(self.folder): os.makedirs(self.folder) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) if self.running_type == 1: curs.execute( "DECLARE crs CURSOR FOR SELECT id, sequence from prom_seq \ where sequence is not null and strpos(chromosome, 'random')=0 and organism='%s'" % self.organism) #09-14-05 not null sequence and no 'random' in chromosome elif self.running_type == 2: curs.execute( "DECLARE crs CURSOR FOR SELECT r.id, r.sequence from sequence.raw_sequence r, sequence.annot_assembly a\ where r.acc_ver=a.acc_ver and a.tax_id=%s" % org2tax_id(self.organism)) #2006-11-27 from a specific tax_id, the later condition will guarantee that sequence is not empty else: sys.stderr.write("Unsupported running_type: %s\n" % self.running_type) sys.exit(3) curs.execute("fetch %s from crs" % self.size) rows = curs.fetchall() counter = 0 sys.stderr.write("Starting to output...\n") while rows: output_file = os.path.join(self.folder, '%s%s' % (self.prefix, counter)) of = open(output_file, 'w') for row in rows: id, sequence = row if sequence: of.write('>%s\n%s\n' % (id, sequence)) del of counter += 1 if self.report: sys.stderr.write('%s%s' % ('\x08' * 20, counter)) curs.execute("fetch %s from crs" % self.size) rows = curs.fetchall() del conn, curs sys.stderr.write("Done.\n")
def run(self): """ 2007-03-20 2007-04-03 """ from FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() if self.draw_only: header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.output_fname) data_matrix = Numeric.array(data_matrix) else: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.input_fname) snp_acc_ls = header[2:] strain_id2index = self.get_id2index(curs, self.strain_info_table, strain_acc_list) snp_id2index = self.get_id2index(curs, self.snp_locus_table, snp_acc_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(report=self.report) data_matrix = dbSNP2data_instance.get_data_matrix( curs, strain_id2index, snp_id2index, nt2number, self.data_table, need_heterozygous_call=1) FilterStrainSNPMatrix_instance.write_data_matrix( data_matrix, self.output_fname, header, strain_acc_list, category_list) heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix( data_matrix) self.displayDataMatrix( heterozygous_data_matrix, title='heterozygous_data_matrix, 5-10=hetero, else=0') self.displayDataMatrix( coarse_data_matrix, title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero') raw_input("enter")
def run(self): """ 09-18-05 09-30-05 way of calling get_mt_id2no() changed get gene_id_dict is a thread now, to speed up """ (conn, curs) = db_connect(self.hostname, self.dbname) tax_id = org2tax_id(self.organism) tax_id_set = Set([tax_id]) get_gene_id_dict_instance = get_gene_id_dict(self.acc_file, tax_id_set) get_gene_id_dict_instance.start() mt_id2no = get_mt_id2no(curs, self.matrix_table) mt_no2matches = self.get_mt_no2matches(curs, self.input_table, self.prom_seq_table, self.top_number, self.organism, mt_id2no) get_gene_id_dict_instance.join() #must wait it to finish before going on, need gene_id_dict self.dump2output_table(curs, self.output_table, mt_no2matches, get_gene_id_dict_instance.gene_id_dict, tax_id) if self.commit: curs.execute("end")
def run(self): """ 2006-09-04 -db_connect() -get_gene_id_list() -get_masked_seq() -run_transfac() -get_top_mt_id_list() -output_transfac_pwm_cismodscan_format() -run_cismodscan() """ if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) seq_fname = os.path.join(self.output_dir, 'pattern_%s.seq' % (self.pattern_id)) transfac_output_fname = os.path.join( self.output_dir, 'pattern_%s.match' % (self.pattern_id)) pwm_fname = os.path.join(self.output_dir, 'pattern_%s.pwm' % (self.pattern_id)) pwm_id_mapping_fname = os.path.join( self.output_dir, 'pattern_%s.pwm_id_mapping' % (self.pattern_id)) cismodscan_output_fname = os.path.join( self.output_dir, 'pattern_%s.cismodscan' % (self.pattern_id)) (conn, curs) = db_connect(self.hostname, self.dbname) gene_id_list = self.get_gene_id_list(curs, self.pattern_table, self.pattern_id) self.get_masked_seq(curs, gene_id_list, self.prom_seq_table, seq_fname) mt_id2no_of_seqs = self.run_transfac(seq_fname, transfac_output_fname, self.match_bin_path, self.matrix_data_path, self.profile_filename) mt_id_list = self.get_top_mt_id_list(mt_id2no_of_seqs, self.no_of_tfs) matrix_table = 'transfac.matrix' self.output_transfac_pwm_cismodscan_format(curs, mt_id_list, matrix_table, pwm_fname, pwm_id_mapping_fname) self.run_cismodscan(self.cismodscan_binary_path, seq_fname, pwm_fname, cismodscan_output_fname, self.no_of_tfs, self.mod_length, self.expt_ratio)
def run(self): """ 2007-03-29 2007-04-03 2007-05-01 --db_connect() --FilterStrainSNPMatrix_instance.read_data() if self.comparison_only: --FilterStrainSNPMatrix_instance.read_data() else: --get_SNPpos2index() --create_SNP_matrix_2010() --get_align_length_from_fname() --get_positions_to_be_checked_ls() --get_align_matrix_from_fname() --get_positions_to_be_checked_ls() --get_mapping_info_regarding_strain_acc() --shuffle_data_matrix_according_to_strain_acc_ls() --FilterStrainSNPMatrix_instance.write_data_matrix() --extract_sub_data_matrix() if self.sub_justin_output_fname: --FilterStrainSNPMatrix_instance.write_data_matrix() --compare_two_SNP_matrix() --outputDiffType() """ from FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() header, src_strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.input_fname) if self.comparison_only: header, strain_acc_ls, abbr_name_ls_sorted, SNP_matrix_2010_sorted = FilterStrainSNPMatrix_instance.read_data( self.output_fname) SNP_matrix_2010_sorted = Numeric.array(SNP_matrix_2010_sorted) else: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) #extract data from alignment snp_acc_ls = header[2:] SNPpos2index = self.get_SNPpos2index(curs, snp_acc_ls, self.snp_locus_table) abbr_name_ls, SNP_matrix_2010 = self.create_SNP_matrix_2010( SNPpos2index, self.data_dir_2010) strain_acc_ls, strain_acc2abbr_name, strain_acc2index = self.get_mapping_info_regarding_strain_acc( curs, self.strain_info_table, self.strain_info_2010_table, abbr_name_ls) SNP_matrix_2010_sorted = self.shuffle_data_matrix_according_to_strain_acc_ls( SNP_matrix_2010, strain_acc_ls, strain_acc2index) abbr_name_ls_sorted = [] for strain_acc in strain_acc_ls: abbr_name_ls_sorted.append(strain_acc2abbr_name[strain_acc]) FilterStrainSNPMatrix_instance.write_data_matrix( SNP_matrix_2010_sorted, self.output_fname, header, strain_acc_ls, abbr_name_ls_sorted) #comparison data_matrix = Numeric.array(data_matrix) sub_data_matrix = self.extract_sub_data_matrix(src_strain_acc_list, data_matrix, strain_acc_ls) if self.sub_justin_output_fname: FilterStrainSNPMatrix_instance.write_data_matrix( sub_data_matrix, self.sub_justin_output_fname, header, strain_acc_ls, abbr_name_ls_sorted) diff_matrix, diff_tag_dict, diff_tag2counter = self.compare_two_SNP_matrix( SNP_matrix_2010_sorted, sub_data_matrix) if self.diff_output_fname: self.outputDiffType(diff_matrix, SNP_matrix_2010_sorted, sub_data_matrix, diff_tag_dict, self.diff_type_to_be_outputted, abbr_name_ls_sorted, header[2:], self.diff_output_fname) summary_result_ls = [] for tag, counter in diff_tag2counter.iteritems(): summary_result_ls.append('%s(%s):%s' % (tag, diff_tag_dict[tag], counter)) print '\t%s(%s)\t%s' % (tag, diff_tag_dict[tag], counter) import pylab pylab.clf() diff_matrix_reverse = list(diff_matrix) diff_matrix_reverse.reverse() diff_matrix_reverse = Numeric.array(diff_matrix_reverse) pylab.imshow(diff_matrix_reverse, interpolation='nearest') pylab.title(' '.join(summary_result_ls)) pylab.colorbar() pylab.show() #2007-11-01 do something as CmpAccession2Ecotype.py from CmpAccession2Ecotype import CmpAccession2Ecotype CmpAccession2Ecotype_ins = CmpAccession2Ecotype() nt_number2diff_matrix_index = CmpAccession2Ecotype_ins.get_nt_number2diff_matrix_index( nt2number) dc_placeholder = dict( zip(range(sub_data_matrix.shape[0]), range(sub_data_matrix.shape[1]))) diff_matrix_ls = CmpAccession2Ecotype_ins.cmp_two_matricies( SNP_matrix_2010_sorted, sub_data_matrix, nt_number2diff_matrix_index, dc_placeholder, dc_placeholder, dc_placeholder) print diff_matrix_ls
def run(self): """ 11-16-05 11-19-05 use no_of_validations to multiply the setting(separate the one setting's validations to different nodes) the extra setting copy is for a non-validation real model fitting --computing_handler() --is_site_confirmed() --get_no_of_mismatches_allowed() --get_no_of_mismatches_for_consensus() --is_good_consensus() --get_no_of_mismatches_for_site() """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1, communicator.size - 1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) unknown_data, known_data = self.get_data(curs, self.fname, self.filter_type, self.is_correct_type, self.need_cal_hg_p_value) known_data_pickle = cPickle.dumps(known_data, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(known_data_pickle, node, 0) unknown_data_pickle = cPickle.dumps(unknown_data, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(unknown_data_pickle, node, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) known_data = cPickle.loads(data) #take the data """ #11-19-05 shuffle data to check index_ls = range(len(known_data)) random.shuffle(index_ls) for i in range(len(index_ls)): index_ls[i] = known_data[i] known_data = index_ls """ data, source, tag = communicator.receiveString(0, 0) unknown_data = cPickle.loads(data) #take the data """ #11-19-05 shuffle data to check index_ls = range(len(unknown_data)) random.shuffle(index_ls) for i in range(len(index_ls)): index_ls[i] = unknown_data[i] unknown_data = index_ls """ elif node_rank == communicator.size - 1: writer = csv.writer(open(self.output_file, 'w'), delimiter='\t') #write down the header writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\ 'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std']) mpi_synchronize(communicator) if node_rank == 0: if self.type == 1: setting_ls = self.form_setting_ls(self.rpart_cp_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations) elif self.type == 2: #randomForest replaces rpart_cp_ls with mty_ls, others are ignored later setting_ls = self.form_setting_ls(self.mty_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations) else: sys.stderr.write("type %s not supported.\n" % self.type) sys.exit(3) self.input_node(communicator, setting_ls, free_computing_nodes, self.report) elif node_rank in free_computing_nodes: parameter_list = [ unknown_data, known_data, self.training_perc, self.no_of_validations, self.type, self.bit_string ] #03-17-06 add type, bit_string computing_node(communicator, parameter_list, self.computing_handler, report=self.report) elif node_rank == communicator.size - 1: setting2validation_stat = {} setting2unknown_known_acc_ls = {} parameter_list = [ writer, setting2validation_stat, setting2unknown_known_acc_ls, self.no_of_validations ] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) #cPickle.dump([setting2validation_stat, setting2unknown_known_acc_ls], open('/home/yuhuang/MpiRpartValidation.setting2result.pickle','w')) #11-23-05 del writer
def run(self): """ 2007-04-16 (rank==0) --get_chr_start_ls() elif free_computing_nodes: -- (receive data) --mpi_synchronize() (rank==0) --input_node() --input_handler() elif free_computing_nodes: --computing_node() --computing_node_handler() --identify_ancestry_with_min_jumps() --initialize_score_trace_matrix() --is_child_heterozygous_SNP_compatible_with_parents() (for loop) --identify_ancestry_of_one_chr_with_DP() --is_child_heterozygous_SNP_compatible_with_parents() --trace() --recursive_trace() else: --output_node() --output_node_handler() """ node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node if node_rank == 0: FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.input_fname) snp_acc_list = header[2:] data_matrix = Numeric.array(data_matrix) no_of_strains = data_matrix.shape[0] (conn, curs) = db_connect(self.hostname, self.dbname, self.schema, password='******', user='******') #2007-09-17 send strain_acc_list to the output_node strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1) self.communicator.send(strain_acc_list_pickle, self.communicator.size - 1, 0) chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list, self.snp_locus_table) chr_start_ls_pickle = cPickle.dumps( chr_start_ls, -1) #-1 means use the highest protocol data_matrix_pickle = cPickle.dumps(data_matrix, -1) for node in free_computing_nodes: #send it to the computing_node self.communicator.send(chr_start_ls_pickle, node, 0) self.communicator.send(data_matrix_pickle, node, 0) elif node_rank in free_computing_nodes: data, source, tag = self.communicator.receiveString(0, 0) chr_start_ls = cPickle.loads(data) #take the data data, source, tag = self.communicator.receiveString(0, 0) data_matrix = cPickle.loads(data) else: data, source, tag = self.communicator.receiveString(0, 0) strain_acc_list = cPickle.loads(data) mpi_synchronize(self.communicator) if node_rank == 0: parameter_list = [no_of_strains] self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, \ self.report) elif node_rank in free_computing_nodes: trio_arrangement_ls = [[0, 1, 2], [1, 2, 0], [ 2, 0, 1 ]] #three different ways to pick the parent-set and the child parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls] computing_node(self.communicator, parameter_list, self.computing_node_handler, report=self.report) else: writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') parameter_list = [writer, strain_acc_list] output_node(self.communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_acc_ls = self.readSNPMarkers(self.input_fname) self.markSelected(curs, self.output_table, snp_acc_ls) if self.commit: curs.execute("end")
def run(self): """ 01-18-06 --db_connect() --get_mt_id_gc_perc2no_of_random_hits() --parse_file() --write_down_mt_id2no_of_hits() --get_seq_id_gc_percentage_length() --get_hit_pvalue() --draw_pvalue_histogram() --calculate_pi0() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) data_fname = '%s.data' % self.output_prefix if os.path.isfile(data_fname): sys.stderr.write("Getting p_value from %s..." % os.path.basename(data_fname)) reader = csv.reader(open(data_fname), delimiter='\t') for row in reader: self.p_value_list.append(float(row[5])) del reader sys.stderr.write("Done.\n") else: pickle_fname = os.path.expanduser( '~/pickle/mt_id_gc_perc2no_of_random_hits.pickle') if os.path.isfile(pickle_fname): mt_id_gc_perc2no_of_random_hits = cPickle.load( open(pickle_fname)) else: mt_id_gc_perc2no_of_random_hits = self.get_mt_id_gc_perc2no_of_random_hits(curs,\ self.matrix2no_of_random_hits_table) of = open(pickle_fname, 'w') cPickle.dump(mt_id_gc_perc2no_of_random_hits, of) del of writer = csv.writer(open(data_fname, 'w'), delimiter='\t') self.log_f = open('%s.log' % self.output_prefix, 'w') files = os.listdir(self.input_dir) files.sort() sys.stderr.write("\tTotally, %d files to be processed.\n" % len(files)) for input_fname in files: input_fname = os.path.join(self.input_dir, input_fname) self.parse_file(curs, input_fname, writer, mt_id_gc_perc2no_of_random_hits) del writer self.log_f.close() self.p_value_list.sort() top_p_value_cutoff = 0.95 #important not 1, p_value histogram shows an abnormal peak from 0.95 to 1 top_p_value_list = self.remove_top_p_values(self.p_value_list, top_p_value_cutoff) figure_fname = '%s_p_value_hist.png' % self.output_prefix self.draw_pvalue_histogram(self.p_value_list, figure_fname) figure_fname = '%s_pi0Tolambda.png' % self.output_prefix lambda_list, pi0_list = self.calculate_pi0_list( self.p_value_list, figure_fname, top_p_value_cutoff) estimated_pi0 = self.estimate_pi0(lambda_list, pi0_list) self.cal_q_value_list(self.p_value_list, estimated_pi0, top_p_value_cutoff, self.output_prefix)