def __init__(self, file_list, outputdir, delimiter, debug=0): """ 08-28-05 """ self.files = file_list self.files.sort() self.outputdir = outputdir self.delimiter = delimiter self.debug = int(debug) #02-24-06 a must before using graph_modeling p_value_cut_off = 0 cor_cut_off = 0.6 graph_modeling.cor_cut_off_vector_construct(p_value_cut_off, cor_cut_off) #02-24-06 a temporary data structure to show how correlations between probes #pointing to the same gene are distributed self.cor_list = []
def cor_vector_from_files(self, communicator, dir, gph_dir, cor_fname, sig_fname, p_value_cut_off, cor_cut_off): """ 05-14-05 modify to be mpi form, feed mode(in MpiBiclustering.py and MpiGraphModeling.py) 05-16-05 output the edge tuple into 0th file. 06-30-05 if both 0, get the corCut from the top 1% graph files --files_sort if node_rank==0: --edge_tuple_list_output() --edge_tuple_list_output() else: --graph_modeling.cor_cut_off_vector_construct() --get_corCut_list() if node_rank==0: (send signal to other nodes) else: --node_fire() --gene_index2expr_array_setup() --cor_calculate() if node_rank==0: --collect_and_merge_output() """ files = os.listdir(dir) #sort all the files based on the dataset number, to order the columns of the outputed edge correlation vector files = self.files_sort(files) file_index_list = range(len(files)) node_rank = communicator.rank if p_value_cut_off ==0 and cor_cut_off == 0 and gph_dir==None: sys.stderr.write("p_value_cut_off and cor_cut_off both are 0, but no gph_dir. Aborted.\n") sys.exit(3) if node_rank == 0: #output the name first self.edge_tuple_list_output("%s_0"%cor_fname) #05-16-05 output the edge tuple into 0th file. self.edge_tuple_list_output("%s_0"%sig_fname) else: #set the cor_cut_off_vector, internal structure of graph_modeling graph_modeling.cor_cut_off_vector_construct(p_value_cut_off, cor_cut_off) if p_value_cut_off ==0 and cor_cut_off == 0: #06-30-05 if both 0, get the corCut from the top 1% graph files corCut_list = self.get_corCut_list(gph_dir) else: corCut_list = [] self.mpi_synchronize(communicator) if node_rank == 0: sys.stderr.write("\tTotally, %d files to be processed.\n"%len(files)) seed_utilized = Set() for node in range(1, communicator.size): if len(file_index_list)==0: #if #nodes > #jobs, tell those nodes to break their listening loop. stop_signal = "-1" communicator.send(stop_signal, node, 0) #no more jobs, stop that node, if self.debug: sys.stderr.write("node %s stopped.\n"%node) else: input_file_index = file_index_list.pop(0) #the first item poped first. communicator.send(repr(input_file_index), node, 0) #string format if self.debug: sys.stderr.write("Node %s schedule a job to %s\n"%(node_rank, node)) seed_utilized.add(node) received_value, source, tag = communicator.receiveString(None, None) #listen while received_value: #??check what the received_value is if len(file_index_list) == 0: #first check if there're still files left, otherwise pop(0) raises error. stop_signal = "-1" communicator.send(stop_signal, source, 0) #no more jobs, stop that node, if self.debug: sys.stderr.write("node %s stopped.\n"%source) seed_utilized.remove(source) if len(seed_utilized) == 0: #all seed used have finished their jobs break else: input_file_index = file_index_list.pop(0) if input_file_index: communicator.send(repr(input_file_index), source, 0) #string format, if self.debug: sys.stderr.write("Node %s get one more job\n"%source) received_value, source, tag = communicator.receiveString(None, None) #listen else: received_data, source, tag = communicator.receiveString(0, None) #get data from node 0, #04-24-05 the array is one-dimension no matter what dimension the original array is while received_data: if received_data=="-1": #stop signal if self.debug: sys.stderr.write("node %s breaked.\n"%node_rank) break else: input_file_index = int(received_data) #convert it to integer sys.stderr.write("node %s working on %s...\n"%(node_rank, received_data)) self.node_fire(dir, files, input_file_index, cor_fname, sig_fname, corCut_list) sys.stderr.write("node %s work on %s finished.\n"%(node_rank, received_data)) communicator.send("finished", 0, node_rank) received_data, source, tag = communicator.receiveString(0, None) #get data from node 0 self.mpi_synchronize(communicator) if node_rank==0: self.collect_and_merge_output(cor_fname, len(files)) elif node_rank==1: self.collect_and_merge_output(sig_fname, len(files))
def run(self): """ 03-18-05 mapping_dict all changed to haiyan_no2gene_no 04-12-05 use min_cluster_size to cut off some small clusters 07-03-05 construct graph_modeling's cor_cut_off vector first 10-14-05 add calculate_unknown_gene_ratio() 12-06-05 add gene_no2incidence_array to parser_type ==4 05-31-06 add type 5 (haifeng's output) --db_connect() --get_haiyan_no2gene_no() --get_known_genes_dict() --get_gene_id2gene_no() --create_tables() --graph_modeling.cor_cut_off_vector_construct() (loop over inf) --parser_dict[parser_type]() (codense_parser(), copath_parser() ) --get_combined_cor_vector --parse_recurrence --parse_connectivity --get_vertex_set_gim_array() (parser_type=4 only) --calculate_unknown_gene_ratio() --db_submit() """ inf = csv.reader(open(self.infname, 'r'), delimiter=self.delimiter) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) #setup the haiyan_no2gene_no if self.mapping_file != None: haiyan_no2gene_no = get_haiyan_no2gene_no(self.mapping_file) else: haiyan_no2gene_no = {} #a blank dictionary, known_gene_no2go_no_set = get_known_genes_dict(curs) #10-14-05 used to get unknown_gene_ratio if self.parser_type == 4 or self.parser_type==5: #12-06-05 if self.gim_inputfname == None: sys.stderr.write("\n parser_type = 4 needs gim_inputfname.\n") sys.exit(3) gene_id2gene_no = get_gene_id2gene_no(curs) gene_no2incidence_array = get_gene_no2incidence_array(self.gim_inputfname, gene_id2gene_no) else: gene_no2incidence_array = None mapping_dict = {1:haiyan_no2gene_no, 2:haiyan_no2gene_no, 3:None, 4:gene_no2incidence_array, 5:gene_no2incidence_array} self.create_tables(curs, self.table, self.mcl_table, self.pattern_table) no = 0 graph_modeling.cor_cut_off_vector_construct(0, 0.8) #07-03-05 compute the cor cutoff vector for graph_modeling, use 0.8 as cutoff #graph_modeling.ind_min_cor() requires the cor_cut_off vector to be constructed ahead. graph_modeling.set_jk_cut_off(6) #07-03-05 haiyan's cutoff is 6, different from my default value, 7. for row in inf: cluster_list = self.parser_dict[self.parser_type](row, mapping_dict[self.parser_type], curs) for cluster in cluster_list: if self.parser_type!=5 and len(cluster.vertex_set)<self.min_cluster_size: #too small, ignore, 2006-08-29 if it's haifeng_output_parser, no restriction for cluster size, haifeng imposes 4 continue #10-14-05 unknown_gene_ratio to submit to pattern_table cluster.unknown_gene_ratio = self.calculate_unknown_gene_ratio(cluster.vertex_set, known_gene_no2go_no_set) self.db_submit(curs, cluster, self.pattern_table) no+=1 if self.report and no%1000==0: sys.stderr.write('%s%d'%('\x08'*20, no)) if self.report: sys.stderr.write('%s%d'%('\x08'*20, no)) if self.needcommit: conn.commit() sys.stderr.write('\n\tTotal patterns: %d\n'%no)