def core_from_files(self, curs): """ 05-19-05 It's outdated compared with core(). So update it. 08-13-05 row[4] is unknown_gene_ratio """ sys.stderr.write("Starting gene-stat...\n") from gene_p_map_redundancy import gene_p_map_redundancy node_distance_class = gene_p_map_redundancy() """ 05-19-05 #read from a single file #following codes are attaching directory path to each file in the list file_list = os.listdir(self.dir_files) file_path_list = [] for filename in file_list: file_path_list.append(os.path.join(self.dir_files, filename)) #multiple files constitute the source of data self.files = fileinput.input(file_path_list) """ self.files = open(self.dir_files, 'r') #wrap it with a reader self.reader = csv.reader(self.files, delimiter='\t') for row in self.reader: row[0] = int(row[0]) row[1] = int(row[1]) row[3] = float(row[3]) row[4] = float(row[4]) curs.execute("select recurrence_array, vertex_set from %s where mcl_id=%d"%(self.mcl_table, int(row[0])) ) rows = curs.fetchall() #first append the recurrence_array row.append(rows[0][0]) #second append the vertex_set row.append(rows[0][1]) #only leave_one_out self._gene_stat_leave_one_out(row, node_distance_class, curs) if self.report and self.no_of_records%2000==0: sys.stderr.write('%s%s'%('\x08'*20, self.no_of_records)) if self.report: sys.stderr.write('%s%s'%('\x08'*20, self.no_of_records)) sys.stderr.write("Done.\n")
def core(self, curs): """ 03-14-05 load go_no2distance on demand """ sys.stderr.write("Starting gene-stat...\n") from gene_p_map_redundancy import gene_p_map_redundancy node_distance_class = gene_p_map_redundancy() #the central function of the class if self.leave_one_out: #leave_one_out method gets data from both cluster_stat-like and mcl_result-like table curs.execute("DECLARE crs CURSOR FOR select c.mcl_id, c.leave_one_out, c.p_value_vector, \ c.connectivity, m.recurrence_array, m.vertex_set from %s c, %s m where c.mcl_id=m.mcl_id"\ %(self.table, self.mcl_table)) else: #no leave_one_out method gets data only from mcl_result-like table curs.execute("DECLARE crs CURSOR FOR select mcl_id, vertex_set, p_value_min, go_no_vector, unknown_gene_ratio, \ recurrence_array from %s where connectivity>=%f and p_value_min notnull and array_upper(recurrence_array, 1)>=%d\ and array_upper(vertex_set, 1)<=%d"%(self.mcl_table, self.connectivity_cut_off, self.recurrence_cut_off, self.cluster_size_cut_off)) curs.execute("fetch 5000 from crs") rows = curs.fetchall() while rows: for row in rows: if self.leave_one_out: #in leave_one_out, only one gene's function is predicted based on one row self._gene_stat_leave_one_out(row, node_distance_class, curs) else: #in no leave_one_out, function of all vertices in that cluster is predicted based on one row self._gene_stat_no_leave_one_out(row) if self.report: sys.stderr.write('%s%s'%('\x08'*20, self.no_of_records)) curs.execute("fetch 5000 from crs") rows = curs.fetchall() sys.stderr.write("Done.\n")
def return_go_no_map(self, go_no_list, curs, distance_table): """ 03-06-05 input: a list of go_nos, curs output: a map showing which go_no corresponds to which curs is used to get the go_no2term_id and nodes pairwise distance """ sys.stderr.write("Mapping go_nos...") from gene_p_map_redundancy import gene_p_map_redundancy from codense.common import get_go_no2term_id borrowed_instance = gene_p_map_redundancy() go_no_map = {} go_no2term_id = get_go_no2term_id(curs) go_no2distance = {} for i in range(len(go_no_list)): go_no = go_no_list[i] if go_no not in go_no_map: #not flagged, map go_no_map[go_no] = go_no for j in range(i+1, len(go_no_list)): go_no2 = go_no_list[j] if go_no < go_no2: key= (go_no, go_no2) else: key = (go_no2, go_no) if key in go_no2distance: jasmine_distance = go_no2distance[key][2] else: jasmine_distance = borrowed_instance.get_distance(curs, go_no, go_no2, distance_table, go_no2distance, go_no2term_id) if jasmine_distance == 0: #jasmine_distance=0 means they are parent-child go_no_map[go_no2] = go_no sys.stderr.write("done.\n") return go_no_map
def run(self): """ 09-05-05 10-23-05 create views from old schema result goes to the new schema's p_gene_table (input_node) --db_connect() --form_schema_tables() --form_schema_tables() --get_gene_no2go_no_set() --get_go_no2depth() (pass data to computing_node) (computing_node) (take data from other nodes, 0 and size-1) (judge_node) --gene_stat() --db_connect() --gene_p_map_redundancy() (output_node) --db_connect() --form_schema_tables() --form_schema_tables() --MpiPredictionFilter() --MpiPredictionFilter_instance.createGeneTable() --get_go_no2edge_counter_list()(if necessary) (pass go_no2edge_counter_list to computing_node) (input_node) --fetch_cluster_block() (computing_node) --get_no_of_unknown_genes() --node_fire_handler() --cleanup_handler() --judge_node() --gene_stat_instance.(match functions) --output_node() --output_node_handler() --MpiPredictionFilter_instance.submit_to_p_gene_table() """ communicator = MPI.world.duplicate() node_rank = communicator.rank if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) """ #01-02-06 old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) """ gene_no2go_no = get_gene_no2go_no_set(curs) gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1) #-1 means use the highest protocol go_no2depth = get_go_no2depth(curs) go_no2depth_pickle = cPickle.dumps(go_no2depth, -1) go_no2gene_no_set = get_go_no2gene_no_set(curs) go_no2gene_no_set_pickle = cPickle.dumps(go_no2gene_no_set, -1) for node in range(1, communicator.size-2): #send it to the computing_node communicator.send(gene_no2go_no_pickle, node, 0) communicator.send(go_no2depth_pickle, node, 0) communicator.send(go_no2gene_no_set_pickle, node, 0) elif node_rank<=communicator.size-3: #WATCH: last 2 nodes are not here. data, source, tag = communicator.receiveString(0, 0) gene_no2go_no = cPickle.loads(data) #take the data data, source, tag = communicator.receiveString(0, 0) go_no2depth = cPickle.loads(data) data, source, tag = communicator.receiveString(0, 0) go_no2gene_no_set = cPickle.loads(data) data, source, tag = communicator.receiveString(communicator.size-1, 0) #from the last node go_no2edge_counter_list = cPickle.loads(data) #choose a functor for recurrence_array functor_dict = {0: None, 1: lambda x: int(x>=self.recurrence_x), 2: lambda x: math.pow(x, self.recurrence_x)} functor = functor_dict[self.recurrence_x_type] elif node_rank == communicator.size-2: #judge node gene_stat_instance = gene_stat(depth_cut_off=self.depth) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) gene_stat_instance.dstruc_loadin(curs) from gene_p_map_redundancy import gene_p_map_redundancy node_distance_class = gene_p_map_redundancy() elif node_rank==communicator.size-1: #establish connection before pursuing (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) """ #01-02-06, input and output are all directed to files old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table) if self.new_table: MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table) """ if self.go_no2edge_counter_list_fname: go_no2edge_counter_list = cPickle.load(open(self.go_no2edge_counter_list_fname,'r')) else: if self.eg_d_type==2: go_no2edge_counter_list = None else: gene_no2go_no = get_gene_no2go_no_set(curs) go_no2edge_counter_list = get_go_no2edge_counter_list(curs, gene_no2go_no, self.edge_type2index) go_no2edge_counter_list_pickle = cPickle.dumps(go_no2edge_counter_list, -1) for node in range(1, communicator.size-2): #send it to the computing_node communicator.send(go_no2edge_counter_list_pickle, node, 0) mpi_synchronize(communicator) free_computing_nodes = range(1,communicator.size-2) #exclude the last node if node_rank == 0: """ curs.execute("DECLARE crs CURSOR FOR SELECT id, vertex_set, edge_set, no_of_edges,\ connectivity, unknown_gene_ratio, recurrence_array, d_matrix from %s"%(old_schema_instance.pattern_table)) """ self.counter = 0 #01-02-06 counter is used as id reader = csv.reader(open(self.input_fname, 'r'), delimiter='\t') parameter_list = [reader] input_node(communicator, parameter_list, free_computing_nodes, self.message_size, \ self.report, input_handler=self.input_handler) del reader elif node_rank in free_computing_nodes: no_of_unknown_genes = get_no_of_unknown_genes(gene_no2go_no) GradientScorePrediction_instance = GradientScorePrediction(gene_no2go_no, go_no2gene_no_set, go_no2depth, \ go_no2edge_counter_list, no_of_unknown_genes, self.depth, self.min_layer1_associated_genes, \ self.min_layer1_ratio, self.min_layer2_associated_genes, self.min_layer2_ratio, self.exponent, \ self.score_list, self.max_layer, self.norm_exp, self.eg_d_type, self.debug) parameter_list = [GradientScorePrediction_instance, functor] computing_node(communicator, parameter_list, self.node_fire_handler, self.cleanup_handler, self.report) elif node_rank == communicator.size-2: self.judge_node(communicator, curs, gene_stat_instance, node_distance_class) elif node_rank==communicator.size-1: #01-02-06 output goes to plain file, not database writer = csv.writer(open(self.jnput_fname, 'w'), delimiter='\t') parameter_list = [writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def output(self, curs, gene_no2go_id_set_list, go_id_set_list, support, prefix, gene_no2id, go_id2name, schema_list): """ 07-06-05 """ sys.stderr.write("Outputing...") #get the total set total_gene_no_set = Set() total_go_id_set = Set() for i in range(len(gene_no2go_id_set_list)): total_gene_no_set |= Set(gene_no2go_id_set_list[i].keys()) total_go_id_set |= go_id_set_list[i] print "the total number of genes is ",len(total_gene_no_set) gene_ofname = '%s.gene'%prefix function_ofname = '%s.function'%prefix gene_writer = csv.writer(open(gene_ofname,'w'), delimiter='\t') function_writer = csv.writer(open(function_ofname, 'w'), delimiter='\t') gene_writer.writerow(['']+schema_list) function_writer.writerow([''] + schema_list) from gene_p_map_redundancy import gene_p_map_redundancy node_distance_class = gene_p_map_redundancy() go_id2term_id = get_go_id2term_id(curs) go_term_id2depth = get_go_term_id2depth(curs) #output the gene-oriented information for gene_no in total_gene_no_set: freq = 0 p_go_id_set_list = [] for i in range(len(gene_no2go_id_set_list)): if gene_no in gene_no2go_id_set_list[i]: p_go_id_set_list.append(gene_no2go_id_set_list[i][gene_no]) freq += 1 if freq == support: if self.p_go_id_set_list_distinct(curs, p_go_id_set_list, node_distance_class, go_term_id2depth, go_id2term_id): row = [gene_no2id[gene_no]] for i in range(len(gene_no2go_id_set_list)): if gene_no in gene_no2go_id_set_list[i]: go_id_set = gene_no2go_id_set_list[i][gene_no] go_name_list = dict_map(go_id2name, go_id_set) row.append(';'.join(go_name_list)) else: row.append('') gene_writer.writerow(row) #output the function_oriented information for go_id in total_go_id_set: freq = 0 for i in range(len(go_id_set_list)): if go_id in go_id_set_list[i]: freq += 1 if freq == support: row = ['%s(%s)'%(go_id2name[go_id],go_id)] for i in range(len(go_id_set_list)): if go_id in go_id_set_list[i]: row.append('1') else: row.append('0') function_writer.writerow(row) sys.stderr.write("Done.\n")