def dstruc_loadin(self, curs): ''' ''' sys.stderr.write("Loading Data STructure...\n") from codense.common import get_go_no2go_id, get_gene_no2gene_id, get_go_no2name, get_gene_id2gene_no, get_gene_no2go_no self.go_no2go_id = get_go_no2go_id(curs) self.go_no2go_name = get_go_no2name(curs) self.gene_no2gene_id = get_gene_no2gene_id(curs) self.gene_id2gene_no = get_gene_id2gene_no(curs) self.global_gene_to_go_dict = get_gene_no2go_no(curs) #04-01-05 the second kind in label_dict gene_no2no = {} for gene_no in self.gene_no2gene_id: gene_no2no[gene_no] = gene_no self.label_dict = {1:self.gene_no2gene_id, 2: gene_no2no} curs.execute("select gene_no,go_functions from gene") if self.type == 3: curs.execute("select array_upper(recurrence_array,1) from %s limit 1"%self.table) rows = curs.fetchall() self.no_of_datasets = int(rows[0][0]) sys.stderr.write("Done\n")
def dstruc_loadin(self, curs): """ 03-09-05 get the context from mcl_table via linking through mcl_id of p_gene_table context_dict is set """ from codense.common import get_known_genes_dict, get_go_no2go_id, get_go_no2name, get_gene_no2gene_id self.known_genes_dict = get_known_genes_dict(curs) self.go_no2go_id = get_go_no2go_id(curs) self.go_no2go_name = get_go_no2name(curs) self.gene_no2gene_id = get_gene_no2gene_id(curs) sys.stderr.write("Setting up gene_prediction_dict...") # setup self.gene_prediction_dict curs.execute( "select p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, p.is_correct_lca, m.vertex_set\ from %s p, %s g, %s m where g.p_gene_id=p.p_gene_id and m.mcl_id=p.mcl_id" % (self.gene_table, self.table, self.mcl_table) ) rows = curs.fetchall() for row in rows: gene_no = row[0] if self.type == 2 and gene_no not in self.known_genes_dict: # I only want the known genes, but this gene is unknown continue elif self.type == 3 and gene_no in self.known_genes_dict: # i only want the unknown genes, but this gene is known continue go_no = row[1] is_correct = row[2] is_correct_l1 = row[3] is_correct_lca = row[4] vertex_set = row[5][1:-1].split(",") vertex_set = map(int, vertex_set) item = function_struc() item.is_correct = is_correct item.is_correct_l1 = is_correct_l1 item.is_correct_lca = is_correct_lca # context_dict is a set item.context_dict = Set(vertex_set) if gene_no not in self.gene_prediction_dict: self.gene_prediction_dict[gene_no] = gene_prediction() self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item else: self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item sys.stderr.write("Done\n") """
def dstruc_loadin(self, curs): """ 03-14-05 remove the distance loading part """ sys.stderr.write("Loading Data STructure...\n") from codense.common import get_known_genes_dict, get_go_no2go_id,\ get_go_no2term_id, get_go_no2depth, get_go_term_id2go_no, \ get_go_term_id2depth self.known_genes_dict = get_known_genes_dict(curs) self.go_no2go_id = get_go_no2go_id(curs) self.go_no2term_id = get_go_no2term_id(curs) self.go_no2depth = get_go_no2depth(curs) self.go_term_id2go_no = get_go_term_id2go_no(curs) self.go_term_id2depth = get_go_term_id2depth(curs) sys.stderr.write("Done\n")
def output(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map): """ 03-03-05 loop over gene_no2p_gene_id_src and p_gene_id_src_map 03-13-05 add a column, #clusters in the output file --output_one_gene() --output_function_group() """ #three dictionaries gene_no2gene_id = get_gene_no2gene_id(curs) gene_no2direct_go = get_gene_no2direct_go(curs) go_no2go_id = get_go_no2go_id(curs) go_no2name = get_go_no2name(curs) go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table) sys.stderr.write("Outputing prediction table...") writer = csv.writer(outf, delimiter='\t') #first output the known genes for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems(): self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \ 'e_acc', 'e_acc_pair', 'cluster_context'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) #second output the unknown genes for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems(): self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \ 'e_acc', 'e_acc_pair', 'cluster_context'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) del writer sys.stderr.write("Done\n")
def output1(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map): """ 03-15-05 copied from output() """ #three dictionaries gene_no2gene_id = get_gene_no2gene_id(curs) gene_no2direct_go = get_gene_no2direct_go(curs) go_no2go_id = get_go_no2go_id(curs) go_no2name = get_go_no2name(curs) go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table) from codense.common import get_prediction_pair2lca_list prediction_pair2lca_list = get_prediction_pair2lca_list(curs,p_gene_table=self.p_gene_table) sys.stderr.write("Outputing prediction table...") writer = csv.writer(outf, delimiter='\t') #first output the known genes for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems(): self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_id', 'go_name', 'is_correct_lca', 'lca_list', 'p_value_list', '#clusters',\ 'e_acc'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: #NOTE: the arguments passed to this function is different between known and unknown genes. self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair, prediction_pair2lca_list, gene_no) writer.writerow([]) #second output the unknown genes for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems(): self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_id', 'go_name', 'p_value_list', '#clusters', 'e_acc'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) del writer sys.stderr.write("Done\n")
def run(self): """ 10-31-05 2006-09-26 modify it to be compatible with the modified pipeline from haifeng 2006-11-06 add type 2006-12-13 use font_path and font_size --form_schema_tables() --db_connect() --get_char_dimension() --get_no_of_p_funcs_gene_no_go_no_list() --get_recurrence_go_no_rec_array_cluster_id_ls() --get_go_no2name() --draw_function_map() --draw_gene_function_map() --get_recurrence_rec_array_bs_no_list() --get_mt_no2tf_name() --draw_tf_map() """ schema_instance = form_schema_tables(self.inputfname, self.acc_cutoff, self.lm_bit) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) font = ImageFont.truetype(self.font_path, self.font_size) char_dimension = font.getsize('a') #char_dimension = get_char_dimension() #go_no2name = get_go_no2name(curs) go_no2name = get_go_id2name(curs) if self.type==1: go_no2go_id = get_go_no2go_id(curs) given_p_gene_set = p_gene_id_set_from_gene_p_table(curs, schema_instance.gene_p_table) no_of_p_funcs_gene_no_go_no_list, mcl_id2go_no_set = self.get_no_of_p_funcs_gene_no_go_no_list_from_db(curs, \ schema_instance.p_gene_table, given_p_gene_set, go_no2go_id) elif self.type==2: no_of_p_funcs_gene_no_go_no_list, mcl_id2go_no_set = self.get_no_of_p_funcs_gene_no_go_no_list_from_file(self.inputfname) recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets, mcl_id2enc_recurrence = \ self.get_recurrence_go_no_rec_array_cluster_id_ls(curs, self.pattern_table, mcl_id2go_no_set) no_of_functions = len(recurrence_go_no_rec_array_cluster_id_ls) function_map_output_fname = '%s.function_map.png'%self.output_prefix go_no2index, function_name_region = self.draw_function_map(recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets,\ go_no2name, function_map_output_fname, self.function_name_length, char_dimension, no_of_functions, font) gene_function_map_output_fname = '%s.gene_function_map.png'%self.output_prefix self.draw_gene_function_map(no_of_p_funcs_gene_no_go_no_list, go_no2index, function_name_region,\ gene_function_map_output_fname, self.function_name_length, char_dimension, no_of_functions, font) #tf_map requires mcl_id2enc_recurrence and no_of_datasets from above recurrence_rec_array_bs_no_list = self.get_recurrence_rec_array_bs_no_list(curs, self.cluster_bs_table, mcl_id2enc_recurrence) mt_no2tf_name = get_gene_id2gene_symbol(curs, tax_id=9606) #mt_no2tf_name = get_mt_no2tf_name() tf_map_output_fname = '%s.tf_map.png'%self.output_prefix self.draw_tf_map(recurrence_rec_array_bs_no_list, no_of_datasets, mt_no2tf_name, \ tf_map_output_fname, self.function_name_length, char_dimension, font)