def compare_gene_condition_vertex_set(curs, p_gene_table, gene_p_table, good_cluster_table, output_fname): import os, sys, csv from sets import Set from codense.common import pg_1d_array2python_ls sys.stderr.write("Getting gene_no2mcl_id_go_no_list ...\n") gene_no2mcl_id_go_no_list = {} curs.execute( "DECLARE crs1 CURSOR for select p.gene_no, p.mcl_id, p.go_no from %s p, %s g\ where p.p_gene_id=g.p_gene_id" % (p_gene_table, gene_p_table) ) counter = 0 curs.execute("fetch 1000 from crs1") rows = curs.fetchall() mcl_id_set = Set() while rows: for row in rows: gene_no, mcl_id, go_no = row mcl_id_set.add(mcl_id) if gene_no not in gene_no2mcl_id_go_no_list: gene_no2mcl_id_go_no_list[gene_no] = [] gene_no2mcl_id_go_no_list[gene_no].append([mcl_id, go_no]) counter += 1 sys.stderr.write("%s%s" % ("\x08" * 30, counter)) curs.execute("fetch 1000 from crs1") rows = curs.fetchall() curs.execute("close crs1") sys.stderr.write("Done.\n") sys.stderr.write("Getting mcl_id2recurrence_array_vertex_set ...\n") mcl_id2recurrence_array_vertex_set = {} curs.execute("DECLARE crs0 CURSOR for select mcl_id, vertex_set, recurrence_array from %s" % good_cluster_table) counter = 0 curs.execute("fetch 1000 from crs0") rows = curs.fetchall() while rows: for row in rows: mcl_id, vertex_set, recurrence_array = row vertex_set = pg_1d_array2python_ls(vertex_set) recurrence_array = pg_1d_array2python_ls(recurrence_array) mcl_id2recurrence_array_vertex_set[mcl_id] = [recurrence_array, Set(vertex_set)] counter += 1 sys.stderr.write("%s%s" % ("\x08" * 30, counter)) curs.execute("fetch 1000 from crs0") rows = curs.fetchall() curs.execute("close crs0") sys.stderr.write("Done.\n") sys.stderr.write("Comparing gene condition and vertex_set ...\n") writer = csv.writer(open(output_fname, "w"), delimiter="\t") for gene_no, mcl_id_go_no_list in gene_no2mcl_id_go_no_list.iteritems(): cmp_condition_vertex_result = get_mcl_id_sharing(mcl_id_go_no_list, mcl_id2recurrence_array_vertex_set) for row in cmp_condition_vertex_result: writer.writerow([gene_no] + row) sys.stderr.write("Done.\n")
def get_prot_interaction_graph(self, curs, prot_interaction_table, tax_id): """ 2006-11-20 2006-01-05 print the number of connected components in the end """ sys.stderr.write("Getting protein interaction graph ...\n") curs.execute("DECLARE crs0 CURSOR for select gene_id_array, interaction_type_id\ from %s where tax_id=%s"%(prot_interaction_table, tax_id)) curs.execute("fetch 3000 from crs0") rows = curs.fetchall() prot_interaction_graph = nx.XGraph() counter = 0 while rows: for row in rows: gene_id_array, interaction_type_id = row gene_id_array = pg_1d_array2python_ls(gene_id_array) if len(gene_id_array)>1: for i in range(len(gene_id_array)): for j in range(i+1, len(gene_id_array)): prot_interaction_graph.add_edge(gene_id_array[i], gene_id_array[j], interaction_type_id) counter += 1 if self.report: sys.stderr.write("%s%s"%('\x08'*20, counter)) curs.execute("fetch 3000 from crs0") rows = curs.fetchall() curs.execute("close crs0") sys.stderr.write("%s nodes, %s edges and %s components, Done.\n"%(nx.number_of_nodes(prot_interaction_graph),\ nx.number_of_edges(prot_interaction_graph), nx.number_connected_components(prot_interaction_graph))) return prot_interaction_graph
def get_prom_seq_from_entrezgene_mapping_table(self, curs, prom_seq_table, entrezgene_mapping_table='entrezgene_mapping', \ annot_assembly_table = 'annot_assembly'): sys.stderr.write("Getting prom_seq from entrezgene_mapping_table...\n") curs.execute("DECLARE crs CURSOR FOR SELECT e.gene_id, e.genomic_gi, e.tax_id, a.chromosome, e.strand,\ e.start, e.stop, e.mrna_start, e.mrna_stop, e.cds_start, e.cds_stop from %s e, %s a \ where e.genomic_gi=a.gi"%(entrezgene_mapping_table, annot_assembly_table)) curs.execute("fetch 10000 from crs") rows = curs.fetchall() counter = 0 while rows: for row in rows: gene_id, genomic_gi, tax_id, chromosome, strand, start, stop, mrna_start, mrna_stop, cds_start, cds_stop = row seg_loc_ls = [] if cds_start and cds_stop: cds_start = pg_1d_array2python_ls(cds_start, int) cds_stop = pg_1d_array2python_ls(cds_stop, int) for i in range(len(cds_start)): seg_loc_ls.append([cds_start[i],cds_stop[i]]) elif mrna_start and mrna_stop: mrna_start = pg_1d_array2python_ls(mrna_start, int) mrna_stop = pg_1d_array2python_ls(mrna_stop, int) for i in range(len(mrna_start)): seg_loc_ls.append([mrna_start[i],mrna_stop[i]]) else: seg_loc_ls.append([start, stop]) seg_loc_ls.sort() #some genes have reversed cds order ps_attr_instance = prom_seq_attr() ps_attr_instance.prom_acc = gene_id ps_attr_instance.chromosome = chromosome ps_attr_instance.organism = tax_id2org(tax_id) upstream_loc_ls = [0,0] instron_1st_loc_ls = [] if strand=='1': #plus strand ps_attr_instance.strand = '+' upstream_loc_ls[1] = seg_loc_ls[0][0]-1 upstream_loc_ls[0] = upstream_loc_ls[1] - 9999 if upstream_loc_ls[0]<1: #in case exceed the chromosome boundary upstream_loc_ls[0] = 1 #check whether there's gene upstream upstream_loc_ls[0] = self.return_closest_anchor(curs, 'stop', upstream_loc_ls, gene_id, tax_id, genomic_gi, \ entrezgene_mapping_table) if upstream_loc_ls[0]>upstream_loc_ls[1]: #No upstream if self.debug: sys.stderr.write("\tgene_id: %s no upstream\n"%gene_id) upstream_loc_ls = [] if len(seg_loc_ls)>1: #the first intron instron_1st_loc_ls.append(seg_loc_ls[0][1]+1) instron_1st_loc_ls.append(seg_loc_ls[1][0]-1) elif strand=='-1': #minus strand ps_attr_instance.strand = '-' upstream_loc_ls[0] = seg_loc_ls[-1][1]+1 upstream_loc_ls[1] = upstream_loc_ls[0] + 9999 #NOTE: exceeding the chromosome boundary is taken care of by get_sequence_segment() #check whether there's gene upstream upstream_loc_ls[1] = self.return_closest_anchor(curs, 'start', upstream_loc_ls, gene_id, tax_id, genomic_gi, \ entrezgene_mapping_table) if upstream_loc_ls[0]>upstream_loc_ls[1]: #No upstream sys.stderr.write("\tgene_id: %s no upstream\n"%gene_id) upstream_loc_ls = [] if len(seg_loc_ls)>1: #the first intron instron_1st_loc_ls.append(seg_loc_ls[-2][1]+1) instron_1st_loc_ls.append(seg_loc_ls[-1][0]-1) else: #ignore genes with no strand info, some are not real genes continue #1st deal with upstream_loc_ls if upstream_loc_ls: ps_attr_instance.prom_genome_start = upstream_loc_ls[0] ps_attr_instance.prom_genome_end = upstream_loc_ls[1] ps_attr_instance.prom_type_id = 1 ps_attr_instance.sequence = get_sequence_segment(curs, genomic_gi, upstream_loc_ls[0], upstream_loc_ls[1]) self.submit_to_prom_seq(curs, prom_seq_table, ps_attr_instance) #2nd handle instron_1st_loc_ls, might not exist if instron_1st_loc_ls: if instron_1st_loc_ls[0]>instron_1st_loc_ls[1]: sys.stderr.write("\tgene_id: %s weird 1st intron %s.\n"%(gene_id, instron_1st_loc_ls)) ps_attr_instance.prom_genome_start = instron_1st_loc_ls[0] ps_attr_instance.prom_genome_end = instron_1st_loc_ls[1] ps_attr_instance.prom_type_id = 5 ps_attr_instance.sequence = get_sequence_segment(curs, genomic_gi, instron_1st_loc_ls[0], instron_1st_loc_ls[1]) self.submit_to_prom_seq(curs, prom_seq_table, ps_attr_instance) counter += 1 if self.report: sys.stderr.write("%s\t%s"%('\x08'*20, counter)) if self.debug: #enough break curs.execute("fetch 10000 from crs") rows = curs.fetchall() sys.stderr.write("Done getting prom_seq from entrezgene_mapping_table.\n")