def run(self): import MySQLdb mysql_conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) mysql_curs = mysql_conn.cursor() #2008-08-13 2 elixir dbs are bad. it causes tables to be cross-created in the two databases. #from transfac.src.GenomeDB import GenomeDatabase #genome_db = GenomeDatabase(drivername='mysql', username=user, password=passwd, hostname=hostname, database='genome') #from transfac.src.GenomeDB import getEntrezgeneAnnotatedAnchor #chromosome2anchor_gene_tuple_ls, gene_id2coord = getEntrezgeneAnnotatedAnchor(genome_db, tax_id=3702) #del genome_db db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) mysql_conn.autocommit(True) entrezgene_mapping_table='genome.entrezgene_mapping' annot_assembly_table='genome.annot_assembly' chromosome2anchor_gene_tuple_ls, gene_id2coord = get_entrezgene_annotated_anchor(mysql_curs, self.tax_id, entrezgene_mapping_table, annot_assembly_table) self.find_SNP_context(db, mysql_curs, Snps.table.name, SnpsContext.table.name, chromosome2anchor_gene_tuple_ls, gene_id2coord,\ max_upstream_distance=self.max_upstream_distance,\ max_downstream_distance=self.max_downstream_distance, need_commit=self.commit, debug=self.debug)
def ucsc_tfbs_conserved_parse(self, curs, inputfile, gene_symbol2gene_id, tax_id): """ 2008-08-12 return_target_gene_ls() returns a different data structure. 03-29-06 --get_entrezgene_annotated_anchor() --submit2matrix() --return_target_gene_ls() --find_binding_site_disp_coord() --get_sequence_segment() --submit2binding_site() """ sys.stderr.write("Parsing ucsc_tfbs_conserved ...\n") chromosome2anchor_gene_tuple_ls, gene_id2coord = get_entrezgene_annotated_anchor(curs, tax_id) inf = open(inputfile, 'r') mt_id_set = Set() line_no = 0 for line in inf: line_no+=1 if line[0]=='#': #skip the comment lines continue row = line[:-1].split('\t') try: bin, chromosome, start, end, name, score, strand, zscore = row chromosome = chromosome[3:] binding_site_attr = binding_site_attribute(tax_id=tax_id) if chromosome.find('random')==-1: #03-29-06 not random sequences binding_site_attr.chromosome = chromosome binding_site_attr.strand = strand mt_id = name if mt_id not in mt_id_set: self.submit2matrix(curs, mt_id, tax_id) mt_id_set.add(mt_id) binding_site_attr.mt_id = mt_id binding_site_attr.bs_genome_start = int(start) binding_site_attr.bs_genome_end = int(end) binding_site_attr.matrix_similarity_score = float(score) binding_site_attr.core_similarity_score = float(zscore) regulatory_coord = (binding_site_attr.chromosome, \ binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end) target_gene_ls, target_gene_ls_type = self.return_target_gene_ls(regulatory_coord, \ chromosome2anchor_gene_tuple_ls, gene_id2coord) pdata = self.return_target_gene_ls(regulatory_coord, chromosome2anchor_gene_tuple_ls, gene_id2coord) if pdata.regulatory_touch_target_gene_ls: target_gene_ls_type = 'touch' target_gene_ls = pdata.regulatory_touch_target_gene_ls elif pdata.regulatory_is_left_upstream_target_gene_ls or pdata.regulatory_is_right_upstream_target_gene_ls: target_gene_ls_type = 'upstream' target_gene_ls = pdata.regulatory_is_left_upstream_target_gene_ls + pdata.regulatory_is_right_upstream_target_gene_ls binding_site_attr.comment = 'bin:%s. %s'%(bin, target_gene_ls_type) for target_gene_tuple in target_gene_ls: anchor, gene_id = target_gene_tuple[:2] binding_site_attr.prom_id = gene_id gene_start, gene_stop, gene_strand, gene_genomic_gi = gene_id2coord[gene_id] self.find_binding_site_disp_coord(binding_site_attr, gene_strand, gene_start, gene_stop) binding_site_attr.sequence = get_sequence_segment(curs, gene_genomic_gi, \ binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end) if strand == '-': #reverse_complement seq = Seq(binding_site_attr.sequence) binding_site_attr.sequence = seq.reverse_complement().tostring() self.submit2binding_site(curs, binding_site_attr) except: print "line_no:", line_no print line sys.stderr.write("Done.\n")
def sgd_regulatory_parse(self, curs, inputfile, gene_symbol2gene_id, tax_id): """ 2008-08-12 return_target_gene_ls() returns a different data structure. 12-17-05 --get_entrezgene_annotated_anchor() --calculate_rome_number() --parse_sgd_regulatory_attribute() --submit2matrix() --return_target_gene_ls() --find_binding_site_disp_coord() --get_sequence_segment() --submit2binding_site() """ sys.stderr.write("Parsing sgd_regulatory ...\n") chromosome2anchor_gene_tuple_ls, gene_id2coord = get_entrezgene_annotated_anchor(curs, tax_id) inf = open(inputfile, 'r') mt_id_set = Set() line_no = 0 for line in inf: line_no+=1 if line[0]=='#': #skip the comment lines continue row = line[:-1].split('\t') try: seqname, source, feature, start, end, score, strand, frame, attribute = row chromosome_rome_number = seqname[3:] if feature=='TF_binding_site': binding_site_attr = binding_site_attribute(tax_id=tax_id) chromosome = calculate_rome_number(chromosome_rome_number) if chromosome: binding_site_attr.chromosome = repr(chromosome) #varchar type in database else: #it might be something like Mito, return None binding_site_attr.chromosome = chromosome_rome_number binding_site_attr.strand = strand mt_id, dbxref = self.parse_sgd_regulatory_attribute(attribute) if mt_id not in mt_id_set: self.submit2matrix(curs, mt_id, tax_id) mt_id_set.add(mt_id) binding_site_attr.mt_id = mt_id binding_site_attr.bs_genome_start = int(start) binding_site_attr.bs_genome_end = int(end) regulatory_coord = (binding_site_attr.chromosome, \ binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end) pdata = self.return_target_gene_ls(regulatory_coord, chromosome2anchor_gene_tuple_ls, gene_id2coord) if pdata.regulatory_touch_target_gene_ls: target_gene_ls_type = 'touch' target_gene_ls = pdata.regulatory_touch_target_gene_ls elif pdata.regulatory_is_left_upstream_target_gene_ls or pdata.regulatory_is_right_upstream_target_gene_ls: target_gene_ls_type = 'upstream' target_gene_ls = pdata.regulatory_is_left_upstream_target_gene_ls + pdata.regulatory_is_right_upstream_target_gene_ls binding_site_attr.comment = '%s;%s'%(dbxref, target_gene_ls_type) for target_gene_tuple in target_gene_ls: anchor, gene_id = target_gene_tuple[:2] binding_site_attr.prom_id = gene_id gene_start, gene_stop, gene_strand, gene_genomic_gi = gene_id2coord[gene_id] self.find_binding_site_disp_coord(binding_site_attr, gene_strand, gene_start, gene_stop) binding_site_attr.sequence = get_sequence_segment(curs, gene_genomic_gi, \ binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end) if strand == '-': #reverse_complement seq = Seq(binding_site_attr.sequence) binding_site_attr.sequence = seq.reverse_complement().tostring() self.submit2binding_site(curs, binding_site_attr) except: print "line_no:", line_no print line sys.stderr.write("Done.\n")