Exemple #1
0
	def run(self):
		import MySQLdb
		mysql_conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
		mysql_curs = mysql_conn.cursor()
		
		#2008-08-13 2 elixir dbs are bad. it causes tables to be cross-created in the two databases.
		#from transfac.src.GenomeDB import GenomeDatabase
		#genome_db = GenomeDatabase(drivername='mysql', username=user, password=passwd, hostname=hostname, database='genome')
		#from transfac.src.GenomeDB import getEntrezgeneAnnotatedAnchor
		#chromosome2anchor_gene_tuple_ls, gene_id2coord = getEntrezgeneAnnotatedAnchor(genome_db, tax_id=3702)
		#del genome_db
		
		db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname)
		mysql_conn.autocommit(True)
		entrezgene_mapping_table='genome.entrezgene_mapping'
		annot_assembly_table='genome.annot_assembly'
		chromosome2anchor_gene_tuple_ls, gene_id2coord = get_entrezgene_annotated_anchor(mysql_curs, self.tax_id, entrezgene_mapping_table, annot_assembly_table)
		self.find_SNP_context(db, mysql_curs, Snps.table.name, SnpsContext.table.name, chromosome2anchor_gene_tuple_ls, gene_id2coord,\
							max_upstream_distance=self.max_upstream_distance,\
							max_downstream_distance=self.max_downstream_distance, need_commit=self.commit, debug=self.debug)
Exemple #2
0
	def ucsc_tfbs_conserved_parse(self, curs, inputfile, gene_symbol2gene_id, tax_id):
		"""
		2008-08-12
			return_target_gene_ls() returns a different data structure.
		03-29-06
			
			--get_entrezgene_annotated_anchor()
			
			--submit2matrix()
			--return_target_gene_ls()
			--find_binding_site_disp_coord()
			--get_sequence_segment()
			--submit2binding_site()
		"""
		sys.stderr.write("Parsing ucsc_tfbs_conserved ...\n")
		chromosome2anchor_gene_tuple_ls, gene_id2coord = get_entrezgene_annotated_anchor(curs, tax_id)
		inf = open(inputfile, 'r')
		mt_id_set = Set()
		line_no = 0
		for line in inf:
			line_no+=1
			if line[0]=='#':	#skip the comment lines
				continue
			row = line[:-1].split('\t')
			try:
				bin, chromosome, start, end, name, score, strand, zscore = row
				chromosome = chromosome[3:]
				binding_site_attr = binding_site_attribute(tax_id=tax_id)
				if chromosome.find('random')==-1:	#03-29-06 not random sequences
					binding_site_attr.chromosome = chromosome
					binding_site_attr.strand = strand
					mt_id = name
					if mt_id not in mt_id_set:
						self.submit2matrix(curs, mt_id, tax_id)
						mt_id_set.add(mt_id)
					binding_site_attr.mt_id = mt_id
					binding_site_attr.bs_genome_start = int(start)
					binding_site_attr.bs_genome_end = int(end)
					binding_site_attr.matrix_similarity_score = float(score)
					binding_site_attr.core_similarity_score = float(zscore)
					
					regulatory_coord = (binding_site_attr.chromosome, \
						binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end)
					target_gene_ls, target_gene_ls_type = self.return_target_gene_ls(regulatory_coord, \
						chromosome2anchor_gene_tuple_ls, gene_id2coord)
					pdata = self.return_target_gene_ls(regulatory_coord, chromosome2anchor_gene_tuple_ls, gene_id2coord)
					if pdata.regulatory_touch_target_gene_ls:
						target_gene_ls_type = 'touch'
						target_gene_ls = pdata.regulatory_touch_target_gene_ls
					elif pdata.regulatory_is_left_upstream_target_gene_ls or pdata.regulatory_is_right_upstream_target_gene_ls:
						target_gene_ls_type = 'upstream'
						target_gene_ls = pdata.regulatory_is_left_upstream_target_gene_ls + pdata.regulatory_is_right_upstream_target_gene_ls
					
					binding_site_attr.comment = 'bin:%s. %s'%(bin, target_gene_ls_type)
					for target_gene_tuple in target_gene_ls:
						anchor, gene_id = target_gene_tuple[:2]
						binding_site_attr.prom_id = gene_id
						gene_start, gene_stop, gene_strand, gene_genomic_gi = gene_id2coord[gene_id]
						self.find_binding_site_disp_coord(binding_site_attr, gene_strand, gene_start, gene_stop)
						binding_site_attr.sequence = get_sequence_segment(curs, gene_genomic_gi, \
							binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end)
						if strand == '-':	#reverse_complement
							seq = Seq(binding_site_attr.sequence)
							binding_site_attr.sequence = seq.reverse_complement().tostring()
						self.submit2binding_site(curs, binding_site_attr)
			except:
				print "line_no:", line_no
				print line
		sys.stderr.write("Done.\n")
Exemple #3
0
	def sgd_regulatory_parse(self, curs, inputfile, gene_symbol2gene_id, tax_id):
		"""
		2008-08-12
			return_target_gene_ls() returns a different data structure.
		12-17-05
			
			--get_entrezgene_annotated_anchor()
			--calculate_rome_number()
			
			--parse_sgd_regulatory_attribute()
			--submit2matrix()
			--return_target_gene_ls()
			--find_binding_site_disp_coord()
			--get_sequence_segment()
			--submit2binding_site()
		"""
		sys.stderr.write("Parsing sgd_regulatory ...\n")
		chromosome2anchor_gene_tuple_ls, gene_id2coord = get_entrezgene_annotated_anchor(curs, tax_id)
		inf = open(inputfile, 'r')
		mt_id_set = Set()
		line_no = 0
		for line in inf:
			line_no+=1
			if line[0]=='#':	#skip the comment lines
				continue
			row = line[:-1].split('\t')
			try:
				seqname, source, feature, start, end, score, strand, frame, attribute = row
				chromosome_rome_number = seqname[3:]
				if feature=='TF_binding_site':
					binding_site_attr = binding_site_attribute(tax_id=tax_id)
					chromosome = calculate_rome_number(chromosome_rome_number)
					if chromosome:
						binding_site_attr.chromosome = repr(chromosome)	#varchar type in database
					else:	#it might be something like Mito, return None
						binding_site_attr.chromosome = chromosome_rome_number
					binding_site_attr.strand = strand
					mt_id, dbxref = self.parse_sgd_regulatory_attribute(attribute)
					if mt_id not in mt_id_set:
						self.submit2matrix(curs, mt_id, tax_id)
						mt_id_set.add(mt_id)
					binding_site_attr.mt_id = mt_id
					binding_site_attr.bs_genome_start = int(start)
					binding_site_attr.bs_genome_end = int(end)
					
					regulatory_coord = (binding_site_attr.chromosome, \
						binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end)
					pdata = self.return_target_gene_ls(regulatory_coord, chromosome2anchor_gene_tuple_ls, gene_id2coord)
					if pdata.regulatory_touch_target_gene_ls:
						target_gene_ls_type = 'touch'
						target_gene_ls = pdata.regulatory_touch_target_gene_ls
					elif pdata.regulatory_is_left_upstream_target_gene_ls or pdata.regulatory_is_right_upstream_target_gene_ls:
						target_gene_ls_type = 'upstream'
						target_gene_ls = pdata.regulatory_is_left_upstream_target_gene_ls + pdata.regulatory_is_right_upstream_target_gene_ls
					
					binding_site_attr.comment = '%s;%s'%(dbxref, target_gene_ls_type)
					for target_gene_tuple in target_gene_ls:
						anchor, gene_id = target_gene_tuple[:2]
						binding_site_attr.prom_id = gene_id
						gene_start, gene_stop, gene_strand, gene_genomic_gi = gene_id2coord[gene_id]
						self.find_binding_site_disp_coord(binding_site_attr, gene_strand, gene_start, gene_stop)
						binding_site_attr.sequence = get_sequence_segment(curs, gene_genomic_gi, \
							binding_site_attr.bs_genome_start, binding_site_attr.bs_genome_end)
						if strand == '-':	#reverse_complement
							seq = Seq(binding_site_attr.sequence)
							binding_site_attr.sequence = seq.reverse_complement().tostring()
						self.submit2binding_site(curs, binding_site_attr)
			except:
				print "line_no:", line_no
				print line
		sys.stderr.write("Done.\n")