def test_intron_annotation(self): """sequences annotated with Introns should return correct seq""" for symbol, stable_id, rank, exp_seq5, exp_seq3 in [ ('IL2', 'ENST00000226730', 1, 'gtaagtatat', 'actttcttag'), ('IL13', 'ENST00000304506', 3, 'gtaaggcatc', 'tgtcctgcag')]: gene = asserted_one(self.human.getGenesMatching(Symbol=symbol)) seq = gene.getAnnotatedSeq(feature_types='gene') intron = asserted_one(seq.getAnnotationsMatching('intron', '%s-%d'%(stable_id, rank))) intron_seq = str(seq.getRegionCoveringAll(intron).getSlice()) self.assertEqual(intron_seq[:10], exp_seq5.upper()) self.assertEqual(intron_seq[-10:], exp_seq3.upper())
def test_intron_number(self): """number of introns should be correct""" for gene_id, transcript_id, exp_number in [ ('ENSG00000227268', 'ENST00000445946', 0), ('ENSG00000132199', 'ENST00000319815', 8), ('ENSG00000132199', 'ENST00000383578', 15)]: gene = asserted_one(self.human.getGenesMatching(StableId=gene_id)) transcript = asserted_one( [t for t in gene.Transcripts if t.StableId==transcript_id]) if exp_number == 0: self.assertEqual(transcript.Introns, None) else: self.assertEqual(len(transcript.Introns), exp_number)
def test_intron_annotation(self): """sequences annotated with Introns should return correct seq""" for symbol, stable_id, rank, exp_seq5, exp_seq3 in [ ('IL2', 'ENST00000226730', 1, 'gtaagtatat', 'actttcttag'), ('IL13', 'ENST00000304506', 3, 'gtaaggcatc', 'tgtcctgcag') ]: gene = asserted_one(self.human.getGenesMatching(Symbol=symbol)) seq = gene.getAnnotatedSeq(feature_types='gene') intron = asserted_one( seq.getAnnotationsMatching('intron', '%s-%d' % (stable_id, rank))) intron_seq = str(seq.getRegionCoveringAll(intron).getSlice()) self.assertEqual(intron_seq[:10], exp_seq5.upper()) self.assertEqual(intron_seq[-10:], exp_seq3.upper())
def test_intron_number(self): """number of introns should be correct""" for gene_id, transcript_id, exp_number in [ ('ENSG00000227268', 'ENST00000445946', 0), ('ENSG00000132199', 'ENST00000583771', 5), ('ENSG00000132199', 'ENST00000340116', 14) ]: gene = asserted_one(self.human.getGenesMatching(StableId=gene_id)) transcript = asserted_one( [t for t in gene.Transcripts if t.StableId == transcript_id]) if exp_number == 0: self.assertEqual(transcript.Introns, None) else: self.assertEqual(len(transcript.Introns), exp_number)
def _get_exon_record(self): # this will be called by _Region parent class to make the location exon_table = self.db.getTable('exon') query = sql.select([exon_table], exon_table.c.exon_id == self.exon_id) records = query.execute() record = asserted_one(records.fetchall()) self._table_rows['exon'] = record
def _get_exon_stable_id_record(self): exon_stable_id_table = self.db.getTable('exon_stable_id') query = sql.select([exon_stable_id_table.c.stable_id], exon_stable_id_table.c.exon_id == self.exon_id) records = query.execute() record = asserted_one(records.fetchall()) self._table_rows['exon_stable_id'] = record
def _get_flanking_seq_data(self): # maps to flanking_sequence through variation_feature_id # if this fails, we grab from genomic sequence variation_id = self._table_rows['variation_feature']['variation_id'] flanking_seq_table = self.flanking_sequence_table query = sql.select([flanking_seq_table], flanking_seq_table.c.variation_id == variation_id) record = asserted_one(query.execute()) self._table_rows['flanking_sequence'] = record up_seq = record['up_seq'] down_seq = record['down_seq'] # the following two lines are because -- wait for it -- someone has # entered the string 'NULL' instead of NULL in the MySQL tables!!! up_seq = [up_seq, None][up_seq == 'NULL'] down_seq = [down_seq, None][down_seq == 'NULL'] seqs = dict(up=up_seq, down=down_seq) for name, seq in seqs.items(): if seq is not None: seq = DNA.makeSequence(seq) else: resized = [(-301, -1), (1, 301)][name == 'down'] if self.Location.Strand == -1: resized = [(1, 301), (-301, -1)][name == 'down'] flank = self.Location.resized(*resized) flanking = self.genome.getRegion(region=flank) seq = flanking.Seq seqs[name] = seq self._cached[('FlankingSeq')] = (seqs['up'][-300:],seqs['down'][:300])
def _get_simple_features(self, db, klass, target_coord, query_coord, where_feature): """returns feature_type records for the query_coord from the simple_feature table. The returned coord is referenced to target_coord. At present, only CpG islands being queried.""" simple_feature_table = db.getTable('simple_feature') feature_types = ['CpGisland'] feature_type_ids=[str(self._feature_type_ids.get(f)) for f in feature_types] # fix the following query = sql.select([simple_feature_table], sql.and_(simple_feature_table.c.analysis_id.in_(feature_type_ids), simple_feature_table.c.seq_region_id == query_coord.seq_region_id)) query = location_query(simple_feature_table,query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) records = query.execute() for record in records: coord = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End = record['seq_region_end'], seq_region_id=record['seq_region_id'], Strand = record['seq_region_strand'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record['score'])
def _get_repeat_features(self, db, klass, target_coord, query_coord, where_feature): """returns Repeat region instances""" # we build repeats using coordinates from repeat_feature table # the repeat_consensus_id is required to get the repeat name, class # and type repeat_feature_table = db.getTable("repeat_feature") query = sql.select([repeat_feature_table], repeat_feature_table.c.seq_region_id == query_coord.seq_region_id) query = location_query( repeat_feature_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature ) for record in query.execute(): coord = Coordinate( self, CoordName=query_coord.CoordName, Start=record["seq_region_start"], End=record["seq_region_end"], seq_region_id=record["seq_region_id"], Strand=record["seq_region_strand"], ensembl_coord=True, ) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record["score"], data=record)
def _get_repeat_features(self, db, klass, target_coord, query_coord, where_feature): """returns Repeat region instances""" # we build repeats using coordinates from repeat_feature table # the repeat_consensus_id is required to get the repeat name, class # and type repeat_feature_table = db.getTable('repeat_feature') query = sql.select( [repeat_feature_table], repeat_feature_table.c.seq_region_id == query_coord.seq_region_id) query = location_query(repeat_feature_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) for record in query.execute(): coord = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End=record['seq_region_end'], seq_region_id=record['seq_region_id'], Strand=record['seq_region_strand'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one( get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record['score'], data=record)
def _get_gene_features(self, db, klass, target_coord, query_coord, where_feature): """returns all genes""" xref_table = [None, db.getTable('xref')][db.Type == 'core'] gene_table = db.getTable('gene') # after release 65, the gene_id_table is removed. The following is to maintain # support for earlier releases. if self.GeneralRelease >= 65: gene_id_table = None else: gene_id_table = db.getTable('gene_stable_id') # note gene records are at chromosome, not contig, level condition = gene_table.c.seq_region_id == query_coord.seq_region_id query = self._build_gene_query(db, condition, gene_table, gene_id_table, xref_table) query = location_query(gene_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) for record in query.execute(): new = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End = record['seq_region_end'], Strand = record['seq_region_strand'], seq_region_id=record['seq_region_id'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # TODO: check coord, used 'new' here. where is coord (above line) used? gene = klass(self, db, Location=new, data=record) yield gene
def _get_seq_region_record(self, seq_region_id): # should this be on a parent class? or a generic function in assembly? seq_region_table = self.db.getTable('seq_region') query = sql.select([seq_region_table], seq_region_table.c.seq_region_id == seq_region_id) record = asserted_one(query.execute()) return record
def _get_transcript_stable_id_record(self): table_name = self._attr_ensembl_table_map['StableId'] if table_name in self._table_rows: return transcript_id = self.transcript_id table = self.db.getTable(table_name) query = sql.select([table], table.c.transcript_id == transcript_id) record = asserted_one(query.execute()) self._table_rows[table_name] = record
def _get_cpg_island_analysis_id(self): analysis_description_table = self.genome.CoreDb.getTable("analysis_description") query = sql.select( [analysis_description_table.c.analysis_id], analysis_description_table.c.display_label.like("%CpG%") ) record = asserted_one(query.execute()) self._table_rows["analysis_description"] = record quoted_limited = lambda x: DisplayString(x, with_quotes=True, num_words=2) self._populate_cache_from_record([("CpGisland", "analysis_id", quoted_limited)], "analysis_description")
def _get_cigar_record(self): genomic_align_table = \ self.parent.compara.ComparaDb.getTable('genomic_align') query = sql.select([genomic_align_table.c.cigar_line], genomic_align_table.c.genomic_align_id == \ self._cached['genomic_align_id']) record = asserted_one(query.execute()) self._cached['cigar_line'] = record['cigar_line'] return record
def _get_seq_region_id(self, CoordName): """returns the seq_region_id for the provided CoordName""" seq_region_table = self.CoreDb.getTable('seq_region') coord_systems = CoordSystem(core_db=self.CoreDb) coord_system_ids = [k for k in coord_systems if type(k) not in (unicode, str)] record = sql.select([seq_region_table.c.seq_region_id], sql.and_(seq_region_table.c.name == CoordName, seq_region_table.c.coord_system_id.in_(coord_system_ids))) record = asserted_one(record.execute().fetchall()) return record['seq_region_id']
def _get_seq_region_id(self, CoordName): """returns the seq_region_id for the provided CoordName""" seq_region_table = self.CoreDb.getTable('seq_region') coord_systems = CoordSystem(core_db=self.CoreDb) coord_system_ids = [k for k in coord_systems if not isinstance(k, str)] record = sql.select( [seq_region_table.c.seq_region_id], sql.and_(seq_region_table.c.name == CoordName, seq_region_table.c.coord_system_id.in_(coord_system_ids))) record = asserted_one(record.execute().fetchall()) return record['seq_region_id']
def _get_translation_record(self): transcript_id = self.transcript_id translation_table = self.db.getTable('translation') query = sql.select([translation_table], translation_table.c.transcript_id == transcript_id) try: record = asserted_one(query.execute()) except NoItemError: self._set_null_values(['TranslatedExons'], 'translation') return self._table_rows['translation'] = record
def test_intron(self): """should get correct Intron sequence, regardless of strand""" # IL2 is on - strand, IL13 is on + strand, both have three introns IL2_exp_introns = [ (1, 122456203, 122456293, 'gtaagtatat', 'actttcttag'), (2, 122453853, 122456143, 'gtaagtacaa', 'attattctag'), (3, 122451862, 122453709, 'gtaaggcatt', 'tcttttatag') ] IL13_exp_introns = [ (1, 132658360, 132659417, 'gtgagtgtcg', 'gctcccacag'), (2, 132659471, 132659723, 'gtaaggacct', 'ctccccacag'), (3, 132659828, 132660174, 'gtaaggcatc', 'tgtcctgcag') ] for symbol, stable_id, exp_introns in [ ('IL2', 'ENST00000226730', IL2_exp_introns), ('IL13', 'ENST00000304506', IL13_exp_introns) ]: gene = asserted_one(self.human.getGenesMatching(Symbol=symbol)) strand = gene.Location.Strand transcript = asserted_one( [t for t in gene.Transcripts if t.StableId == stable_id]) introns = transcript.Introns self.assertEqual(len(introns), len(exp_introns)) idx = 0 for intron in introns: loc = intron.Location start, end = loc.Start, loc.End seq = str(intron.Seq) exp_rank, exp_start, exp_end, exp_seq5, \ exp_seq3 = exp_introns[idx] self.assertEqual(loc.Strand, strand) # test the order using rank self.assertEqual(intron.Rank, exp_rank) # test position self.assertEqual(start, exp_start) self.assertEqual(end, exp_end) # test sequence self.assertEqual(seq[:10], exp_seq5.upper()) self.assertEqual(seq[-10:], exp_seq3.upper()) idx += 1
def _get_repeat_consensus_record(self): repeat_consensus_table = self.db.getTable('repeat_consensus') repeat_consensus_id = self._table_rows['repeat_feature']['repeat_consensus_id'] record = sql.select([repeat_consensus_table], repeat_consensus_table.c.repeat_consensus_id == repeat_consensus_id) record = asserted_one(record.execute().fetchall()) self._table_rows['repeat_consensus'] = record limit_length = lambda x : DisplayString(x, repr_length=10) attr_column_map = [('Symbol', 'repeat_name', _quoted), ('RepeatClass', 'repeat_class', _quoted), ('RepeatType', 'repeat_type', _quoted), ('Consensus', 'repeat_consensus', limit_length)] self._populate_cache_from_record(attr_column_map, 'repeat_consensus')
def _get_cpg_island_analysis_id(self): analysis_description_table = \ self.genome.CoreDb.getTable('analysis_description') query = sql.select( [analysis_description_table.c.analysis_id], analysis_description_table.c.display_label.like('%CpG%')) record = asserted_one(query.execute()) self._table_rows['analysis_description'] = record quoted_limited = lambda x: DisplayString( x, with_quotes=True, num_words=2) self._populate_cache_from_record( [('CpGisland', 'analysis_id', quoted_limited)], 'analysis_description')
def _get_seq_region_record(self, CoordName): # override the _Region class method, since, we take the provided Start # etc .. attributes # CoordName comes from seq_region_table.c.name # matched, by coord_system_id, to default coord system seq_region_table = self.genome.db.getTable('seq_region') coord_systems = CoordSystem(core_db=self.genome.CoreDb) coord_system_ids = [k for k in coord_systems if not isinstance(k, str)] record = sql.select([seq_region_table], sql.and_(seq_region_table.c.name == CoordName, seq_region_table.c.coord_system_id.in_(coord_system_ids))) record = asserted_one(record.execute().fetchall()) self._table_rows['seq_region'] = record
def test_intron(self): """should get correct Intron sequence, regardless of strand""" # IL2 is on - strand, IL13 is on + strand, both have three introns IL2_exp_introns = [ (1, 123377358, 123377448, 'gtaagtatat', 'actttcttag'), (2, 123375008, 123377298, 'gtaagtacaa', 'attattctag'), (3, 123373017,123374864, 'gtaaggcatt', 'tcttttatag')] IL13_exp_introns = [ (1, 131994052, 131995109, 'gtgagtgtcg', 'gctcccacag'), (2, 131995163, 131995415, 'gtaaggacct', 'ctccccacag'), (3, 131995520, 131995866, 'gtaaggcatc', 'tgtcctgcag')] for symbol, stable_id, exp_introns in [ ('IL2', 'ENST00000226730', IL2_exp_introns), ('IL13', 'ENST00000304506', IL13_exp_introns)]: gene = asserted_one(self.human.getGenesMatching(Symbol=symbol)) strand = gene.Location.Strand transcript = asserted_one( [t for t in gene.Transcripts if t.StableId==stable_id]) introns = transcript.Introns self.assertEqual(len(introns), len(exp_introns)) idx = 0 for intron in introns: loc = intron.Location start, end = loc.Start, loc.End seq = str(intron.Seq) exp_rank, exp_start, exp_end, exp_seq5, \ exp_seq3 = exp_introns[idx] self.assertEqual(loc.Strand, strand) # test the order using rank self.assertEqual(intron.Rank, exp_rank) # test position self.assertEqual(start, exp_start) self.assertEqual(end, exp_end) # test sequence self.assertEqual(seq[:10], exp_seq5.upper()) self.assertEqual(seq[-10:], exp_seq3.upper()) idx += 1
def __init__(self, genome, db, StableId=None, Symbol=None, Location=None, data=None): """constructed by a genome instance""" super(Gene, self).__init__(genome, db, Location=Location) if data is None: args = [dict(StableId=StableId), dict(Symbol=Symbol)][StableId is None] assert args data = asserted_one(list(self.genome._get_gene_query(db, **args).execute())) for name, func in \ [('StableId',self._get_gene_stable_id_record), ('BioType', self._get_gene_record), ('Description', self._get_gene_record), ('Symbol', self._get_xref_record), ('Location', self._get_gene_record)]: if name == 'Symbol' and 'display_label' not in data.keys(): # For EST continue self._table_rows[self._attr_ensembl_table_map[name]] = data func() # this populates the attributes
def assembly_exception_coordinate(loc): """returns a coordinate conversion for one with an assembly exception""" genome = loc.genome assemb_except_table = genome.CoreDb.getTable('assembly_exception') seq_region_table = genome.CoreDb.getTable('seq_region') query = sql.select([assemb_except_table, seq_region_table.c.name], sql.and_( assemb_except_table.c.seq_region_id == \ loc.seq_region_id, assemb_except_table.c.exc_seq_region_id == \ seq_region_table.c.seq_region_id)) query = location_query(assemb_except_table, loc.Start, loc.End, query = query) record = asserted_one(query.execute().fetchall()) s, conv_loc = _get_equivalent_coords(loc, record, "seq_region", "exc_seq_region", loc.CoordType) return conv_loc
def _make_location(self): row = self._table_rows[self._attr_ensembl_table_map['Location']] if row is None: return seq_region_id = row['%sid' % self._location_column_prefix] start = row['%sstart' % self._location_column_prefix] end = row['%send' % self._location_column_prefix] strand = row['%sstrand' % self._location_column_prefix] seq_region_table = self.db.getTable('seq_region') query = sql.select([seq_region_table.c.name], seq_region_table.c.seq_region_id == seq_region_id) result = asserted_one(query.execute().fetchall()) coord_name = result['name'] coord = Coordinate(genome = self.genome, CoordName=coord_name, Start=start, End=end, Strand=strand, seq_region_id=seq_region_id, ensembl_coord=True) self._cached['Location'] = coord
def _get_coord_type_and_seq_region_id(coord_name, core_db): seq_region_table = core_db.getTable('seq_region') rows = sql.select([seq_region_table]).\ where(seq_region_table.c.name == coord_name).execute().fetchall() species_coord_sys = CoordSystem(species=core_db.db_name.Species, core_db=core_db) try: selected_row = asserted_one(rows) except ValueError: selected_row = None for row in rows: # not a default_version if not row['coord_system_id'] in species_coord_sys: continue elif not selected_row: selected_row = row break if selected_row is None: raise ValueError("Ambigous coordinate name: %s" % coord_name) coord_type = species_coord_sys[selected_row['coord_system_id']].name return selected_row, coord_type
def _get_coord_type_and_seq_region_id(coord_name, core_db): seq_region_table = core_db.getTable('seq_region') rows = sql.select([seq_region_table]).\ where(seq_region_table.c.name == str(coord_name)).execute().fetchall() species_coord_sys = CoordSystem(species=core_db.db_name.Species, core_db = core_db) try: selected_row = asserted_one(rows) except ValueError: selected_row = None for row in rows: # not a default_version if not row['coord_system_id'] in species_coord_sys: continue elif not selected_row: selected_row = row break if selected_row is None: raise ValueError("Ambigous coordinate name: %s" % coord_name) coord_type = species_coord_sys[selected_row['coord_system_id']].name return selected_row, coord_type
def _get_simple_features(self, db, klass, target_coord, query_coord, where_feature): """returns feature_type records for the query_coord from the simple_feature table. The returned coord is referenced to target_coord. At present, only CpG islands being queried.""" simple_feature_table = db.getTable('simple_feature') feature_types = ['CpGisland'] feature_type_ids = [ self._feature_type_ids.get(f) for f in feature_types ] # fix the following query = sql.select( [simple_feature_table], sql.and_( simple_feature_table.c.analysis_id.in_(feature_type_ids), simple_feature_table.c.seq_region_id == query_coord.seq_region_id)) query = location_query(simple_feature_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) records = query.execute() for record in records: coord = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End=record['seq_region_end'], seq_region_id=record['seq_region_id'], Strand=record['seq_region_strand'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one( get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record['score'])
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False): # TODO clean up use of a coord genome = coord.genome # no matter what strand user provide, we get the + sequence first coord.Strand = 1 species = genome.Species coord_type = CoordSystem(species=species, core_db=genome.CoreDb, seq_level=True) if DEBUG: print('Created Coordinate:', coord, coord.EnsemblStart, coord.EnsemblEnd) print(coord.CoordType, coord_type) assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb) if not assemblies: raise NoItemError('no assembly for %s' % coord) dna = genome.CoreDb.getTable('dna') seqs, positions = [], [] for q_loc, t_loc in assemblies: assert q_loc.Strand == 1 length = len(t_loc) # get MySQL to do the string slicing via substr function query = sql.select([ substr(dna.c.sequence, t_loc.EnsemblStart, length).label('sequence') ], dna.c.seq_region_id == t_loc.seq_region_id) record = asserted_one(query.execute().fetchall()) seq = record['sequence'] seq = DNA.makeSequence(seq) if t_loc.Strand == -1: seq = seq.rc() seqs.append(str(seq)) positions.append((q_loc.Start, q_loc.End)) sequence = _assemble_seq(seqs, coord.Start, coord.End, positions) return sequence
def _get_dnafrag_id_for_coord(self, coord): """returns the dnafrag_id for the coordnate""" dnafrag_table = self.ComparaDb.getTable('dnafrag') genome_db_table = self.ComparaDb.getTable('genome_db') # column renamed between versions prefix = coord.genome.Species.lower() if int(self.Release) > 58: prefix = _Species.getEnsemblDbPrefix(prefix) query = sql.select([dnafrag_table.c.dnafrag_id, dnafrag_table.c.coord_system_name], sql.and_(dnafrag_table.c.genome_db_id ==\ genome_db_table.c.genome_db_id, genome_db_table.c.name == prefix, dnafrag_table.c.name == coord.CoordName)) try: record = asserted_one(query.execute().fetchall()) dnafrag_id = record['dnafrag_id'] except NoItemError: raise RuntimeError, 'No DNA fragment identified' return dnafrag_id
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False): # TODO clean up use of a coord genome = coord.genome # no matter what strand user provide, we get the + sequence first coord.Strand = 1 species = genome.Species coord_type = CoordSystem(species=species,core_db=genome.CoreDb, seq_level=True) if DEBUG: print 'Created Coordinate:',coord,coord.EnsemblStart,coord.EnsemblEnd print coord.CoordType, coord_type assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb) if not assemblies: raise NoItemError, 'no assembly for %s' % coord dna = genome.CoreDb.getTable('dna') seqs, positions = [], [] for q_loc, t_loc in assemblies: assert q_loc.Strand == 1 length = len(t_loc) # get MySQL to do the string slicing via substr function query = sql.select([substr(dna.c.sequence, t_loc.EnsemblStart, length).label('sequence')], dna.c.seq_region_id == t_loc.seq_region_id) record = asserted_one(query.execute().fetchall()) seq = record['sequence'] seq = DNA.makeSequence(seq) if t_loc.Strand == -1: seq = seq.rc() seqs.append(str(seq)) positions.append((q_loc.Start, q_loc.End)) sequence = _assemble_seq(seqs, coord.Start, coord.End, positions) return sequence