Ejemplo n.º 1
0
 def _get_repeat_features(self, db, klass, target_coord, query_coord,
                          where_feature):
     """returns Repeat region instances"""
     # we build repeats using coordinates from repeat_feature table
     # the repeat_consensus_id is required to get the repeat name, class
     # and type
     repeat_feature_table = db.getTable('repeat_feature')
     query = sql.select(
         [repeat_feature_table],
         repeat_feature_table.c.seq_region_id == query_coord.seq_region_id)
     query = location_query(repeat_feature_table,
                            query_coord.EnsemblStart,
                            query_coord.EnsemblEnd,
                            query=query,
                            where=where_feature)
     for record in query.execute():
         coord = Coordinate(self,
                            CoordName=query_coord.CoordName,
                            Start=record['seq_region_start'],
                            End=record['seq_region_end'],
                            seq_region_id=record['seq_region_id'],
                            Strand=record['seq_region_strand'],
                            ensembl_coord=True)
         if query_coord.CoordName != target_coord.CoordName:
             coord = asserted_one(
                 get_coord_conversion(coord, target_coord.CoordType,
                                      self.CoreDb))[1]
         # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName
         # coord = coord.makeRelativeTo(target_coord, False)
         yield klass(self,
                     db,
                     Location=coord,
                     Score=record['score'],
                     data=record)
Ejemplo n.º 2
0
 def _get_simple_features(self, db, klass, target_coord, query_coord,
                          where_feature):
     """returns feature_type records for the query_coord from the
     simple_feature table. The returned coord is referenced to
     target_coord. At present, only CpG islands being queried."""
     simple_feature_table = db.getTable('simple_feature')
     feature_types = ['CpGisland']
     feature_type_ids=[str(self._feature_type_ids.get(f)) for f in feature_types]
     # fix the following
     query = sql.select([simple_feature_table],
         sql.and_(simple_feature_table.c.analysis_id.in_(feature_type_ids),
         simple_feature_table.c.seq_region_id == query_coord.seq_region_id))
     query = location_query(simple_feature_table,query_coord.EnsemblStart,
                     query_coord.EnsemblEnd, query=query,
                     where=where_feature)
     records = query.execute()
     for record in records:
         coord = Coordinate(self, CoordName=query_coord.CoordName,
                         Start=record['seq_region_start'],
                         End = record['seq_region_end'],
                         seq_region_id=record['seq_region_id'],
                         Strand = record['seq_region_strand'],
                         ensembl_coord=True)
         if query_coord.CoordName != target_coord.CoordName:
             coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1]
             
         # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName
         # coord = coord.makeRelativeTo(target_coord, False)
         yield klass(self, db, Location=coord, Score=record['score'])
Ejemplo n.º 3
0
 def _get_gene_features(self, db, klass, target_coord, query_coord,
                        where_feature):
     """returns all genes"""
     xref_table = [None, db.getTable('xref')][db.Type == 'core']
     gene_table = db.getTable('gene')
     
     # after release 65, the gene_id_table is removed. The following is to maintain
     # support for earlier releases.
     if self.GeneralRelease >= 65:
         gene_id_table = None
     else:
         gene_id_table = db.getTable('gene_stable_id')
     
     # note gene records are at chromosome, not contig, level
     condition = gene_table.c.seq_region_id == query_coord.seq_region_id
     query = self._build_gene_query(db, condition, gene_table, gene_id_table, xref_table)
     query = location_query(gene_table, query_coord.EnsemblStart,
                 query_coord.EnsemblEnd, query=query, where=where_feature)
     
     for record in query.execute():
         new = Coordinate(self, CoordName=query_coord.CoordName,
                         Start=record['seq_region_start'],
                         End = record['seq_region_end'],
                         Strand = record['seq_region_strand'], 
                         seq_region_id=record['seq_region_id'],
                         ensembl_coord=True)
         if query_coord.CoordName != target_coord.CoordName:
             coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1]
         
         # TODO: check coord, used 'new' here. where is coord (above line) used? 
         gene = klass(self, db, Location=new, data=record)
         yield gene
Ejemplo n.º 4
0
 def _get_repeat_features(self, db, klass, target_coord, query_coord, where_feature):
     """returns Repeat region instances"""
     # we build repeats using coordinates from repeat_feature table
     # the repeat_consensus_id is required to get the repeat name, class
     # and type
     repeat_feature_table = db.getTable("repeat_feature")
     query = sql.select([repeat_feature_table], repeat_feature_table.c.seq_region_id == query_coord.seq_region_id)
     query = location_query(
         repeat_feature_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature
     )
     for record in query.execute():
         coord = Coordinate(
             self,
             CoordName=query_coord.CoordName,
             Start=record["seq_region_start"],
             End=record["seq_region_end"],
             seq_region_id=record["seq_region_id"],
             Strand=record["seq_region_strand"],
             ensembl_coord=True,
         )
         if query_coord.CoordName != target_coord.CoordName:
             coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1]
         # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName
         # coord = coord.makeRelativeTo(target_coord, False)
         yield klass(self, db, Location=coord, Score=record["score"], data=record)
Ejemplo n.º 5
0
    def getFeatures(self,
                    region=None,
                    feature_types=None,
                    where_feature=None,
                    CoordName=None,
                    Start=None,
                    End=None,
                    Strand=None,
                    ensembl_coord=False):
        """returns Region instances for the specified location"""
        if isinstance(feature_types, str):
            feature_types = [feature_types]
        feature_types = [ft.lower() for ft in feature_types]
        feature_coord_levels = self._get_feature_coord_levels(feature_types)

        if region is None:
            seq_region_id = self._get_seq_region_id(CoordName)
            region = Coordinate(self,
                                CoordName=CoordName,
                                Start=Start,
                                End=End,
                                Strand=convert_strand(Strand),
                                seq_region_id=seq_region_id,
                                ensembl_coord=ensembl_coord)
        elif hasattr(region, 'Location'):
            region = region.Location

        coord = region
        # the coordinate system at which locations are to be referenced, and
        # the processing function
        target_coords_funcs = \
            dict(cpg = (self._get_simple_features, CpGisland),
                 repeat = (self._get_repeat_features, Repeat),
                 gene = (self._get_gene_features, Gene),
                 est = (self._get_gene_features, Est),
                 variation = (self._get_variation_features, Variation))

        known_types = set(target_coords_funcs.keys())
        if not set(feature_types) <= known_types:
            raise RuntimeError, 'Unknown feature[%s], valid feature_types \
                are: %s' % (set(feature_types) ^ known_types, known_types)

        for feature_type in feature_types:
            target_func, target_class = target_coords_funcs[feature_type]
            db = self.CoreDb
            if feature_type == 'est':
                db = self.OtherFeaturesDb

            feature_coords = feature_coord_levels[feature_type].levels
            for feature_coord in feature_coords:
                chrom_other_coords = get_coord_conversion(coord,
                                                          feature_coord,
                                                          db,
                                                          where=where_feature)
                for chrom_coord, other_coord in chrom_other_coords:
                    for region in target_func(db, target_class, chrom_coord,
                                              other_coord, where_feature):
                        yield region
Ejemplo n.º 6
0
 def test_get_coord_conversion(self):
     """should correctly map between different coordinate levels"""
     # not really testing the contig coordinates are correct
     CoordName, Start, End, Strand = '1', 1000, 1000000, 1
     human_loc = Coordinate(CoordName = CoordName, Start = Start, End = End,
                     Strand = Strand, genome = human)
     results = get_coord_conversion(human_loc, 'contig', human.CoreDb)
     for result in results:
         self.assertTrue(result[0].CoordName == CoordName)
         self.assertTrue(result[0].Start >= Start)
         self.assertTrue(result[0].End <= End)
         self.assertTrue(result[0].Strand == Strand)
Ejemplo n.º 7
0
 def test_get_coord_conversion(self):
     """should correctly map between different coordinate levels"""
     # not really testing the contig coordinates are correct
     CoordName, Start, End, Strand = '1', 1000, 1000000, 1
     human_loc = Coordinate(CoordName=CoordName,
                            Start=Start,
                            End=End,
                            Strand=Strand,
                            genome=human)
     results = get_coord_conversion(human_loc, 'contig', human.CoreDb)
     for result in results:
         self.assertTrue(result[0].CoordName == CoordName)
         self.assertTrue(result[0].Start >= Start)
         self.assertTrue(result[0].End <= End)
         self.assertTrue(result[0].Strand == Strand)
Ejemplo n.º 8
0
 def getFeatures(self, region=None, feature_types=None, where_feature=None,
                 CoordName=None, Start=None, End=None, Strand=None,
                 ensembl_coord=False):
     """returns Region instances for the specified location"""
     if isinstance(feature_types, str):
         feature_types = [feature_types]
     feature_types = [ft.lower() for ft in feature_types]
     feature_coord_levels = self._get_feature_coord_levels(feature_types)
     
     if region is None:
         seq_region_id = self._get_seq_region_id(CoordName)
         region = Coordinate(self,CoordName=CoordName, Start=Start,
                     End=End,
                     Strand = convert_strand(Strand),
                     seq_region_id=seq_region_id,
                     ensembl_coord=ensembl_coord)
     elif hasattr(region, 'Location'):
         region = region.Location
     
     coord = region
     # the coordinate system at which locations are to be referenced, and
     # the processing function
     target_coords_funcs = \
         dict(cpg = (self._get_simple_features, CpGisland),
              repeat = (self._get_repeat_features, Repeat),
              gene = (self._get_gene_features, Gene),
              est = (self._get_gene_features, Est),
              variation = (self._get_variation_features, Variation))
     
     known_types = set(target_coords_funcs.keys())
     if not set(feature_types) <= known_types:
         raise RuntimeError, 'Unknown feature[%s], valid feature_types \
             are: %s' % (set(feature_types)^known_types, known_types)
     
     for feature_type in feature_types:
         target_func, target_class = target_coords_funcs[feature_type]
         db = self.CoreDb
         if feature_type == 'est':
             db = self.OtherFeaturesDb
         
         feature_coords = feature_coord_levels[feature_type].levels
         for feature_coord in feature_coords:
             chrom_other_coords = get_coord_conversion(coord, feature_coord,
                                         db, where=where_feature)
             for chrom_coord, other_coord in chrom_other_coords:
                 for region in target_func(db, target_class, chrom_coord,
                                         other_coord, where_feature):
                     yield region
Ejemplo n.º 9
0
    def _get_simple_features(self, db, klass, target_coord, query_coord,
                             where_feature):
        """returns feature_type records for the query_coord from the
        simple_feature table. The returned coord is referenced to
        target_coord. At present, only CpG islands being queried."""
        simple_feature_table = db.getTable('simple_feature')
        feature_types = ['CpGisland']
        feature_type_ids = [
            self._feature_type_ids.get(f) for f in feature_types
        ]
        # fix the following
        query = sql.select(
            [simple_feature_table],
            sql.and_(
                simple_feature_table.c.analysis_id.in_(feature_type_ids),
                simple_feature_table.c.seq_region_id ==
                query_coord.seq_region_id))
        query = location_query(simple_feature_table,
                               query_coord.EnsemblStart,
                               query_coord.EnsemblEnd,
                               query=query,
                               where=where_feature)
        records = query.execute()
        for record in records:
            coord = Coordinate(self,
                               CoordName=query_coord.CoordName,
                               Start=record['seq_region_start'],
                               End=record['seq_region_end'],
                               seq_region_id=record['seq_region_id'],
                               Strand=record['seq_region_strand'],
                               ensembl_coord=True)
            if query_coord.CoordName != target_coord.CoordName:
                coord = asserted_one(
                    get_coord_conversion(coord, target_coord.CoordType,
                                         self.CoreDb))[1]

            # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName
            # coord = coord.makeRelativeTo(target_coord, False)
            yield klass(self, db, Location=coord, Score=record['score'])
Ejemplo n.º 10
0
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False):
    # TODO clean up use of a coord
    genome = coord.genome
    # no matter what strand user provide, we get the + sequence first
    coord.Strand = 1
    species = genome.Species
    coord_type = CoordSystem(species=species,
                             core_db=genome.CoreDb,
                             seq_level=True)

    if DEBUG:
        print('Created Coordinate:', coord, coord.EnsemblStart,
              coord.EnsemblEnd)
        print(coord.CoordType, coord_type)

    assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb)

    if not assemblies:
        raise NoItemError('no assembly for %s' % coord)

    dna = genome.CoreDb.getTable('dna')
    seqs, positions = [], []
    for q_loc, t_loc in assemblies:
        assert q_loc.Strand == 1
        length = len(t_loc)
        # get MySQL to do the string slicing via substr function
        query = sql.select([
            substr(dna.c.sequence, t_loc.EnsemblStart,
                   length).label('sequence')
        ], dna.c.seq_region_id == t_loc.seq_region_id)
        record = asserted_one(query.execute().fetchall())
        seq = record['sequence']
        seq = DNA.makeSequence(seq)
        if t_loc.Strand == -1:
            seq = seq.rc()
        seqs.append(str(seq))
        positions.append((q_loc.Start, q_loc.End))
    sequence = _assemble_seq(seqs, coord.Start, coord.End, positions)
    return sequence
Ejemplo n.º 11
0
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False):
    # TODO clean up use of a coord
    genome = coord.genome
    # no matter what strand user provide, we get the + sequence first
    coord.Strand = 1
    species = genome.Species
    coord_type = CoordSystem(species=species,core_db=genome.CoreDb,
                             seq_level=True)
    
    if DEBUG:
        print 'Created Coordinate:',coord,coord.EnsemblStart,coord.EnsemblEnd
        print coord.CoordType, coord_type
    
    assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb)
    
    if not assemblies:
        raise NoItemError, 'no assembly for %s' % coord
    
    dna = genome.CoreDb.getTable('dna')
    seqs, positions = [], []
    for q_loc, t_loc in assemblies:
        assert q_loc.Strand == 1
        length = len(t_loc)
        # get MySQL to do the string slicing via substr function
        query = sql.select([substr(dna.c.sequence,
                                  t_loc.EnsemblStart,
                                  length).label('sequence')],
                            dna.c.seq_region_id == t_loc.seq_region_id)
        record = asserted_one(query.execute().fetchall())
        seq = record['sequence']
        seq = DNA.makeSequence(seq)
        if t_loc.Strand == -1:
            seq = seq.rc()
        seqs.append(str(seq))
        positions.append((q_loc.Start, q_loc.End))
    sequence = _assemble_seq(seqs, coord.Start, coord.End, positions)
    return sequence
Ejemplo n.º 12
0
def get_lower_coord_conversion(coord, species, core_db):
    coord_system = CoordSystem(species=species, core_db=core_db)
    seq_level_coord_type = CoordSystem(species=species,core_db=core_db,
                             seq_level=True)
    query_rank = coord_system[coord.CoordType].rank
    seq_level_rank = coord_system[seq_level_coord_type].rank
    assemblies = None
    for rank in range(query_rank+1, seq_level_rank):
        coord_type = None
        for key in coord_system.keys():
            if coord_system[key].rank == rank:
                coord_type = coord_system[key].name
                break
        
        if coord_type is None:
            continue
        
        assemblies = get_coord_conversion(coord, coord_type, core_db)
        
        if assemblies: 
            break
        
    
    return assemblies
Ejemplo n.º 13
0
def get_lower_coord_conversion(coord, species, core_db):
    coord_system = CoordSystem(species=species, core_db=core_db)
    seq_level_coord_type = CoordSystem(species=species,
                                       core_db=core_db,
                                       seq_level=True)
    query_rank = coord_system[coord.CoordType].rank
    seq_level_rank = coord_system[seq_level_coord_type].rank
    assemblies = None
    for rank in range(query_rank + 1, seq_level_rank):
        coord_type = None
        for key in coord_system.keys():
            if coord_system[key].rank == rank:
                coord_type = coord_system[key].name
                break

        if coord_type is None:
            continue

        assemblies = get_coord_conversion(coord, coord_type, core_db)

        if assemblies:
            break

    return assemblies