def test_get_ensembl_format(self): """should take common or latin names and return the corresponding ensembl db prefix""" self.assertEqual(Species.getEnsemblDbPrefix("human"), "homo_sapiens") self.assertEqual(Species.getEnsemblDbPrefix("mouse"), "mus_musculus") self.assertEqual(Species.getEnsemblDbPrefix("Mus musculus"), "mus_musculus")
def test_get_name_type(self): """should return the (latin|common) name given a latin, common or ensembl db prefix names""" self.assertEqual(Species.getSpeciesName("human"), "H**o sapiens") self.assertEqual(Species.getSpeciesName("homo_sapiens"), "H**o sapiens") self.assertEqual(Species.getCommonName("Mus musculus"), "Mouse") self.assertEqual(Species.getCommonName("mus_musculus"), "Mouse")
def test_add_new_species(self): """should correctly add a new species/common combination and infer the correct ensembl prefix""" species_name, common_name = "Otolemur garnettii", "Bushbaby" Species.amendSpecies(species_name, common_name) self.assertEqual(Species.getSpeciesName(species_name), species_name) self.assertEqual(Species.getSpeciesName("Bushbaby"), species_name) self.assertEqual(Species.getSpeciesName(common_name), species_name) self.assertEqual(Species.getCommonName(species_name), common_name) self.assertEqual(Species.getCommonName("Bushbaby"), common_name) self.assertEqual(Species.getEnsemblDbPrefix("Bushbaby"), "otolemur_garnettii") self.assertEqual(Species.getEnsemblDbPrefix(species_name), "otolemur_garnettii") self.assertEqual(Species.getEnsemblDbPrefix(common_name), "otolemur_garnettii")
def __init__(self, Species, Release, account=None, pool_recycle=None): super(Genome, self).__init__() assert Release, 'invalid release specified' if account is None: account = get_ensembl_account(release=Release) self._account = account self._pool_recycle = pool_recycle # TODO: check Release may not be necessary because: assert Release above if Release is None: Release = get_latest_release(account=account) self._gen_release = None # TODO make name and release immutable properties self.Species = _Species.getSpeciesName(Species) self.Release = str(Release) # the db connections self._core_db = None self._var_db = None self._other_db = None self._feature_type_ids = FeatureTypeCache(self) self._feature_coord_levels = FeatureCoordLevels(self.Species)
def __repr__(self): my_type = self.__class__.__name__ name = _Species.getCommonName(self.Species) coord_type = self.CoordType c = '%s(%r,%r,%r,%d-%d,%d)'%(my_type, name, coord_type, self.CoordName, self.Start, self.End, self.Strand) return c.replace("'", "")
def _attach_genomes(self): for species in self.Species: attr_name = _Species.getComparaName(species) genome = Genome(Species=species, Release=self.Release, account=self._account) self._genomes[species] = genome setattr(self, attr_name, genome)
def __call__(self, species=None, core_db=None, feature_types=None, var_db=None, otherfeature_db=None): if 'variation' in feature_types: assert var_db is not None species = _Species.getSpeciesName(core_db.db_name.Species or species) self._set_species_feature_levels(species, core_db, feature_types, var_db, otherfeature_db) return self._species_feature_levels[species]
def make_db_name_pattern(species=None, db_type=None, release=None): """returns a pattern for matching the db name against""" sep = r"%" pattern = "" if species: species = Species.getEnsemblDbPrefix(species) pattern = "%s%s" % (sep, species) if db_type: pattern = "%s%s%s" % (pattern, sep, db_type) if release: pattern = "%s%s%s" % (pattern, sep, release) assert pattern return "'%s%s'" % (pattern, sep)
def __init__(self, species, Release, account=None, pool_recycle=None, division=None): assert Release, "invalid release specified" self.Release = str(Release) if account is None: account = get_ensembl_account(release=Release) self._account = account self._pool_recycle = pool_recycle self._compara_db = None sp = sorted([_Species.getSpeciesName(sp) for sp in set(species)]) self.Species = tuple(sp) self._genomes = {} self._attach_genomes() self._species_id_map = None self._species_db_map = None self._species_set = None self._method_species_link = None self.division = division
def __call__(self, coord_type = None, core_db = None, species = None, seq_level=False): """coord_type can be coord_type or coord_system_id""" # TODO should only pass in core_db here, not that and Species, or just # the genome - what if someone wants to compare different ensembl # releases? keying by species is then a bad idea! better to key by # id(object) # change identifier to coord_system, handle either string val or int # (see MySQL table) as is this shouldn't be a __call__, see line 168 # for reason why we should have a method to set data: setSpeciesCoord # call then then just returns the coords for the named species species = _Species.getSpeciesName(species or core_db.db_name.Species) self._set_species_system(core_db, species) if seq_level: result = self._get_seq_level_system(species) elif coord_type: result = self._species_coord_systems[species][coord_type] else: result = self._species_coord_systems[species] return result
def _get_dnafrag_id_for_coord(self, coord): """returns the dnafrag_id for the coordnate""" dnafrag_table = self.ComparaDb.getTable('dnafrag') genome_db_table = self.ComparaDb.getTable('genome_db') # column renamed between versions prefix = coord.genome.Species.lower() if int(self.Release) > 58: prefix = _Species.getEnsemblDbPrefix(prefix) query = sql.select([dnafrag_table.c.dnafrag_id, dnafrag_table.c.coord_system_name], sql.and_(dnafrag_table.c.genome_db_id ==\ genome_db_table.c.genome_db_id, genome_db_table.c.name == prefix, dnafrag_table.c.name == coord.CoordName)) try: record = asserted_one(query.execute().fetchall()) dnafrag_id = record['dnafrag_id'] except NoItemError: raise RuntimeError, 'No DNA fragment identified' return dnafrag_id
def __init__(self, species, Release, account=None, pool_recycle=None, division=None): assert Release, 'invalid release specified' self.Release = str(Release) if account is None: account = get_ensembl_account(release=Release) self._account = account self._pool_recycle = pool_recycle self._compara_db = None sp = sorted([_Species.getSpeciesName(sp) for sp in set(species)]) self.Species = tuple(sp) self._genomes = {} self._attach_genomes() self._species_id_map = None self._species_db_map = None self._species_set = None self._method_species_link = None self.division = division
def __init__(self, species): self.Species = _Species.getSpeciesName(species)
def test_get_compara_name(self): """should correctly form valid names for assignment onto objects""" self.assertEqual(Species.getComparaName('pika'), 'Pika') self.assertEqual(Species.getComparaName('C.elegans'), 'Celegans') self.assertEqual(Species.getComparaName('Caenorhabditis elegans'), 'Celegans')
def test_amend_existing(self): """should correctly amend an existing species""" species_name = 'Ochotona princeps' common_name1 = 'american pika' common_name2 = 'pika' ensembl_pref = 'ochotona_princeps' Species.amendSpecies(species_name, common_name1) self.assertEqual(Species.getCommonName(species_name),common_name1) Species.amendSpecies(species_name, common_name2) self.assertEqual(Species.getSpeciesName(common_name2), species_name) self.assertEqual(Species.getSpeciesName(ensembl_pref), species_name) self.assertEqual(Species.getCommonName(species_name), common_name2) self.assertEqual(Species.getCommonName(ensembl_pref), common_name2) self.assertEqual(Species.getEnsemblDbPrefix(species_name), ensembl_pref) self.assertEqual(Species.getEnsemblDbPrefix(common_name2), ensembl_pref)
def test_amend_existing(self): """should correctly amend an existing species""" species_name = 'Ochotona princeps' common_name1 = 'american pika' common_name2 = 'pika' ensembl_pref = 'ochotona_princeps' Species.amendSpecies(species_name, common_name1) self.assertEqual(Species.getCommonName(species_name), common_name1) Species.amendSpecies(species_name, common_name2) self.assertEqual(Species.getSpeciesName(common_name2), species_name) self.assertEqual(Species.getSpeciesName(ensembl_pref), species_name) self.assertEqual(Species.getCommonName(species_name), common_name2) self.assertEqual(Species.getCommonName(ensembl_pref), common_name2) self.assertEqual(Species.getEnsemblDbPrefix(species_name), ensembl_pref) self.assertEqual(Species.getEnsemblDbPrefix(common_name2), ensembl_pref)
def getSyntenicRegions(self, Species=None, CoordName=None, Start=None, End=None, Strand=1, ensembl_coord=False, region=None, align_method=None, align_clade=None, method_clade_id=None): """returns a SyntenicRegions instance Arguments: - Species: the species name - CoordName, Start, End, Strand: the coordinates for the region - ensembl_coord: whether the coordinates are in Ensembl form - region: a region instance or a location, in which case the CoordName etc .. arguments are ignored - align_method, align_clade: the alignment method and clade to use Note: the options for this instance can be found by printing the method_species_links attribute of this object. - method_clade_id: over-rides align_method/align_clade. The entry in method_species_links under method_link_species_set_id """ assert (align_method and align_clade) or method_clade_id, \ 'Must specify (align_method & align_clade) or method_clade_id' if method_clade_id is None: for row in self.method_species_links: if align_method.lower() in row['align_method'].lower() and\ align_clade.lower() in row['align_clade'].lower(): method_clade_id = row['method_link_species_set_id'] if method_clade_id is None: raise RuntimeError, "Invalid align_method[%s] or align_clade "\ "specified[%s]" % (align_method, align_clade) if region is None: ref_genome = self._genomes[_Species.getSpeciesName(Species)] region = ref_genome.makeLocation(CoordName=CoordName, Start=Start, End=End, Strand=Strand, ensembl_coord=ensembl_coord) elif hasattr(region, 'Location'): region = region.Location # make sure the genome instances match ref_genome = self._genomes[region.genome.Species] if ref_genome is not region.genome: # recreate region from our instance region = ref_genome.makeLocation(CoordName=region.CoordName, Start=region.Start, End=region.End, Strand=region.Strand) ref_dnafrag_id = self._get_dnafrag_id_for_coord(region) blocks=self._get_genomic_align_blocks_for_dna_frag_id(method_clade_id, ref_dnafrag_id, region) for block in blocks: genomic_align_block_id = block['genomic_align_block_id'] # we get joint records for these identifiers from records = self._get_joint_genomic_align_dnafrag( genomic_align_block_id) members = [] ref_location = None for record in records: taxon_id = self.genome_taxon[record.genome_db_id] genome = self.taxon_id_species[taxon_id] # we have a case where we getback different coordinate system # results for the ref genome. We keep only those that match # the CoordName of region if genome is region.genome and \ record.name == region.CoordName: # this is the ref species and we adjust the ref_location # for this block diff_start = record.dnafrag_start-region.EnsemblStart shift_start = [0, diff_start][diff_start > 0] diff_end = record.dnafrag_end-region.EnsemblEnd shift_end = [diff_end, 0][diff_end > 0] try: ref_location = region.resized(shift_start, shift_end) except ValueError: # we've hit some ref genome fragment that matches # but whose coordinates aren't right continue elif genome is region.genome: continue members += [(genome, record)] assert ref_location is not None, "Failed to make the reference"\ " location" yield SyntenicRegions(self, members, ref_location=ref_location)
def __call__(self, species = None, core_db=None, feature_types=None, var_db=None, otherfeature_db=None): if 'variation' in feature_types: assert var_db is not None species = _Species.getSpeciesName(core_db.db_name.Species or species) self._set_species_feature_levels(species, core_db, feature_types, var_db, otherfeature_db) return self._species_feature_levels[species]
def getSyntenicRegions(self, Species=None, CoordName=None, Start=None, End=None, Strand=1, ensembl_coord=False, region=None, align_method=None, align_clade=None, method_clade_id=None): """returns a SyntenicRegions instance Arguments: - Species: the species name - CoordName, Start, End, Strand: the coordinates for the region - ensembl_coord: whether the coordinates are in Ensembl form - region: a region instance or a location, in which case the CoordName etc .. arguments are ignored - align_method, align_clade: the alignment method and clade to use Note: the options for this instance can be found by printing the method_species_links attribute of this object. - method_clade_id: over-rides align_method/align_clade. The entry in method_species_links under method_link_species_set_id """ assert (align_method and align_clade) or method_clade_id, \ 'Must specify (align_method & align_clade) or method_clade_id' if method_clade_id is None: for row in self.method_species_links: if align_method.lower() in row['align_method'].lower() and\ align_clade.lower() in row['align_clade'].lower(): method_clade_id = row['method_link_species_set_id'] if method_clade_id is None: raise RuntimeError, "Invalid align_method[%s] or align_clade "\ "specified[%s]" % (align_method, align_clade) if region is None: ref_genome = self._genomes[_Species.getSpeciesName(Species)] region = ref_genome.makeLocation(CoordName=CoordName, Start=Start, End=End, Strand=Strand, ensembl_coord=ensembl_coord) elif hasattr(region, 'Location'): region = region.Location # make sure the genome instances match ref_genome = self._genomes[region.genome.Species] if ref_genome is not region.genome: # recreate region from our instance region = ref_genome.makeLocation(CoordName=region.CoordName, Start=region.Start, End=region.End, Strand=region.Strand) ref_dnafrag_id = self._get_dnafrag_id_for_coord(region) blocks = self._get_genomic_align_blocks_for_dna_frag_id( method_clade_id, ref_dnafrag_id, region) for block in blocks: genomic_align_block_id = block['genomic_align_block_id'] # we get joint records for these identifiers from records = self._get_joint_genomic_align_dnafrag( genomic_align_block_id) members = [] ref_location = None for record in records: taxon_id = self.genome_taxon[record.genome_db_id] genome = self.taxon_id_species[taxon_id] # we have a case where we getback different coordinate system # results for the ref genome. We keep only those that match # the CoordName of region if genome is region.genome and \ record.name == region.CoordName: # this is the ref species and we adjust the ref_location # for this block diff_start = record.dnafrag_start - region.EnsemblStart shift_start = [0, diff_start][diff_start > 0] diff_end = record.dnafrag_end - region.EnsemblEnd shift_end = [diff_end, 0][diff_end > 0] try: ref_location = region.resized(shift_start, shift_end) except ValueError: # we've hit some ref genome fragment that matches # but whose coordinates aren't right continue elif genome is region.genome: continue members += [(genome, record)] assert ref_location is not None, "Failed to make the reference"\ " location" yield SyntenicRegions(self, members, ref_location=ref_location)