def register(cls, latin_name, synonyms, reference_assemblies): """ Create a Species object from the given arguments and enter into all the dicts used to look the species up by its fields. """ species = Species(latin_name=latin_name, synonyms=synonyms, reference_assemblies=reference_assemblies) cls._latin_names_to_species[species.latin_name] = species for synonym in synonyms: # if synonym in cls._common_names_to_species: # raise ValueError("Can't use synonym '%s' for both %s and %s" % ( # synonym, # species, # cls._common_names_to_species[synonym])) cls._common_names_to_species[synonym] = species for reference_name in reference_assemblies: # if reference_name in cls._reference_names_to_species: # raise ValueError("Can't use reference '%s' for both %s and %s" % ( # reference_name, # species, # cls._reference_names_to_species[reference_name])) cls._reference_names_to_species[reference_name] = species return species
def test_species_to_json(): eq_(human, Species.from_json(human.to_json()))
def test_species_to_dict(): eq_(human, Species.from_dict(human.to_dict()))
def collect_all_genomes(): """ data aware generation of Species object. searches in .cache dir and generates a Species object Also generates a tsv file with all the genome info. Such file can be used to install sets of genomes at once (logic is bit like conda environment profile). It would be relatively easy to code export and import for such a file. """ def str2num(s, cat=False, force=True): """ Converts string to integer eg. ensembl92 to 92 :param s: string :param cat: Whether to concatenate detected integers. eg. 20,23 to 2023 :param force: If True, ignores decimal point error. """ import re if '.' in s and not force: raise ValueError( f"A string can only be converted to integeres, found a '.' in {s}" ) n = re.findall(r'\d+', s) if len(n) == 0: raise ValueError("No digits found in string {}".format(s)) elif len(n) == 1: return int(n[0]) else: if cat: return int(''.join(n)) else: return n from glob import glob from os.path import dirname, basename, exists import numpy as np import pandas as pd from pyensembl.species import normalize_species_name, Species # here's how I get the .cache directory eg. '/home/user/.cache/pyensembl' import datacache pyensembl_cache_dir = f"{dirname(datacache.get_data_dir())}/pyensembl" #FIXME if genomes are installed at other places than .cache # all the assemblies assemblies = [basename(p) for p in glob(f"{pyensembl_cache_dir}/*")] # dataframe that contains all the info (and can be exported as a tsv). dspecies = pd.DataFrame( columns=['latin name', 'release', 'synonymn', 'assembly']) # assempy to release min max dict needed as an input to create Species object assembly2releasesminmax = {} # following loop populates the dataframe genomei = 0 for assembly in assemblies: releases = [ basename(p) for p in glob(f"{pyensembl_cache_dir}/{assembly}/*") ] for release in releases: releasei = str2num(release) #FIXME is realease is a float genome_dir = f"{pyensembl_cache_dir}/{assembly}/{release}" genome_files = glob(f"{genome_dir}/*") is_genome_installed = True if len( genome_files ) > 4 else False #FIXME need more than 4 (.gz) files to be strict if is_genome_installed: dspecies.loc[genomei, 'assembly'] = assembly dspecies.loc[genomei, 'release'] = releasei dspecies.loc[genomei, 'synonymn'] = basename( genome_files[0]).split('.')[0] dspecies.loc[genomei, 'latin name'] = normalize_species_name( dspecies.loc[genomei, 'synonymn']) genomei += 1 # following loop generates the Species object for spc in dspecies['latin name'].unique(): assembly2releases = {} for assembly in dspecies.loc[(dspecies['latin name'] == spc), 'assembly'].unique(): d = dspecies.loc[((dspecies['latin name'] == spc) & (dspecies['assembly'] == assembly)), :] assembly2releases[assembly] = d['release'].min(), d['release'].max( ) #FIXME if MAX_ENSEMBL_RELEASE very important and has to be used Species.register(latin_name=spc, synonyms=dspecies.loc[(dspecies['latin name'] == spc), 'synonymn'].unique().tolist(), reference_assemblies=assembly2releases) Species.dspecies = dspecies return Species