コード例 #1
0
ファイル: species.py プロジェクト: rraadd88/pyensembl
 def register(cls, latin_name, synonyms, reference_assemblies):
     """
     Create a Species object from the given arguments and enter into
     all the dicts used to look the species up by its fields.
     """
     species = Species(latin_name=latin_name,
                       synonyms=synonyms,
                       reference_assemblies=reference_assemblies)
     cls._latin_names_to_species[species.latin_name] = species
     for synonym in synonyms:
         # if synonym in cls._common_names_to_species:
         #     raise ValueError("Can't use synonym '%s' for both %s and %s" % (
         #         synonym,
         #         species,
         #         cls._common_names_to_species[synonym]))
         cls._common_names_to_species[synonym] = species
     for reference_name in reference_assemblies:
         # if reference_name in cls._reference_names_to_species:
         #     raise ValueError("Can't use reference '%s' for both %s and %s" % (
         #         reference_name,
         #         species,
         #         cls._reference_names_to_species[reference_name]))
         cls._reference_names_to_species[reference_name] = species
     return species
コード例 #2
0
def test_species_to_json():
    eq_(human, Species.from_json(human.to_json()))
コード例 #3
0
def test_species_to_dict():
    eq_(human, Species.from_dict(human.to_dict()))
コード例 #4
0
ファイル: species.py プロジェクト: rraadd88/pyensembl
def collect_all_genomes():
    """
    data aware generation of Species object.
    searches in .cache dir and generates a Species object

    Also generates a tsv file with all the genome info.
    Such file can be used to install sets of genomes at once (logic is bit like conda environment profile).
    It would be relatively easy to code export and import for such a file.
    """
    def str2num(s, cat=False, force=True):
        """
        Converts string to integer
        eg. ensembl92 to 92

        :param s: string
        :param cat: Whether to concatenate detected integers. eg. 20,23 to 2023
        :param force: If True, ignores decimal point error. 
        """
        import re
        if '.' in s and not force:
            raise ValueError(
                f"A string can only be converted to integeres, found a '.' in {s}"
            )
        n = re.findall(r'\d+', s)
        if len(n) == 0:
            raise ValueError("No digits found in string {}".format(s))
        elif len(n) == 1:
            return int(n[0])
        else:
            if cat:
                return int(''.join(n))
            else:
                return n

    from glob import glob
    from os.path import dirname, basename, exists
    import numpy as np
    import pandas as pd
    from pyensembl.species import normalize_species_name, Species

    # here's how I get the .cache directory eg. '/home/user/.cache/pyensembl'
    import datacache
    pyensembl_cache_dir = f"{dirname(datacache.get_data_dir())}/pyensembl"  #FIXME if genomes are installed at other places than .cache

    # all the assemblies
    assemblies = [basename(p) for p in glob(f"{pyensembl_cache_dir}/*")]
    # dataframe that contains all the info (and can be exported as a tsv).
    dspecies = pd.DataFrame(
        columns=['latin name', 'release', 'synonymn', 'assembly'])
    # assempy to release min max dict needed as an input to create Species object
    assembly2releasesminmax = {}
    # following loop populates the dataframe
    genomei = 0
    for assembly in assemblies:
        releases = [
            basename(p) for p in glob(f"{pyensembl_cache_dir}/{assembly}/*")
        ]
        for release in releases:
            releasei = str2num(release)  #FIXME is realease is a float
            genome_dir = f"{pyensembl_cache_dir}/{assembly}/{release}"
            genome_files = glob(f"{genome_dir}/*")
            is_genome_installed = True if len(
                genome_files
            ) > 4 else False  #FIXME need more than 4 (.gz) files to be strict
            if is_genome_installed:
                dspecies.loc[genomei, 'assembly'] = assembly
                dspecies.loc[genomei, 'release'] = releasei
                dspecies.loc[genomei, 'synonymn'] = basename(
                    genome_files[0]).split('.')[0]
                dspecies.loc[genomei, 'latin name'] = normalize_species_name(
                    dspecies.loc[genomei, 'synonymn'])
                genomei += 1
    # following loop generates the Species object
    for spc in dspecies['latin name'].unique():
        assembly2releases = {}
        for assembly in dspecies.loc[(dspecies['latin name'] == spc),
                                     'assembly'].unique():
            d = dspecies.loc[((dspecies['latin name'] == spc) &
                              (dspecies['assembly'] == assembly)), :]
            assembly2releases[assembly] = d['release'].min(), d['release'].max(
            )  #FIXME if MAX_ENSEMBL_RELEASE very important and has to be used
        Species.register(latin_name=spc,
                         synonyms=dspecies.loc[(dspecies['latin name'] == spc),
                                               'synonymn'].unique().tolist(),
                         reference_assemblies=assembly2releases)
        Species.dspecies = dspecies
    return Species
コード例 #5
0
def test_species_to_json():
    eq_(human, Species.from_json(human.to_json()))
コード例 #6
0
def test_species_to_dict():
    eq_(human, Species.from_dict(human.to_dict()))