예제 #1
0
    def select_species(self):
        import tempfile
        outfile = tempfile.mktemp() + '.txt.gz'
        try:
            self.logger.info('Downloading "dataset_names.txt.gz"...')
            out_f = open(outfile, 'wb')
            ftp = FTP(self.__class__.ENSEMBL_FTP_HOST)
            ftp.login()
            species_file = '/pub/metazoa/release-%s/mysql/metazoa_mart_%s/dataset_names.txt.gz' % (
                self.release, self.release)
            ftp.retrbinary("RETR " + species_file, out_f.write)
            out_f.close()
            self.logger.info('Done.')

            #load saved file
            self.logger.info('Parsing "dataset_names.txt.gz"...')
            species_li = tab2list(outfile, (0, 4, 5), header=0)
            species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li]
            species_li = [
                x[:-1] + [is_int(x[-1]) and int(x[-1]) or None]
                for x in species_li
            ]
            self.logger.info('Done.')
        finally:
            os.remove(outfile)
            pass

        import pprint
        self.logger.error(pprint.pformat(species_li))
        return species_li
예제 #2
0
    def get_all_species(self):
        import tempfile
        outfile = tempfile.mktemp() + '.txt.gz'
        try:
            self.logger.info('Downloading "species.txt.gz"...')
            out_f = open(outfile, 'wb')
            ftp = FTP(self.__class__.ENSEMBL_FTP_HOST)
            ftp.login()
            species_file = '/pub/release-%s/mysql/ensembl_production_%s/species.txt.gz' % (self.release, self.release)
            ftp.retrbinary("RETR " + species_file, out_f.write)
            out_f.close()
            self.logger.info('Done.')

            #load saved file
            self.logger.info('Parsing "species.txt.gz"...')
            species_li = tab2list(outfile, (1, 2, 7), header=0)   # db_name,common_name,taxid
            species_li = [x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li]
            # as of ensembl 87, there are also mouse strains. keep only the "original" one
            species_li = [s for s in species_li if not s[0].startswith("mus_musculus_")]
            self.logger.info('Done.')
        finally:
            os.remove(outfile)
            pass

        return species_li
예제 #3
0
    def _select_species(self):
        """
        Return a list of tuple containing species to download data for.
        [(species_name1, common_name1, taxid1),(species_name2, common_name2, taxid2), ...]
        """
        import tempfile
        outfile = tempfile.mktemp() + '.txt.gz'
        try:
            self.logger.info('Downloading Species List...')
            out_f = open(outfile, 'wb')
            ftp = FTP(self.__class__.ENSEMBL_FTP_HOST)
            ftp.login()
            species_file = self.get_species_file()
            ftp.retrbinary("RETR " + species_file, out_f.write)
            out_f.close()
            self.logger.info('Done.')

            # load saved file
            self.logger.info('Loading Species List...')
            species_li = tab2list(outfile, (0, 4, 5), header=0)
            species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li]
            species_li = [
                x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li]
            self.logger.info('Done.')
        finally:
            os.remove(outfile)

        import pprint
        self.logger.debug('\n %s', pprint.pformat(species_li))
        return species_li
예제 #4
0
    def load_data(self, data_folder):
        """
        Loads gene data from NCBI's refseq2gene.gz file.
        Parses it based on genomic position data and refseq status provided by the
        list of taxids from get_ref_microbe_taxids() as lookup table
        :return:
        """

        taxids_file = os.path.join(data_folder, "../ref_microbe_taxids.pyobj")
        datafile = os.path.join(data_folder, 'gene2refseq.gz')

        taxids = loadobj(taxids_file)
        taxid_set = set(taxids)

        def _includefn(ld):
            return ld[0] in taxid_set  # match taxid from taxid_set

        cols_included = [0, 1, 7, 9, 10, 11]  # 0-based col idx
        gene2genomic_pos_li = tab2list(datafile,
                                       cols_included,
                                       header=1,
                                       includefn=_includefn)
        count = 0
        last_id = None
        for gene in gene2genomic_pos_li:
            count += 1
            strand = 1 if gene[5] == '+' else -1
            _id = gene[1]

            mgi_dict = {
                '_id': _id,
                'genomic_pos': {
                    'entrezgene': _id,
                    'start': int(gene[3]),
                    'end': int(gene[4]),
                    'chr': gene[2],
                    'strand': strand
                }
            }
            if _id != last_id:
                # rows with dup _id will be skipped
                yield mgi_dict
            last_id = _id
예제 #5
0
def get_geneid_d(data_folder,
                 species_li=None,
                 load_cache=True,
                 save_cache=True,
                 only_for={}):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set(
            [TAXONOMY[species]["tax_id"] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(data_folder)

    # check cache file
    _cache_file = 'geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene_info.gz') and \
       file_newer(_cache_file, 'gene_history.gz'):
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(data_folder, 'gene_info.gz')
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set and (
            only_for and ld[1] in only_for)
    elif only_for:
        species_filter = lambda ld: only_for and ld[1] in only_for
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))

    DATAFILE = os.path.join(data_folder, 'gene_history.gz')

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    # TODO: this fills memory with key==value ...
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d