Esempio n. 1
0
 def __init__(self, gtf_index_path, species_id='Homo_sapiens:82'):
     """Load & parse GTF file.
     :param gtf_index_path: path to GTF index file
     :type gtf_index_path: str
     :param species_id: Species code & version of GTF cache to use. E.g.: 'Homo_sapiens:75'
     :type species_id: str
     """
     assert os.path.exists(gtf_index_path), "Provided path '{}' does not exist!".format(gtf_index_path)
     assert gtf_index_path.endswith(
         'index'), "Provided index file '{}' does not have *.index suffix. Is it real index?".format(gtf_index_path)
     self.logger = logging.getLogger(__name__)
     self.species_id, self.gtf_path, self.genes = gtf.get_indexed_genes_for_identifier(gtf_index_path, self.logger,
                                                                                       species_id)
     self.genes_indexed_by_id = gtf.index_genes_by_gene_id(self.genes)

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
if __name__ == "__main__":

    if options.print_species:
        for species in list_indexed_species(options.index):
            print species
        sys.exit()

    #
    #   get genes previously parsed by index_gtf_files.py
    #
    if options.verbose >= 2:
        logger.log(MESSAGE, "Reading GTF data for %s from %s " % (options.species, options.index))
    species, gtf_file_name, genes = get_indexed_genes_for_identifier(options.index, logger, options.species)
    if not genes:
        raise Exception("No genes for %s from %s" % (options.species, options.index))
    if not "protein_coding" in genes:
        raise Exception("No protein coding genes for %s" % (options.species,))

    if options.print_gene_types:
        logger.info("Valid gene types:")
        sys.stdout.write("Valid gene types:\n")
        for gene_type in sorted(genes.keys()):
            sys.stdout.write("    %s\n" % gene_type)
            logger.info("    %s\n" % gene_type)
        sys.exit()

    open_file_names = set()
Esempio n. 3
0
def get_genes():
    logger = logging.getLogger("test")
    species_id, gtf_path, genes = gtf_to_genes.get_indexed_genes_for_identifier(
        '/Users/poldrack/data_unsynced/selftracking/vega/gtf.index',logger,'Homo_sapiens:59')
    return genes
Esempio n. 4
0
        logger.info("Valid gene types in %s / %s:" % (species, gtf_file_name))
        sys.stdout.write("Valid gene types in %s (%s):\n" %
                         (species, os.path.split(gtf_file_name)[1]))
        for gene_type in sorted(gene_types):
            sys.stdout.write("    %s\n" % gene_type)
            logger.info("    %s\n" % gene_type)
        sys.exit()

    #
    #   get genes previously parsed by index_gtf_files.py
    #
    if options.verbose >= 2:
        logger.log(
            MESSAGE, "Reading GTF data for %s from %s " %
            (options.species, options.index))
    species, gtf_file_name, genes = get_indexed_genes_for_identifier(
        options.index, logger, options.species)
    if not genes:
        raise Exception("No genes for %s from %s" %
                        (options.species, options.index))
    if not "protein_coding" in genes:
        raise Exception("No protein coding genes for %s" % (options.species, ))

    open_file_names = set()

    #
    #   output files
    #
    def open_file(file_name, open_files):
        # do not output
        if not file_name:
            return None