def __init__(self, gtf_index_path, species_id='Homo_sapiens:82'): """Load & parse GTF file. :param gtf_index_path: path to GTF index file :type gtf_index_path: str :param species_id: Species code & version of GTF cache to use. E.g.: 'Homo_sapiens:75' :type species_id: str """ assert os.path.exists(gtf_index_path), "Provided path '{}' does not exist!".format(gtf_index_path) assert gtf_index_path.endswith( 'index'), "Provided index file '{}' does not have *.index suffix. Is it real index?".format(gtf_index_path) self.logger = logging.getLogger(__name__) self.species_id, self.gtf_path, self.genes = gtf.get_indexed_genes_for_identifier(gtf_index_path, self.logger, species_id) self.genes_indexed_by_id = gtf.index_genes_by_gene_id(self.genes)
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 if __name__ == "__main__": if options.print_species: for species in list_indexed_species(options.index): print species sys.exit() # # get genes previously parsed by index_gtf_files.py # if options.verbose >= 2: logger.log(MESSAGE, "Reading GTF data for %s from %s " % (options.species, options.index)) species, gtf_file_name, genes = get_indexed_genes_for_identifier(options.index, logger, options.species) if not genes: raise Exception("No genes for %s from %s" % (options.species, options.index)) if not "protein_coding" in genes: raise Exception("No protein coding genes for %s" % (options.species,)) if options.print_gene_types: logger.info("Valid gene types:") sys.stdout.write("Valid gene types:\n") for gene_type in sorted(genes.keys()): sys.stdout.write(" %s\n" % gene_type) logger.info(" %s\n" % gene_type) sys.exit() open_file_names = set()
def get_genes(): logger = logging.getLogger("test") species_id, gtf_path, genes = gtf_to_genes.get_indexed_genes_for_identifier( '/Users/poldrack/data_unsynced/selftracking/vega/gtf.index',logger,'Homo_sapiens:59') return genes
logger.info("Valid gene types in %s / %s:" % (species, gtf_file_name)) sys.stdout.write("Valid gene types in %s (%s):\n" % (species, os.path.split(gtf_file_name)[1])) for gene_type in sorted(gene_types): sys.stdout.write(" %s\n" % gene_type) logger.info(" %s\n" % gene_type) sys.exit() # # get genes previously parsed by index_gtf_files.py # if options.verbose >= 2: logger.log( MESSAGE, "Reading GTF data for %s from %s " % (options.species, options.index)) species, gtf_file_name, genes = get_indexed_genes_for_identifier( options.index, logger, options.species) if not genes: raise Exception("No genes for %s from %s" % (options.species, options.index)) if not "protein_coding" in genes: raise Exception("No protein coding genes for %s" % (options.species, )) open_file_names = set() # # output files # def open_file(file_name, open_files): # do not output if not file_name: return None