def main(args=None): """Script body.""" if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() fasta_file = args.fasta_file species = args.species chrom_pat = args.chromosome_pattern output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # generate regular expression object from the chromosome pattern if chrom_pat is None: chrom_pat = ensembl.SPECIES_CHROMPAT[species] chrom_re = re.compile(chrom_pat) # filter the FASTA file # note: each chromosome sequence is temporarily read into memory, # so this script has a large memory footprint with \ misc.smart_open_read( fasta_file, mode='r', encoding='ascii', try_gzip=True ) as fh, \ misc.smart_open_write( output_file, mode='w', encoding='ascii' ) as ofh: # inside = False reader = FastaReader(fh) for seq in reader: chrom = seq.name.split(' ', 1)[0] if chrom_re.match(chrom) is None: logger.info('Ignoring chromosome "%s"...', chrom) continue seq.name = chrom seq.append_fasta(ofh) return 0
def main(args=None): """Script body.""" if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() fasta_file = args.fasta_file species = args.species chrom_pat = args.chromosome_pattern output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # generate regular expression object from the chromosome pattern if chrom_pat is None: chrom_pat = ensembl.species_chrompat[species] chrom_re = re.compile(chrom_pat) # filter the FASTA file # note: each chromosome sequence is temporarily read into memory, # so this script has a large memory footprint with \ misc.smart_open_read( fasta_file, mode='r', encoding='ascii', try_gzip=True ) as fh, \ misc.smart_open_write( output_file, mode='w', encoding='ascii' ) as ofh: # inside = False reader = FastaReader(fh) for seq in reader: chrom = seq.name.split(' ', 1)[0] if chrom_re.match(chrom) is None: logger.info('Ignoring chromosome "%s"...', chrom) continue seq.name = chrom seq.append_fasta(ofh) return 0
def write_entrez2gene(file_path, entrez2gene, logger): """Writes Entrez ID -> gene symbol mapping to a tab-delimited text file. Parameters ---------- file_path: str The path of the output file. entrez2gene: dict The mapping of Entrez IDs to gene symbols. Returns ------- None """ with misc.smart_open_write(file_path, mode='wb') as ofh: writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep) for k in sorted(entrez2gene.keys(), key=lambda x: int(x)): writer.writerow([k, entrez2gene[k]]) logger.info('Output written to file "%s".', file_path)
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect='excel-tab') writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE, quotechar='|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding', 'polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' % (exons)) return 0
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE , quotechar = '|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding','polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' %(exons)) return 0