def main(args=None):
    """Extracts gene-level expression data from StringTie output.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    stringtie_file = args.stringtie_file
    gene_file = args.gene_file
    no_novel_transcripts = args.no_novel_transcripts
    output_file = args.output_file

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read list of gene symbols
    logger.info('Reading gene data...')
    genes = misc.read_single(gene_file)

    # read StringTie output file and summarize FPKM and TPM per gene
    logger.info('Parsing StringTie output...')

    logger.info('Associating StringTie gene IDs with gene symbols...')
    stringtie_genes = {}
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect='excel-tab')
        for l in reader:
            if l[0][0] == '#':
                continue
            assert len(l) == 9
            if l[2] != 'transcript':
                continue
            attr = parse_attributes(l[8])
            try:
                ref_gene = attr['ref_gene_name']
            except KeyError:
                continue
            else:
                # entry has a "ref_gene_name" attribute
                try:
                    g = stringtie_genes[attr['gene_id']]
                except KeyError:
                    stringtie_genes[attr['gene_id']] = {
                        ref_gene,
                    }
                else:
                    g.add(ref_gene)
    logger.info('Associated %d gene IDs with gene symbols.',
                len(stringtie_genes))
    # C = Counter(len(v) for v in stringtie_genes.itervalues())
    gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1]
    n = len(gene_ids_ambiguous)
    logger.info('%d / %d associated with multiple gene symbols (%.1f%%).', n,
                len(stringtie_genes), 100 * (n / float(len(stringtie_genes))))

    # read StringTie output file and summarize FPKM and TPM per gene
    n = len(genes)
    fpkm = np.zeros(n, dtype=np.float64)
    tpm = np.zeros(n, dtype=np.float64)
    fpkm_novel_gene = 0
    fpkm_unknown_gene_name = 0
    fpkm_novel_trans = 0
    fpkm_ambig = 0
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect='excel-tab')
        for l in reader:
            if l[0][0] == '#':
                # skip header
                continue
            assert len(l) == 9

            if l[2] != 'transcript':
                # skip exon lines
                continue

            attr = parse_attributes(l[8])
            f = float(attr['FPKM'])

            try:
                g = attr['ref_gene_name']
            except KeyError:
                if no_novel_transcripts:
                    # ignore this transcript
                    fpkm_novel_trans += f
                    continue
                else:
                    # see if we can assign a gene name based on the gene ID
                    try:
                        assoc = stringtie_genes[attr['gene_id']]
                    except KeyError:
                        # gene_id not associated with any reference gene
                        fpkm_novel_gene += f
                        continue
                    else:
                        if len(assoc) > 1:
                            # gene ID associated with multiple ref. genes
                            # => ingored
                            fpkm_ambig += f
                            continue
                        else:
                            # gene ID associated with exactly one ref. gene
                            g = list(assoc)[0]

            try:
                idx = misc.bisect_index(genes, g)
            except ValueError:
                fpkm_unknown_gene_name += f
                logger.warning('Unknown gene name: "%s".', g)
                continue

            t = float(attr['TPM'])
            fpkm[idx] += f
            tpm[idx] += t

    # ignored_fpkm = None
    if no_novel_transcripts:
        ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name
    else:
        ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name
    total_fpkm = np.sum(fpkm) + ignored_fpkm
    logger.info('Ignored %.1f / %.1f FPKM (%.1f%%)', ignored_fpkm, total_fpkm,
                100 * (ignored_fpkm / total_fpkm))

    if no_novel_transcripts and fpkm_novel_trans > 0:
        logger.info('Ignored %.1f FPKM from novel transcripts (%.1f%%).',
                    fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm))

    else:
        if fpkm_novel_gene > 0:
            logger.info(
                'Ignored %.1f FPKM from transcripts of novel genes '
                '(%.1f%%).', fpkm_novel_gene,
                100 * (fpkm_novel_gene / total_fpkm))

        if fpkm_ambig > 0:
            logger.info(
                'Ignored %.1f FPKM from transcripts with ambiguous '
                'gene membership (%.1f%%).', fpkm_ambig,
                100 * (fpkm_ambig / total_fpkm))

    if fpkm_unknown_gene_name > 0:
        logger.info(
            'Ignored %.1f FPKM from transcripts of genes with unknown '
            'names (%.1f%%).', fpkm_unknown_gene_name,
            100 * (fpkm_unknown_gene_name / total_fpkm))

    # write output file
    E = np.c_[fpkm, tpm]
    with open(output_file, 'w') as ofh:
        writer = csv.writer(ofh,
                            dialect='excel-tab',
                            lineterminator=os.linesep,
                            quoting=csv.QUOTE_NONE)
        for i, g in enumerate(genes):
            writer.writerow([g] + ['%.5f' % e for e in E[i, :]])

    return 0
Beispiel #2
0
def get_gene_exons(gene_table, genome_annotation_file, chunksize=10000):
    """Parse GTF file and get a dictionary of gene=>list of exon intervals.
    
    (Only for protein-coding genes.)
    TODO: docstring"""

    # get gene names that are guaranteed to be unique
    #gene_names = get_readable_gene_identifiers(gene_table)

    # series with index = Ensembl ID, value = unique gene name
    #genes = pd.Series(index=gene_table.index, data=gene_names)

    # sort genes by chromosome, strand, and then position
    sorted_gene_ids = sorted([id_ for id_ in gene_table.index],
                             key=lambda id_: [
                                 gene_table.loc[id_, 'chromosome'], gene_table.
                                 loc[id_, 'position'] < 0,
                                 abs(gene_table.loc[id_, 'position'])
                             ])
    #genes = genes.loc[sorted_gene_ids]
    gene_table = gene_table.loc[sorted_gene_ids]

    # dictionary for holding list of intervals for each gene
    gene_exons = OrderedDict([id_, []] for id_ in gene_table.index)

    valid = 0
    total = 0

    _LOGGER.info('Parsing GTF file "%s" in chunks...', genome_annotation_file)

    for i, df in enumerate(
            pd.read_csv(genome_annotation_file,
                        dtype={0: str},
                        sep='\t',
                        comment='#',
                        header=None,
                        chunksize=chunksize)):

        # select only exon entries
        df_sel = df.loc[df.iloc[:, 2] == 'exon']

        # extract gene IDs
        gene_ids = df_sel.iloc[:, 8].apply(
            lambda x: gtf.parse_attributes(x)['gene_id'])

        for id_, chrom, start, end in zip(gene_ids, df_sel.iloc[:, 0],
                                          df_sel.iloc[:, 3], df_sel.iloc[:,
                                                                         4]):

            total += 1

            try:
                gene = gene_table.loc[id_]
            except KeyError:
                # this gene is not contained in the gene table
                continue

            gene_chrom = gene_table.loc[id_, 'chromosome']
            if chrom != gene_chrom:
                _LOGGER.warning(
                    '%s exon ignored (wrong chromosome: '
                    '%s instead of %s).', id_, chrom, gene_chrom)
            else:
                valid += 1
                gene_exons[id_].append([start - 1, end])

    _LOGGER.info('%d / %d exons from valid genes (%.1f %%).', valid, total,
                 100 * (valid / float(total)))

    return gene_exons
Beispiel #3
0
def get_protein_coding_genes(
    path_or_buffer,
    chunksize=100000,
    chromosome_pattern=r"(?:\d\d?|MT|X|Y)$",
    include_polymorphic_pseudogenes=True,
    only_manual=False,
    remove_duplicates=True,
    fancy_sorting=True,
):
    r"""Get list of all protein-coding genes based on Ensembl GTF file.
    
    Parameters
    ----------
    path_or_buffer : str or buffer
        The GTF file (either the file path or a buffer)
    chromosome_pattern : str, optional
        Regular expression specifying valid chromosomes. [r'(?:\d\d?|MT|X|Y)$']
    include_polymorphic_pseudogene : bool, optional
        Whether to include genes annotated as "polymorphic pseudogenes"?
    only_manual : bool, optional
        Whether to exclude annotations with source "ensembl", which
        are based only on an automatic annotation pipeline. [True]
    remove_duplicates : bool, optional
        Whether to remove duplicate annotations, i.e. those with different
        Ensembl IDs for the same gene. [True]
    fancy_sorting : bool, optional
        Whether to sort chromosomes numerically, with "X", "Y", and "MT" at the
        end. 

    Returns
    -------
    `pandas.DataFrame`
        Table with rows corresponding to protein-coding genes.

    Notes
    -----
    
    Annotation sources and redundant gene annotations
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    According to the Ensembl website (1), the Ensembl gene annotation
    GTF files for human, mouse, zebrafish, rat and pig essentially
    contain two sets of annotations:
    
    One set consists of all annotations with the "ensembl"
    source annotation (column 2). These annotations are the product of
    the automated Ensembl "genebuild" pipeline.
    
    The other set consists of genes that are manually annotated by
    the HAVANA team (source "havana"), some of which have been merged with the
    automatic annotations (source "ensembl_havana").
    
    There seems to be no overlap between genes annotated with "havana" and
    "ensembl_havana" sources, respectively. However, there are a few genes for
    which only annotations with source "ensembl" exist.
    
    Our policy is therefore to prefer annotations with source "ensembl_havana"
    and "havana" over those with source "ensembl", and to only keep annotations
    with source "ensembl" if there are no manually curated alternative
    annotations.
    
    A special case is represented by mitochondrial genes, which always have the
    source "insdc".
    
    (1) see http://www.ensembl.org/Help/Faq?id=152


    Removal of duplicates
    ~~~~~~~~~~~~~~~~~~~~~
    
    Unfortunately, the Ensembl gene annotations contain duplicates for a
    handful of genes. For example, for MATR3, there are ENSG00000015479 and
    ENSG00000280987, both of type
    "ensembl_havana". There seems to be no clear criterion by which we could
    rationally and automatically choose one ID over the other, at least based
    on information contained
    in the GTF file.
    
    We therefore remove duplicates according to following policy:
    - For genes on '+' strand, keep the gene with the left-most starting
      position.
    - For genes on '-' strand, keep the gene with the right-most starting
      position.
    (In case the starting positions are equal, we keep the one that occurs
    first in the GTF file.)
    
    We would like to use the pandas.DataFrame.drop_duplicates() function for
    this. So we're temporarily reordering genes using their signed position,
    and then we're using the original index (position) to restore the original
    order.
    """
    chrompat = re.compile(chromosome_pattern)

    c = 0
    num_lines = 0
    num_chunks = 0

    t0 = time.time()
    reader = pd.read_csv(
        path_or_buffer, encoding="ascii", sep="\t", header=None, comment="#", dtype={0: str}, chunksize=chunksize
    )
    data = []
    header = ["Gene", "Ensembl_ID", "Chromosome", "Position", "Lnegth", "Source", "Type"]

    valid_biotypes = set(["protein_coding"])
    if include_polymorphic_pseudogenes:
        valid_biotypes.add("polymorphic_pseudogene")

    valid_sources = set(["ensembl_havana", "havana", "insdc"])
    if not only_manual:
        valid_sources.add("ensembl")

    excluded_chromosomes = set()

    for j, df in enumerate(reader):
        num_chunks += 1
        num_lines += df.shape[0]
        # "insdc" is required to catch the mitochondrial protein-coding genes
        sel = (df.iloc[:, 2] == "gene") & df.iloc[:, 1].isin(valid_sources)
        # c += sel.sum()
        for i, row in df.loc[sel].iterrows():
            attr = gtf.parse_attributes(row[8].lstrip(" "))

            biotype = attr["gene_biotype"]
            if biotype not in valid_biotypes:
                continue

            chrom = str(row[0])
            source = row[1]
            match = chrompat.match(chrom)
            if match is None:
                excluded_chromosomes.add(chrom)
                continue

            c += 1

            gene_name = attr["gene_name"]
            ensembl_id = attr["gene_id"]

            assert row[6] in ["+", "-"]
            if row[6] == "+":
                pos = int(row[3]) - 1
            elif row[6] == "-":
                pos = -int(row[4])
            else:
                raise ValueError("Invalid strand information: %s" % str(row[6]))
            length = abs(int(row[4]) - int(row[3]))

            data.append([gene_name, ensembl_id, chrom, pos, length, source, biotype])

    t1 = time.time()

    df = pd.DataFrame(columns=header, data=data)

    if not only_manual:
        # keep only annotations with source "ensembl"
        # if no manual annotations are available
        sel = df["Source"] == "ensembl"
        redundant_ensembl_genes = set(df.loc[sel, "Gene"].values) & set(df.loc[~sel, "Gene"].values)
        sel = sel & df["Gene"].isin(redundant_ensembl_genes)
        num_genes_before = df.shape[0]
        df = df.loc[~sel]
        num_genes_after = df.shape[0]
        logger.info(
            'Removed %d gene annotations with source "ensembl" that ' "also had manual annotations.",
            num_genes_before - num_genes_after,
        )
    if remove_duplicates:
        # remove duplicate annotations (two or more Ensembl IDs for the same
        # gene)
        num_genes_before = df.shape[0]

        # sort by signed position value
        df.sort_values("Position", kind="mergesort", inplace=True)

        # remove duplicates by keeping the first occurrence
        df.drop_duplicates(["Chromosome", "Gene"], inplace=True)

        # restore original order using the numeric index
        df.sort_index(inplace=True)

        num_genes_after = df.shape[0]
        logger.info("Removed %d duplicate gene entries", num_genes_before - num_genes_after)

    # sort normally (first by chromsome, then by absolute position)
    df_sort = pd.concat([df["Chromosome"], df["Position"].abs()], axis=1)
    df_sort = df_sort.sort_values(["Chromosome", "Position"], kind="mergesort")
    df = df.loc[df_sort.index]

    if fancy_sorting:
        # Perform "fancy sorting" of genes. Chromosomes with numbers (1-22)
        # are ordered numerically, and followed by the X, Y, and MT
        # chromosomes.
        def transform_chrom(chrom):
            try:
                c = int(chrom)
            except:
                if chrom == "MT":
                    return "_MT"
                else:
                    return chrom
            else:
                return "%02d" % c

        chrom_for_sorting = df["Chromosome"].apply(transform_chrom)
        a = chrom_for_sorting.argsort(kind="mergesort")
        df = df.iloc[a]
        logger.info("Performed fancy sorting of chromosomes.")

    logger.info("Read %d lines (in %d chunks).", num_lines, num_chunks)
    logger.info("Found %d valid protein-coding gene entries.", c)
    logger.info("Final number of unique protein-coding genes: %d", df.shape[0])
    logger.info("Parsing time: %.1f s", t1 - t0)

    # additional statistics
    all_chromosomes = list(df["Chromosome"].unique())
    logger.info("Valid chromosomes (%d): %s", len(all_chromosomes), ", ".join(all_chromosomes))
    logger.info("Excluded chromosomes (%d): %s", len(excluded_chromosomes), ", ".join(sorted(excluded_chromosomes)))

    logger.info("Sources:")
    for i, c in df["Source"].value_counts().iteritems():
        logger.info("\t%s: %d", i, c)

    logger.info("Gene types:")
    for i, c in df["Type"].value_counts().iteritems():
        logger.info("\t%s: %d", i, c)

    return df
def main(args=None):
    """Extracts gene-level expression data from StringTie output.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    stringtie_file = args.stringtie_file
    gene_file = args.gene_file
    no_novel_transcripts = args.no_novel_transcripts
    output_file = args.output_file

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read list of gene symbols
    logger.info("Reading gene data...")
    genes = misc.read_single(gene_file)

    # read StringTie output file and summarize FPKM and TPM per gene
    logger.info("Parsing StringTie output...")

    logger.info("Associating StringTie gene IDs with gene symbols...")
    stringtie_genes = {}
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect="excel-tab")
        for l in reader:
            if l[0][0] == "#":
                continue
            assert len(l) == 9
            if l[2] != "transcript":
                continue
            attr = parse_attributes(l[8])
            try:
                ref_gene = attr["ref_gene_name"]
            except KeyError:
                continue
            else:
                # entry has a "ref_gene_name" attribute
                try:
                    g = stringtie_genes[attr["gene_id"]]
                except KeyError:
                    stringtie_genes[attr["gene_id"]] = {ref_gene}
                else:
                    g.add(ref_gene)
    logger.info("Associated %d gene IDs with gene symbols.", len(stringtie_genes))
    # C = Counter(len(v) for v in stringtie_genes.itervalues())
    gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1]
    n = len(gene_ids_ambiguous)
    logger.info(
        "%d / %d associated with multiple gene symbols (%.1f%%).",
        n,
        len(stringtie_genes),
        100 * (n / float(len(stringtie_genes))),
    )

    # read StringTie output file and summarize FPKM and TPM per gene
    n = len(genes)
    fpkm = np.zeros(n, dtype=np.float64)
    tpm = np.zeros(n, dtype=np.float64)
    fpkm_novel_gene = 0
    fpkm_unknown_gene_name = 0
    fpkm_novel_trans = 0
    fpkm_ambig = 0
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect="excel-tab")
        for l in reader:
            if l[0][0] == "#":
                # skip header
                continue
            assert len(l) == 9

            if l[2] != "transcript":
                # skip exon lines
                continue

            attr = parse_attributes(l[8])
            f = float(attr["FPKM"])

            try:
                g = attr["ref_gene_name"]
            except KeyError:
                if no_novel_transcripts:
                    # ignore this transcript
                    fpkm_novel_trans += f
                    continue
                else:
                    # see if we can assign a gene name based on the gene ID
                    try:
                        assoc = stringtie_genes[attr["gene_id"]]
                    except KeyError:
                        # gene_id not associated with any reference gene
                        fpkm_novel_gene += f
                        continue
                    else:
                        if len(assoc) > 1:
                            # gene ID associated with multiple ref. genes
                            # => ingored
                            fpkm_ambig += f
                            continue
                        else:
                            # gene ID associated with exactly one ref. gene
                            g = list(assoc)[0]

            try:
                idx = misc.bisect_index(genes, g)
            except ValueError:
                fpkm_unknown_gene_name += f
                logger.warning('Unknown gene name: "%s".', g)
                continue

            t = float(attr["TPM"])
            fpkm[idx] += f
            tpm[idx] += t

    # ignored_fpkm = None
    if no_novel_transcripts:
        ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name
    else:
        ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name
    total_fpkm = np.sum(fpkm) + ignored_fpkm
    logger.info("Ignored %.1f / %.1f FPKM (%.1f%%)", ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm))

    if no_novel_transcripts and fpkm_novel_trans > 0:
        logger.info(
            "Ignored %.1f FPKM from novel transcripts (%.1f%%).",
            fpkm_novel_trans,
            100 * (fpkm_novel_trans / total_fpkm),
        )

    else:
        if fpkm_novel_gene > 0:
            logger.info(
                "Ignored %.1f FPKM from transcripts of novel genes " "(%.1f%%).",
                fpkm_novel_gene,
                100 * (fpkm_novel_gene / total_fpkm),
            )

        if fpkm_ambig > 0:
            logger.info(
                "Ignored %.1f FPKM from transcripts with ambiguous " "gene membership (%.1f%%).",
                fpkm_ambig,
                100 * (fpkm_ambig / total_fpkm),
            )

    if fpkm_unknown_gene_name > 0:
        logger.info(
            "Ignored %.1f FPKM from transcripts of genes with unknown " "names (%.1f%%).",
            fpkm_unknown_gene_name,
            100 * (fpkm_unknown_gene_name / total_fpkm),
        )

    # write output file
    E = np.c_[fpkm, tpm]
    with open(output_file, "w") as ofh:
        writer = csv.writer(ofh, dialect="excel-tab", lineterminator=os.linesep, quoting=csv.QUOTE_NONE)
        for i, g in enumerate(genes):
            writer.writerow([g] + ["%.5f" % e for e in E[i, :]])

    return 0
def main(args=None):
    """Extract all exon annotations of protein-coding genes."""

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream,
                             log_file=log_file,
                             quiet=quiet,
                             verbose=verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
                chrom_pat.pattern)

    chromosomes = set()
    excluded_chromosomes = set()
    i = 0
    exons = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \
            misc.smart_open_write(output_file) as ofh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect='excel-tab')
        writer = csv.writer(ofh,
                            dialect='excel-tab',
                            lineterminator=os.linesep,
                            quoting=csv.QUOTE_NONE,
                            quotechar='|')
        for l in reader:
            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress
            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']
                if type_ in ['protein_coding', 'polymorphic_pseudogene']:

                    # test whether chromosome is valid
                    chrom = l[0]
                    m = chrom_pat.match(chrom)
                    if m is None:
                        excluded_chromosomes.add(chrom)
                        continue

                    chromosomes.add(chrom)
                    writer.writerow(l)
                    exons += 1

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('')
    logger.info('Gene chromosomes (%d):', len(chromosomes))
    logger.info('\t' + ', '.join(sorted(chromosomes)))
    logger.info('')
    logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes))
    logger.info('\t' + ', '.join(sorted(excluded_chromosomes)))
    logger.info('')
    logger.info('Total no. of exons: %d' % (exons))

    return 0
def main(args=None):
    """Extract Ensembl IDs and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.species_chrompat[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    # for statistics
    types = Counter()
    sources = Counter()

    # primary information
    genes = Counter()
    gene_chroms = dict()
    gene_ids = dict()

    # secondary information
    genes2 = Counter()
    polymorphic = set()

    # list of chromosomes
    chromosomes = set()
    excluded_chromosomes = set()

    transcripts = {}
    gene_id = None
    gene_name = None

    i = 0
    missing = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        for l in reader:

            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress

            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']

                if type_ not in ['protein_coding', 'polymorphic_pseudogene']:
                    continue

                chrom = l[0]

                # test whether chromosome is valid
                m = chrom_pat.match(chrom)
                if m is None:
                    excluded_chromosomes.add(chrom)
                    continue

                chromosomes.add(m.group())

                source = l[1]
                gene_id = attr['gene_id']
                try:
                    gene_name = attr['gene_name']
                except KeyError as e:
                    missing += 1
                    continue

                if gene_id in genes:
                    if genes[gene_id] != gene_name:
                        raise ValueError('Ensembl ID "%s" ' %(gene_id) +
                                'associated with multiple gene symbols.')
                else:
                    genes[gene_id] = gene_name

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('Excluded %d chromosomes:', len(excluded_chromosomes))
    logger.info(', '.join(sorted(excluded_chromosomes)))

    n = len(genes)
    m = len(set(genes.values()))

    logger.info('No. of chromosomes: %d', len(chromosomes))
    logger.info('No. of genes IDs: %d', n)
    logger.info('No. of gene names: %d', m)

    with misc.smart_open_write(output_file) as ofh:
        writer = csv.writer(ofh, dialect = 'excel-tab',
                lineterminator = os.linesep, quoting = csv.QUOTE_NONE)
        for g in sorted(genes.keys()):
            writer.writerow([g, genes[g]])

    return 0
def main(args=None):
    """Extract Ensembl IDs and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    # for statistics
    types = Counter()
    sources = Counter()

    # primary information
    genes = Counter()
    gene_chroms = dict()
    gene_ids = dict()

    # secondary information
    genes2 = Counter()
    polymorphic = set()

    # list of chromosomes
    chromosomes = set()
    excluded_chromosomes = set()

    transcripts = {}
    gene_id = None
    gene_name = None

    i = 0
    missing = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        for l in reader:

            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress

            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']

                if type_ not in ['protein_coding', 'polymorphic_pseudogene']:
                    continue

                chrom = l[0]

                # test whether chromosome is valid
                m = chrom_pat.match(chrom)
                if m is None:
                    excluded_chromosomes.add(chrom)
                    continue

                chromosomes.add(m.group())

                source = l[1]
                gene_id = attr['gene_id']
                try:
                    gene_name = attr['gene_name']
                except KeyError as e:
                    missing += 1
                    continue

                if gene_id in genes:
                    if genes[gene_id] != gene_name:
                        raise ValueError('Ensembl ID "%s" ' %(gene_id) +
                                'associated with multiple gene symbols.')
                else:
                    genes[gene_id] = gene_name

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('Excluded %d chromosomes:', len(excluded_chromosomes))
    logger.info(', '.join(sorted(excluded_chromosomes)))

    n = len(genes)
    m = len(set(genes.values()))

    logger.info('No. of chromosomes: %d', len(chromosomes))
    logger.info('No. of genes IDs: %d', n)
    logger.info('No. of gene names: %d', m)

    with misc.smart_open_write(output_file) as ofh:
        writer = csv.writer(ofh, dialect = 'excel-tab',
                lineterminator = os.linesep, quoting = csv.QUOTE_NONE)
        for g in sorted(genes.keys()):
            writer.writerow([g, genes[g]])

    return 0
def main(args=None):
    """Extract all exon annotations of protein-coding genes."""

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.species_chrompat[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    chromosomes = set()
    excluded_chromosomes = set()
    i = 0
    exons = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \
            misc.smart_open_write(output_file) as ofh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep,
                quoting = csv.QUOTE_NONE , quotechar = '|')
        for l in reader:
            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress
            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']
                if type_ in ['protein_coding','polymorphic_pseudogene']:

                    # test whether chromosome is valid
                    chrom = l[0]
                    m = chrom_pat.match(chrom)
                    if m is None:
                        excluded_chromosomes.add(chrom)
                        continue

                    chromosomes.add(chrom)
                    writer.writerow(l)
                    exons += 1

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('')
    logger.info('Gene chromosomes (%d):', len(chromosomes))
    logger.info('\t' + ', '.join(sorted(chromosomes)))
    logger.info('')
    logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes))
    logger.info('\t' + ', '.join(sorted(excluded_chromosomes)))
    logger.info('')
    logger.info('Total no. of exons: %d' %(exons))

    return 0