Ejemplo n.º 1
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gopca_file = args.gopca_file
    output_file = args.output_file

    # configure root logger
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose
    logger = misc.get_logger(log_file=log_file, quiet=quiet,
                             verbose=verbose)

    assert os.path.isfile(gopca_file)

    workbook = xlsxwriter.Workbook(
        output_file, {'strings_to_numbers': True, 'in_memory': True})
    workbook.set_properties({'title': 'GO-PCA Signatures'})

    bold = workbook.add_format({'bold': True})

    ws = workbook.add_worksheet()

    result = util.read_gopca_result(gopca_file)
    signatures = result.signatures

    # sort signatures first by PC, then by fold enrichment
    signatures = sorted(
        signatures, key=lambda s: [abs(s.pc), -sign(s.pc), -s.escore])

    labels = list(signatures[0].get_ordered_dict().keys())
    ws.write_row(0, 0, labels, cell_format=bold)

    max_width = np.float64([len(labels[j]) for j in range(len(labels))])
    for i, sig in enumerate(signatures):
        vals = sig.get_ordered_dict().values()
        for j, v in enumerate(vals):
            max_width[j] = max(max_width[j], float(len(v)))
        ws.write_row(i+1, 0, vals)

    for j in range(len(labels)):
        ws.set_column(j, j, max_width[j]+0.43)

    workbook.close()

    logger.info('Wrote %d signatures to "%s".', len(signatures), output_file)

    return 0
Ejemplo n.º 2
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gopca_file = args.gopca_file
    output_file = args.output_file

    #sig_max_len = args.sig_max_len
    #sig_reverse_order = args.sig_reverse_order

    #sample_cluster_metric = args.sample_cluster_metric
    #no_sample_clustering = args.no_sample_clustering

    # configure root logger
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose
    logger = misc.get_logger(log_file=log_file, quiet=quiet,
                             verbose=verbose)

    result = util.read_gopca_result(gopca_file)
    
    sig_matrix = util.read_gopca_result(gopca_file)

    sig_labels = [sig.get_label(include_id=False)
                  for sig in sig_matrix.signatures]

    matrix = ExpMatrix(genes=sig_labels, samples=sig_matrix.samples,
                       X=sig_matrix.X)
    matrix.index.name = 'Signatures'
    #signatures = result.signatures
    #sig_labels = [sig.get_label(max_name_length=sig_max_len, include_id=False)
    #              for sig in signatures]
    #samples = list(result.samples)

    # generate expression matrix
    #E = ExpMatrix(genes=sig_labels, samples=samples, X=sig_matrix.X)

    # clustering of signatures (rows)
    #E, _ = cluster.cluster_genes(E, reverse=sig_reverse_order)

    exp_logger = logging.getLogger(expression.__name__)
    exp_logger.setLevel(logging.WARNING)
    matrix.write_tsv(output_file)
    exp_logger.setLevel(logging.NOTSET)
    logger.info('Wrote %d x %d signature matrix to "%s".',
                matrix.p, matrix.n, output_file)

    return 0
Ejemplo n.º 3
0
def main(args=None):
    """Script body."""

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    fasta_file = args.fasta_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    output_file = args.output_file

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream,
                             log_file=log_file,
                             quiet=quiet,
                             verbose=verbose)

    # generate regular expression object from the chromosome pattern
    if chrom_pat is None:
        chrom_pat = ensembl.SPECIES_CHROMPAT[species]
    chrom_re = re.compile(chrom_pat)

    # filter the FASTA file
    # note: each chromosome sequence is temporarily read into memory,
    # so this script has a large memory footprint
    with \
        misc.smart_open_read(
            fasta_file, mode='r', encoding='ascii', try_gzip=True
        ) as fh, \
        misc.smart_open_write(
            output_file, mode='w', encoding='ascii'
        ) as ofh:

        # inside = False
        reader = FastaReader(fh)
        for seq in reader:
            chrom = seq.name.split(' ', 1)[0]
            if chrom_re.match(chrom) is None:
                logger.info('Ignoring chromosome "%s"...', chrom)
                continue
            seq.name = chrom
            seq.append_fasta(ofh)

    return 0
Ejemplo n.º 4
0
def main(args=None):
    """Script body."""

    if args is None:
        # parse command-line arguments 
        parser = get_argument_parser()
        args = parser.parse_args()

    fasta_file = args.fasta_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    output_file = args.output_file
    
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream, log_file=log_file,
                             quiet=quiet, verbose=verbose)

    # generate regular expression object from the chromosome pattern
    if chrom_pat is None:
        chrom_pat = ensembl.species_chrompat[species]
    chrom_re = re.compile(chrom_pat)

    # filter the FASTA file
    # note: each chromosome sequence is temporarily read into memory,
    # so this script has a large memory footprint
    with \
        misc.smart_open_read(
            fasta_file, mode='r', encoding='ascii', try_gzip=True
        ) as fh, \
        misc.smart_open_write(
            output_file, mode='w', encoding='ascii'
        ) as ofh:

        # inside = False
        reader = FastaReader(fh)
        for seq in reader:
            chrom = seq.name.split(' ', 1)[0]
            if chrom_re.match(chrom) is None:
                logger.info('Ignoring chromosome "%s"...', chrom)
                continue
            seq.name = chrom
            seq.append_fasta(ofh)

    return 0
Ejemplo n.º 5
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gopca_file = args.gopca_file
    output_file = args.output_file

    emin = args.min_val
    emax = args.max_val

    width = args.width
    height = args.height

    margin_left = args.margin_left
    margin_bottom = args.margin_bottom

    font_size = args.font_size
    font = args.font

    include_plotlyjs = True
    if args.no_plotly_js:
        include_plotlyjs = False

    # configure root logger
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose
    logger = misc.get_logger(log_file=log_file, quiet=quiet,
                             verbose=verbose)

    colorbar_label = args.colorbar_label

    sig_matrix = util.read_gopca_result(gopca_file)
    fig = sig_matrix.get_figure(
        width=width, height=height,
        font_size=font_size, font=font,
        emin=emin, emax=emax,
        margin_left=margin_left, margin_bottom=margin_bottom,
        heatmap_kw=dict(colorbar_label=colorbar_label))
    plot(fig, filename=output_file, image_filename='gopca_signature_matrix', 
         auto_open=False, include_plotlyjs=include_plotlyjs)

    logger.info('Plotted  %d x %d signature matrix.',
                sig_matrix.p, sig_matrix.n)

    return 0
Ejemplo n.º 6
0
def main(args=None):
    """Extracts Entrez ID -> gene symbol mapping and writes it to a text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).

    Raises
    ------
    SystemError
        If the version of the Python interpreter is not >= 2.7.
    """
    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gene2acc_file = args.gene2acc_file
    output_file = args.output_file
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure logger
    log_stream = sys.stdout
    if output_file == '-':
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream, log_file=log_file,
                             quiet=quiet, verbose=verbose)

    entrez2gene = read_gene2acc(gene2acc_file, logger)
    write_entrez2gene(output_file, entrez2gene, logger)

    return 0
Ejemplo n.º 7
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gopca_file = args.gopca_file
    output_file = args.output_file

    # configure root logger
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose
    logger = misc.get_logger(log_file=log_file, quiet=quiet,
                             verbose=verbose)

    assert os.path.isfile(gopca_file)

    result = util.read_gopca_result(gopca_file)
    signatures = result.signatures

    # sort signatures first by PC, then by fold enrichment
    signatures = sorted(
        signatures, key=lambda s: [abs(s.pc), -sign(s.pc), -s.escore])

    labels = signatures[0].get_ordered_dict().keys()

    with open(output_file, 'wb') as ofh:
        writer = csv.writer(ofh, dialect='excel-tab', lineterminator='\n',
                            quoting=csv.QUOTE_NONE)

        writer.writerow(labels)

        for i, sig in enumerate(signatures):
            vals = sig.get_ordered_dict().values()
            writer.writerow(vals)

    logger.info('Wrote %d signatures to "%s".',
                len(signatures), output_file)

    return 0
Ejemplo n.º 8
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gopca_file = args.gopca_file
    output_file = args.output_file

    # configure root logger
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    assert os.path.isfile(gopca_file)

    result = util.read_gopca_result(gopca_file)
    signatures = result.signatures

    # sort signatures first by PC, then by fold enrichment
    signatures = sorted(signatures,
                        key=lambda s: [abs(s.pc), -sign(s.pc), -s.escore])

    labels = signatures[0].get_ordered_dict().keys()

    with open(output_file, 'wb') as ofh:
        writer = csv.writer(ofh,
                            dialect='excel-tab',
                            lineterminator='\n',
                            quoting=csv.QUOTE_NONE)

        writer.writerow(labels)

        for i, sig in enumerate(signatures):
            vals = sig.get_ordered_dict().values()
            writer.writerow(vals)

    logger.info('Wrote %d signatures to "%s".', len(signatures), output_file)

    return 0
Ejemplo n.º 9
0
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Tests for the `GSEAnalysis` class."""

import pytest

# from genometools.expression import ExpGenome
from genometools import misc
from genometools.enrichment import GeneSetEnrichmentAnalysis

logger = misc.get_logger('genometools', verbose=True)


@pytest.fixture
def my_analysis(my_valid_genes, my_gene_set_coll):
    analysis = GeneSetEnrichmentAnalysis(my_valid_genes, my_gene_set_coll)
    return analysis


def test_basic(my_analysis, my_valid_genes):
    assert isinstance(my_analysis, GeneSetEnrichmentAnalysis)
    assert isinstance(repr(my_analysis), str)
    assert isinstance(str(my_analysis), str)

    assert my_analysis.valid_genes is not my_valid_genes
    assert len(my_analysis.valid_genes) == len(my_valid_genes)
def main(args=None):
    """Extracts gene-level expression data from StringTie output.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    stringtie_file = args.stringtie_file
    gene_file = args.gene_file
    no_novel_transcripts = args.no_novel_transcripts
    output_file = args.output_file

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read list of gene symbols
    logger.info('Reading gene data...')
    genes = misc.read_single(gene_file)

    # read StringTie output file and summarize FPKM and TPM per gene
    logger.info('Parsing StringTie output...')

    logger.info('Associating StringTie gene IDs with gene symbols...')
    stringtie_genes = {}
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect='excel-tab')
        for l in reader:
            if l[0][0] == '#':
                continue
            assert len(l) == 9
            if l[2] != 'transcript':
                continue
            attr = parse_attributes(l[8])
            try:
                ref_gene = attr['ref_gene_name']
            except KeyError:
                continue
            else:
                # entry has a "ref_gene_name" attribute
                try:
                    g = stringtie_genes[attr['gene_id']]
                except KeyError:
                    stringtie_genes[attr['gene_id']] = {
                        ref_gene,
                    }
                else:
                    g.add(ref_gene)
    logger.info('Associated %d gene IDs with gene symbols.',
                len(stringtie_genes))
    # C = Counter(len(v) for v in stringtie_genes.itervalues())
    gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1]
    n = len(gene_ids_ambiguous)
    logger.info('%d / %d associated with multiple gene symbols (%.1f%%).', n,
                len(stringtie_genes), 100 * (n / float(len(stringtie_genes))))

    # read StringTie output file and summarize FPKM and TPM per gene
    n = len(genes)
    fpkm = np.zeros(n, dtype=np.float64)
    tpm = np.zeros(n, dtype=np.float64)
    fpkm_novel_gene = 0
    fpkm_unknown_gene_name = 0
    fpkm_novel_trans = 0
    fpkm_ambig = 0
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect='excel-tab')
        for l in reader:
            if l[0][0] == '#':
                # skip header
                continue
            assert len(l) == 9

            if l[2] != 'transcript':
                # skip exon lines
                continue

            attr = parse_attributes(l[8])
            f = float(attr['FPKM'])

            try:
                g = attr['ref_gene_name']
            except KeyError:
                if no_novel_transcripts:
                    # ignore this transcript
                    fpkm_novel_trans += f
                    continue
                else:
                    # see if we can assign a gene name based on the gene ID
                    try:
                        assoc = stringtie_genes[attr['gene_id']]
                    except KeyError:
                        # gene_id not associated with any reference gene
                        fpkm_novel_gene += f
                        continue
                    else:
                        if len(assoc) > 1:
                            # gene ID associated with multiple ref. genes
                            # => ingored
                            fpkm_ambig += f
                            continue
                        else:
                            # gene ID associated with exactly one ref. gene
                            g = list(assoc)[0]

            try:
                idx = misc.bisect_index(genes, g)
            except ValueError:
                fpkm_unknown_gene_name += f
                logger.warning('Unknown gene name: "%s".', g)
                continue

            t = float(attr['TPM'])
            fpkm[idx] += f
            tpm[idx] += t

    # ignored_fpkm = None
    if no_novel_transcripts:
        ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name
    else:
        ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name
    total_fpkm = np.sum(fpkm) + ignored_fpkm
    logger.info('Ignored %.1f / %.1f FPKM (%.1f%%)', ignored_fpkm, total_fpkm,
                100 * (ignored_fpkm / total_fpkm))

    if no_novel_transcripts and fpkm_novel_trans > 0:
        logger.info('Ignored %.1f FPKM from novel transcripts (%.1f%%).',
                    fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm))

    else:
        if fpkm_novel_gene > 0:
            logger.info(
                'Ignored %.1f FPKM from transcripts of novel genes '
                '(%.1f%%).', fpkm_novel_gene,
                100 * (fpkm_novel_gene / total_fpkm))

        if fpkm_ambig > 0:
            logger.info(
                'Ignored %.1f FPKM from transcripts with ambiguous '
                'gene membership (%.1f%%).', fpkm_ambig,
                100 * (fpkm_ambig / total_fpkm))

    if fpkm_unknown_gene_name > 0:
        logger.info(
            'Ignored %.1f FPKM from transcripts of genes with unknown '
            'names (%.1f%%).', fpkm_unknown_gene_name,
            100 * (fpkm_unknown_gene_name / total_fpkm))

    # write output file
    E = np.c_[fpkm, tpm]
    with open(output_file, 'w') as ofh:
        writer = csv.writer(ofh,
                            dialect='excel-tab',
                            lineterminator=os.linesep,
                            quoting=csv.QUOTE_NONE)
        for i, g in enumerate(genes):
            writer.writerow([g] + ['%.5f' % e for e in E[i, :]])

    return 0
Ejemplo n.º 11
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError(
            "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor)
        )

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGenome.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).",
            f,
            matrix.p,
            100 * (f / float(matrix.p)),
        )

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)",
            f,
            p,
            100 * (f / float(p)),
        )

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0
Ejemplo n.º 12
0
"""Tests for the `ExpGenome` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import str as text

from copy import deepcopy
from collections import Iterable

import pytest

from genometools.misc import get_logger
from genometools.expression import ExpGene, ExpGenome

logger = get_logger(__name__, verbose=True)

def test_init(my_genome, my_exp_genes):
    assert isinstance(my_genome, ExpGenome)
    assert isinstance(repr(my_genome), str)
    assert isinstance(str(my_genome), str)
    assert isinstance(text(my_genome), text)
    assert isinstance(my_genome.hash, text)
    assert len(my_genome) == len(my_exp_genes)
    assert isinstance(my_genome, Iterable)

    genes = [eg.name for eg in my_exp_genes]
    assert my_genome.genes == genes
    assert my_genome.exp_genes == my_exp_genes

    other = deepcopy(my_genome)
def main(args=None):
    """Extract Ensembl IDs and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.species_chrompat[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    # for statistics
    types = Counter()
    sources = Counter()

    # primary information
    genes = Counter()
    gene_chroms = dict()
    gene_ids = dict()

    # secondary information
    genes2 = Counter()
    polymorphic = set()

    # list of chromosomes
    chromosomes = set()
    excluded_chromosomes = set()

    transcripts = {}
    gene_id = None
    gene_name = None

    i = 0
    missing = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        for l in reader:

            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress

            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']

                if type_ not in ['protein_coding', 'polymorphic_pseudogene']:
                    continue

                chrom = l[0]

                # test whether chromosome is valid
                m = chrom_pat.match(chrom)
                if m is None:
                    excluded_chromosomes.add(chrom)
                    continue

                chromosomes.add(m.group())

                source = l[1]
                gene_id = attr['gene_id']
                try:
                    gene_name = attr['gene_name']
                except KeyError as e:
                    missing += 1
                    continue

                if gene_id in genes:
                    if genes[gene_id] != gene_name:
                        raise ValueError('Ensembl ID "%s" ' %(gene_id) +
                                'associated with multiple gene symbols.')
                else:
                    genes[gene_id] = gene_name

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('Excluded %d chromosomes:', len(excluded_chromosomes))
    logger.info(', '.join(sorted(excluded_chromosomes)))

    n = len(genes)
    m = len(set(genes.values()))

    logger.info('No. of chromosomes: %d', len(chromosomes))
    logger.info('No. of genes IDs: %d', n)
    logger.info('No. of gene names: %d', m)

    with misc.smart_open_write(output_file) as ofh:
        writer = csv.writer(ofh, dialect = 'excel-tab',
                lineterminator = os.linesep, quoting = csv.QUOTE_NONE)
        for g in sorted(genes.keys()):
            writer.writerow([g, genes[g]])

    return 0
Ejemplo n.º 14
0
def main(args=None):
    """Extract GO annotations and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    Raises
    ------
    SystemError
        If the version of the Python interpreter is not >= 2.7.
    """
    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gene_file = args.gene_file
    gene_ontology_file = args.gene_ontology_file
    goa_association_file = args.goa_association_file
    output_file = args.output_file

    evidence_codes = args.evidence_codes
    min_genes = args.min_genes_per_term
    max_genes = args.max_genes_per_term

    part_of_cc_only = args.part_of_cc_only

    # logging parameters
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    logger.info('Selected evidence codes: %s', ', '.join(evidence_codes))
    logger.info('Min. number of genes per gene set: %d', min_genes)
    logger.info('Max. number of genes per gene set: %d', max_genes)

    # checks
    assert os.path.isfile(gene_file)
    assert os.path.isfile(gene_ontology_file)
    assert os.path.isfile(goa_association_file)

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream,
                             log_file=log_file,
                             quiet=quiet,
                             verbose=verbose)

    # extract protein-coding genes from Ensembl GTF file
    exp_genome = ExpGenome.read_tsv(gene_file)

    # parse Gene Ontology
    gene_ontology = GeneOntology.read_obo(gene_ontology_file)

    # parse UniProt-GOA gene association file
    with gzip.open(goa_association_file, 'rt', encoding='ascii') as fh:
        go_annotations = ontology.parse_gaf(fh,
                                            gene_ontology,
                                            ev_codes=evidence_codes,
                                            genome=exp_genome)

    # extract GO-based gene sets
    gene_sets = ontology.get_goa_gene_sets(go_annotations)
    logger.info('Generated %d GO-derived gene sets', len(gene_sets))

    # filter gene sets based on size
    if min_genes > 0:
        old_size = len(gene_sets)
        gene_sets = GeneSetCollection(gs for gs in gene_sets
                                      if gs.size >= min_genes)
        logger.info('Excluded %d gene sets with too few genes.',
                    old_size - len(gene_sets))

    if max_genes > 0:
        old_size = len(gene_sets)
        gene_sets = GeneSetCollection(gs for gs in gene_sets
                                      if gs.size <= max_genes)
        logger.info('Excluded %d gene sets with too many genes.',
                    old_size - len(gene_sets))

    # writing output file
    gene_sets.write_tsv(output_file)
    logger.info('Wrote %s GO-derived gene sets to output file "%s".',
                len(gene_sets), output_file)

    return 0
def main(args=None):
    """Extracts gene-level expression data from StringTie output.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    stringtie_file = args.stringtie_file
    gene_file = args.gene_file
    no_novel_transcripts = args.no_novel_transcripts
    output_file = args.output_file

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read list of gene symbols
    logger.info("Reading gene data...")
    genes = misc.read_single(gene_file)

    # read StringTie output file and summarize FPKM and TPM per gene
    logger.info("Parsing StringTie output...")

    logger.info("Associating StringTie gene IDs with gene symbols...")
    stringtie_genes = {}
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect="excel-tab")
        for l in reader:
            if l[0][0] == "#":
                continue
            assert len(l) == 9
            if l[2] != "transcript":
                continue
            attr = parse_attributes(l[8])
            try:
                ref_gene = attr["ref_gene_name"]
            except KeyError:
                continue
            else:
                # entry has a "ref_gene_name" attribute
                try:
                    g = stringtie_genes[attr["gene_id"]]
                except KeyError:
                    stringtie_genes[attr["gene_id"]] = {ref_gene}
                else:
                    g.add(ref_gene)
    logger.info("Associated %d gene IDs with gene symbols.", len(stringtie_genes))
    # C = Counter(len(v) for v in stringtie_genes.itervalues())
    gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1]
    n = len(gene_ids_ambiguous)
    logger.info(
        "%d / %d associated with multiple gene symbols (%.1f%%).",
        n,
        len(stringtie_genes),
        100 * (n / float(len(stringtie_genes))),
    )

    # read StringTie output file and summarize FPKM and TPM per gene
    n = len(genes)
    fpkm = np.zeros(n, dtype=np.float64)
    tpm = np.zeros(n, dtype=np.float64)
    fpkm_novel_gene = 0
    fpkm_unknown_gene_name = 0
    fpkm_novel_trans = 0
    fpkm_ambig = 0
    with open(stringtie_file) as fh:
        reader = csv.reader(fh, dialect="excel-tab")
        for l in reader:
            if l[0][0] == "#":
                # skip header
                continue
            assert len(l) == 9

            if l[2] != "transcript":
                # skip exon lines
                continue

            attr = parse_attributes(l[8])
            f = float(attr["FPKM"])

            try:
                g = attr["ref_gene_name"]
            except KeyError:
                if no_novel_transcripts:
                    # ignore this transcript
                    fpkm_novel_trans += f
                    continue
                else:
                    # see if we can assign a gene name based on the gene ID
                    try:
                        assoc = stringtie_genes[attr["gene_id"]]
                    except KeyError:
                        # gene_id not associated with any reference gene
                        fpkm_novel_gene += f
                        continue
                    else:
                        if len(assoc) > 1:
                            # gene ID associated with multiple ref. genes
                            # => ingored
                            fpkm_ambig += f
                            continue
                        else:
                            # gene ID associated with exactly one ref. gene
                            g = list(assoc)[0]

            try:
                idx = misc.bisect_index(genes, g)
            except ValueError:
                fpkm_unknown_gene_name += f
                logger.warning('Unknown gene name: "%s".', g)
                continue

            t = float(attr["TPM"])
            fpkm[idx] += f
            tpm[idx] += t

    # ignored_fpkm = None
    if no_novel_transcripts:
        ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name
    else:
        ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name
    total_fpkm = np.sum(fpkm) + ignored_fpkm
    logger.info("Ignored %.1f / %.1f FPKM (%.1f%%)", ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm))

    if no_novel_transcripts and fpkm_novel_trans > 0:
        logger.info(
            "Ignored %.1f FPKM from novel transcripts (%.1f%%).",
            fpkm_novel_trans,
            100 * (fpkm_novel_trans / total_fpkm),
        )

    else:
        if fpkm_novel_gene > 0:
            logger.info(
                "Ignored %.1f FPKM from transcripts of novel genes " "(%.1f%%).",
                fpkm_novel_gene,
                100 * (fpkm_novel_gene / total_fpkm),
            )

        if fpkm_ambig > 0:
            logger.info(
                "Ignored %.1f FPKM from transcripts with ambiguous " "gene membership (%.1f%%).",
                fpkm_ambig,
                100 * (fpkm_ambig / total_fpkm),
            )

    if fpkm_unknown_gene_name > 0:
        logger.info(
            "Ignored %.1f FPKM from transcripts of genes with unknown " "names (%.1f%%).",
            fpkm_unknown_gene_name,
            100 * (fpkm_unknown_gene_name / total_fpkm),
        )

    # write output file
    E = np.c_[fpkm, tpm]
    with open(output_file, "w") as ofh:
        writer = csv.writer(ofh, dialect="excel-tab", lineterminator=os.linesep, quoting=csv.QUOTE_NONE)
        for i, g in enumerate(genes):
            writer.writerow([g] + ["%.5f" % e for e in E[i, :]])

    return 0
Ejemplo n.º 16
0
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGeneTable.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            'Failed to convert %d / %d entrez IDs '
            'to gene symbols (%.1f%%).', f, matrix.p,
            100 * (f / float(matrix.p)))

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            'Failed to find %d / %d gene symbols in list of '
            'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p)))

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes),
                 len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0
def main(args=None):
    """Extract all exon annotations of protein-coding genes."""

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream,
                             log_file=log_file,
                             quiet=quiet,
                             verbose=verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
                chrom_pat.pattern)

    chromosomes = set()
    excluded_chromosomes = set()
    i = 0
    exons = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \
            misc.smart_open_write(output_file) as ofh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect='excel-tab')
        writer = csv.writer(ofh,
                            dialect='excel-tab',
                            lineterminator=os.linesep,
                            quoting=csv.QUOTE_NONE,
                            quotechar='|')
        for l in reader:
            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress
            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']
                if type_ in ['protein_coding', 'polymorphic_pseudogene']:

                    # test whether chromosome is valid
                    chrom = l[0]
                    m = chrom_pat.match(chrom)
                    if m is None:
                        excluded_chromosomes.add(chrom)
                        continue

                    chromosomes.add(chrom)
                    writer.writerow(l)
                    exons += 1

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('')
    logger.info('Gene chromosomes (%d):', len(chromosomes))
    logger.info('\t' + ', '.join(sorted(chromosomes)))
    logger.info('')
    logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes))
    logger.info('\t' + ', '.join(sorted(excluded_chromosomes)))
    logger.info('')
    logger.info('Total no. of exons: %d' % (exons))

    return 0
"""Tests for functions in `cluster` module."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import str as text
from builtins import int as newint

import os

import pytest

from genometools import ensembl
from genometools import misc

logger = misc.get_logger()

@pytest.mark.online
def test_latest_release():
    release = ensembl.get_latest_release()
    assert isinstance(release, newint)
    logger.info('Current release: %d', release)

@pytest.mark.online
@pytest.mark.linux
@pytest.mark.darwin
@pytest.mark.cygwin
def test_download(my_download_dir):
    assert isinstance(my_download_dir, text)

    species = [
Ejemplo n.º 19
0
from genometools import misc
from pyaffy import rma
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt


## set up directories
data_dir = '/nobackup/shilab/Data/StJude/Leukemia_Subtypes/'
output_dir = './Expression'

if not os.path.isdir(ourput_dir):
    os.mkdir(output_dir)


misc.get_logger(verbose = False)

#TODO: make sure sampleList file is correct.
#sampleList = "/nobackup/shilab/Data/StJude/Leukemia_Subtypes/sampleList.txt"

with open(sampleList, 'r') as samp:
    for line in samp.readlines():
        line = line.rstrip().split("\t")
        sample_cel_files.update({line[0] : line[1])

sample_cel_files = OrderedDict([])

genes, samples, X = rma(cdf_file, sample_cel_files)

def main(args=None):
    """Extract Ensembl IDs and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    # for statistics
    types = Counter()
    sources = Counter()

    # primary information
    genes = Counter()
    gene_chroms = dict()
    gene_ids = dict()

    # secondary information
    genes2 = Counter()
    polymorphic = set()

    # list of chromosomes
    chromosomes = set()
    excluded_chromosomes = set()

    transcripts = {}
    gene_id = None
    gene_name = None

    i = 0
    missing = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        for l in reader:

            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress

            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']

                if type_ not in ['protein_coding', 'polymorphic_pseudogene']:
                    continue

                chrom = l[0]

                # test whether chromosome is valid
                m = chrom_pat.match(chrom)
                if m is None:
                    excluded_chromosomes.add(chrom)
                    continue

                chromosomes.add(m.group())

                source = l[1]
                gene_id = attr['gene_id']
                try:
                    gene_name = attr['gene_name']
                except KeyError as e:
                    missing += 1
                    continue

                if gene_id in genes:
                    if genes[gene_id] != gene_name:
                        raise ValueError('Ensembl ID "%s" ' %(gene_id) +
                                'associated with multiple gene symbols.')
                else:
                    genes[gene_id] = gene_name

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('Excluded %d chromosomes:', len(excluded_chromosomes))
    logger.info(', '.join(sorted(excluded_chromosomes)))

    n = len(genes)
    m = len(set(genes.values()))

    logger.info('No. of chromosomes: %d', len(chromosomes))
    logger.info('No. of genes IDs: %d', n)
    logger.info('No. of gene names: %d', m)

    with misc.smart_open_write(output_file) as ofh:
        writer = csv.writer(ofh, dialect = 'excel-tab',
                lineterminator = os.linesep, quoting = csv.QUOTE_NONE)
        for g in sorted(genes.keys()):
            writer.writerow([g, genes[g]])

    return 0
Ejemplo n.º 21
0
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Tests for functions in `misc` module."""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from builtins import str as text

import os

import pytest

from genometools import misc

logger = misc.get_logger()


@pytest.mark.linux
@pytest.mark.darwin
def test_checksum(my_checksum_file):
    """Tests functions that calculate checksums using Unix "sum" utility."""
    assert misc.get_file_checksum(my_checksum_file) == 2761
    assert misc.test_file_checksum(my_checksum_file, 2761)


@pytest.mark.online
def test_ftp_download(my_readme_file):
    """Tests `ftp_download` function."""
    misc.ftp_download('ftp://ftp.ensembl.org/pub/current_README',
                      my_readme_file)
Ejemplo n.º 22
0
"""Tests for the `GSEAnalysis` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import str as text

from string import ascii_lowercase

import pytest

# from genometools.expression import ExpGenome
from genometools import misc
from genometools.enrichment import GeneSetEnrichmentAnalysis

logger = misc.get_logger('genometools', verbose=True)


@pytest.fixture
def my_analysis(my_genome, my_gene_set_coll):
    analysis = GeneSetEnrichmentAnalysis(my_genome, my_gene_set_coll)
    return analysis


def test_basic(my_analysis, my_genome):
    assert isinstance(my_analysis, GeneSetEnrichmentAnalysis)
    assert isinstance(repr(my_analysis), str)
    assert isinstance(str(my_analysis), str)
    assert isinstance(text(my_analysis), text)

    assert isinstance(my_analysis.genes, list)
Ejemplo n.º 23
0
#!/usr/bin/env python3
"""Gene expression quantification script for inDrop data."""

import sys
import argparse

from genometools import misc

from .. import expression

_LOGGER = misc.get_logger()


def get_argument_parser():

    desc = 'Quantify gene expression based on aligned inDrop reads.'

    parser = argparse.ArgumentParser(description=desc, add_help=False)

    g = parser.add_argument_group('Help')
    g.add_argument('-h',
                   '--help',
                   action='help',
                   help='Show this help message and exit.')

    g = parser.add_argument_group('Input and output files')

    g.add_argument('-a',
                   '--alignment-file',
                   type=str,
                   required=True,
def main(args=None):
    """Extract all exon annotations of protein-coding genes."""

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.species_chrompat[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    chromosomes = set()
    excluded_chromosomes = set()
    i = 0
    exons = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \
            misc.smart_open_write(output_file) as ofh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep,
                quoting = csv.QUOTE_NONE , quotechar = '|')
        for l in reader:
            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress
            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']
                if type_ in ['protein_coding','polymorphic_pseudogene']:

                    # test whether chromosome is valid
                    chrom = l[0]
                    m = chrom_pat.match(chrom)
                    if m is None:
                        excluded_chromosomes.add(chrom)
                        continue

                    chromosomes.add(chrom)
                    writer.writerow(l)
                    exons += 1

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('')
    logger.info('Gene chromosomes (%d):', len(chromosomes))
    logger.info('\t' + ', '.join(sorted(chromosomes)))
    logger.info('')
    logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes))
    logger.info('\t' + ', '.join(sorted(excluded_chromosomes)))
    logger.info('')
    logger.info('Total no. of exons: %d' %(exons))

    return 0