def test_from_names(my_genes):
    genome = ExpGenome.from_gene_names(my_genes)
    assert len(genome) == len(my_genes)
    for i, (g, eg) in enumerate(zip(my_genes, genome)):
        assert eg.name == g
        assert eg.chromosomes == []
        assert eg.ensembl_ids == []
def main(args=None):

    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError(
            "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor)
        )

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    expression_file = args.expression_file
    entrez2gene_file = args.entrez2gene_file
    gene_file = args.gene_file
    output_file = args.output_file

    strip_affy_suffix = args.strip_affy_suffix

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    # read data
    genome = ExpGenome.read_tsv(gene_file)
    matrix = ExpMatrix.read_tsv(expression_file)
    e2g = dict(misc.read_all(entrez2gene_file))

    entrez = matrix.genes

    if strip_affy_suffix:
        # remove "_at" suffix from Entrez IDs
        entrez = [e[:-3] for e in entrez]
    logger.debug(str(entrez[:3]))

    # check that Entrez IDs are unique
    assert len(entrez) == len(set(entrez))

    # convert Entrez IDs to gene names
    f = 0
    genes = []
    X = []

    # g = None
    for i, e in enumerate(entrez):
        # print e
        try:
            g = e2g[e]
        except KeyError:
            f += 1
        else:
            # check if there are multiple entrez IDs pointing to the same gene
            # assert g not in genes
            genes.append(g)
            X.append(matrix.X[i, :])
    assert len(genes) == len(set(genes))
    if f > 0:
        logger.warning(
            "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).",
            f,
            matrix.p,
            100 * (f / float(matrix.p)),
        )

    # filter for known protein-coding genes
    X = np.float64(X)
    p = X.shape[0]
    logger.debug(str(X.shape))
    sel = np.zeros(p, dtype=np.bool_)
    for i in range(p):
        if genes[i] in genome:
            sel[i] = True
    sel = np.nonzero(sel)[0]
    genes = [genes[i] for i in sel]
    X = X[sel, :]
    f = p - sel.size
    if f > 0:
        logger.warning(
            "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)",
            f,
            p,
            100 * (f / float(p)),
        )

    # generate new matrix (this automatically sorts the genes alphabetically)
    logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape))
    matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X)

    # write output file
    matrix_conv.write_tsv(output_file)

    return 0
Exemple #3
0
def my_genome():
    # we're creating a fake genome consisting of all lowercase ascii characters
    genes = [text(c) for c in ascii_lowercase]
    genome = ExpGenome.from_gene_names(genes)
    return genome
Exemple #4
0
    def run(self):
        """Perform GO-PCA.

        Parameters
        ----------

        Returns
        -------
        `GOPCARun` or None
            The GO-PCA run, or ``None`` if the run failed.
        """
        t0 = time.time()  # remember the start time
        timestamp = str(datetime.datetime.utcnow())  # timestamp for the run

        ### Phase 1: Make sure all configurations are valid
        all_configs_valid = True
        for config in self.configs:
            if not config.user_params.check_params():
                # problems with the configuration
                all_configs_valid = False
            config.finalize_params(self.matrix.p)
            if not config.params.check_params():
                all_configs_valid = False

        if not all_configs_valid:
            logger.error('Invalid configuration settings. '
                         'Aborting GO-PCA run.')
            return None

        # print some information
        p, n = self.matrix.shape
        logger.info('Timestamp: %s', timestamp)
        logger.info(
            'Size of expression matrix: ' + 'p=%d genes x n=%d samples.', p, n)

        # Report hash values for expression matrix and configurations
        expression_hash = self.matrix.hash
        logger.info('Expression matrix hash: %s', expression_hash)
        config_hashes = []
        for i, config in enumerate(self.configs):
            config_hashes.append(config.hash)
            logger.info('Configuration #%d hash: %s', i + 1, config_hashes[-1])

        ### Phase 2: Determine the number of principal components
        num_components = self.num_components
        if num_components == 0:
            # estimate the number of non-trivial PCs using a permutation test
            num_components = self.estimate_num_components()
            if num_components == 0:
                logger.error('The estimated number of non-trivial '
                             'principal components is 0. '
                             'Aborting GO-PCA run.')
                return None
            if 0 < self.pc_max_components < num_components:
                num_components = self.pc_max_components
                logger.info('Limiting the number of PCs to test to %d.',
                            num_components)

        else:
            # determine the total number of principal components
            # (i.e., the number of dimensions spanned by the data)
            max_components = min(self.matrix.p, self.matrix.n - 1)
            if self.num_components > max_components:
                logger.error(
                    'The number of PCs to test was specified as '
                    '%d, but the data spans only %d dimensions. '
                    'Aborting GO-PCA run.', num_components, max_components)
                return None

        if num_components == 0:
            logger.error('No principal components to test.'
                         'Aborting GO-PCA run.')
            return None

        ### Phase 3: Perform PCA
        logger.info('Performing PCA...')
        pca = PCA(n_components=num_components)
        Y = pca.fit_transform(self.matrix.X.T)

        # output fraction of variance explained for the PCs tested
        frac = pca.explained_variance_ratio_
        cum_frac = np.cumsum(frac)
        logger.info(
            'Fraction of total variance explained by the first '
            '%d PCs: %.1f%%', num_components, 100 * cum_frac[-1])

        ### Phase 4: Run GO-PCA for each configuration supplied
        enr_logger = logging.getLogger(enrichment.__name__)

        genome = ExpGenome.from_gene_names(self.matrix.genes.tolist())
        W = pca.components_.T  # the loadings matrix

        msg = logger.debug
        if self.verbose:
            # enable more verbose "INFO" messages
            msg = logger.info

        all_signatures = []
        for k, config in enumerate(self.configs):

            logger.info(
                'Generating GO-PCA signatures for configuration '
                '%d...', k + 1)

            # create GeneSetEnrichmentAnalysis object
            enr_logger.setLevel(logging.ERROR)
            gse_analysis = GeneSetEnrichmentAnalysis(genome, config.gene_sets)
            enr_logger.setLevel(logging.NOTSET)

            # generate signatures
            final_signatures = []
            var_expl = 0.0
            for d in range(num_components):
                var_expl += frac[d]
                msg('')
                msg('-' * 70)
                msg('PC %d explains %.1f%% of the variance.', d + 1,
                    100 * frac[d])
                msg(
                    'The new cumulative fraction of variance explained '
                    'is %.1f%%.', 100 * var_expl)

                signatures_dsc = self._generate_pc_signatures(
                    self.matrix, config.params, gse_analysis, W, d + 1)
                signatures_asc = self._generate_pc_signatures(
                    self.matrix, config.params, gse_analysis, W, -(d + 1))
                signatures = signatures_dsc + signatures_asc
                msg('# signatures: %d', len(signatures))

                # apply global filter (if enabled)
                if not config.params.no_global_filter:
                    before = len(signatures)
                    signatures = self._global_filter(config.params, signatures,
                                                     final_signatures,
                                                     config.gene_ontology)
                    msg('Global filter: kept %d / %d signatures.',
                        len(signatures), before)

                # self.print_signatures(signatures, debug=True)
                final_signatures.extend(signatures)
                msg('Total no. of signatures generated so far: %d',
                    len(final_signatures))

            logger.info('')
            logger.info('=' * 70)
            logger.info(
                'GO-PCA for configuration #%d generated %d '
                'signatures.', k + 1, len(final_signatures))
            logger.info('-' * 70)
            self.print_signatures(final_signatures)
            logger.info('=' * 70)
            logger.info('')
            all_signatures.extend(final_signatures)

        ### Phase 5: Generate signature matrix and return a `GOPCARun` instance
        sig_matrix = GOPCASignatureMatrix.from_signatures(all_signatures)
        t1 = time.time()
        exec_time = t1 - t0
        logger.info('This GO-PCA run took %.2f s.', exec_time)
        gopca_run = GOPCARun(sig_matrix, gopca.__version__, timestamp,
                             exec_time, expression_hash, config_hashes,
                             self.matrix.genes, self.matrix.samples, W, Y)

        return gopca_run
Exemple #5
0
def my_genome(my_genome_file):
    genome = ExpGenome.read_tsv(my_genome_file)
    return genome
Exemple #6
0
    def run(self):
        """Perform GO-PCA.

        Parameters
        ----------

        Returns
        -------
        `GOPCARun` or None
            The GO-PCA run, or ``None`` if the run failed.
        """
        t0 = time.time()  # remember the start time
        timestamp = str(datetime.datetime.utcnow())  # timestamp for the run


        ### Phase 1: Make sure all configurations are valid
        all_configs_valid = True
        for config in self.configs:
            if not config.user_params.check_params():
                # problems with the configuration
                all_configs_valid = False
            config.finalize_params(self.matrix.p)
            if not config.params.check_params():
                all_configs_valid = False

        if not all_configs_valid:
            logger.error('Invalid configuration settings. '
                         'Aborting GO-PCA run.')
            return None

        # print some information
        p, n = self.matrix.shape
        logger.info('Timestamp: %s', timestamp)
        logger.info('Size of expression matrix: ' +
                    'p=%d genes x n=%d samples.', p, n)

        # Report hash values for expression matrix and configurations
        expression_hash = self.matrix.hash
        logger.info('Expression matrix hash: %s', expression_hash)
        config_hashes = []
        for i, config in enumerate(self.configs):
            config_hashes.append(config.hash)
            logger.info('Configuration #%d hash: %s', i+1, config_hashes[-1])


        ### Phase 2: Determine the number of principal components
        num_components = self.num_components
        if num_components == 0:
            # estimate the number of non-trivial PCs using a permutation test
            num_components = self.estimate_num_components()
            if num_components == 0:
                logger.error('The estimated number of non-trivial '
                             'principal components is 0. '
                             'Aborting GO-PCA run.')
                return None
            if 0 < self.pc_max_components < num_components:
                num_components = self.pc_max_components
                logger.info('Limiting the number of PCs to test to %d.', num_components)


        else:
            # determine the total number of principal components
            # (i.e., the number of dimensions spanned by the data)
            max_components = min(self.matrix.p, self.matrix.n - 1)
            if self.num_components > max_components:
                logger.error('The number of PCs to test was specified as '
                             '%d, but the data spans only %d dimensions. '
                             'Aborting GO-PCA run.',
                             num_components, max_components)
                return None

        if num_components == 0:
            logger.error('No principal components to test.'
                         'Aborting GO-PCA run.')
            return None


        ### Phase 3: Perform PCA
        logger.info('Performing PCA...')
        pca = PCA(n_components=num_components)
        Y = pca.fit_transform(self.matrix.X.T)

        # output fraction of variance explained for the PCs tested
        frac = pca.explained_variance_ratio_
        cum_frac = np.cumsum(frac)
        logger.info('Fraction of total variance explained by the first '
                    '%d PCs: %.1f%%', num_components, 100 * cum_frac[-1])


        ### Phase 4: Run GO-PCA for each configuration supplied
        enr_logger = logging.getLogger(enrichment.__name__)

        genome = ExpGenome.from_gene_names(self.matrix.genes.tolist())
        W = pca.components_.T  # the loadings matrix

        msg = logger.debug
        if self.verbose:
            # enable more verbose "INFO" messages
            msg = logger.info

        all_signatures = []
        for k, config in enumerate(self.configs):

            logger.info('Generating GO-PCA signatures for configuration '
                        '%d...', k+1)

            # create GeneSetEnrichmentAnalysis object
            enr_logger.setLevel(logging.ERROR)
            gse_analysis = GeneSetEnrichmentAnalysis(genome, config.gene_sets)
            enr_logger.setLevel(logging.NOTSET)

            # generate signatures
            final_signatures = []
            var_expl = 0.0
            for d in range(num_components):
                var_expl += frac[d]
                msg('')
                msg('-'*70)
                msg('PC %d explains %.1f%% of the variance.',
                    d+1, 100*frac[d])
                msg('The new cumulative fraction of variance explained '
                    'is %.1f%%.', 100*var_expl)

                signatures_dsc = self._generate_pc_signatures(
                    self.matrix, config.params, gse_analysis, W, d+1)
                signatures_asc = self._generate_pc_signatures(
                    self.matrix, config.params, gse_analysis, W, -(d+1))
                signatures = signatures_dsc + signatures_asc
                msg('# signatures: %d', len(signatures))

                # apply global filter (if enabled)
                if not config.params.no_global_filter:
                    before = len(signatures)
                    signatures = self._global_filter(
                        config.params, signatures, final_signatures,
                        config.gene_ontology)
                    msg('Global filter: kept %d / %d signatures.',
                        len(signatures), before)

                # self.print_signatures(signatures, debug=True)
                final_signatures.extend(signatures)
                msg('Total no. of signatures generated so far: %d',
                    len(final_signatures))

            logger.info('')
            logger.info('='*70)
            logger.info('GO-PCA for configuration #%d generated %d '
                        'signatures.', k+1, len(final_signatures))
            logger.info('-'*70)
            self.print_signatures(final_signatures)
            logger.info('='*70)
            logger.info('')
            all_signatures.extend(final_signatures)


        ### Phase 5: Generate signature matrix and return a `GOPCARun` instance
        sig_matrix = GOPCASignatureMatrix.from_signatures(all_signatures)
        t1 = time.time()
        exec_time = t1 - t0
        logger.info('This GO-PCA run took %.2f s.', exec_time)
        gopca_run = GOPCARun(sig_matrix,
                             gopca.__version__, timestamp, exec_time,
                             expression_hash, config_hashes,
                             self.matrix.genes, self.matrix.samples, W, Y)

        return gopca_run
def main(args=None):
    """Extract GO annotations and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
 
    Raises
    ------
    SystemError
        If the version of the Python interpreter is not >= 2.7.
    """
    vinfo = sys.version_info
    if not (vinfo >= (2, 7)):
        raise SystemError('Python interpreter version >= 2.7 required, '
                          'found %d.%d instead.' % (vinfo.major, vinfo.minor))

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    gene_file = args.gene_file
    gene_ontology_file = args.gene_ontology_file
    goa_association_file = args.goa_association_file
    output_file = args.output_file

    evidence_codes = args.evidence_codes
    min_genes = args.min_genes_per_term
    max_genes = args.max_genes_per_term

    part_of_cc_only = args.part_of_cc_only

    # logging parameters
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose)

    logger.info('Selected evidence codes: %s', ', '.join(evidence_codes))
    logger.info('Min. number of genes per gene set: %d', min_genes)
    logger.info('Max. number of genes per gene set: %d', max_genes)

    # checks
    assert os.path.isfile(gene_file)
    assert os.path.isfile(gene_ontology_file)
    assert os.path.isfile(goa_association_file)

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream,
                             log_file=log_file,
                             quiet=quiet,
                             verbose=verbose)

    # extract protein-coding genes from Ensembl GTF file
    exp_genome = ExpGenome.read_tsv(gene_file)

    # parse Gene Ontology
    gene_ontology = GeneOntology.read_obo(gene_ontology_file)

    # parse UniProt-GOA gene association file
    with gzip.open(goa_association_file, 'rt', encoding='ascii') as fh:
        go_annotations = ontology.parse_gaf(fh,
                                            gene_ontology,
                                            ev_codes=evidence_codes,
                                            genome=exp_genome)

    # extract GO-based gene sets
    gene_sets = ontology.get_goa_gene_sets(go_annotations)
    logger.info('Generated %d GO-derived gene sets', len(gene_sets))

    # filter gene sets based on size
    if min_genes > 0:
        old_size = len(gene_sets)
        gene_sets = GeneSetCollection(gs for gs in gene_sets
                                      if gs.size >= min_genes)
        logger.info('Excluded %d gene sets with too few genes.',
                    old_size - len(gene_sets))

    if max_genes > 0:
        old_size = len(gene_sets)
        gene_sets = GeneSetCollection(gs for gs in gene_sets
                                      if gs.size <= max_genes)
        logger.info('Excluded %d gene sets with too many genes.',
                    old_size - len(gene_sets))

    # writing output file
    gene_sets.write_tsv(output_file)
    logger.info('Wrote %s GO-derived gene sets to output file "%s".',
                len(gene_sets), output_file)

    return 0
def test_tsv(tmpdir, my_genome):
    tmp_file = str(tmpdir.join('_genome.tsv'))
    # print(type(_genome.exp_genes[0]))
    my_genome.write_tsv(tmp_file)
    other = ExpGenome.read_tsv(tmp_file)
    assert my_genome == other