def test_from_names(my_genes): genome = ExpGenome.from_gene_names(my_genes) assert len(genome) == len(my_genes) for i, (g, eg) in enumerate(zip(my_genes, genome)): assert eg.name == g assert eg.chromosomes == [] assert eg.ensembl_ids == []
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError( "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor) ) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGenome.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).", f, matrix.p, 100 * (f / float(matrix.p)), ) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)", f, p, 100 * (f / float(p)), ) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0
def my_genome(): # we're creating a fake genome consisting of all lowercase ascii characters genes = [text(c) for c in ascii_lowercase] genome = ExpGenome.from_gene_names(genes) return genome
def run(self): """Perform GO-PCA. Parameters ---------- Returns ------- `GOPCARun` or None The GO-PCA run, or ``None`` if the run failed. """ t0 = time.time() # remember the start time timestamp = str(datetime.datetime.utcnow()) # timestamp for the run ### Phase 1: Make sure all configurations are valid all_configs_valid = True for config in self.configs: if not config.user_params.check_params(): # problems with the configuration all_configs_valid = False config.finalize_params(self.matrix.p) if not config.params.check_params(): all_configs_valid = False if not all_configs_valid: logger.error('Invalid configuration settings. ' 'Aborting GO-PCA run.') return None # print some information p, n = self.matrix.shape logger.info('Timestamp: %s', timestamp) logger.info( 'Size of expression matrix: ' + 'p=%d genes x n=%d samples.', p, n) # Report hash values for expression matrix and configurations expression_hash = self.matrix.hash logger.info('Expression matrix hash: %s', expression_hash) config_hashes = [] for i, config in enumerate(self.configs): config_hashes.append(config.hash) logger.info('Configuration #%d hash: %s', i + 1, config_hashes[-1]) ### Phase 2: Determine the number of principal components num_components = self.num_components if num_components == 0: # estimate the number of non-trivial PCs using a permutation test num_components = self.estimate_num_components() if num_components == 0: logger.error('The estimated number of non-trivial ' 'principal components is 0. ' 'Aborting GO-PCA run.') return None if 0 < self.pc_max_components < num_components: num_components = self.pc_max_components logger.info('Limiting the number of PCs to test to %d.', num_components) else: # determine the total number of principal components # (i.e., the number of dimensions spanned by the data) max_components = min(self.matrix.p, self.matrix.n - 1) if self.num_components > max_components: logger.error( 'The number of PCs to test was specified as ' '%d, but the data spans only %d dimensions. ' 'Aborting GO-PCA run.', num_components, max_components) return None if num_components == 0: logger.error('No principal components to test.' 'Aborting GO-PCA run.') return None ### Phase 3: Perform PCA logger.info('Performing PCA...') pca = PCA(n_components=num_components) Y = pca.fit_transform(self.matrix.X.T) # output fraction of variance explained for the PCs tested frac = pca.explained_variance_ratio_ cum_frac = np.cumsum(frac) logger.info( 'Fraction of total variance explained by the first ' '%d PCs: %.1f%%', num_components, 100 * cum_frac[-1]) ### Phase 4: Run GO-PCA for each configuration supplied enr_logger = logging.getLogger(enrichment.__name__) genome = ExpGenome.from_gene_names(self.matrix.genes.tolist()) W = pca.components_.T # the loadings matrix msg = logger.debug if self.verbose: # enable more verbose "INFO" messages msg = logger.info all_signatures = [] for k, config in enumerate(self.configs): logger.info( 'Generating GO-PCA signatures for configuration ' '%d...', k + 1) # create GeneSetEnrichmentAnalysis object enr_logger.setLevel(logging.ERROR) gse_analysis = GeneSetEnrichmentAnalysis(genome, config.gene_sets) enr_logger.setLevel(logging.NOTSET) # generate signatures final_signatures = [] var_expl = 0.0 for d in range(num_components): var_expl += frac[d] msg('') msg('-' * 70) msg('PC %d explains %.1f%% of the variance.', d + 1, 100 * frac[d]) msg( 'The new cumulative fraction of variance explained ' 'is %.1f%%.', 100 * var_expl) signatures_dsc = self._generate_pc_signatures( self.matrix, config.params, gse_analysis, W, d + 1) signatures_asc = self._generate_pc_signatures( self.matrix, config.params, gse_analysis, W, -(d + 1)) signatures = signatures_dsc + signatures_asc msg('# signatures: %d', len(signatures)) # apply global filter (if enabled) if not config.params.no_global_filter: before = len(signatures) signatures = self._global_filter(config.params, signatures, final_signatures, config.gene_ontology) msg('Global filter: kept %d / %d signatures.', len(signatures), before) # self.print_signatures(signatures, debug=True) final_signatures.extend(signatures) msg('Total no. of signatures generated so far: %d', len(final_signatures)) logger.info('') logger.info('=' * 70) logger.info( 'GO-PCA for configuration #%d generated %d ' 'signatures.', k + 1, len(final_signatures)) logger.info('-' * 70) self.print_signatures(final_signatures) logger.info('=' * 70) logger.info('') all_signatures.extend(final_signatures) ### Phase 5: Generate signature matrix and return a `GOPCARun` instance sig_matrix = GOPCASignatureMatrix.from_signatures(all_signatures) t1 = time.time() exec_time = t1 - t0 logger.info('This GO-PCA run took %.2f s.', exec_time) gopca_run = GOPCARun(sig_matrix, gopca.__version__, timestamp, exec_time, expression_hash, config_hashes, self.matrix.genes, self.matrix.samples, W, Y) return gopca_run
def my_genome(my_genome_file): genome = ExpGenome.read_tsv(my_genome_file) return genome
def run(self): """Perform GO-PCA. Parameters ---------- Returns ------- `GOPCARun` or None The GO-PCA run, or ``None`` if the run failed. """ t0 = time.time() # remember the start time timestamp = str(datetime.datetime.utcnow()) # timestamp for the run ### Phase 1: Make sure all configurations are valid all_configs_valid = True for config in self.configs: if not config.user_params.check_params(): # problems with the configuration all_configs_valid = False config.finalize_params(self.matrix.p) if not config.params.check_params(): all_configs_valid = False if not all_configs_valid: logger.error('Invalid configuration settings. ' 'Aborting GO-PCA run.') return None # print some information p, n = self.matrix.shape logger.info('Timestamp: %s', timestamp) logger.info('Size of expression matrix: ' + 'p=%d genes x n=%d samples.', p, n) # Report hash values for expression matrix and configurations expression_hash = self.matrix.hash logger.info('Expression matrix hash: %s', expression_hash) config_hashes = [] for i, config in enumerate(self.configs): config_hashes.append(config.hash) logger.info('Configuration #%d hash: %s', i+1, config_hashes[-1]) ### Phase 2: Determine the number of principal components num_components = self.num_components if num_components == 0: # estimate the number of non-trivial PCs using a permutation test num_components = self.estimate_num_components() if num_components == 0: logger.error('The estimated number of non-trivial ' 'principal components is 0. ' 'Aborting GO-PCA run.') return None if 0 < self.pc_max_components < num_components: num_components = self.pc_max_components logger.info('Limiting the number of PCs to test to %d.', num_components) else: # determine the total number of principal components # (i.e., the number of dimensions spanned by the data) max_components = min(self.matrix.p, self.matrix.n - 1) if self.num_components > max_components: logger.error('The number of PCs to test was specified as ' '%d, but the data spans only %d dimensions. ' 'Aborting GO-PCA run.', num_components, max_components) return None if num_components == 0: logger.error('No principal components to test.' 'Aborting GO-PCA run.') return None ### Phase 3: Perform PCA logger.info('Performing PCA...') pca = PCA(n_components=num_components) Y = pca.fit_transform(self.matrix.X.T) # output fraction of variance explained for the PCs tested frac = pca.explained_variance_ratio_ cum_frac = np.cumsum(frac) logger.info('Fraction of total variance explained by the first ' '%d PCs: %.1f%%', num_components, 100 * cum_frac[-1]) ### Phase 4: Run GO-PCA for each configuration supplied enr_logger = logging.getLogger(enrichment.__name__) genome = ExpGenome.from_gene_names(self.matrix.genes.tolist()) W = pca.components_.T # the loadings matrix msg = logger.debug if self.verbose: # enable more verbose "INFO" messages msg = logger.info all_signatures = [] for k, config in enumerate(self.configs): logger.info('Generating GO-PCA signatures for configuration ' '%d...', k+1) # create GeneSetEnrichmentAnalysis object enr_logger.setLevel(logging.ERROR) gse_analysis = GeneSetEnrichmentAnalysis(genome, config.gene_sets) enr_logger.setLevel(logging.NOTSET) # generate signatures final_signatures = [] var_expl = 0.0 for d in range(num_components): var_expl += frac[d] msg('') msg('-'*70) msg('PC %d explains %.1f%% of the variance.', d+1, 100*frac[d]) msg('The new cumulative fraction of variance explained ' 'is %.1f%%.', 100*var_expl) signatures_dsc = self._generate_pc_signatures( self.matrix, config.params, gse_analysis, W, d+1) signatures_asc = self._generate_pc_signatures( self.matrix, config.params, gse_analysis, W, -(d+1)) signatures = signatures_dsc + signatures_asc msg('# signatures: %d', len(signatures)) # apply global filter (if enabled) if not config.params.no_global_filter: before = len(signatures) signatures = self._global_filter( config.params, signatures, final_signatures, config.gene_ontology) msg('Global filter: kept %d / %d signatures.', len(signatures), before) # self.print_signatures(signatures, debug=True) final_signatures.extend(signatures) msg('Total no. of signatures generated so far: %d', len(final_signatures)) logger.info('') logger.info('='*70) logger.info('GO-PCA for configuration #%d generated %d ' 'signatures.', k+1, len(final_signatures)) logger.info('-'*70) self.print_signatures(final_signatures) logger.info('='*70) logger.info('') all_signatures.extend(final_signatures) ### Phase 5: Generate signature matrix and return a `GOPCARun` instance sig_matrix = GOPCASignatureMatrix.from_signatures(all_signatures) t1 = time.time() exec_time = t1 - t0 logger.info('This GO-PCA run took %.2f s.', exec_time) gopca_run = GOPCARun(sig_matrix, gopca.__version__, timestamp, exec_time, expression_hash, config_hashes, self.matrix.genes, self.matrix.samples, W, Y) return gopca_run
def main(args=None): """Extract GO annotations and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). Raises ------ SystemError If the version of the Python interpreter is not >= 2.7. """ vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gene_file = args.gene_file gene_ontology_file = args.gene_ontology_file goa_association_file = args.goa_association_file output_file = args.output_file evidence_codes = args.evidence_codes min_genes = args.min_genes_per_term max_genes = args.max_genes_per_term part_of_cc_only = args.part_of_cc_only # logging parameters log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) logger.info('Selected evidence codes: %s', ', '.join(evidence_codes)) logger.info('Min. number of genes per gene set: %d', min_genes) logger.info('Max. number of genes per gene set: %d', max_genes) # checks assert os.path.isfile(gene_file) assert os.path.isfile(gene_ontology_file) assert os.path.isfile(goa_association_file) # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # extract protein-coding genes from Ensembl GTF file exp_genome = ExpGenome.read_tsv(gene_file) # parse Gene Ontology gene_ontology = GeneOntology.read_obo(gene_ontology_file) # parse UniProt-GOA gene association file with gzip.open(goa_association_file, 'rt', encoding='ascii') as fh: go_annotations = ontology.parse_gaf(fh, gene_ontology, ev_codes=evidence_codes, genome=exp_genome) # extract GO-based gene sets gene_sets = ontology.get_goa_gene_sets(go_annotations) logger.info('Generated %d GO-derived gene sets', len(gene_sets)) # filter gene sets based on size if min_genes > 0: old_size = len(gene_sets) gene_sets = GeneSetCollection(gs for gs in gene_sets if gs.size >= min_genes) logger.info('Excluded %d gene sets with too few genes.', old_size - len(gene_sets)) if max_genes > 0: old_size = len(gene_sets) gene_sets = GeneSetCollection(gs for gs in gene_sets if gs.size <= max_genes) logger.info('Excluded %d gene sets with too many genes.', old_size - len(gene_sets)) # writing output file gene_sets.write_tsv(output_file) logger.info('Wrote %s GO-derived gene sets to output file "%s".', len(gene_sets), output_file) return 0
def test_tsv(tmpdir, my_genome): tmp_file = str(tmpdir.join('_genome.tsv')) # print(type(_genome.exp_genes[0])) my_genome.write_tsv(tmp_file) other = ExpGenome.read_tsv(tmp_file) assert my_genome == other