def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gopca_file = args.gopca_file output_file = args.output_file # configure root logger log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) assert os.path.isfile(gopca_file) workbook = xlsxwriter.Workbook( output_file, {'strings_to_numbers': True, 'in_memory': True}) workbook.set_properties({'title': 'GO-PCA Signatures'}) bold = workbook.add_format({'bold': True}) ws = workbook.add_worksheet() result = util.read_gopca_result(gopca_file) signatures = result.signatures # sort signatures first by PC, then by fold enrichment signatures = sorted( signatures, key=lambda s: [abs(s.pc), -sign(s.pc), -s.escore]) labels = list(signatures[0].get_ordered_dict().keys()) ws.write_row(0, 0, labels, cell_format=bold) max_width = np.float64([len(labels[j]) for j in range(len(labels))]) for i, sig in enumerate(signatures): vals = sig.get_ordered_dict().values() for j, v in enumerate(vals): max_width[j] = max(max_width[j], float(len(v))) ws.write_row(i+1, 0, vals) for j in range(len(labels)): ws.set_column(j, j, max_width[j]+0.43) workbook.close() logger.info('Wrote %d signatures to "%s".', len(signatures), output_file) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gopca_file = args.gopca_file output_file = args.output_file #sig_max_len = args.sig_max_len #sig_reverse_order = args.sig_reverse_order #sample_cluster_metric = args.sample_cluster_metric #no_sample_clustering = args.no_sample_clustering # configure root logger log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) result = util.read_gopca_result(gopca_file) sig_matrix = util.read_gopca_result(gopca_file) sig_labels = [sig.get_label(include_id=False) for sig in sig_matrix.signatures] matrix = ExpMatrix(genes=sig_labels, samples=sig_matrix.samples, X=sig_matrix.X) matrix.index.name = 'Signatures' #signatures = result.signatures #sig_labels = [sig.get_label(max_name_length=sig_max_len, include_id=False) # for sig in signatures] #samples = list(result.samples) # generate expression matrix #E = ExpMatrix(genes=sig_labels, samples=samples, X=sig_matrix.X) # clustering of signatures (rows) #E, _ = cluster.cluster_genes(E, reverse=sig_reverse_order) exp_logger = logging.getLogger(expression.__name__) exp_logger.setLevel(logging.WARNING) matrix.write_tsv(output_file) exp_logger.setLevel(logging.NOTSET) logger.info('Wrote %d x %d signature matrix to "%s".', matrix.p, matrix.n, output_file) return 0
def main(args=None): """Script body.""" if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() fasta_file = args.fasta_file species = args.species chrom_pat = args.chromosome_pattern output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # generate regular expression object from the chromosome pattern if chrom_pat is None: chrom_pat = ensembl.SPECIES_CHROMPAT[species] chrom_re = re.compile(chrom_pat) # filter the FASTA file # note: each chromosome sequence is temporarily read into memory, # so this script has a large memory footprint with \ misc.smart_open_read( fasta_file, mode='r', encoding='ascii', try_gzip=True ) as fh, \ misc.smart_open_write( output_file, mode='w', encoding='ascii' ) as ofh: # inside = False reader = FastaReader(fh) for seq in reader: chrom = seq.name.split(' ', 1)[0] if chrom_re.match(chrom) is None: logger.info('Ignoring chromosome "%s"...', chrom) continue seq.name = chrom seq.append_fasta(ofh) return 0
def main(args=None): """Script body.""" if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() fasta_file = args.fasta_file species = args.species chrom_pat = args.chromosome_pattern output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # generate regular expression object from the chromosome pattern if chrom_pat is None: chrom_pat = ensembl.species_chrompat[species] chrom_re = re.compile(chrom_pat) # filter the FASTA file # note: each chromosome sequence is temporarily read into memory, # so this script has a large memory footprint with \ misc.smart_open_read( fasta_file, mode='r', encoding='ascii', try_gzip=True ) as fh, \ misc.smart_open_write( output_file, mode='w', encoding='ascii' ) as ofh: # inside = False reader = FastaReader(fh) for seq in reader: chrom = seq.name.split(' ', 1)[0] if chrom_re.match(chrom) is None: logger.info('Ignoring chromosome "%s"...', chrom) continue seq.name = chrom seq.append_fasta(ofh) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gopca_file = args.gopca_file output_file = args.output_file emin = args.min_val emax = args.max_val width = args.width height = args.height margin_left = args.margin_left margin_bottom = args.margin_bottom font_size = args.font_size font = args.font include_plotlyjs = True if args.no_plotly_js: include_plotlyjs = False # configure root logger log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) colorbar_label = args.colorbar_label sig_matrix = util.read_gopca_result(gopca_file) fig = sig_matrix.get_figure( width=width, height=height, font_size=font_size, font=font, emin=emin, emax=emax, margin_left=margin_left, margin_bottom=margin_bottom, heatmap_kw=dict(colorbar_label=colorbar_label)) plot(fig, filename=output_file, image_filename='gopca_signature_matrix', auto_open=False, include_plotlyjs=include_plotlyjs) logger.info('Plotted %d x %d signature matrix.', sig_matrix.p, sig_matrix.n) return 0
def main(args=None): """Extracts Entrez ID -> gene symbol mapping and writes it to a text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). Raises ------ SystemError If the version of the Python interpreter is not >= 2.7. """ vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gene2acc_file = args.gene2acc_file output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) entrez2gene = read_gene2acc(gene2acc_file, logger) write_entrez2gene(output_file, entrez2gene, logger) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gopca_file = args.gopca_file output_file = args.output_file # configure root logger log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) assert os.path.isfile(gopca_file) result = util.read_gopca_result(gopca_file) signatures = result.signatures # sort signatures first by PC, then by fold enrichment signatures = sorted( signatures, key=lambda s: [abs(s.pc), -sign(s.pc), -s.escore]) labels = signatures[0].get_ordered_dict().keys() with open(output_file, 'wb') as ofh: writer = csv.writer(ofh, dialect='excel-tab', lineterminator='\n', quoting=csv.QUOTE_NONE) writer.writerow(labels) for i, sig in enumerate(signatures): vals = sig.get_ordered_dict().values() writer.writerow(vals) logger.info('Wrote %d signatures to "%s".', len(signatures), output_file) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gopca_file = args.gopca_file output_file = args.output_file # configure root logger log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) assert os.path.isfile(gopca_file) result = util.read_gopca_result(gopca_file) signatures = result.signatures # sort signatures first by PC, then by fold enrichment signatures = sorted(signatures, key=lambda s: [abs(s.pc), -sign(s.pc), -s.escore]) labels = signatures[0].get_ordered_dict().keys() with open(output_file, 'wb') as ofh: writer = csv.writer(ofh, dialect='excel-tab', lineterminator='\n', quoting=csv.QUOTE_NONE) writer.writerow(labels) for i, sig in enumerate(signatures): vals = sig.get_ordered_dict().values() writer.writerow(vals) logger.info('Wrote %d signatures to "%s".', len(signatures), output_file) return 0
# This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """Tests for the `GSEAnalysis` class.""" import pytest # from genometools.expression import ExpGenome from genometools import misc from genometools.enrichment import GeneSetEnrichmentAnalysis logger = misc.get_logger('genometools', verbose=True) @pytest.fixture def my_analysis(my_valid_genes, my_gene_set_coll): analysis = GeneSetEnrichmentAnalysis(my_valid_genes, my_gene_set_coll) return analysis def test_basic(my_analysis, my_valid_genes): assert isinstance(my_analysis, GeneSetEnrichmentAnalysis) assert isinstance(repr(my_analysis), str) assert isinstance(str(my_analysis), str) assert my_analysis.valid_genes is not my_valid_genes assert len(my_analysis.valid_genes) == len(my_valid_genes)
def main(args=None): """Extracts gene-level expression data from StringTie output. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() stringtie_file = args.stringtie_file gene_file = args.gene_file no_novel_transcripts = args.no_novel_transcripts output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read list of gene symbols logger.info('Reading gene data...') genes = misc.read_single(gene_file) # read StringTie output file and summarize FPKM and TPM per gene logger.info('Parsing StringTie output...') logger.info('Associating StringTie gene IDs with gene symbols...') stringtie_genes = {} with open(stringtie_file) as fh: reader = csv.reader(fh, dialect='excel-tab') for l in reader: if l[0][0] == '#': continue assert len(l) == 9 if l[2] != 'transcript': continue attr = parse_attributes(l[8]) try: ref_gene = attr['ref_gene_name'] except KeyError: continue else: # entry has a "ref_gene_name" attribute try: g = stringtie_genes[attr['gene_id']] except KeyError: stringtie_genes[attr['gene_id']] = { ref_gene, } else: g.add(ref_gene) logger.info('Associated %d gene IDs with gene symbols.', len(stringtie_genes)) # C = Counter(len(v) for v in stringtie_genes.itervalues()) gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1] n = len(gene_ids_ambiguous) logger.info('%d / %d associated with multiple gene symbols (%.1f%%).', n, len(stringtie_genes), 100 * (n / float(len(stringtie_genes)))) # read StringTie output file and summarize FPKM and TPM per gene n = len(genes) fpkm = np.zeros(n, dtype=np.float64) tpm = np.zeros(n, dtype=np.float64) fpkm_novel_gene = 0 fpkm_unknown_gene_name = 0 fpkm_novel_trans = 0 fpkm_ambig = 0 with open(stringtie_file) as fh: reader = csv.reader(fh, dialect='excel-tab') for l in reader: if l[0][0] == '#': # skip header continue assert len(l) == 9 if l[2] != 'transcript': # skip exon lines continue attr = parse_attributes(l[8]) f = float(attr['FPKM']) try: g = attr['ref_gene_name'] except KeyError: if no_novel_transcripts: # ignore this transcript fpkm_novel_trans += f continue else: # see if we can assign a gene name based on the gene ID try: assoc = stringtie_genes[attr['gene_id']] except KeyError: # gene_id not associated with any reference gene fpkm_novel_gene += f continue else: if len(assoc) > 1: # gene ID associated with multiple ref. genes # => ingored fpkm_ambig += f continue else: # gene ID associated with exactly one ref. gene g = list(assoc)[0] try: idx = misc.bisect_index(genes, g) except ValueError: fpkm_unknown_gene_name += f logger.warning('Unknown gene name: "%s".', g) continue t = float(attr['TPM']) fpkm[idx] += f tpm[idx] += t # ignored_fpkm = None if no_novel_transcripts: ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name else: ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name total_fpkm = np.sum(fpkm) + ignored_fpkm logger.info('Ignored %.1f / %.1f FPKM (%.1f%%)', ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm)) if no_novel_transcripts and fpkm_novel_trans > 0: logger.info('Ignored %.1f FPKM from novel transcripts (%.1f%%).', fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm)) else: if fpkm_novel_gene > 0: logger.info( 'Ignored %.1f FPKM from transcripts of novel genes ' '(%.1f%%).', fpkm_novel_gene, 100 * (fpkm_novel_gene / total_fpkm)) if fpkm_ambig > 0: logger.info( 'Ignored %.1f FPKM from transcripts with ambiguous ' 'gene membership (%.1f%%).', fpkm_ambig, 100 * (fpkm_ambig / total_fpkm)) if fpkm_unknown_gene_name > 0: logger.info( 'Ignored %.1f FPKM from transcripts of genes with unknown ' 'names (%.1f%%).', fpkm_unknown_gene_name, 100 * (fpkm_unknown_gene_name / total_fpkm)) # write output file E = np.c_[fpkm, tpm] with open(output_file, 'w') as ofh: writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE) for i, g in enumerate(genes): writer.writerow([g] + ['%.5f' % e for e in E[i, :]]) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError( "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor) ) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGenome.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).", f, matrix.p, 100 * (f / float(matrix.p)), ) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)", f, p, 100 * (f / float(p)), ) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0
"""Tests for the `ExpGenome` class.""" from __future__ import (absolute_import, division, print_function, unicode_literals) from builtins import str as text from copy import deepcopy from collections import Iterable import pytest from genometools.misc import get_logger from genometools.expression import ExpGene, ExpGenome logger = get_logger(__name__, verbose=True) def test_init(my_genome, my_exp_genes): assert isinstance(my_genome, ExpGenome) assert isinstance(repr(my_genome), str) assert isinstance(str(my_genome), str) assert isinstance(text(my_genome), text) assert isinstance(my_genome.hash, text) assert len(my_genome) == len(my_exp_genes) assert isinstance(my_genome, Iterable) genes = [eg.name for eg in my_exp_genes] assert my_genome.genes == genes assert my_genome.exp_genes == my_exp_genes other = deepcopy(my_genome)
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
def main(args=None): """Extract GO annotations and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). Raises ------ SystemError If the version of the Python interpreter is not >= 2.7. """ vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gene_file = args.gene_file gene_ontology_file = args.gene_ontology_file goa_association_file = args.goa_association_file output_file = args.output_file evidence_codes = args.evidence_codes min_genes = args.min_genes_per_term max_genes = args.max_genes_per_term part_of_cc_only = args.part_of_cc_only # logging parameters log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) logger.info('Selected evidence codes: %s', ', '.join(evidence_codes)) logger.info('Min. number of genes per gene set: %d', min_genes) logger.info('Max. number of genes per gene set: %d', max_genes) # checks assert os.path.isfile(gene_file) assert os.path.isfile(gene_ontology_file) assert os.path.isfile(goa_association_file) # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # extract protein-coding genes from Ensembl GTF file exp_genome = ExpGenome.read_tsv(gene_file) # parse Gene Ontology gene_ontology = GeneOntology.read_obo(gene_ontology_file) # parse UniProt-GOA gene association file with gzip.open(goa_association_file, 'rt', encoding='ascii') as fh: go_annotations = ontology.parse_gaf(fh, gene_ontology, ev_codes=evidence_codes, genome=exp_genome) # extract GO-based gene sets gene_sets = ontology.get_goa_gene_sets(go_annotations) logger.info('Generated %d GO-derived gene sets', len(gene_sets)) # filter gene sets based on size if min_genes > 0: old_size = len(gene_sets) gene_sets = GeneSetCollection(gs for gs in gene_sets if gs.size >= min_genes) logger.info('Excluded %d gene sets with too few genes.', old_size - len(gene_sets)) if max_genes > 0: old_size = len(gene_sets) gene_sets = GeneSetCollection(gs for gs in gene_sets if gs.size <= max_genes) logger.info('Excluded %d gene sets with too many genes.', old_size - len(gene_sets)) # writing output file gene_sets.write_tsv(output_file) logger.info('Wrote %s GO-derived gene sets to output file "%s".', len(gene_sets), output_file) return 0
def main(args=None): """Extracts gene-level expression data from StringTie output. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() stringtie_file = args.stringtie_file gene_file = args.gene_file no_novel_transcripts = args.no_novel_transcripts output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read list of gene symbols logger.info("Reading gene data...") genes = misc.read_single(gene_file) # read StringTie output file and summarize FPKM and TPM per gene logger.info("Parsing StringTie output...") logger.info("Associating StringTie gene IDs with gene symbols...") stringtie_genes = {} with open(stringtie_file) as fh: reader = csv.reader(fh, dialect="excel-tab") for l in reader: if l[0][0] == "#": continue assert len(l) == 9 if l[2] != "transcript": continue attr = parse_attributes(l[8]) try: ref_gene = attr["ref_gene_name"] except KeyError: continue else: # entry has a "ref_gene_name" attribute try: g = stringtie_genes[attr["gene_id"]] except KeyError: stringtie_genes[attr["gene_id"]] = {ref_gene} else: g.add(ref_gene) logger.info("Associated %d gene IDs with gene symbols.", len(stringtie_genes)) # C = Counter(len(v) for v in stringtie_genes.itervalues()) gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1] n = len(gene_ids_ambiguous) logger.info( "%d / %d associated with multiple gene symbols (%.1f%%).", n, len(stringtie_genes), 100 * (n / float(len(stringtie_genes))), ) # read StringTie output file and summarize FPKM and TPM per gene n = len(genes) fpkm = np.zeros(n, dtype=np.float64) tpm = np.zeros(n, dtype=np.float64) fpkm_novel_gene = 0 fpkm_unknown_gene_name = 0 fpkm_novel_trans = 0 fpkm_ambig = 0 with open(stringtie_file) as fh: reader = csv.reader(fh, dialect="excel-tab") for l in reader: if l[0][0] == "#": # skip header continue assert len(l) == 9 if l[2] != "transcript": # skip exon lines continue attr = parse_attributes(l[8]) f = float(attr["FPKM"]) try: g = attr["ref_gene_name"] except KeyError: if no_novel_transcripts: # ignore this transcript fpkm_novel_trans += f continue else: # see if we can assign a gene name based on the gene ID try: assoc = stringtie_genes[attr["gene_id"]] except KeyError: # gene_id not associated with any reference gene fpkm_novel_gene += f continue else: if len(assoc) > 1: # gene ID associated with multiple ref. genes # => ingored fpkm_ambig += f continue else: # gene ID associated with exactly one ref. gene g = list(assoc)[0] try: idx = misc.bisect_index(genes, g) except ValueError: fpkm_unknown_gene_name += f logger.warning('Unknown gene name: "%s".', g) continue t = float(attr["TPM"]) fpkm[idx] += f tpm[idx] += t # ignored_fpkm = None if no_novel_transcripts: ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name else: ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name total_fpkm = np.sum(fpkm) + ignored_fpkm logger.info("Ignored %.1f / %.1f FPKM (%.1f%%)", ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm)) if no_novel_transcripts and fpkm_novel_trans > 0: logger.info( "Ignored %.1f FPKM from novel transcripts (%.1f%%).", fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm), ) else: if fpkm_novel_gene > 0: logger.info( "Ignored %.1f FPKM from transcripts of novel genes " "(%.1f%%).", fpkm_novel_gene, 100 * (fpkm_novel_gene / total_fpkm), ) if fpkm_ambig > 0: logger.info( "Ignored %.1f FPKM from transcripts with ambiguous " "gene membership (%.1f%%).", fpkm_ambig, 100 * (fpkm_ambig / total_fpkm), ) if fpkm_unknown_gene_name > 0: logger.info( "Ignored %.1f FPKM from transcripts of genes with unknown " "names (%.1f%%).", fpkm_unknown_gene_name, 100 * (fpkm_unknown_gene_name / total_fpkm), ) # write output file E = np.c_[fpkm, tpm] with open(output_file, "w") as ofh: writer = csv.writer(ofh, dialect="excel-tab", lineterminator=os.linesep, quoting=csv.QUOTE_NONE) for i, g in enumerate(genes): writer.writerow([g] + ["%.5f" % e for e in E[i, :]]) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGeneTable.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( 'Failed to convert %d / %d entrez IDs ' 'to gene symbols (%.1f%%).', f, matrix.p, 100 * (f / float(matrix.p))) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( 'Failed to find %d / %d gene symbols in list of ' 'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p))) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect='excel-tab') writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE, quotechar='|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding', 'polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' % (exons)) return 0
"""Tests for functions in `cluster` module.""" from __future__ import (absolute_import, division, print_function, unicode_literals) from builtins import str as text from builtins import int as newint import os import pytest from genometools import ensembl from genometools import misc logger = misc.get_logger() @pytest.mark.online def test_latest_release(): release = ensembl.get_latest_release() assert isinstance(release, newint) logger.info('Current release: %d', release) @pytest.mark.online @pytest.mark.linux @pytest.mark.darwin @pytest.mark.cygwin def test_download(my_download_dir): assert isinstance(my_download_dir, text) species = [
from genometools import misc from pyaffy import rma import os import numpy as np import pandas as pd from matplotlib import pyplot as plt ## set up directories data_dir = '/nobackup/shilab/Data/StJude/Leukemia_Subtypes/' output_dir = './Expression' if not os.path.isdir(ourput_dir): os.mkdir(output_dir) misc.get_logger(verbose = False) #TODO: make sure sampleList file is correct. #sampleList = "/nobackup/shilab/Data/StJude/Leukemia_Subtypes/sampleList.txt" with open(sampleList, 'r') as samp: for line in samp.readlines(): line = line.rstrip().split("\t") sample_cel_files.update({line[0] : line[1]) sample_cel_files = OrderedDict([]) genes, samples, X = rma(cdf_file, sample_cel_files)
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
# # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """Tests for functions in `misc` module.""" from __future__ import (absolute_import, division, print_function, unicode_literals) from builtins import str as text import os import pytest from genometools import misc logger = misc.get_logger() @pytest.mark.linux @pytest.mark.darwin def test_checksum(my_checksum_file): """Tests functions that calculate checksums using Unix "sum" utility.""" assert misc.get_file_checksum(my_checksum_file) == 2761 assert misc.test_file_checksum(my_checksum_file, 2761) @pytest.mark.online def test_ftp_download(my_readme_file): """Tests `ftp_download` function.""" misc.ftp_download('ftp://ftp.ensembl.org/pub/current_README', my_readme_file)
"""Tests for the `GSEAnalysis` class.""" from __future__ import (absolute_import, division, print_function, unicode_literals) from builtins import str as text from string import ascii_lowercase import pytest # from genometools.expression import ExpGenome from genometools import misc from genometools.enrichment import GeneSetEnrichmentAnalysis logger = misc.get_logger('genometools', verbose=True) @pytest.fixture def my_analysis(my_genome, my_gene_set_coll): analysis = GeneSetEnrichmentAnalysis(my_genome, my_gene_set_coll) return analysis def test_basic(my_analysis, my_genome): assert isinstance(my_analysis, GeneSetEnrichmentAnalysis) assert isinstance(repr(my_analysis), str) assert isinstance(str(my_analysis), str) assert isinstance(text(my_analysis), text) assert isinstance(my_analysis.genes, list)
#!/usr/bin/env python3 """Gene expression quantification script for inDrop data.""" import sys import argparse from genometools import misc from .. import expression _LOGGER = misc.get_logger() def get_argument_parser(): desc = 'Quantify gene expression based on aligned inDrop reads.' parser = argparse.ArgumentParser(description=desc, add_help=False) g = parser.add_argument_group('Help') g.add_argument('-h', '--help', action='help', help='Show this help message and exit.') g = parser.add_argument_group('Input and output files') g.add_argument('-a', '--alignment-file', type=str, required=True,
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE , quotechar = '|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding','polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' %(exons)) return 0