def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.read_map( IOTools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def test_cmdline(): '''test style of scripts ''' # start script in order to build the command line parser global ORIGINAL_START if ORIGINAL_START is None: ORIGINAL_START = E.start # read the first two columns map_option2action = IOTools.read_map( IOTools.open_file(FILENAME_OPTIONLIST), columns=(0, 1), has_header=True) files = [] for label, expression in EXPRESSIONS: f = glob.glob(expression) files.extend(sorted(f)) files = filter_files(files) # make sure to use the current working directory as # primary lookup. sys.path.insert(0, ".") # files = [ # 'scripts/check_db.py', # 'scripts/cgat_build_report_page.py'] for f in files: if os.path.isdir(f): continue if os.path.basename(f) in EXCLUDE: continue script_name = os.path.abspath(f) pyxfile = (os.path.join(os.path.dirname(f), "_") + os.path.basename(f) + "x") fail_.description = script_name # check if script contains getopt with IOTools.open_file(script_name) as inf: if "getopt" in inf.read(): yield (fail_, "script uses getopt directly: %s" % script_name) continue module, modulename = load_script(script_name) if module is None: yield (fail_, "module could not be imported: %s\n" % script_name) continue E.start = LocalStart try: module.main(argv=["dummy", "--help"]) except AttributeError: yield (fail_, "no main method in %s\n" % script_name) ok_(False, "no main method in %s" % script_name) except SystemExit: yield (fail_, "script does not use E.start() %s\n" % script_name) except DummyError: pass for option in PARSER.option_list: # ignore options added by optparse if option.dest is None: continue optstring = option.get_opt_string() if optstring.startswith("--"): optstring = optstring[2:] check_option.description = script_name + ":" + optstring yield (check_option, optstring, os.path.abspath(f), map_option2action) # clear up del sys.modules[modulename] # scripts with pyximport need special handling. # # Multiple imports of pyximport seems to create # some confusion - here, clear up sys.meta_path after # each script if os.path.exists(pyxfile): sys.meta_path = []
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "map-identifier", "nop", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one " "[default=%default].") parser.add_option("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default = %default].") parser.add_option("--sample-proportion", dest="sample_proportion", type="float", help="sample proportion [default = %default].") parser.add_option("--exclude-pattern", dest="exclude_pattern", type="string", help="exclude all sequences with ids matching pattern " "[default = %default].") parser.add_option("--include-pattern", dest="include_pattern", type="string", help="include only sequences with ids matching pattern " "[default = %default].") parser.add_option("--filter-method", dest="filter_methods", type="string", action="append", help="filtering methods to apply " "[default = %default].") parser.add_option( "-t", "--sequence-type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na) [%default]. This option determines " "which characters to use for masking [default = %default].") parser.add_option( "-l", "--template-identifier", dest="template_identifier", type="string", help="template for numerical identifier [default = %default] " "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help= "input filename with map for identifiers. The first row is a header") parser.add_option( "--fold-width", dest="fold_width", type="int", help="fold width for sequence output. 0 is unfolded [%default]") parser.set_defaults(methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], input_filename_fasta="-", input_filename_map=None, fold_width=80) (options, args) = E.start(parser) if len(args) > 0: options.input_filename_fasta = args[0] options.parameters = options.parameters.split(",") rx_include, rx_exclude = None, None if options.include_pattern: rx_include = re.compile(options.include_pattern) if options.exclude_pattern: rx_exclude = re.compile(options.exclude_pattern) iterator = FastaIterator.FastaIterator(options.stdin) nseq = 0 map_seq2nid = {} map_identifier = ("apply-map" in options.methods or "map-identifier" in options.methods) if map_identifier: if options.input_filename_map is None: raise ValueError("for method=map-identifier use --map-tsv-file") with IOTools.open_file(options.input_filename_map) as infile: map_identifier = IOTools.read_map(infile, has_header=True) if options.type == "na": mask_chars = options.na_mask_chars mask_char = options.na_mask_char else: mask_chars = options.aa_mask_chars mask_char = options.aa_mask_char if "map-codons" in options.methods: map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if "mask-soft" in options.methods: f = options.parameters[0] del options.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in options.methods or "back-translate" in options.methods: # open a second stream to read sequences from f = options.parameters[0] del options.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) if "sample" in options.methods: if not options.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = options.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in options.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in IOTools.open_file(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) iterator = pysam.FastxFile(options.input_filename_fasta) c = E.Counter() fold_width = options.fold_width def fold(s, w): return "\n".join([s[x:x + w] for x in range(0, len(s), w)]) for record in iterator: c.nseq += 1 c.input += 1 sequence = re.sub(" ", "", record.sequence) l = len(sequence) if rx_include and not rx_include.search(record.name): c.skipped += 1 continue if rx_exclude and rx_exclude.search(record.name): c.skipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or record.name in filter_id_list): c.skipped += 1 continue for method in options.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % options.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( record.name, ls) c.errors += 1 if options.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = next(other_iterator) except StopIteration: raise ValueError("run out of sequences") if record.name != other_record.title: raise "sequence titles don't match: %s %s" % ( record.name, other_record.title) other_sequence = re.sub("[ %s]" % options.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % options.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in options.gap_chars: c = options.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, record.name) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = sequence.translate( str.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = options.na_mask_char elif method == "remove-stops": char = options.gap_char for x in sequence: if x not in options.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in options.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = next(hard_masked_iterator) except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (record.name)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in zip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, record.name) seq = [] for codon in (sequence[x:x + 3].upper() for x in range(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, record.name) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in options.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, record.name) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", record.name).groups()[0] if id in map_seq2nid: rest = record.name[len(id):] record.name = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", record.name).groups()[0] new_id = options.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id record.name = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len([x for x in seq[x:x + 3] if x in mask_chars]) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = next(other_iterator) if other_record is None: raise ValueError("run out of sequences.") if record.name != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (record.name, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (record.name, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in options.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [options.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [options.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: c.skipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: c.skipped += 1 continue record.sequence = sequence if fold_width >= 0: if record.comment: options.stdout.write(">{} {}\n{}\n".format( record.name, record.comment, fold(record.sequence, fold_width))) else: options.stdout.write(">{}\n{}\n".format( record.name, fold(record.sequence, fold_width))) else: options.stdout.write(str(record) + "\n") c.output += 1 if "build-map" in options.methods: p = options.parameters[0] if p: outfile = IOTools.open_file(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in list(map_seq2nid.items()): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info(c) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--fasta", dest="input_filename_fasta", type="string", help="filename with fasta sequences. ") parser.add_option("-o", "--output-filename-sequences", dest="output_filename_sequences", type="string", help="output per sequence information to filename") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=["map-identifier", "nop"], help="method to apply") parser.add_option( "--input-filename-map", dest="input_filename_map", type="string", help= "input filename with map for identifiers. The first row is a header") parser.add_option( "--fold-width", dest="fold_width", type="int", help="fold width for sequence output. 0 is unfolded [%default]") parser.set_defaults(input_filename_fasta="-", methods=[], input_filename_map=None, fold_width=80) (options, args) = E.start(parser, argv=argv) if len(args) > 0: options.input_filename_fasta = args[0] if options.methods is None: raise ValueError("please specify at least one method") map_identifier = "map-identifier" in options.methods if map_identifier: if options.input_filename_map is None: raise ValueError( "for method=map-identifier use --input-filename-map") with IOTools.open_file(options.input_filename_map) as infile: map_identifier = IOTools.read_map(infile, has_header=True) iterator = pysam.FastxFile(options.input_filename_fasta) c = E.Counter() fold_width = options.fold_width def fold(s, w): return "\n".join([s[x:x + w] for x in range(0, len(s), w)]) method = options.method for record in iterator: c.input += 1 if map_identifier: record.name = map_identifier.get(record.name, record.name) if fold_width >= 0: options.stdout.write(">{} {}\n{}\n".format( record.name, record.comment, fold(record.sequence, fold_width))) else: options.stdout.write(str(record) + "\n") c.output += 1 E.info(c) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--species", dest="species", type="string", help="species to use [default=%default].") parser.add_option("-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories " "[default=%default].") parser.add_option("-g", "--genes-tsv-file", dest="filename_genes", type="string", help="filename with genes to analyse " "[default=%default].") parser.add_option("-b", "--background-tsv-file", dest="filename_background", type="string", help="filename with background genes to analyse " "[default=%default].") parser.add_option("-m", "--min-counts", dest="minimum_counts", type="int", help="minimum count - ignore all categories that have " "fewer than # number of genes" " [default=%default].") parser.add_option("-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio"), help="output sort order [default=%default].") parser.add_option("--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested " "separately [default=%default].") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="significance threshold [>1.0 = all ]. If --fdr is set, this " "refers to the fdr, otherwise it is a cutoff for p-values.") parser.add_option("--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile " "[default=%default].") parser.add_option( "--gene2name-map-tsv-file", dest="filename_gene2name", type="string", help="optional filename mapping gene identifiers to gene names " "[default=%default].") parser.add_option( "--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default].") parser.add_option("--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile " "[default=%default].") parser.add_option("--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default].") parser.add_option( "--filename-output-pattern", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern with output filename pattern " "(should contain: %(go)s and %(section)s ) [default=%default]") parser.add_option("--fdr", dest="fdr", action="store_true", help="calculate and filter by FDR default=%default].") parser.add_option( "--go2goslim", dest="go2goslim", action="store_true", help="convert go assignments in STDIN to goslim assignments and " "write to STDOUT [default=%default].") parser.add_option("--gene-pattern", dest="gene_pattern", type="string", help="pattern to transform identifiers to GO gene names " "[default=%default].") parser.add_option("--filename-map-slims", dest="filename_map_slims", type="string", help="write mapping between GO categories and GOSlims " "[default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default].") parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background " "[default=%default].") parser.add_option( "-q", "--fdr-method", dest="qvalue_method", type="choice", choices=("empirical", "storey", "BH"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default].") # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults(species=None, filename_genes="-", filename_background=None, filename_slims=None, minimum_counts=0, ontology=[], filename_dump=None, sample=0, fdr=False, output_filename_pattern=None, threshold=0.05, filename_map_slims=None, gene_pattern=None, sort_order="ratio", get_genes=None, strict=False, qvalue_method="empirical", pairs_min_observed_counts=3, compute_pairwise=False, filename_gene2name=None) (options, args) = E.start(parser, add_database_options=True) if options.go2goslim: GO.convertGo2Goslim(options) E.stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn("fdr will be computed without sampling") ############################################################# # dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = [ "biol_process", "mol_function", "cell_location" ] E.info("dumping GO categories to %s" % (options.filename_dump)) dbhandle = Database.connect(url=options.database_url) outfile = IOTools.open_file(options.filename_dump, "w", create_dir=True) GO.DumpGOFromDatabase(outfile, dbhandle, options) outfile.close() E.stop() sys.exit(0) ############################################################# # read GO categories from file if options.filename_input: E.info("reading association of categories and genes from %s" % (options.filename_input)) infile = IOTools.open_file(options.filename_input) gene2gos, go2infos = GO.ReadGene2GOFromFile(infile) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = IOTools.open_file(options.filename_gene2name) gene2name = IOTools.read_map(infile, has_header=True) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: # use identity mapping gene2name = dict([(x, x) for x in list(gene2gos.keys())]) ############################################################# # read GO ontology from file if options.filename_ontology: E.info("reading ontology from %s" % (options.filename_ontology)) infile = IOTools.open_file(options.filename_ontology) ontology = GO.readOntology(infile) infile.close() def _g(): return collections.defaultdict(GO.GOInfo) go2infos = collections.defaultdict(_g) # substitute go2infos for go in list(ontology.values()): go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId, go_type=go.mNameSpace, description=go.mName) ############################################################# # get foreground gene list input_foreground, genelists = GO.ReadGeneLists( options.filename_genes, gene_pattern=options.gene_pattern) E.info("read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists))) ############################################################# # get background if options.filename_background: # nick - bug fix: background is the first tuple element from # ReadGeneLists input_background = GO.ReadGeneLists( options.filename_background, gene_pattern=options.gene_pattern)[0] E.info("read %i genes for background" % len(input_background)) else: input_background = None ############################################################# # sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = list(gene2gos.keys()) E.info("found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append("\t".join( ("genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments", "messages")) + "\n") ############################################################# # get go categories for genes for test_ontology in sorted(options.ontology): # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info("working on ontology %s" % test_ontology) ############################################################# # get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info("reading data from database ...") dbhandle.Connect(options) gene2go, go2info = GO.ReadGene2GOFromDatabase( dbhandle, test_ontology, options.database, options.species) E.info("finished") if len(go2info) == 0: E.warn("could not find information for terms - " "could be mismatch between ontologies") ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go) E.info("assignments found: %i genes mapped to %i categories " "(%i maps)" % (ngenes, ncategories, nmaps)) if options.minimum_counts > 0: to_remove = set([ x for x, y in counts_per_category.items() if y < options.minimum_counts ]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts)) GO.removeCategories(gene2go, to_remove) ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) E.info("assignments after filtering: %i genes mapped " "to %i categories (%i maps)" % (ngenes, ncategories, nmaps)) for genelist_name, foreground in sorted(genelists.items()): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## # build background - reconcile with foreground ################################################################## if input_background is None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % ( len(missing), str(missing)) else: if len(missing) != 0: E.warn("%i genes in foreground that are not in " "background - added to background of %i" % (len(missing), len(background))) background.extend(missing) E.info("(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) # sort foreground and background, important for reproducibility # under random seed foreground = sorted(foreground) background = sorted(background) ############################################################# # sanity checks: # are all of the foreground genes in the dataset # missing = set(genes).difference( set(gene2go.keys()) ) # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# # read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GO.GetGOSlims( IOTools.open_file(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in list(go_slims.values()): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" % (options.filename_slims, len(go_slims), len(v))) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile = IOTools.open_file(options.filename_map_slims, "w") outfile.write("GO\tGOSlim\n") for go, go_slim in sorted(list(go_slims.items())): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology) if options.loglevel >= 1: ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) options.stdlog.write( "# after go slim filtering: %i genes mapped to " "%i categories (%i maps)\n" % (ngenes, ncategories, nmaps)) ############################################################# # Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in list(gene2go.items()): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append(gene) elif gene in background: bg.append(gene) else: ng.append(gene) # skip to next GO class if not (bg or ng): continue options.stdout.write("# genes in GO category %s\n" % options.get_genes) options.stdout.write("gene\tset\n") for x in sorted(fg): options.stdout.write("%s\t%s\n" % ("fg", x)) for x in sorted(bg): options.stdout.write("%s\t%s\n" % ("bg", x)) for x in sorted(ng): options.stdout.write("%s\t%s\n" % ("ng", x)) E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng))) E.stop() sys.exit(0) ############################################################# outfile = GO.getFileName(options, go=test_ontology, section='foreground', set=genelist_name) outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground)))) if options.output_filename_pattern: outfile.close() outfile = GO.getFileName(options, go=test_ontology, section='background', set=genelist_name) # Jethro bug fix - see section 'build background' for assignment outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background)))) if options.output_filename_pattern: outfile.close() ############################################################# # do the analysis go_results = GO.AnalyseGO(gene2go, foreground, background) if len(go_results.mSampleGenes) == 0: E.warn("%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = list(go_results.mResults.items()) ############################################################# # calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = GO.computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info) for x, v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append("fdr=%s" % method) if options.sort_order == "fdr": pairs.sort(key=lambda x: x[1].mQValue) elif options.sort_order == "ratio": pairs.sort(key=lambda x: x[1].mRatio) elif options.sort_order == "pvalue": pairs.sort(key=lambda x: x[1].mPValue) ############################################################# ############################################################# ############################################################# # output the full result outfile = GO.getFileName(options, go=test_ontology, section='overall', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options) nselected = len(filtered_pairs) nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1]) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1]) assert nselected_up + nselected_down == nselected outfile = GO.getFileName(options, go=test_ontology, section='results', set=genelist_name) GO.outputResults(outfile, filtered_pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append(pairs) all_significant_results.append(filtered_pairs) all_genelists_with_results.append(genelist_name) ############################################################# ############################################################# ############################################################# # output parameters ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) outfile = GO.getFileName(options, go=test_ontology, section='parameters', set=genelist_name) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology)) outfile.write("parameter\tvalue\tdescription\n") outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes) outfile.write("mapped_categories\t%i\tmapped categories\n" % ncategories) outfile.write("mappings\t%i\tmappings\n" % nmaps) outfile.write("genes_in_fg\t%i\tgenes in foreground\n" % len(foreground)) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes))) outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes))) outfile.write("associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.pretty_percent(len(go_results.mSampleGenes), len(foreground), "%5.2f"))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.pretty_percent(len(go_results.mBackgroundGenes), nbackground, "%5.2f"))) outfile.write("significant\t%i\tsignificant results reported\n" % nselected) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down) outfile.write("threshold\t%6.4f\tsignificance threshold\n" % options.threshold) if options.output_filename_pattern: outfile.close() summary.append("\t".join( map(str, (genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, IOTools.pretty_percent(len(go_results.mSampleGenes), len(foreground), "%5.2f"), IOTools.pretty_percent( len(go_results.mBackgroundGenes), nbackground, "%5.2f"), ",".join(msgs)))) + "\n") ############################################################# ############################################################# ############################################################# # output the fg patterns outfile = GO.getFileName(options, go=test_ontology, section='withgenes', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples, gene2go=gene2go, foreground=foreground, gene2name=gene2name) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ################################################################### # output various summary files # significant results GO.outputMultipleGeneListResults(all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section='significant') # all results GO.outputMultipleGeneListResults(all_results, all_genelists_with_results, test_ontology, go2info, options, section='all') if options.compute_pairwise: GO.pairwiseGOEnrichment(all_results, all_genelists_with_results, test_ontology, go2info, options) outfile_summary = options.stdout outfile_summary.write("".join(summary)) E.stop()