def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-q", "--quality-file", dest="quality_file", type=str, help="filename with genomic base quality " "information.") parser.add_argument("-b", "--bam-file", dest="bam_files", type=str, metavar="bam", help="filename with read mapping information. " "Multiple files can be submitted in a " "comma-separated list.") parser.add_argument("-i", "--bigwig-file", dest="bigwig_file", type=str, metavar="bigwig", help="filename with bigwig information ") parser.add_argument("-f", "--gff-file", dest="filename_gff", type=str, action="append", metavar='bed', help="filename with extra gff files. The order " "is important.") parser.add_argument("--filename-format", dest="filename_format", type=str, choices=("bed", "gff", "gtf"), help="format of secondary stream.") parser.add_argument("--restrict-source", dest="gff_sources", type=str, action="append", help="restrict input to this 'source' in extra " "gff file (for counter: overlap).") parser.add_argument("--restrict-feature", dest="gff_features", type=str, action="append", help="restrict input to this 'feature' in extra gff " "file (for counter: overlap).") parser.add_argument("-r", "--reporter", dest="reporter", type=str, choices=("genes", "transcripts"), help="report results for 'genes' or 'transcripts' ") parser.add_argument("-s", "--section", dest="sections", type=str, action="append", choices=("exons", "introns"), help="select range on which counters will operate ") parser.add_argument( "-c", "--counter", dest="counters", type=str, action="append", choices=("bigwig-counts", "binding-pattern", "classifier", "classifier-rnaseq", "classifier-rnaseq-splicing", "classifier-polii", "composition-na", "composition-cpg", "coverage", "distance", "distance-genes", "distance-tss", "length", 'neighbours', "overlap", "overlap-stranded", "overlap-transcripts", "overrun", "position", "proximity", "proximity-exclusive", "proximity-lengthmatched", "quality", "read-coverage", "read-extension", "read-overlap", "read-counts", "read-fullcounts", "readpair-counts", "readpair-fullcounts", "splice", "splice-comparison", "territories"), help="select counters to apply to input ") parser.add_argument("--add-gtf-source", dest="add_gtf_source", action="store_true", help="add gtf field of source to output ") parser.add_argument("--proximal-distance", dest="proximal_distance", type=int, help="distance to be considered proximal to " "an interval.") parser.add_argument("--multi-mapping-method", dest="multi_mapping", type=str, choices=('all', 'ignore', 'weight'), help="how to treat multi-mapping reads in " "bam-files. Requires " "the NH flag to be set by the mapper ") parser.add_argument("--use-barcodes", dest="use_barcodes", action="store_true", help="Use barcodes to count unique umi's. " "UMI's are specified in the read identifier " "as the last field, where fields are separated " "by underscores, e.g. " "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. " "When true, unique counts are returned. " "Currently only compatible with count-reads") parser.add_argument("--sample-probability", dest="sample_probability", type=float, help="Specify the probability of whether any" "given read or read pair in a file bam is counted" "Currently only compatible with count-reads") parser.add_argument("--column-prefix", dest="prefixes", type=str, action="append", help="add prefix to column headers - prefixes " "are used in the same order as the counters ") parser.add_argument("--library-type", dest="library_type", type=str, choices=("unstranded", "firststrand", "secondstrand", "fr-unstranded", "fr-firststrand", "fr-secondstrand"), help="library type of reads in bam file. ") parser.add_argument("--min-mapping-quality", dest="minimum_mapping_quality", type=float, help="minimum mapping quality. Reads with a quality " "score of less will be ignored. ") parser.set_defaults(genome_file=None, reporter="genes", with_values=True, sections=[], counters=[], filename_gff=[], filename_format=None, gff_features=[], gff_sources=[], add_gtf_source=False, proximal_distance=10000, bam_files=None, multi_mapping='all', library_type='fr-unstranded', prefixes=[], minimum_mapping_quality=0, use_barcodes=False, sample_probability=1.0) if not argv: argv = sys.argv (args) = E.start(parser, add_output_options=True, argv=argv) if args.prefixes: if len(args.prefixes) != len(args.counters): raise ValueError("if any prefix is given, the number of prefixes " "must be the same as the number of counters") # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.quality_file: quality = IndexedFasta.IndexedFasta(args.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) else: quality = None if args.bam_files: bam_files = [] for bamfile in args.bam_files.split(","): bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: bam_files = None if args.bigwig_file: bigwig_file = pyBigWig.open(args.bigwig_file) else: bigwig_file = None counters = [] if not args.sections: E.info("counters will use the default section (exons)") args.sections.append(None) if not args.gff_sources: args.gff_sources.append(None) if not args.gff_features: args.gff_features.append(None) cc = E.Counter() for n, c in enumerate(args.counters): if args.prefixes: prefix = args.prefixes[n] else: prefix = None if c == "position": for section in args.sections: counters.append( GeneModelAnalysis.CounterPosition(section=section, options=args, prefix=prefix)) elif c == "length": for section in args.sections: counters.append( GeneModelAnalysis.CounterLengths(section=section, options=args, prefix=prefix)) elif c == "splice": if fasta is None: raise ValueError('splice requires a genomic sequence') counters.append( GeneModelAnalysis.CounterSpliceSites(fasta=fasta, prefix=prefix)) elif c == "quality": if fasta is None: raise ValueError('quality requires a quality score sequence') counters.append( GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix)) elif c == "overrun": counters.append( GeneModelAnalysis.CounterOverrun( filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-coverage": counters.append( GeneModelAnalysis.CounterReadCoverage(bam_files, options=args, prefix=prefix)) elif c == "read-extension": counters.append( GeneModelAnalysis.CounterReadExtension( bam_files, filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-overlap": counters.append( GeneModelAnalysis.CounterReadOverlap( bam_files, multi_mapping=args.multi_mapping, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-counts": counters.append( GeneModelAnalysis.CounterReadCounts( bam_files, multi_mapping=args.multi_mapping, use_barcodes=args.use_barcodes, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-fullcounts": counters.append( GeneModelAnalysis.CounterReadCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-counts": counters.append( GeneModelAnalysis.CounterReadPairCounts( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, library_type=args.library_type, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-fullcounts": counters.append( GeneModelAnalysis.CounterReadPairCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "bigwig-counts": counters.append( GeneModelAnalysis.CounterBigwigCounts(bigwig_file, options=args, prefix=prefix)) elif c == "splice-comparison": if fasta is None: raise ValueError('splice-comparison requires a genomic ' 'sequence') counters.append( GeneModelAnalysis.CounterSpliceSiteComparison( fasta=fasta, filename_gff=args.filename_gff, feature=None, source=None, options=args, prefix=prefix)) elif c == "composition-na": if fasta is None: raise ValueError('composition-na requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionNucleotides( fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "composition-cpg": if fasta is None: raise ValueError('composition-cpg requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionCpG(fasta=fasta, section=section, options=args, prefix=prefix)) elif c in ("overlap", "overlap-stranded", "overlap-transcripts", "proximity", "proximity-exclusive", "proximity-lengthmatched", "neighbours", "territories", "distance", "distance-genes", "distance-tss", "binding-pattern", "coverage"): if c == "overlap": template = GeneModelAnalysis.CounterOverlap if c == "overlap-stranded": template = GeneModelAnalysis.CounterOverlapStranded elif c == "overlap-transcripts": template = GeneModelAnalysis.CounterOverlapTranscripts elif c == "proximity": template = GeneModelAnalysis.CounterProximity elif c == "neighbours": template = GeneModelAnalysis.CounterNeighbours elif c == "proximity-exclusive": template = GeneModelAnalysis.CounterProximityExclusive elif c == "proximity-lengthmatched": template = GeneModelAnalysis.CounterProximityLengthMatched elif c == "territories": template = GeneModelAnalysis.CounterTerritories elif c == "distance": template = GeneModelAnalysis.CounterDistance elif c == "distance-genes": template = GeneModelAnalysis.CounterDistanceGenes elif c == "distance-tss": template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites elif c == "coverage": template = GeneModelAnalysis.CounterCoverage elif c == "binding-pattern": template = GeneModelAnalysis.CounterBindingPattern for section in args.sections: for source in args.gff_sources: for feature in args.gff_features: counters.append( template(filename_gff=args.filename_gff, feature=feature, source=source, fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "classifier": counters.append( GeneModelAnalysis.Classifier(filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq": counters.append( GeneModelAnalysis.ClassifierRNASeq( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq-splicing": counters.append( GeneModelAnalysis.ClassifierRNASeqSplicing( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-polii": counters.append( GeneModelAnalysis.ClassifierPolII( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) elif c == "binding-pattern": counters.append( GeneModelAnalysis.CounterBindingPattern( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) if args.reporter == "genes": iterator = GTF.flat_gene_iterator header = ["gene_id"] fheader = lambda x: [x[0].gene_id] elif args.reporter == "transcripts": iterator = GTF.transcript_iterator header = ["transcript_id"] fheader = lambda x: [x[0].transcript_id] if args.add_gtf_source: header.append("source") ffields = lambda x: [x[0].source] else: ffields = lambda x: [] args.stdout.write("\t".join(header + [x.getHeader() for x in counters]) + "\n") for gffs in iterator(GTF.iterator(args.stdin)): cc.input += 1 for counter in counters: counter.update(gffs) skip = len([x for x in counters if x.skip]) == len(counters) if skip: cc.skipped += 1 continue args.stdout.write("\t".join( fheader(gffs) + ffields(gffs) + [str(counter) for counter in counters]) + "\n") cc.output += 1 E.info("%s" % str(cc)) for counter in counters: E.info("%s\t%s" % (repr(counter), str(counter.counter))) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands, but can be changed " "by --input-format. " "For example, 'chr1:+:10:12' will return " "bases 11 and 12 on chr1. Elements from the end of the " "string can be omitted. For example, 'chr1' will return " "all of chromosome 'chr1'.") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s. See --extract. [default=%%default]." % ", ".join(input_format_choices)) parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms. This is a comma separated with list " "of equivalence relations. For example, chrM=chrMT " "means that chrMT will refer to chrM and either " "can be used to retrieve a sequence " "[default=%default]") group = E.OptionGroup(parser, "Bencharking options") group.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") group.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") group.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option_group(group) group = E.OptionGroup(parser, "Validation options") group.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") group.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") parser.add_option_group(group) file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--force-output", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) group = E.OptionGroup(parser, 'Compression options') compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") group.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specified compression " "method. " "Valid choices are %s, but depend on availability on the " "system " "[default=%%default]." % ", ".join(compression_choices)) group.add_option("--random-access-points", dest="random_access_points", type="int", help="set random access points every # number " "of nucleotides for block compression schemes " "[default=%default].") group.add_option( "--compress-index", dest="compress_index", action="store_true", help="compress index. The default is to use a plain-text, " "human-readable index [default=%default].") parser.add_option_group(group) parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" % (options.benchmark_fragment_size), setup="from cgat import IndexedFasta\n" "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in list(synonyms.items()): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print(globals()["__doc__"]) sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.stop()