def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands, but can be changed " "by --input-format. " "For example, 'chr1:+:10:12' will return " "bases 11 and 12 on chr1. Elements from the end of the " "string can be omitted. For example, 'chr1' will return " "all of chromosome 'chr1'.") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s. See --extract. [default=%%default]." % ", ".join(input_format_choices)) parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms. This is a comma separated with list " "of equivalence relations. For example, chrM=chrMT " "means that chrMT will refer to chrM and either " "can be used to retrieve a sequence " "[default=%default]") group = E.OptionGroup(parser, "Bencharking options") group.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") group.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") group.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option_group(group) group = E.OptionGroup(parser, "Validation options") group.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") group.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") parser.add_option_group(group) file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--force-output", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) group = E.OptionGroup(parser, 'Compression options') compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") group.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specified compression " "method. " "Valid choices are %s, but depend on availability on the " "system " "[default=%%default]." % ", ".join(compression_choices)) group.add_option("--random-access-points", dest="random_access_points", type="int", help="set random access points every # number " "of nucleotides for block compression schemes " "[default=%default].") group.add_option( "--compress-index", dest="compress_index", action="store_true", help="compress index. The default is to use a plain-text, " "human-readable index [default=%default].") parser.add_option_group(group) parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" % (options.benchmark_fragment_size), setup="from cgat import IndexedFasta\n" "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in list(synonyms.items()): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print(globals()["__doc__"]) sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.stop()