def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome (indexed).") parser.add_argument("-w", "--windows-bed-file", dest="filename_windows", type=str, help="gff file with windows to use.") parser.add_argument("-d", "--filename-data", dest="filename_data", type=str, help="gff file with data to use.") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="filename-data is gtf file") parser.add_argument("-f", "--features", dest="features", type=str, action="append", choices=("GC", ), help="features to compute.") parser.add_argument("-c", "--decorator", dest="decorator", type=str, choices=("counts", "gc", "gc3", "mean-length", "median-length", "percent-coverage", "median-score", "mean-score", "stddev-score", "min-score", "max-score"), help="decorators to use.") parser.add_argument("-e", "--skip-empty", dest="skip_empty", action="store_true", help="skip empty windows.") parser.add_argument( "-t", "--transform=", dest="transform", type=str, choices=("none", "overlap", "complement", "third_codon"), help="transform to use when mapping overlapping regions onto window.") parser.set_defaults( genome_file=None, filename_windows=None, filename_data=None, features=[], skip_empty=False, decorator="counts", transform="none", is_gtf=False, ) (args) = E.start(parser) # test_transform_third_codon() if not args.filename_windows: raise ValueError("please supply a gff file with window information.") if args.loglevel >= 1: args.stdlog.write("# reading windows...") args.stdlog.flush() windows = GTF.readAsIntervals( GTF.iterator(iotools.open_file(args.filename_windows, "r"))) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() if args.filename_data: if args.loglevel >= 1: args.stdlog.write("# reading data...") args.stdlog.flush() if args.is_gtf: gff_data = GTF.readFromFile( iotools.open_file(args.filename_data, "r")) else: gff_data = GTF.readFromFile( IOTOols.open_file(args.filename_data, "r")) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() data_ranges = GTF.SortPerContig(gff_data) else: # use windows to compute properties # by supplying no data and asking for the complement = original window gff_data = None data_ranges = None args.transform = "complement" map_contig2size = {} if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) map_contig2size = fasta.getContigSizes() else: for contig, values in list(windows.items()): map_contig2size[contig] = max(lambda x: x[1], values) fasta = None contigs = list(map_contig2size.keys()) contigs.sort() # proceed contig wise noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0 args.stdout.write("\t".join( map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1", "l1", "n2", "l2", "score", "extra_info"))) + "\n") for contig in contigs: skip = False if contig not in windows: ncontigs_skipped_windows += 1 skip = True if data_ranges and contig not in data_ranges: ncontigs_skipped_data += 1 skip = True if skip: continue noutput_contigs += 1 if data_ranges: annotateWindows( contig, windows[contig], gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta, args) else: annotateWindows(contig, windows[contig], [], fasta, args) E.info( "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i" % (len(windows), noutput_contigs, len(contigs), ncontigs_skipped_windows, ncontigs_skipped_data)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def cropGFF(gffs, filename_gff): """crop intervals in gff file.""" # read regions to crop with and convert intervals to intersectors E.info("reading gff for cropping: started.") other_gffs = GTF.iterator(iotools.open_file(filename_gff, "r")) cropper = GTF.readAsIntervals(other_gffs) ntotal = 0 for contig in list(cropper.keys()): intersector = quicksect.IntervalTree() for start, end in cropper[contig]: intersector.add(start, end) ntotal += 1 cropper[contig] = intersector E.info("reading gff for cropping: finished.") E.info("reading gff for cropping: %i contigs with %i intervals." % (len(cropper), ntotal)) ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0 # do the actual cropping for gff in gffs: ninput += 1 if gff.contig in cropper: start, end = gff.start, gff.end overlaps = cropper[gff.contig].find(quicksect.Interval(start, end)) if overlaps: l = end - start a = numpy.ones(l) for i in overlaps: s = max(0, i.start - start) e = min(l, i.end - start) a[s:e] = 0 segments = Intervals.fromArray(a) if len(segments) == 0: ndeleted += 1 else: ncropped += 1 for s, e in segments: gff.start, gff.end = s + start, e + start noutput += 1 yield (gff) continue noutput += 1 yield (gff) E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" % (ninput, noutput, ncropped, ndeleted))