def addIntergenicSegment(last, this, fasta, options): """add an intergenic segment between last and this. At telomeres, either can be None. """ if not this and not last: return 0 nadded = 0 if not this: # last telomere try: lcontig = fasta.getLength(last.contig) except KeyError as msg: if options.ignore_missing: return nadded else: raise KeyError(msg) flank = min(last.end + options.flank, lcontig) nadded += addFlank(last.end, flank, last, options) nadded += addSegment("telomeric", flank, lcontig, last, options) elif not last: # first telomere flank = max(0, this.start - options.flank) nadded += addSegment("telomeric", 0, flank, this, options) nadded += addFlank(flank, this.start, this, options) else: # intergenic region d = this.start - last.end flank = options.flank if d > flank * 2: nadded += addFlank(last.end, last.end + flank, last, options) nadded += addSegment("intergenic", last.end + flank, this.start - flank, (last, this), options) nadded += addFlank(this.start - flank, this.start, this, options) else: # add short flank between two genes. If they can not agree # on the directionality, "flank" is used. is_positive1 = Genomics.IsPositiveStrand(last.strand) is_positive2 = Genomics.IsPositiveStrand(this.strand) if is_positive1 and not is_positive2: key = "3flank" elif not is_positive1 and is_positive2: key = "5flank" else: key = "flank" nadded += addSegment(key, last.end, this.start, (last, this), options) return nadded
def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] return "".join(s)
def addFlank(start, end, template, options): """add a flank. """ is_positive = Genomics.IsPositiveStrand(template.strand) is_before = end <= template.start if (is_before and is_positive) or (not is_before and not is_positive): name = "5flank" else: name = "3flank" return addSegment(name, start, end, template, options)
def updateVariants(variants, lcontig, strand, phased=True): '''update variants such that they use same coordinate system (and strand) as the transcript fixes 1-ness of variants ''' new_variants = [] is_positive = Genomics.IsPositiveStrand(strand) for variant in variants: pos = variant.pos genotype = bytes(variant.genotype) reference = bytes(variant.reference) # fix 1-ness of variants # pos -= 1 if len(genotype) == 1: variantseqs = list(Genomics.decodeGenotype(genotype)) has_wildtype = reference in variantseqs action = "=" start, end = pos, pos + 1 else: variantseqs = [x[1:] for x in genotype.split("/")] lvariant = max([len(x) for x in variantseqs]) if not phased: variantseqs = [x for x in variantseqs if x] has_wildtype = "*" in genotype if "+" in genotype and "-" in genotype: # both insertion and deletion at position # the range is given by the deletion # see below for explanations if genotype.startswith("+"): action = ">" variantseqs[1] += "-" * (lvariant - len(variantseqs[1])) else: action = "<" variantseqs[0] += "-" * (lvariant - len(variantseqs[0])) start, end = pos + 1, pos + lvariant + 1 elif "-" in genotype: action = "-" # samtools: deletions are after the base denoted by snp.position # * <- deletion at 1 # 0 1 2 3 4 5 6 # - - # 6 5 4 3 2 1 0 # deletion of 2+3 = (2,4) # on reverse: (7-4, 7-2) = (3,5) start, end = pos + 1, pos + lvariant + 1 # deletions of unequal length are filled up with "-" # This is necessary to deal with negative strands: # -at/-atg on the positive strand deletes a t [g] # -at/-atg on the negative strand deletes [g] t a variantseqs = [ x + "-" * (lvariant - len(x)) for x in variantseqs ] elif "+" in genotype: action = "+" # indels are after the base denoted by position # as region use both flanking base so that negative strand # coordinates work # insertion between position 2 and 3 # * <- insection at pos 2 # 0 1 2i3 4 # 4 3 2i1 0 # is insertion between 1 and 2 in reverse # including both flanking residues makes it work: # (2,3) = (5-3,5-2) = (2,3) # but: # (2,4) = (5-4,5-2) = (1,3) start, end = pos, pos + 2 # revert strand if not is_positive: reference = Genomics.reverse_complement(reference) variantseqs = [ Genomics.reverse_complement(x.upper()) for x in variantseqs ] start, end = lcontig - end, lcontig - start new_variants.append( ExtendedVariant._make((start, end, reference.upper(), action, has_wildtype, variantseqs))) return new_variants
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("B").itemsize)) # AString.AString( "a").itemsize )) for contig, size in list(contig_sizes.items()): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) # annotations[contig] = array.array("", default_code * size) # Go to list for py3 compatibility, patch annotations[contig] = [default_code] * size E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.open_output_file("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError as msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c) E.info("finished reading genes: %s" % str(counter)) outfile_junctions.close() E.info("started counting") outfile = E.open_output_file("counts") outputCounts(outfile, annotations) outfile.close() E.info("started output") for k in sorted(annotations.keys()): # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring())) options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-m", "--method", dest="method", type=str, choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand", "rename-chr"), help="method to apply ") parser.add_argument("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf.") parser.add_argument("-c", "--contigs-tsv-file", dest="input_filename_contigs", type=str, help="filename with contig lengths.") parser.add_argument( "--agp-file", dest="input_filename_agp", type=str, help="agp file to map coordinates from contigs to scaffolds.") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("--crop-gff-file", dest="filename_crop_gff", type=str, help="GFF/GTF file to crop against.") parser.add_argument( "--group-field", dest="group_field", type=str, help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... .""") parser.add_argument( "--filter-range", dest="filter_range", type=str, help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_argument("--sanitize-method", dest="sanitize_method", type=str, choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " ".") parser.add_argument( "--flank-method", dest="flank_method", type=str, choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " ".") parser.add_argument("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised .") parser.add_argument( "--contig-pattern", dest="contig_pattern", type=str, help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize .") parser.add_argument( "--assembly-report", dest="assembly_report", type=str, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-hasids", dest="assembly_report_hasIDs", type=int, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type=int, help="column in the assembly report containing ucsc contig ids" ".") parser.add_argument( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type=int, help="column in the assembly report containing ensembl contig ids") parser.add_argument( "--assembly-extras", dest="assembly_extras", type=str, help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome .") parser.add_argument("--extension-upstream", dest="extension_upstream", type=float, help="extension for upstream end .") parser.add_argument("--extension-downstream", dest="extension_downstream", type=float, help="extension for downstream end .") parser.add_argument("--min-distance", dest="min_distance", type=int, help="minimum distance of features to merge/join .") parser.add_argument("--max-distance", dest="max_distance", type=int, help="maximum distance of features to merge/join .") parser.add_argument("--min-features", dest="min_features", type=int, help="minimum number of features to merge/join .") parser.add_argument("--max-features", dest="max_features", type=int, help="maximum number of features to merge/join .") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, rename_chr_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (args) = E.start(parser, argv=argv) contigs = None genome_fasta = None chr_map = None if args.input_filename_contigs: contigs = Genomics.readContigSizes( iotools.open_file(args.input_filename_contigs, "r")) if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") if args.assembly_report: df = pd.read_csv(args.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if args.assembly_report_hasIDs == 1: ucsccol = args.assembly_report_ucsccol ensemblcol = args.assembly_report_ensemblcol df.loc[df[1] == "assembled-molecule", ensemblcol] = df.loc[df[1] == "assembled-molecule", 0] if args.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif args.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if args.assembly_extras is not None: assembly_extras = args.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if args.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if args.input_filename_agp: agp = AGP.AGP() agp.readFromFile(iotools.open_file(args.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(args.stdin) if args.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == args.method add_downstream_flank = "add-downstream-flank" == args.method if args.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(args.extension_upstream) downstream_flank = int(args.extension_downstream) extend_flank = args.flank_method == "extend" if args.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, args.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: args.stdout.write(str(gff) + "\n") elif args.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: if args.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start args.stdout.write(str(x) + "\n") x.start = c.end elif args.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" args.stdout.write(str(x) + "\n") elif args.method == "join-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=False, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "merge-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=True, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "crop": for gff in cropGFF(gffs, args.filename_crop_gff): args.stdout.write(str(gff) + "\n") elif args.method == "crop-unique": for gff in cropGFFUnique(gffs): args.stdout.write(str(gff) + "\n") elif args.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", args.filter_range).groups() except AttributeError: raise "can not parse range %s" % args.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): args.stdout.write(str(gff) + "\n") elif args.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif args.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif args.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if args.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if args.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if args.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if args.contig_pattern: to_remove = [ re.compile(x) for x in args.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue args.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) elif args.method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") for gff in renameChromosomes(gffs, chr_map): args.stdout.write(str(gff) + "\n") else: for gff in gffs: if args.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if args.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) args.stdout.write(str(gff) + "\n") E.stop()