def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="method", type="choice", choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand"), help="method to apply [%default]") parser.add_option("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf [default=%default].") parser.add_option("-c", "--contigs-tsv-file", dest="input_filename_contigs", type="string", help="filename with contig lengths.") parser.add_option( "--agp-file", dest="input_filename_agp", type="string", help="agp file to map coordinates from contigs to scaffolds.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--crop-gff-file", dest="filename_crop_gff", type="string", help="GFF/GTF file to crop against.") parser.add_option( "--group-field", dest="group_field", type="string", help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... [%default].""") parser.add_option( "--filter-range", dest="filter_range", type="string", help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_option("--sanitize-method", dest="sanitize_method", type="choice", choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " "[%default].") parser.add_option( "--flank-method", dest="flank_method", type="choice", choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " "[%default].") parser.add_option("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised [%default].") parser.add_option( "--contig-pattern", dest="contig_pattern", type="string", help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize [%default].") parser.add_option( "--assembly-report", dest="assembly_report", type="string", help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize [%default].") parser.add_option( "--assembly-report-hasids", dest="assembly_report_hasIDs", type="int", help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize [%default].") parser.add_option( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type="int", help="column in the assembly report containing ucsc contig ids" "[%default].") parser.add_option( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type="int", help="column in the assembly report containing ensembl contig ids" "[%default].") parser.add_option( "--assembly-extras", dest="assembly_extras", type="str", help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome [%default].") parser.add_option("--extension-upstream", dest="extension_upstream", type="float", help="extension for upstream end [%default].") parser.add_option("--extension-downstream", dest="extension_downstream", type="float", help="extension for downstream end [%default].") parser.add_option( "--min-distance", dest="min_distance", type="int", help="minimum distance of features to merge/join [%default].") parser.add_option( "--max-distance", dest="max_distance", type="int", help="maximum distance of features to merge/join [%default].") parser.add_option( "--min-features", dest="min_features", type="int", help="minimum number of features to merge/join [%default].") parser.add_option( "--max-features", dest="max_features", type="int", help="maximum number of features to merge/join [%default].") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (options, args) = E.Start(parser, argv=argv) contigs = None genome_fasta = None if options.input_filename_contigs: contigs = Genomics.readContigSizes( IOTools.openFile(options.input_filename_contigs, "r")) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() if options.assembly_report: df = pd.read_csv(options.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if options.assembly_report_hasIDs == 1: ucsccol = options.assembly_report_ucsccol ensemblcol = options.assembly_report_ensemblcol df.ix[df[1] == "assembled-molecule", ensemblcol] = df.ix[df[1] == "assembled-molecule", 0] if options.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif options.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if options.assembly_extras is not None: assembly_extras = options.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if options.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if options.input_filename_agp: agp = AGP.AGP() agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(options.stdin) if options.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == options.method add_downstream_flank = "add-downstream-flank" == options.method if options.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(options.extension_upstream) downstream_flank = int(options.extension_downstream) extend_flank = options.flank_method == "extend" if options.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, options.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: options.stdout.write(str(gff) + "\n") elif options.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: if options.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start options.stdout.write(str(x) + "\n") x.start = c.end elif options.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" options.stdout.write(str(x) + "\n") elif options.method == "join-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=False, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "merge-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=True, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "crop": for gff in cropGFF(gffs, options.filename_crop_gff): options.stdout.write(str(gff) + "\n") elif options.method == "crop-unique": for gff in cropGFFUnique(gffs): options.stdout.write(str(gff) + "\n") elif options.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match( "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", options.filter_range).groups() except AttributeError: raise "can not parse range %s" % options.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): options.stdout.write(str(gff) + "\n") elif options.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif options.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif options.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if options.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if options.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if options.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if options.contig_pattern: to_remove = [ re.compile(x) for x in options.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue options.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) else: for gff in gffs: if options.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if options.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) options.stdout.write(str(gff) + "\n") E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2gff.py 2868 2010-03-03 10:19:52Z andreas $") parser.add_option("-f", "--forward-coordinates", dest="forward_coordinates", help="translate to forward coordinates.", action="store_true") parser.add_option("--forward-strand", dest="forward_strand", help="convert to forward strand.", action="store_true") parser.add_option("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf [default=%default].") parser.add_option( "--add-up-flank", dest="add_up_flank", type="int", help="add an upstream flanking segment to first exon of a group.") parser.add_option( "--add-down-flank", dest="add_down_flank", type="int", help="add a downstream flanking segment to last segment of a group.") parser.add_option("--extend", dest="extend", help="extend the existing features.", action="store_true") parser.add_option("-c", "--contigs", dest="input_filename_contigs", type="string", help="filename with contig lenghts.") parser.add_option( "--filename-agp", dest="input_filename_agp", type="string", help="agp file to map coordinates from contigs to scaffolds.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option( "--complement-groups", dest="complement_groups", action="store_true", help="""complement groups. Will write introns from exons [%default].""" ) parser.add_option( "--group-field", dest="group_field", type="string", help= """gff field/attribute to group by such as gene_id, transrcipt_id, ... [%default].""" ) parser.add_option("--combine-groups", dest="combine_groups", action="store_true", help="""combine groups.""") parser.add_option( "--filter-range", dest="filter_range", type="string", help= """extract all elements overlapping a range. A range is specified by eithor 'contig:from..to', 'contig:+:from..to', or 'from,to' .""" ) parser.add_option( "--join-features", dest="join_features", type="string", help= "join features into a single transcript. Consecutive features are grouped " " into the same transcript/gene. This metdo expects a string of for numbers ``a,b,c,d`` " " as input with:" " a,b=minimum/maximum distance between features, " " c,d=minimum,maximum number of features." "") parser.add_option( "--merge-features", dest="merge_features", type="string", help= "merge features. Consecutive features are merged into a single feature. " "This method expects a string of four numbers ``a,b,c,d`` as input; " "a,b=minimum/maximum distance between features, " "c,d=minimum,maximum number of features.") parser.add_option( "--crop-unique", dest="crop_unique", action="store_true", help= "crop overlapping intervals, keeping only intervals that are unique [default=%default]" ) parser.add_option( "--crop", dest="crop", type="string", help= """crop features in gff file with features in another file. If a feature falls in the middle of another, two entries will be output.""" ) parser.add_option( "--sanitize", dest="sanitize", type="choice", choices=("ucsc", "ensembl", "genome"), help= "sanitize chr names for ucsc or ensembl or use the genome translator [%default]." ) parser.add_option( "--skip-missing", dest="skip_missing", action="store_true", help= "skip entries on missing contigs. Otherwise an exception is raised [%default]." ) parser.add_option( "--remove-contigs", dest="remove_contigs", type="string", action="store", help= "a comma separated list of regular expressions specifying contigs to be removed when runnnig sanitize [%default]." ) parser.set_defaults( forward_coordinates=False, forward_strand=False, input_filename_contigs=False, input_filename_agp=False, genome_file=None, sanitize=None, add_up_flank=None, add_down_flank=None, extend=False, complement_groups=False, combine_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, join_features=None, merge_features=None, output_format="%06i", skip_missing=False, remove_contigs=None, is_gtf=False, group_field=None, ) (options, args) = E.Start(parser, argv=argv) if options.input_filename_contigs: contigs = Genomics.ReadContigSizes( IOTools.openFile(options.input_filename_contigs, "r")) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() else: genome_fasta = None if (options.forward_coordinates or options.forward_strand) and not contigs: raise ValueError("inverting coordinates requires genome file") if options.input_filename_agp: agp = AGP.AGP() agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(options.stdin) if options.add_up_flank or options.add_down_flank: if options.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, options.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(lambda x, y: cmp(x.start, y.start)) lcontig = contigs[chunk[0].contig] if options.extend: if options.add_up_flank: if is_positive: chunk[0].start = max( 0, chunk[0].start - options.add_up_flank) else: chunk[-1].end = min( lcontig, chunk[-1].end + options.add_up_flank) if options.add_down_flank: if is_positive: chunk[-1].end = min( lcontig, chunk[-1].end + options.add_down_flank) else: chunk[0].start = max( 0, chunk[0].start - options.add_down_flank) else: if options.add_up_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - options.add_up_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + options.add_up_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if options.add_down_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + options.add_up_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - options.add_up_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: options.stdout.write(str(gff) + "\n") elif options.complement_groups: iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: if options.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort() x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start options.stdout.write(str(x) + "\n") x.start = c.end elif options.combine_groups: iterator = GTF.joined_iterator(gffs) for chunk in iterator: chunk.sort() x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" options.stdout.write(str(x) + "\n") elif options.join_features: combineGFF(gffs, options, merge=False) elif options.merge_features: combineGFF(gffs, options, merge=True) elif options.crop: cropGFF(gffs, options) elif options.crop_unique: cropGFFUnique(gffs, options) elif options.filter_range: contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match( "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", options.filter_range).groups() except AttributeError: raise "can not parse range %s" % options.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None if options.loglevel >= 2: options.stdlog.write( "# filter: contig=%s, strand=%s, interval=%s\n" % (str(contig), str(strand), str(interval))) options.stdlog.flush() for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): options.stdout.write(str(gff) + "\n") elif options.sanitize: def toUCSC(id): if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id return id def toEnsembl(id): if id.startswith("contig"): return id[len("contig"):] if id.startswith("chr"): return id[len("chr"):] return id if options.sanitize == "genome": if genome_fasta is None: raise ValueError( "please specify --genome-file= when using --sanitize=genome" ) f = genome_fasta.getToken elif options.sanitize == "ucsc": f = toUCSC elif options.sanitize == "ensembl": f = toEnsembl skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError, msg: if options.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if options.remove_contigs: to_remove = [ re.compile(x) for x in options.remove_contigs.split(",") ] if any([x.match(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue options.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len( skipped_contigs.keys()), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(outofrange_contigs.keys()), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len( filtered_contigs.keys()), str(filtered_contigs)))