def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.items(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def update(self, bed): # convert to a gtf entry gtf = GTF.Entry() gtf.fromBed(bed) gtf.feature = 'exon' GeneModelAnalysis.Classifier.update(self, [gtf])
def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf)
def test_entry(frame, strand, xfrom, xto, start, end, ref): entry = GTF.Entry() entry.frame = frame entry.strand = strand entry.start = xfrom entry.end = xto intervals = transform_third_codon(start, end, [(xfrom, xto, entry)]) if ref != intervals: print("failed:", ref != intervals)
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append((min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write("# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def convert_set(gffs, gene_pattern, transcript_pattern, options): ''' creates the gene_id and transcript_id fields from a string format pattern using fields of the gff. ''' for gff in gffs: gff.gene_id = str(gene_pattern) % gff.asDict() gff.transcript_id = str(gene_pattern) % gff.asDict() gtf_entry = GTF.Entry() gtf_entry.copy(gff) if "Parent" in gtf_entry: gtf_entry['Parent'] = ",".join(gtf_entry['Parent']) options.stdout.write(str(gtf_entry) + "\n")
def buildRepeatTrack(infile, outfile): '''build a repeat track as negative control.''' nrepeats = 0 for gff in GTF.iterator(gzip.open(infile, "r")): nrepeats += 1 sample = set( random.sample(range(nrepeats), PARAMS["ancestral_repeats_samplesize"])) outf = gzip.open(outfile, "w") gtf = GTF.Entry() for x, gff in enumerate(GTF.iterator(gzip.open(infile, "r"))): if x not in sample: continue gtf.fromGTF(gff, "%08i" % x, "%08i" % x) outf.write("%s\n" % str(gtf)) outf.close() E.debug("created sample of %i repeats out of %i in %s" % (len(sample), nrepeats, outfile))
def addSegment(feature, start, end, template, options): """add a generic segment of type *feature*. """ if start >= end: return 0 entry = GTF.Entry() if isinstance(template, tuple): entry.copy(template[0]) entry.clearAttributes() entry.addAttribute("downstream_gene_id", template[1].gene_id) else: entry.copy(template) entry.clearAttributes() entry.start, entry.end = start, end entry.feature = feature if feature not in ("exon", "CDS", "UTR", "UTR3", "UTR5"): entry.score = "." options.stdout.write(str(entry) + "\n") return 1
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False): '''get refseq gene set from UCSC database and save as :term:`gtf` formatted file. Matches to ``chr_random`` are ignored (as does ENSEMBL). Note that this approach does not work as a gene set, as refseq maps are not real gene builds and unalignable parts cause differences that are not reconcilable. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`gtf` format. The filename aims to be close to the ENSEMBL gtf format. remove_duplicate : bool If True, duplicate mappings are removed. ''' duplicates = set() if remove_duplicates: cc = dbhandle.execute("""SELECT name, COUNT(*) AS c FROM refGene WHERE chrom NOT LIKE '%_random' GROUP BY name HAVING c > 1""") duplicates = set([x[0] for x in cc.fetchall()]) E.info("removing %i duplicates" % len(duplicates)) # these are forward strand coordinates statement = ''' SELECT gene.name, link.geneName, link.name, gene.name2, product, protAcc, chrom, strand, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, exonFrames FROM refGene as gene, refLink as link WHERE gene.name = link.mrnaAcc AND chrom NOT LIKE '%_random' ORDER by chrom, cdsStart ''' outf = iotools.open_file(outfile, "w") cc = dbhandle.execute(statement) SQLResult = collections.namedtuple( 'Result', '''transcript_id, gene_id, gene_name, gene_id2, description, protein_id, contig, strand, start, end, nexons, starts, ends, frames''') counts = E.Counter() counts.duplicates = len(duplicates) for r in map(SQLResult._make, cc.fetchall()): if r.transcript_id in duplicates: continue starts = list(map(int, r.starts.split(",")[:-1])) ends = list(map(int, r.ends.split(",")[:-1])) frames = list(map(int, r.frames.split(",")[:-1])) gtf = GTF.Entry() gtf.contig = r.contig gtf.source = "protein_coding" gtf.strand = r.strand gtf.gene_id = r.gene_id gtf.transcript_id = r.transcript_id gtf.addAttribute("protein_id", r.protein_id) gtf.addAttribute("transcript_name", r.transcript_id) gtf.addAttribute("gene_name", r.gene_name) assert len(starts) == len(ends) == len(frames) if gtf.strand == "-": starts.reverse() ends.reverse() frames.reverse() counts.transcripts += 1 i = 0 for start, end, frame in zip(starts, ends, frames): gtf.feature = "exon" counts.exons += 1 i += 1 gtf.addAttribute("exon_number", i) # frame of utr exons is set to -1 in UCSC gtf.start, gtf.end, gtf.frame = start, end, "." outf.write("%s\n" % str(gtf)) cds_start, cds_end = max(r.start, start), min(r.end, end) if cds_start >= cds_end: # UTR exons have no CDS # do not expect any in UCSC continue gtf.feature = "CDS" # invert the frame frame = (3 - frame % 3) % 3 gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame outf.write("%s\n" % str(gtf)) outf.close() E.info("%s" % str(counts))
def main(argv=None): ''' main function ''' if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-o", "--output-only-attributes", dest="only_attributes", action="store_true", help="output only attributes as separate columns " "[default=%default].") parser.add_option( "-f", "--attributes-as-columns", dest="output_full", action="store_true", help="output attributes as separate columns " "[default=%default].") parser.add_option("--is-gff3", dest="is_gtf", action="store_false", help="input file is in gtf format [default=%default] ") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="convert tab-separated table back to gtf " "[default=%default].") parser.add_option( "-m", "--output-map", dest="output_map", type="choice", choices=( "transcript2gene", "peptide2gene", "peptide2transcript"), help="output a map mapping transcripts to genes " "[default=%default].") parser.set_defaults( only_attributes=False, output_full=False, invert=False, output_map=None, is_gtf=True ) (options, args) = E.start(parser, argv=argv) if options.output_full: # output full table with column for each attribute attributes = set() data = [] if options.is_gtf: for gtf in GTF.iterator(options.stdin): data.append(gtf) attributes = attributes.union(set(gtf.keys())) else: for gff in GFF3.iterator_from_gff(options.stdin): data.append(gff) attributes = attributes.union(set(gff.attributes)) # remove gene_id and transcript_id, as they are used # explicitely later attributes.difference_update(["gene_id", "transcript_id"]) attributes = sorted(list(attributes)) # Select whether gtf of gff for output columns if options.is_gtf: if options.only_attributes: header = ["gene_id", "transcript_id"] + attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", ] + attributes else: if options.only_attributes: header = attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame"] + attributes attributes_new = header options.stdout.write("\t".join(header) + "\n") if options.is_gtf: for gtf in data: first = True for a in attributes_new: try: val = getattr(gtf, a) except (AttributeError, KeyError): val = "" if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") else: for gff in data: options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t") % (gff.contig, gff.source, gff.feature, gff.start, gff.end, gff.score, gff.strand, gff.frame)) first = True for a in attributes: try: val = (gff.attributes[a]) except (AttributeError, KeyError): val = '' if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") elif options.invert: gtf = GTF.Entry() header = None for line in options.stdin: if line.startswith("#"): continue data = line[:-1].split("\t") if not header: header = data map_header2column = dict( [(y, x) for x, y in enumerate(header)]) continue # fill gtf entry with data try: gtf.contig = data[map_header2column["contig"]] gtf.source = data[map_header2column["source"]] gtf.feature = data[map_header2column["feature"]] # subtract -1 to start for 0-based coordinates gtf.start = int(data[map_header2column["start"]]) gtf.end = int(data[map_header2column["end"]]) gtf.score = data[map_header2column["score"]] gtf.strand = data[map_header2column["strand"]] gtf.frame = data[map_header2column["frame"]] gtf.gene_id = data[map_header2column["gene_id"]] gtf.transcript_id = data[map_header2column["transcript_id"]] gtf.parseInfo(data[map_header2column["attributes"]], line) except KeyError as msg: raise KeyError("incomplete entry %s: %s: %s" % (str(data), str(map_header2column), msg)) if gtf.frame is None: gtf.frame = "." # output gtf entry in gtf format options.stdout.write("%s\n" % str(gtf)) elif options.output_map: if options.output_map == "transcript2gene": fr = lambda x: x.transcript_id to = lambda x: x.gene_id options.stdout.write("transcript_id\tgene_id\n") elif options.output_map == "peptide2gene": fr = lambda x: x.protein_id to = lambda x: x.gene_id options.stdout.write("peptide_id\tgene_id\n") elif options.output_map == "peptide2transcript": fr = lambda x: x.protein_id to = lambda x: x.transcript_id options.stdout.write("peptide_id\ttranscript_id\n") map_fr2to = {} for gtf in GTF.iterator(options.stdin): try: map_fr2to[fr(gtf)] = to(gtf) except (AttributeError, KeyError): pass for x, y in sorted(map_fr2to.items()): options.stdout.write("%s\t%s\n" % (x, y)) else: header = ("contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", "attributes") options.stdout.write("\t".join(header) + "\n") for gtf in GTF.iterator(options.stdin): attributes = [] for a in list(gtf.keys()): if a in ("gene_id", "transcript_id"): continue attributes.append('%s %s' % (a, GTF.quote(gtf[a]))) attributes = "; ".join(attributes) # Capture if None and set to . format if gtf.frame is None: gtf.frame = "." options.stdout.write(str(gtf) + "\n") E.stop()
def buildTerritories(iterator, fasta, method, options): """build gene territories. Exons in a gene are merged and the resulting segments enlarged by --radius. Territories overlapping are divided in the midpoint between the two genes. If *method* is ``gene``, gene territories will be built. If *method* is ``tss``, tss territories will be built. """ ninput, noutput, nambiguous = 0, 0, 0 assert method in ("gene", "tss") dr = 2 * options.radius prev_pos = 0 last_contig = None gff = None def _iterator(iterator): """yield gene plus the locations of the end of the previous gene and start of next gene""" last_end, prev_end = 0, 0 last_contig = None last = None for matches in GTF.iterator_overlaps(iterator): this_start = min([x.start for x in matches]) this_end = max([x.end for x in matches]) if method == "tss": # restrict to tss if matches[0].strand == "+": this_end = this_start + 1 else: this_start = this_end - 1 this_contig = matches[0].contig if last_contig != this_contig: if last: yield prev_end, last, fasta.getLength(last_contig) last_end, prev_end = 0, 0 else: yield prev_end, last, this_start prev_end = last_end last_end = this_end last = matches last_contig = this_contig if last: yield prev_end, last, fasta.getLength(last_contig) for last_end, matches, next_start in _iterator(iterator): gff = GTF.Entry().copy(matches[0]) start = min([x.start for x in matches]) end = max([x.end for x in matches]) if method == "tss": # restrict to tss if matches[0].strand == "+": end = start + 1 else: start = end - 1 d = start - last_end if d < dr: start -= d // 2 else: start -= options.radius d = next_start - end if d < dr: end += d // 2 else: end += options.radius gff.gene_id = ":".join(sorted(set([x.gene_id for x in matches]))) gff.transcript_id = gff.gene_id gff.start, gff.end = start, end nsegments = len(matches) if nsegments > 1: gff.addAttribute("ambiguous", nsegments) nambiguous += 1 assert gff.start < gff.end, "invalid segment: %s" % str(gff) options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nambiguous=%i" % (ninput, noutput, nambiguous))
def convert_hierarchy(first_gffs, second_gffs, options): ''' Converts GFF to GTF by parsing the hierarchy. First parses :param:first_gffs to build the hierarchy then iterates over second_gffs using a call to the recursive function search_hierarchy to identify gene_ids and transcript_ids. If multiple gene and transcript_ids are found outputs a record for each combination. If no definitive transcript_id is found and options.missing_gene is True, it will use the possible_transcript_id as transcript_id, which is the ID one level below the entry used as gene_id. If this is also None (that is there was only on level), sets transcript_id to gene_id. Might raise ValueError if options.missing_gene is false and either no gene or no transcript_id was found for an entry. Might raise RuntimeError if the recursion limit was reached because the input contains circular references. ''' hierarchy = {} for gff in first_gffs: if not (options.parent == "Parent"): if options.parent in gff.asDict(): gff['Parent'] = gff[options.parent].split(",") else: gff['Parent'] = [] hierarchy[gff['ID']] = { "type": gff.feature, "Parent": gff.asDict().get("Parent", []), "gene_id": gff.attributes.get(options.gene_field_or_pattern, gff['ID']), "transcript_id": gff.attributes.get(options.transcript_field_or_pattern, gff['ID']) } for gff in second_gffs: if options.discard and ( (options.missing_gene and options.parent not in gff) or (gff.feature in (options.gene_type, options.transcript_type))): continue gene_ids, transcript_ids, poss_transcript_ids = search_hierarchy( gff['ID'], hierarchy, options) assert len(gene_ids) > 0 and len(transcript_ids) > 0 if options.missing_gene: transcript_ids = [ poss if found is None else found for found, poss in zip(transcript_ids, poss_transcript_ids) ] transcript_ids = [ gid if found is None else found for found, gid in zip(transcript_ids, gene_ids) ] elif None in transcript_ids: raise ValueError("failed to find transcript id for %s" % gff['ID']) for gene_id, transcript_id in zip(gene_ids, transcript_ids): gff.gene_id = gene_id gff.transcript_id = transcript_id gtf_entry = GTF.Entry() gtf_entry.copy(gff) if "Parent" in gtf_entry: gtf_entry['Parent'] = ",".join(gtf_entry['Parent']) options.stdout.write(str(gtf_entry) + "\n")
def annotateRegulons(iterator, fasta, tss, options): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) if options.merge_promotors: # merge the regulons (and rename - as sort order might have # changed) regulons = Intervals.combine(regulons) transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) nregulons += 1 x += 1 E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons))
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-m", "--method", dest="method", type=str, choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand", "rename-chr"), help="method to apply ") parser.add_argument("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf.") parser.add_argument("-c", "--contigs-tsv-file", dest="input_filename_contigs", type=str, help="filename with contig lengths.") parser.add_argument( "--agp-file", dest="input_filename_agp", type=str, help="agp file to map coordinates from contigs to scaffolds.") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("--crop-gff-file", dest="filename_crop_gff", type=str, help="GFF/GTF file to crop against.") parser.add_argument( "--group-field", dest="group_field", type=str, help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... .""") parser.add_argument( "--filter-range", dest="filter_range", type=str, help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_argument("--sanitize-method", dest="sanitize_method", type=str, choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " ".") parser.add_argument( "--flank-method", dest="flank_method", type=str, choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " ".") parser.add_argument("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised .") parser.add_argument( "--contig-pattern", dest="contig_pattern", type=str, help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize .") parser.add_argument( "--assembly-report", dest="assembly_report", type=str, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-hasids", dest="assembly_report_hasIDs", type=int, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type=int, help="column in the assembly report containing ucsc contig ids" ".") parser.add_argument( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type=int, help="column in the assembly report containing ensembl contig ids") parser.add_argument( "--assembly-extras", dest="assembly_extras", type=str, help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome .") parser.add_argument("--extension-upstream", dest="extension_upstream", type=float, help="extension for upstream end .") parser.add_argument("--extension-downstream", dest="extension_downstream", type=float, help="extension for downstream end .") parser.add_argument("--min-distance", dest="min_distance", type=int, help="minimum distance of features to merge/join .") parser.add_argument("--max-distance", dest="max_distance", type=int, help="maximum distance of features to merge/join .") parser.add_argument("--min-features", dest="min_features", type=int, help="minimum number of features to merge/join .") parser.add_argument("--max-features", dest="max_features", type=int, help="maximum number of features to merge/join .") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, rename_chr_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (args) = E.start(parser, argv=argv) contigs = None genome_fasta = None chr_map = None if args.input_filename_contigs: contigs = Genomics.readContigSizes( iotools.open_file(args.input_filename_contigs, "r")) if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") if args.assembly_report: df = pd.read_csv(args.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if args.assembly_report_hasIDs == 1: ucsccol = args.assembly_report_ucsccol ensemblcol = args.assembly_report_ensemblcol df.loc[df[1] == "assembled-molecule", ensemblcol] = df.loc[df[1] == "assembled-molecule", 0] if args.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif args.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if args.assembly_extras is not None: assembly_extras = args.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if args.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if args.input_filename_agp: agp = AGP.AGP() agp.readFromFile(iotools.open_file(args.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(args.stdin) if args.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == args.method add_downstream_flank = "add-downstream-flank" == args.method if args.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(args.extension_upstream) downstream_flank = int(args.extension_downstream) extend_flank = args.flank_method == "extend" if args.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, args.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: args.stdout.write(str(gff) + "\n") elif args.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: if args.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start args.stdout.write(str(x) + "\n") x.start = c.end elif args.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" args.stdout.write(str(x) + "\n") elif args.method == "join-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=False, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "merge-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=True, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "crop": for gff in cropGFF(gffs, args.filename_crop_gff): args.stdout.write(str(gff) + "\n") elif args.method == "crop-unique": for gff in cropGFFUnique(gffs): args.stdout.write(str(gff) + "\n") elif args.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", args.filter_range).groups() except AttributeError: raise "can not parse range %s" % args.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): args.stdout.write(str(gff) + "\n") elif args.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif args.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif args.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if args.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if args.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if args.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if args.contig_pattern: to_remove = [ re.compile(x) for x in args.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue args.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) elif args.method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") for gff in renameChromosomes(gffs, chr_map): args.stdout.write(str(gff) + "\n") else: for gff in gffs: if args.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if args.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) args.stdout.write(str(gff) + "\n") E.stop()
def combineGFF(gffs, min_distance, max_distance, min_features, max_features, merge=True, output_format="%06i"): """join intervals in gff file. Note: strandedness is ignored """ E.info("joining features: min distance=%i, max_distance=%i, " "at least %i and at most %i features." % (min_distance, max_distance, min_features, max_features)) def iterate_chunks(gffs): last = next(gffs) to_join = [last] for gff in gffs: d = gff.start - last.end if gff.contig == last.contig: assert gff.start >= last.start, "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" % ( d, last, gff) if gff.contig != last.contig or \ (max_distance and d > max_distance) or \ (min_distance and d < min_distance) or \ (max_features and len(to_join) >= max_features): if min_features or len(to_join) >= min_features: yield to_join to_join = [] last = gff to_join.append(gff) if len(to_join) >= min_features: yield to_join raise StopIteration id = 1 ninput, noutput, nfeatures = 0, 0, 0 if merge: for to_join in iterate_chunks(gffs): ninput += 1 y = GTF.Entry() t = output_format % id y.fromGTF(to_join[0], t, t) y.start = to_join[0].start y.end = to_join[-1].end yield (y) nfeatures += 1 noutput += 1 id += 1 else: for to_join in iterate_chunks(gffs): ninput += 1 for x in to_join: y = GTF.Entry() t = output_format % id y.fromGTF(x, t, t) yield (y) nfeatures += 1 noutput += 1 id += 1 E.info("ninput=%i, noutput=%i, nfeatures=%i" % (ninput, noutput, nfeatures))
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_argument( "-f", "--id-format", dest="id_format", type=str, help="format for numeric identifier if --as-gtf is set and " "no name in bed file .") parser.set_defaults(as_gtf=False, id_format="%08i", test=None) (args) = E.start(parser, add_pipe_options=True) as_gtf = args.as_gtf id_format = args.id_format if as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "bed" gff.feature = "exon" ninput, noutput, nskipped = 0, 0, 0 id = 0 for bed in Bed.iterator(args.stdin): ninput += 1 gff.contig = bed.contig gff.start = bed.start gff.end = bed.end if bed.fields and len(bed.fields) >= 3: gff.strand = bed.fields[2] else: gff.strand = "." if bed.fields and len(bed.fields) >= 2: gff.score = bed.fields[1] if as_gtf: if bed.fields: gff.gene_id = bed.fields[0] gff.transcript_id = bed.fields[0] else: id += 1 gff.gene_id = id_format % id gff.transcript_id = id_format % id else: if bed.fields: gff.source = bed.fields[0] args.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.stop()
def annotateGREATDomains(iterator, fasta, options): """build great domains extend from TSS a basal region. """ gene_iterator = GTF.gene_iterator(iterator) counter = E.Counter() upstream, downstream = options.upstream, options.downstream radius = options.radius outfile = options.stdout regions = [] #################################################################### # define basal regions for each gene # take all basal regions per transcript and merge them # Thus, the basal region of a gene might be larger than the sum # of options.upstream + options.downstream for gene in gene_iterator: counter.genes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] # collect every basal region per transcript for transcript in gene: counter.transcripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) # take first/last entry start, end = min(x[0] for x in regulons), max(x[1] for x in regulons) gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "greatdomain" gtf.start, gtf.end = start, end regions.append(gtf) regions.sort(key=lambda x: (x.contig, x.start)) outf = iotools.open_file("test.gff", "w") for x in regions: outf.write(str(x) + "\n") outf.close() #################################################################### # extend basal regions regions.sort(key=lambda x: (x.contig, x.start)) # iterate within groups of overlapping basal regions groups = list(GTF.iterator_overlaps(iter(regions))) counter.groups = len(groups) last_end = 0 reset = False for region_id, group in enumerate(groups): # collect basal intervals in group intervals = [(x.start, x.end) for x in group] def overlapsBasalRegion(pos): for start, end in intervals: if start == pos or end == pos: continue if start <= pos < end: return True if start > pos: return False return False # deal with boundary cases - end of contig if region_id < len(groups) - 1: nxt = groups[region_id + 1] if nxt[0].contig == group[0].contig: next_start = min([x.start for x in nxt]) else: next_start = fasta.getLength(group[0].contig) reset = True else: next_start = fasta.getLength(group[0].contig) reset = True # last_end = basal extension of previous group # next_start = basal_extension of next group # extend region to previous/next group always extend # dowstream, but upstream only extend if basal region of an # interval is not already overlapping another basal region # within the group save_end = 0 for gtf in group: save_end = max(save_end, gtf.end) if gtf.strand == "+": if not overlapsBasalRegion(gtf.start): gtf.start = max(gtf.start - radius, last_end) # always extend downstream gtf.end = min(gtf.end + radius, next_start) else: # always extend downstream gtf.start = max(gtf.start - radius, last_end) if not overlapsBasalRegion(gtf.end): gtf.end = min(gtf.end + radius, next_start) outfile.write(str(gtf) + "\n") counter.regulons += 1 if len(group) > 1: counter.overlaps += len(group) else: counter.nonoverlaps += 1 if reset: last_end = 0 reset = False else: last_end = save_end E.info("%s" % str(counter))
def annotateGenes(iterator, fasta, options): """annotate gene structures This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions. This method annotates per transcript. In order to achieve a unique tiling, use only a single transcript per gene and remove any overlap between genes. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nskipped = 0, 0, 0 results = [] increment = options.increment introns_detail = "introns" in options.detail exons_detail = "exons" in options.detail for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) try: lcontig = fasta.getLength(gene[0][0].contig) except KeyError: nskipped += 1 continue results = [] for transcript in gene: def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf) ntranscripts += 1 exons = [(x.start, x.end) for x in transcript if x.feature == "exon"] if len(exons) == 0: nskipped += 1 exons.sort() introns = [] end = exons[0][1] for exon in exons[1:]: introns.append((end, exon[0])) end = exon[1] # add flank start, end = exons[0][0], exons[-1][1] upstream, downstream = [], [] for x in range(0, options.flank, increment): upstream.append((start - increment, start)) start -= increment downstream.append((end, end + increment)) end += increment # remove out-of-bounds coordinates upstream = [x for x in upstream if x[0] >= 0] downstream = [x for x in downstream if x[1] <= lcontig] if is_negative_strand: exons.reverse() introns.reverse() upstream, downstream = downstream, upstream # add exons if exons_detail: _add(exons[0], "first_exon") if len(exons) > 1: _add(exons[-1], "last_exon") for e in exons[1:-1]: _add(e, "middle_exon") else: for e in exons: _add(e, "exon") # add introns if introns_detail: if len(introns) > 0: _add(introns[0], "first_intron") if len(introns) > 1: _add(introns[-1], "last_intron") for i in introns[1:-1]: _add(i, "middle_intron") else: for i in introns: _add(i, "intron") for x, u in enumerate(upstream): _add(u, "upstream_%i" % (increment * (x + 1))) for x, u in enumerate(downstream): _add(u, "downstream_%i" % (increment * (x + 1))) results.sort(key=lambda x: x.feature) cache = [] for key, vals in itertools.groupby(results, key=lambda x: x.feature): v = list(vals) intervals = [(x.start, x.end) for x in v] intervals = Intervals.combine(intervals) for start, end in intervals: r = GTF.Entry() r.copy(v[0]) r.start, r.end = start, end cache.append(r) cache.sort(key=lambda x: x.start) for r in cache: options.stdout.write("%s\n" % str(r)) E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" % (ngenes, ntranscripts, nskipped))