def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' # just load each transcript with its classification temp = P.getTempFile(".") inf = iotools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() P.load(temp.name, outfile, options="--header-names=transcript_id,gene_id,class " "--add-index=transcript_id " "--add-index=gene_id") os.unlink(temp.name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-i", "--ignore-missing", dest="ignore_missing", action="store_true", help= "Ignore transcripts on contigs that are not in the genome-file [default=%default]." ) parser.add_option( "--min-intron-length", dest="min_intron_length", type="int", help= "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]." ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("full", ), help="method to apply [default=%default].") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if not options.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(options.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) annotateGenome(iterator, fasta, options) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = iotools.read_map( iotools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(iotools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(iotools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = iotools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input file is in gtf format") parser.add_argument("--set-name", dest="name", type=str, help="field from the GFF/GTF file to use as the " "name field in the BED file ", choices=("gene_id", "transcript_id", "class", "family", "feature", "source", "repName", "gene_biotype")) parser.add_argument("--track", dest="track", type=str, choices=("feature", "source", None), help="use feature/source field to define BED tracks ") parser.add_argument( "--bed12-from-transcripts", dest="bed12", action="store_true", default=False, help="Process GTF file into Bed12 entries, with blocks as exons" "and thick/thin as coding/non-coding") parser.set_defaults(track=None, name="gene_id", is_gtf=False) (args) = E.start(parser, add_pipe_options=True) ninput, noutput = 0, 0 iterator = GTF.iterator(args.stdin) if args.bed12: iterator = GTF.transcript_iterator(iterator) if args.track: all_input = list(iterator) if args.track == "feature": grouper = lambda x: x.feature elif args.track == "source": grouper = lambda x: x.source all_input.sort(key=grouper) bed = Bed.Bed() for key, vals in itertools.groupby(all_input, grouper): args.stdout.write("track name=%s\n" % key) for gff in vals: ninput += 1 if args.bed12: bed = transcript2bed12(gff) else: bed.fromGTF(gff, name=args.name) args.stdout.write(str(bed) + "\n") noutput += 1 else: bed = Bed.Bed() for gff in iterator: ninput += 1 if args.bed12: bed = transcript2bed12(gff) else: bed.fromGTF(gff, name=args.name) args.stdout.write(str(bed) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i" % (ninput, noutput)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome") parser.add_argument( "-i", "--ignore-missing", dest="ignore_missing", action="store_true", help="Ignore transcripts on contigs that are not in the genome-file.") parser.add_argument( "--min-intron-length", dest="min_intron_length", type=int, help= "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown" ) parser.add_argument("-m", "--method", dest="method", type=str, choices=["full"], help="method to apply") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) if not args.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(args.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(args.stdin)) annotateGenome(iterator, fasta, args) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $1.0$", usage=globals()["__doc__"]) parser.add_option("-r", "--reffile", dest="reffile", type="string", help="Supply reference gtf file name") parser.add_option("-d", "--class-file", dest="classfile", type="string", help="Supply database name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="Supply output bed file name") parser.add_option("-u", "--indivfile", dest="indivfile", type="string", help="Supply output bed file name for individual utrons") parser.add_option("-p", "--partfile", dest="partfile", type="string", help="Supply output bed file name for partnered utrons") parser.add_option( "-q", "--indivpartfile", dest="indivpartfile", type="string", help="Supply output bed file name for individual partnered utrons") parser.add_option("-n", "--novel-file", dest="novelfile", type="string", help="Supply output bed file name for novel introns") parser.add_option( "--novel-transcript", dest="novel_id", type="string", help="DEBUG: Output info for this transcript from the STDIN") parser.add_option( "--target-transcript", dest="target_id", type="string", help="DEBUG: Output info for this transcript from ref-file") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) outlines = [] individuals = [] partnered = [] individualpartnered = [] novel = [] db = pandas.read_csv(options.classfile, sep="\t") # This keeps just one entry per-transcript - why? #db = db.groupby("transcript_id").first() db = db.set_index("transcript_id") enshashtable = getGeneTable(options.reffile) for novel_transcript in GTF.transcript_iterator(GTF.iterator( options.stdin)): # Why do it on a gene by gene basis rather than transcript by transcript basis? transcript_id = novel_transcript[0].transcript_id if transcript_id == options.novel_id: output_novel = True else: output_novel = False try: geneid = db.loc[transcript_id].match_gene_id except KeyError: if output_novel: E.debug("Transcript %s not in class table" % transcript_id) continue if pandas.isnull(geneid): if output_novel: E.debug("Transcript %s matches no gene in class table" % transcript_id) continue ens_gene = enshashtable[geneid] all_ref_introns = set() novel_transcript_exons = GTF.asRanges(novel_transcript, "exon") novel_transcript_introns = GTF.toIntronIntervals(novel_transcript) for ref_transcript in ens_gene["models"].values(): ref_introns = GTF.toIntronIntervals(ref_transcript) all_ref_introns.update(ref_introns) #Identify comparison set def _in_exon(position, exons): return any(e[0] <= position <= e[1] for e in exons) # check if this ever gets the wrong start_codon. filtered_starts = [ s for s in ens_gene["start_codons"] if _in_exon(s, novel_transcript_exons) ] if len(filtered_starts) == 0: if output_novel: E.debug("No starts found for %s" % transcript_id) continue #if novel_transcript[0].strand == "-": # selected_start = max(filtered_starts) #else: # selected_start = min(filtered_starts) selected_models = list() for startc in filtered_starts: selected_models.extend(ens_gene["start_codons"][startc]) if output_novel: E.debug("Transcripts with compatible starts are %s" % selected_models) for ref_transcript_id in selected_models: if output_novel and ref_transcript_id == options.target_id: output_ref = True else: output_ref = False second = ens_gene["models"][ref_transcript_id] ens_CDS = GTF.asRanges(second, "CDS") if len(ens_CDS) == 0: if output_ref: E.debug("%s is not coding" ) # ensure only protein-coding transcripts continue ens_exons = GTF.asRanges(second, "exon") first_introns = set(novel_transcript_introns) second_introns = set(GTF.toIntronIntervals(second)) first_CDSintrons = [ intron for intron in first_introns if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1]) ] second_CDSintrons = [ intron for intron in second_introns if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1]) ] first_CDSintrons = set(first_CDSintrons) second_CDSintrons = set(second_CDSintrons) if not first_CDSintrons == second_CDSintrons: if output_ref: E.debug("CDS chains do not match. Chains are:") first_CDSintrons = sorted(list(first_CDSintrons)) second_CDSintrons = sorted(list(second_CDSintrons)) output = "\n".join( map(str, zip(first_CDSintrons, second_CDSintrons))) E.debug(output) continue # match CDS intron chain firstUTRintrons = first_introns - first_CDSintrons if len(firstUTRintrons) == 0: if output_ref: E.debug("No UTR introns") continue secondUTRintrons = second_introns - second_CDSintrons found = False for intron in first_introns: if (intron[0] < ens_CDS[-1][1] and intron[1] > ens_CDS[-1][1]) or \ (intron[0] < ens_CDS[0][0] and intron[1] > ens_CDS[0][0]): found = True break # ensure pruned transcript doesn't have # introns overlapping start or stop codons in ensembl # transcript if found: if output_ref: E.debug("Start or stop in intron") continue if second[0].strand == "+": ens_stop = ens_CDS[-1][1] UTR3introns = [ intron for intron in firstUTRintrons if intron[0] >= ens_CDS[-1][1] and intron[1] < ens_exons[-1][1] ] secondUTR3introns = [ intron for intron in secondUTRintrons if intron[0] >= ens_CDS[-1][1] and intron[1] < ens_exons[-1][1] ] else: ens_stop = ens_CDS[0][0] UTR3introns = [ intron for intron in firstUTRintrons if intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0] ] secondUTR3introns = [ intron for intron in secondUTRintrons if intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0] ] if len(UTR3introns) == 0: if output_ref: E.debug("No UTR introns") continue outbed = Bed.Bed() outbed.fields = ['.', '.', '.', '.', '.', '.', '.', '.', '.'] outbed.fromIntervals(UTR3introns) outbed.contig = novel_transcript[0].contig outbed["name"] = novel_transcript[0].transcript_id outbed["strand"] = novel_transcript[0].strand outlines.append(outbed) # get output for each transcript for item in UTR3introns: outbed2 = Bed.Bed() outbed2.fields = ['.', '.', '.', '.'] outbed2.fromIntervals([item]) outbed2.contig = novel_transcript[0].contig outbed2['name'] = novel_transcript[0].transcript_id outbed2["strand"] = novel_transcript[0].strand outbed2["thickStart"] = ens_stop individuals.append(outbed2) # get output for each intron UTR3introns = set(UTR3introns) secondUTR3introns = set(secondUTR3introns) extraUTR3introns = list(UTR3introns - secondUTR3introns) if output_ref and len(secondUTR3introns - UTR3introns) > 0: E.debug("Following introns in UTR of %s but not %s" % (options.target_id, options.novel_id)) E.debug(secondUTRintrons - UTR3introns) # get only introns that are not in matched transcript if len(extraUTR3introns) != 0 and len(secondUTR3introns - UTR3introns) == 0: outbed3 = Bed.Bed() outbed3.fields = ['.'] * 9 outbed3.fromIntervals(extraUTR3introns) outbed3.contig = novel_transcript[0].contig outbed3["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed3["strand"] = novel_transcript[0].strand partnered.append(outbed3) for item in extraUTR3introns: outbed4 = Bed.Bed() outbed4.fields = ['.', '.', '.', '.'] outbed4.fromIntervals([item]) outbed4.contig = novel_transcript[0].contig outbed4["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed4["strand"] = novel_transcript[0].strand outbed4["thickStart"] = ens_stop individualpartnered.append(outbed4) if len(all_ref_introns) == 0: ens_starts, ens_ends = [], [] else: ens_starts, ens_ends = zip(*all_ref_introns) novelEvents = [ i for i in UTR3introns if i[0] not in ens_starts and i[1] not in ens_ends ] for item in novelEvents: outbed5 = Bed.Bed() outbed5.fields = ['.'] * 4 outbed5.fromIntervals([item]) outbed5.contig = novel_transcript[0].contig outbed5["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed5["strand"] = novel_transcript[0].strand outbed5["thickStart"] = ens_stop novel.append(outbed5) with IOTools.open_file(options.outfile, "w") as outf: for line in outlines: outf.write(str(line) + "\n") if options.indivfile is not None: with IOTools.open_file(options.indivfile, "w") as outf2: for line in individuals: outf2.write(str(line) + "\n") if options.partfile is not None: with IOTools.open_file(options.partfile, "w") as outf3: for line in partnered: outf3.write(str(line) + "\n") if options.indivpartfile is not None: with IOTools.open_file(options.indivpartfile, "w") as outf4: for line in individualpartnered: outf4.write(str(line) + "\n") if options.novelfile is not None: with IOTools.open_file(options.novelfile, "w") as outf5: for line in novel: outf5.write(str(line) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gffs[0].transcript_id entry.mSbjctId = gffs[0].contig entry.strand = gffs[0].strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "geneprofile", "tssprofile", "utrprofile", "intervalprofile", "midpointprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "separateexonprofile", "separateexonprofilewithintrons", ), help='counters to use. Counters describe the ' 'meta-gene structure to use. ' 'Note using geneprofilewithintrons, or ' 'geneprofileabsolutedistancefromthreeprimeend will ' 'automatically turn on the --use-base-accuracy option' '[%default].') parser.add_option("-b", "--bam-file", "--bedfile", "--bigwigfile", dest="infiles", metavar="BAM", type="string", action="append", help="BAM/bed/bigwig files to use. Do not mix " "different types [%default]") parser.add_option("-c", "--control-bam-file", dest="controlfiles", metavar="BAM", type="string", action="append", help="control/input to use. Should be of the same " "type as the bam/bed/bigwig file" " [%default]") parser.add_option("-g", "--gtf-file", dest="gtffile", type="string", metavar="GTF", help="GTF file to use. " "[%default]") parser.add_option("--normalize-transcript", dest="transcript_normalization", type="choice", choices=("none", "max", "sum", "total-max", "total-sum"), help="normalization to apply on each transcript " "profile before adding to meta-gene profile. " "[%default]") parser.add_option("--normalize-profile", dest="profile_normalizations", type="choice", action="append", choices=("all", "none", "area", "counts", "background"), help="normalization to apply on meta-gene " "profile normalization. " "[%default]") parser.add_option( "-r", "--reporter", dest="reporter", type="choice", choices=("gene", "transcript"), help="report results for genes or transcripts." " When 'genes` is chosen, exons across all transcripts for" " a gene are merged. When 'transcript' is chosen, counts are" " computed for each transcript separately with each transcript" " contributing equally to the meta-gene profile." " [%default]") parser.add_option("-i", "--shift-size", dest="shifts", type="int", action="append", help="shift reads in :term:`bam` formatted file " "before computing densities (ChIP-Seq). " "[%default]") parser.add_option("-a", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge pairs in :term:`bam` formatted " "file before computing " "densities (ChIP-Seq). " "[%default]") parser.add_option("-u", "--use-base-accuracy", dest="base_accuracy", action="store_true", help="compute densities with base accuracy. The default " "is to only use the start and end of the aligned region " "(RNA-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extends", type="int", action="append", help="extend reads in :term:`bam` formatted file " "(ChIP-Seq). " "[%default]") parser.add_option("--resolution-upstream", dest="resolution_upstream", type="int", help="resolution of upstream region in bp " "[%default]") parser.add_option("--resolution-downstream", dest="resolution_downstream", type="int", help="resolution of downstream region in bp " "[%default]") parser.add_option("--resolution-upstream-utr", dest="resolution_upstream_utr", type="int", help="resolution of upstream UTR region in bp " "[%default]") parser.add_option("--resolution-downstream-utr", dest="resolution_downstream_utr", type="int", help="resolution of downstream UTR region in bp " "[%default]") parser.add_option("--resolution-cds", dest="resolution_cds", type="int", help="resolution of cds region in bp " "[%default]") parser.add_option("--resolution-first-exon", dest="resolution_first", type="int", help="resolution of first exon in gene, in bp" "[%default]") parser.add_option("--resolution-last-exon", dest="resolution_last", type="int", help="resolution of last exon in gene, in bp" "[%default]") parser.add_option("--resolution-introns", dest="resolution_introns", type="int", help="resolution of introns region in bp " "[%default]") parser.add_option("--resolution-exons-absolute-distance-topolya", dest="resolution_exons_absolute_distance_topolya", type="int", help="resolution of exons absolute distance " "topolya in bp " "[%default]") parser.add_option("--resolution-introns-absolute-distance-topolya", dest="resolution_introns_absolute_distance_topolya", type="int", help="resolution of introns absolute distance " "topolya in bp " "[%default]") parser.add_option("--extension-exons-absolute-distance-topolya", dest="extension_exons_absolute_distance_topolya", type="int", help="extension for exons from the absolute " "distance from the topolya in bp " "[%default]") parser.add_option( "--extension-introns-absolute-distance-topolya", dest="extension_introns_absolute_distance_topolya", type="int", help="extension for introns from the absolute distance from " "the topolya in bp [%default]") parser.add_option("--extension-upstream", dest="extension_upstream", type="int", help="extension upstream from the first exon in bp" "[%default]") parser.add_option("--extension-downstream", dest="extension_downstream", type="int", help="extension downstream from the last exon in bp" "[%default]") parser.add_option("--extension-inward", dest="extension_inward", type="int", help="extension inward from a TSS start site in bp" "[%default]") parser.add_option("--extension-outward", dest="extension_outward", type="int", help="extension outward from a TSS start site in bp" "[%default]") parser.add_option("--scale-flank-length", dest="scale_flanks", type="int", help="scale flanks to (integer multiples of) gene length" "[%default]") parser.add_option( "--control-factor", dest="control_factor", type="float", help="factor for normalizing control and foreground data. " "Computed from data if not set. " "[%default]") parser.add_option("--output-all-profiles", dest="output_all_profiles", action="store_true", help="keep individual profiles for each " "transcript and output. " "[%default]") parser.add_option("--counts-tsv-file", dest="input_filename_counts", type="string", help="filename with count data for each transcript. " "Use this instead " "of recomputing the profile. Useful for plotting the " "meta-gene profile " "from previously computed counts " "[%default]") parser.add_option( "--background-region-bins", dest="background_region_bins", type="int", help="number of bins on either end of the profile " "to be considered for background meta-gene normalization " "[%default]") parser.add_option( "--output-res", dest="resolution_images", type="int", help="the output dpi for the figure plot - will default to " "[%default]") parser.add_option( "--image-format", dest="image_format", type="string", help="The output format for the figure plot - defaults to " "[%default]") parser.set_defaults( remove_rna=False, ignore_pairs=False, force_output=False, bin_size=10, extends=[], shifts=[], sort=[], reporter="transcript", resolution_cds=1000, resolution_introns=1000, # 3kb is a good balance of seeing long enough 3 prime bias and not omit # too many genes. Tim 31th Aug 2013 resolution_exons_absolute_distance_topolya=3000, # introns is only for assess the noise level, thus do ont need a long # region, a long region has the side effect of omit more genes. Tim # 31th Aug 2013 resolution_introns_absolute_distance_topolya=500, # extension can simply just be the same as resolution extension_exons_absolute_distance_topolya=3000, extension_introns_absolute_distance_topolya=500, resolution_upstream_utr=1000, resolution_downstream_utr=1000, resolution_upstream=1000, resolution_downstream=1000, resolution_first=1000, resolution_last=1000, # mean length of transcripts: about 2.5 kb extension_upstream=2500, extension_downstream=2500, extension_inward=3000, extension_outward=3000, plot=True, methods=[], infiles=[], controlfiles=[], gtffile=None, profile_normalizations=[], transcript_normalization=None, scale_flanks=0, merge_pairs=False, min_insert_size=0, max_insert_size=1000, base_accuracy=False, matrix_format="single", control_factor=None, output_all_profiles=False, background_region_bins=10, input_filename_counts=None, resolution_images=None, image_format="png", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) # Keep for backwards compatability if len(args) == 2: infile, gtf = args options.infiles.append(infile) options.gtffile = gtf if not options.gtffile: raise ValueError("no GTF file specified") if options.gtffile == "-": options.gtffile = options.stdin else: options.gtffile = iotools.open_file(options.gtffile) if len(options.infiles) == 0: raise ValueError("no bam/wig/bed files specified") for methodsRequiresBaseAccuracy in [ "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", ]: # If you implemented any methods that you do not want the # spliced out introns or exons appear to be covered by # non-existent reads, it is better you let those methods imply # --base-accurarcy by add them here. if methodsRequiresBaseAccuracy in options.methods: options.base_accuracy = True if options.reporter == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile)) elif options.reporter == "transcript": gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile)) # Select rangecounter based on file type if len(options.infiles) > 0: if options.infiles[0].endswith(".bam"): bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.AlignmentFile(x, "rb") for x in options.controlfiles ] else: controlfiles = None format = "bam" if options.merge_pairs: range_counter = geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, merge_pairs=options.merge_pairs, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, controfiles=controlfiles, control_factor=options.control_factor) elif options.shifts or options.extends: range_counter = geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, controlfiles=controlfiles, control_factor=options.control_factor) elif options.base_accuracy: range_counter = geneprofile.RangeCounterBAMBaseAccuracy( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) else: range_counter = geneprofile.RangeCounterBAM( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bed.gz"): bedfiles = [pysam.Tabixfile(x) for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.Tabixfile(x) for x in options.controlfiles ] else: controlfiles = None range_counter = geneprofile.RangeCounterBed( bedfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bw"): wigfiles = [BigWigFile(file=open(x)) for x in options.infiles] range_counter = geneprofile.RangeCounterBigWig(wigfiles) else: raise NotImplementedError("can't determine file type for %s" % str(options.infiles)) counters = [] for method in options.methods: if method == "utrprofile": counters.append( geneprofile.UTRCounter( range_counter, options.resolution_upstream, options.resolution_upstream_utr, options.resolution_cds, options.resolution_downstream_utr, options.resolution_downstream, options.extension_upstream, options.extension_downstream, )) elif method == "geneprofile": counters.append( geneprofile.GeneCounter( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofilewithintrons": counters.append( geneprofile.GeneCounterWithIntrons( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofileabsolutedistancefromthreeprimeend": # options.extension_exons_absolute_distance_tostartsite, # options.extension_introns_absolute_distance_tostartsite, # Tim 31th Aug 2013: a possible feature for future, if five prime # bias is of your interest. # (you need to create another class). It is not very difficult to # derive from this class, but is not implemented yet # This future feature is slightly different the TSS profile # already implemented, because in this future feature introns are # skipped, counters.append( geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd( range_counter, options.resolution_upstream, options.resolution_downstream, options.resolution_exons_absolute_distance_topolya, options.resolution_introns_absolute_distance_topolya, options.extension_upstream, options.extension_downstream, options.extension_exons_absolute_distance_topolya, options.extension_introns_absolute_distance_topolya, options.scale_flanks)) elif method == "tssprofile": counters.append( geneprofile.TSSCounter(range_counter, options.extension_outward, options.extension_inward)) elif method == "intervalprofile": counters.append( geneprofile.RegionCounter(range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "midpointprofile": counters.append( geneprofile.MidpointCounter(range_counter, options.resolution_upstream, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # add new method to split 1st and last exons out # requires a representative transcript for reach gene # gtf should be sorted gene-position elif method == "separateexonprofile": counters.append( geneprofile.SeparateExonCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "separateexonprofilewithintrons": counters.append( geneprofile.SeparateExonWithIntronCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # set normalization for c in counters: c.setNormalization(options.transcript_normalization) if options.output_all_profiles: c.setOutputProfiles( iotools.open_file( E.get_output_file(c.name) + ".profiles.tsv.gz", "w")) if options.input_filename_counts: # read counts from file E.info("reading counts from %s" % options.input_filename_counts) all_counts = pandas.read_csv(iotools.open_file( options.input_filename_counts), sep='\t', header=0, index_col=0) if len(counters) != 1: raise NotImplementedError( 'counting from matrix only implemented for 1 counter.') # build counter based on reference counter counter = geneprofile.UnsegmentedCounter(counters[0]) counters = [counter] geneprofile.countFromCounts(counters, all_counts) else: E.info("starting counting with %i counters" % len(counters)) feature_names = geneprofile.countFromGTF(counters, gtf_iterator) # output matrices if not options.profile_normalizations: options.profile_normalizations.append("none") elif "all" in options.profile_normalizations: options.profile_normalizations = [ "none", "area", "counts", "background" ] for method, counter in zip(options.methods, counters): profiles = [] for norm in options.profile_normalizations: # build matrix, apply normalization profile = counter.getProfile( normalize=norm, background_region_bins=options.background_region_bins) profiles.append(profile) for x in range(1, len(profiles)): assert profiles[0].shape == profiles[x].shape # build a single matrix of all profiles for output matrix = numpy.concatenate(profiles) matrix.shape = len(profiles), len(profiles[0]) matrix = matrix.transpose() with iotools.open_file( E.get_output_file(counter.name) + ".matrix.tsv.gz", "w") as outfile: outfile.write("bin\tregion\tregion_bin\t%s\n" % "\t".join(options.profile_normalizations)) fields = [] bins = [] for field, nbins in zip(counter.fields, counter.nbins): fields.extend([field] * nbins) bins.extend(list(range(nbins))) for row, cols in enumerate(zip(fields, bins, matrix)): outfile.write("%i\t%s\t" % (row, "\t".join([str(x) for x in cols[:-1]]))) outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]]))) with iotools.open_file( E.get_output_file(counter.name) + ".lengths.tsv.gz", "w") as outfile: counter.writeLengthStats(outfile) if options.output_all_profiles: counter.closeOutputProfiles() if options.plot: import matplotlib # avoid Tk or any X matplotlib.use("Agg") import matplotlib.pyplot as plt for method, counter in zip(options.methods, counters): if method in ("geneprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "utrprofile", "intervalprofile", "separateexonprofile", "separateexonprofilewithintrons"): plt.figure() plt.subplots_adjust(wspace=0.05) max_scale = max([max(x) for x in counter.aggregate_counts]) for x, counts in enumerate(counter.aggregate_counts): plt.subplot(6, 1, x + 1) plt.plot(list(range(len(counts))), counts) plt.title(counter.fields[x]) plt.ylim(0, max_scale) figname = counter.name + ".full" fn = E.get_output_file(figname) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) plt.figure() points = [] cuts = [] for x, counts in enumerate(counter.aggregate_counts): points.extend(counts) cuts.append(len(counts)) plt.plot(list(range(len(points))), points) xx, xxx = 0, [] for x in cuts: xxx.append(xx + x // 2) xx += x plt.axvline(xx, color="r", ls="--") plt.xticks(xxx, counter.fields) figname = counter.name + ".detail" fn = E.get_output_file(figname) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) elif method == "tssprofile": plt.figure() plt.subplot(1, 3, 1) plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.title(counter.fields[0]) plt.subplot(1, 3, 2) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.title(counter.fields[1]) plt.subplot(1, 3, 3) plt.title("combined") plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.legend(counter.fields[:2]) fn = E.get_output_file( counter.name) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) elif method == "midpointprofile": plt.figure() plt.plot(numpy.arange(-options.resolution_upstream, 0), counter.aggregate_counts[0]) plt.plot(numpy.arange(0, options.resolution_downstream), counter.aggregate_counts[1]) fn = E.get_output_file( counter.name) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) # write footer and output benchmark information. E.stop()