def readAndIndex(iterator, with_value=True): '''read from gtf stream and index. Returns ------- index : an object of type :class:`IndexedGenome.IndexedGenome` ''' if with_value: index = IndexedGenome.IndexedGenome() for gtf in iterator: index.add(gtf.contig, gtf.start, gtf.end, gtf) else: index = IndexedGenome.Simple() for gtf in iterator: index.add(gtf.contig, gtf.start, gtf.end) return index
def buildQuicksectMask(bed_file): '''return Quicksect object containing the regions specified takes a bed file listing the regions to mask ''' mask = IndexedGenome.Quicksect() n_regions = 0 for bed in Bed.iterator(iotools.openFile(bed_file)): # it is neccessary to extend the region to make an accurate mask mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1) n_regions += 1 E.info("Built Quicksect mask for %i regions" % n_regions) return(mask)
def makeIntervalCorrelation(infiles, outfile, field, reference): '''compute correlation of interval properties between sets ''' dbhandle = sqlite3.connect(PARAMS["database_name"]) tracks, idx = [], [] for infile in infiles: track = P.snip(infile, ".bed.gz") tablename = "%s_intervals" % P.tablequote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals( ) cc.execute(statement) ix = IndexedGenome.IndexedGenome() for contig, start, end, peakval in cc: ix.add(contig, start, end, peakval) idx.append(ix) tracks.append(track) outs = iotools.openFile(outfile, "w") outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n") for bed in Bed.iterator(infile=iotools.openFile(reference, "r")): row = [] for ix in idx: try: intervals = list(ix.get(bed.contig, bed.start, bed.end)) except KeyError: row.append("") continue if len(intervals) == 0: peakval = "" else: peakval = str((max([x[2] for x in intervals]))) row.append(peakval) outs.write(str(bed) + "\t" + "\t".join(row) + "\n") outs.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = iotools.read_map( iotools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(iotools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(iotools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = iotools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def annotateWindows(contig, windows, gff_data, fasta, options): """annotate windows.""" index = IndexedGenome.IndexedGenome() for g in gff_data: index.add(g.contig, g.start, g.end, g) is_gtf = options.is_gtf if options.transform == "none": transform = lambda x, y, z: [(x[0], x[1]) for x in z] elif options.transform == "overlap": transform = transform_overlap elif options.transform == "complement": transform = transform_complement elif options.transform == "third_codon": transform = transform_third_codon else: raise ValueError("unknown transform %s" % options.transform) work_on_intervals = True if options.decorator == "counts": decorator = decorator_counts elif options.decorator == "mean-length": decorator = decorator_mean_length elif options.decorator == "median-length": decorator = decorator_median_length elif options.decorator == "percent-coverage": decorator = decorator_percent_coverage elif options.decorator == "gc": decorator = decorator_percent_gc elif options.decorator == "median-score": decorator = decorator_median_score work_on_intervals = False elif options.decorator == "mean-score": decorator = decorator_mean_score work_on_intervals = False elif options.decorator == "stddev-score": decorator = decorator_stddev_score work_on_intervals = False elif options.decorator == "min-score": decorator = decorator_min_score work_on_intervals = False elif options.decorator == "max-score": decorator = decorator_max_score work_on_intervals = False else: raise ValueError("unknown decorator %s" % options.decorator) for start, end in windows: # counts/length before/after transformation n1, l1, n2, l2 = 0, 0, 0, 0 values, intervals_with_gff, genes, transcripts = [], [], set(), set() try: for istart, iend, value in index.get(contig, start, end): n1 += 1 l1 += iend - istart intervals_with_gff.append((istart, iend, value)) values.append(value.score) if is_gtf: genes.add(value.gene_id) transcripts.add(value.transcript_id) except KeyError: pass if n1 == 0 and options.skip_empty: continue if work_on_intervals: if options.loglevel >= 3: options.stdlog.write( "# intervals in window %i:%i before transformation: %s\n" % (start, end, str(intervals))) intervals = transform(start, end, intervals_with_gff) for xstart, xend in intervals: n2 += 1 l2 += xend - xstart if options.loglevel >= 3: options.stdlog.write( "# intervals in window %i:%i after transformation: %s\n" % (start, end, str(intervals))) score, extra_info = decorator(intervals, start, end, contig, fasta) else: if len(values) > 0: values = list(map(float, values)) score, extra_info = decorator(values, start, end, contig) else: score, extra_info = 0, None l2 = 0 n2 = 0 if is_gtf: ngenes, ntranscripts = len(genes), len(transcripts) else: ngenes, ntranscripts = 0, 0 if extra_info: extra_info = re.sub("\t", ";", extra_info) options.stdout.write("\t".join( map(str, (contig, start, end, ngenes, ntranscripts, n1, l1, n2, l2, score, extra_info))) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("--bed-file", dest="infiles", type=str, metavar="bed", help="supply list of bed files", action="append") parser.set_defaults(infiles=[]) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) args.infiles.extend(unknown) if len(args.infiles) == 0: raise ValueError('please provide at least 1 bed file') E.info("concatenating bed files") # concatenate the list of files tmp = tempfile.NamedTemporaryFile(delete=False, mode="w") tmp_merge = tempfile.NamedTemporaryFile(delete=False, mode="w") infs = args.infiles for inf in infs: for bed in Bed.iterator(iotools.open_file(inf)): tmp.write("%s\n" % bed) tmp.close() E.info("merging bed entries") # merge the bed entries in the file name = tmp.name tmp_bed = pybedtools.BedTool(name) tmp_bed.sort().merge().saveas(tmp_merge.name) tmp_merge.close() E.info("indexing bed entries") # index the bed entries merged = IndexedGenome.Simple() for bed in Bed.iterator(iotools.open_file(tmp_merge.name)): merged.add(bed.contig, bed.start, bed.end) counts = collections.defaultdict(int) # list of samples samples = args.infiles E.info("counting no. samples overlapping each interval") for sample in samples: found = set() for bed in Bed.iterator(iotools.open_file(sample)): if merged.contains(bed.contig, bed.start, bed.end): key = [bed.contig] + \ [x for x in merged.get(bed.contig, bed.start, bed.end)] key = (key[0], key[1][0], key[1][1]) if key in found: continue found.add(key) # tuple of interval description as key - (contig, start, end) counts[key] += 1 # open outfile args.stdout.write("contig\tstart\tend\tcount\n") E.info("outputting result") for interval, count in sorted(counts.items()): args.stdout.write("\t".join(map(str, interval)) + "\t" + str(count) + "\n") # write footer and output benchmark information. E.stop()
def __call__(self, track, slice=None): result = odict() merged = None rocs = [] for field in self.mFields: data = [] for replicate in EXPERIMENTS.getTracks(track): statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals( ) data.append(self.get(statement)) idx = [] for x in range(len(data)): i = IndexedGenome.IndexedGenome() for contig, start, end, peakval in data[x]: i.add(contig, start, end, peakval) idx.append(i) def _iter(all): all.sort() last_contig, first_start, last_end, last_value = all[0] for contig, start, end, value in all[1:]: if contig != last_contig or last_end < start: yield (last_contig, first_start, last_end) last_contig, first_start, last_end = contig, start, end else: last_end = max(last_end, end) yield (last_contig, first_start, last_end) if not merged: all = [x for x in itertools.chain(*data)] merged = list(_iter(all)) roc_data = [] for contig, start, end in merged: intervals = [] for i in idx: try: intervals.append(list(i.get(contig, start, end))) except KeyError: continue if len(intervals) == 0: continue is_repro = len([x for x in intervals if x != []]) == len(data) value = max([x[2] for x in itertools.chain(*intervals)]) # fpr, tpr roc_data.append((value, is_repro)) roc_data.sort() roc_data.reverse() roc = list(zip(*Stats.computeROC(roc_data))) result[field] = odict((("FPR", roc[0]), (field, roc[1]))) return result