def intersectBedFiles(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ''' P.run() os.unlink(tmpfile)
def subtractBedFiles(infile, subtractfile, outfile): '''subtract intervals in *subtractfile* from *infile* and store in *outfile*. ''' if IOTools.isEmpty(subtractfile): shutil.copyfile(infile, outfile) return elif IOTools.isEmpty(infile): P.touch(outfile) return statement = ''' intersectBed -v -a %(infile)s -b %(subtractfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ; tabix -p bed %(outfile)s ''' P.run()
def loadZinba(infile, outfile, bamfile, tablename=None, controlfile=None): '''load Zinba results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.zinba`. If no peaks were predicted, an empty table is created. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. This method uses the refined peak locations. Zinba peaks can be overlapping. This method does not merge overlapping intervals. Zinba calls peaks in regions where there are many reads inside the control. Thus this method applies a filtering step removing all intervals in which there is a peak of more than readlength / 2 height in the control. .. note: Zinba calls peaks that are overlapping. ''' track = P.snip(os.path.basename(infile), ".zinba") folder = os.path.dirname(infile) infilename = infile + ".peaks" outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") counter = E.Counter() if not os.path.exists(infilename): E.warn("could not find %s" % infilename) elif IOTools.isEmpty(infilename): E.warn("no data in %s" % infilename) else: # filter peaks shift = getPeakShiftFromZinba(infile) assert shift is not None, \ "could not determine peak shift from Zinba file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] if controlfile: controlfiles = [pysam.Samfile(controlfile, "rb")] readlength = BamTools.estimateTagSize(controlfile) control_max_peakval = readlength // 2 E.info("removing intervals in which control has peak higher than %i reads" % control_max_peakval) else: controlfiles = None id = 0 # get thresholds max_qvalue = float(PARAMS["zinba_fdr_threshold"]) with IOTools.openFile(infilename, "r") as ins: for peak in WrapperZinba.iteratePeaks(ins): # filter by qvalue if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue assert peak.refined_start < peak.refined_end # filter by control if controlfiles: npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, controlfiles, offsets) if peakval > control_max_peakval: counter.removed_control += 1 continue # output peak npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.refined_start, peak.refined_end, npeaks, peakcenter, length, avgval, peakval, nreads, 1.0 - peak.posterior, 1.0, peak.fdr, peak.refined_start + peak.summit - 1, peak.height))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = IOTools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_intervals" % track statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def BedFileVenn(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' bed1, bed2 = infiles liver_name = P.snip(os.path.basename(liver), ".replicated.bed") testes_name = P.snip(os.path.basename(testes), ".replicated.bed") to_cluster = True statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed; echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; sed -i '{N;s/\\n/\\t/g}' %(outfile)s; ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.is_empty(infiles[0]) or IOTools.isEmpty(infiles[1]): IOTools.touch_file(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run(statement) else: tmpfile = P.get_temp_filename(".") # need to merge incrementally fn = infiles[0] if IOTools.is_empty(infiles[0]): IOTools.touch_file(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run(statement) for fn in infiles[1:]: if IOTools.is_empty(infiles[0]): IOTools.touch_file(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run(statement) statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %(outfile)s ''' P.run(statement) os.unlink(tmpfile)