def summarizeMACSFDR(infiles, outfile): '''compile table with peaks that would remain after filtering by fdr. ''' fdr_thresholds = numpy.arange(0, 1.05, 0.05) outf = IOTools.openFile(outfile, "w") outf.write("track\t%s\n" % "\t".join(map(str, fdr_thresholds))) for infile in infiles: called = [] track = P.snip(os.path.basename(infile), ".macs") infilename = infile + "_peaks.xls.gz" inf = IOTools.openFile(infilename) peaks = list(WrapperMACS.iteratePeaks(inf)) for threshold in fdr_thresholds: called.append(len([x for x in peaks if x.fdr <= threshold])) outf.write("%s\t%s\n" % (track, "\t".join(map(str, called)))) outf.close()
def loadMACS(infile, outfile, bamfile, tablename=None): '''load MACS results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.macs`. This method creates two optional additional files: * if the file :file:`<track>_diag.xls` is present, load MACS diagnostic data into the table :file:`<track>_macsdiag`. * if the file :file:`<track>_model.r` is present, call R to create a MACS peak-shift plot and save it as :file:`<track>_model.pdf` in the :file:`export/MACS` directory. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. ''' track = P.snip(os.path.basename(infile), ".macs") folder = os.path.dirname(infile) if len(folder) > 0: infilename = folder + "/" + track + "_peaks.xls" filename_diag = folder + "/" + track + "_diag.xls" filename_r = folder + "/" + track + "_model.r" filename_rlog = folder + "/" + track + ".r.log" filename_pdf = track + "_model.pdf" else: infilename = track + "_peaks.xls" filename_diag = track + "_diag.xls" filename_r = track + "_model.r" filename_rlog = track + ".r.log" filename_pdf = track + "_model.pdf" if not os.path.exists(infilename): E.warn("could not find %s" % infilename) P.touch(outfile) return # create plot by calling R if os.path.exists(filename_r): if len(folder) > 0: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; ''' else: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; ''' P.run() # filter peaks shift = getPeakShiftFromMacs(infile) assert shift is not None, "could not determine peak shift from MACS file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") id = 0 # get thresholds max_qvalue = float(PARAMS["macs_max_qvalue"]) # min, as it is -10log10 min_pvalue = float(PARAMS["macs_min_pvalue"]) counter = E.Counter() with IOTools.openFile(infilename, "r") as ins: for peak in WrapperMACS.iteratePeaks(ins): if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue elif peak.pvalue < min_pvalue: counter.removed_pvalue += 1 continue assert peak.start < peak.end npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks( peak.contig, peak.start, peak.end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.start, peak.end, npeaks, peakcenter, length, avgval, peakval, nreads, peak.pvalue, peak.fold, peak.fdr, peak.start + peak.summit - 1, peak.tags))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = IOTools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_macs_intervals" % track statement = '''cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename) # load diagnostic data if os.path.exists(filename_diag): tablename = "%s_macsdiag" % track statement = ''' cat %(filename_diag)s | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" | cgat csv2db %(csv2db_options)s --map=fc:str --table=%(tablename)s >> %(outfile)s ''' P.run()