Example #1
0
def summarizeMACSFDR(infiles, outfile):
    '''compile table with peaks that would remain after filtering
    by fdr.
    '''

    fdr_thresholds = numpy.arange(0, 1.05, 0.05)

    outf = IOTools.openFile(outfile, "w")
    outf.write("track\t%s\n" % "\t".join(map(str, fdr_thresholds)))

    for infile in infiles:
        called = []
        track = P.snip(os.path.basename(infile), ".macs")
        infilename = infile + "_peaks.xls.gz"
        inf = IOTools.openFile(infilename)
        peaks = list(WrapperMACS.iteratePeaks(inf))

        for threshold in fdr_thresholds:
            called.append(len([x for x in peaks if x.fdr <= threshold]))

        outf.write("%s\t%s\n" % (track, "\t".join(map(str, called))))

    outf.close()
def summarizeMACSFDR(infiles, outfile):
    '''compile table with peaks that would remain after filtering
    by fdr.
    '''

    fdr_thresholds = numpy.arange(0, 1.05, 0.05)

    outf = IOTools.openFile(outfile, "w")
    outf.write("track\t%s\n" % "\t".join(map(str, fdr_thresholds)))

    for infile in infiles:
        called = []
        track = P.snip(os.path.basename(infile), ".macs")
        infilename = infile + "_peaks.xls.gz"
        inf = IOTools.openFile(infilename)
        peaks = list(WrapperMACS.iteratePeaks(inf))

        for threshold in fdr_thresholds:
            called.append(len([x for x in peaks if x.fdr <= threshold]))

        outf.write("%s\t%s\n" % (track, "\t".join(map(str, called))))

    outf.close()
Example #3
0
def loadMACS(infile, outfile, bamfile, tablename=None):
    '''load MACS results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.macs`.

    This method creates two optional additional files:

    * if the file :file:`<track>_diag.xls` is present, load MACS 
      diagnostic data into the table :file:`<track>_macsdiag`.

    * if the file :file:`<track>_model.r` is present, call R to
      create a MACS peak-shift plot and save it as :file:`<track>_model.pdf`
      in the :file:`export/MACS` directory.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.
    '''

    track = P.snip(os.path.basename(infile), ".macs")
    folder = os.path.dirname(infile)
    if len(folder) > 0:
        infilename = folder + "/" + track + "_peaks.xls"
        filename_diag = folder + "/" + track + "_diag.xls"
        filename_r = folder + "/" + track + "_model.r"
        filename_rlog = folder + "/" + track + ".r.log"
        filename_pdf = track + "_model.pdf"
    else:
        infilename = track + "_peaks.xls"
        filename_diag = track + "_diag.xls"
        filename_r = track + "_model.r"
        filename_rlog = track + ".r.log"
        filename_pdf = track + "_model.pdf"

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
        P.touch(outfile)
        return

    # create plot by calling R
    if os.path.exists(filename_r):
        if len(folder) > 0:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; '''
        else:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; '''
        P.run()

    # filter peaks
    shift = getPeakShiftFromMacs(infile)
    assert shift is not None, "could not determine peak shift from MACS file %s" % infile

    E.info("%s: found peak shift of %i" % (track, shift))

    samfiles = [pysam.Samfile(bamfile, "rb")]
    offsets = [shift / 2]

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")
    id = 0

    # get thresholds
    max_qvalue = float(PARAMS["macs_max_qvalue"])
    # min, as it is -10log10
    min_pvalue = float(PARAMS["macs_min_pvalue"])

    counter = E.Counter()
    with IOTools.openFile(infilename, "r") as ins:
        for peak in WrapperMACS.iteratePeaks(ins):

            if peak.fdr > max_qvalue:
                counter.removed_qvalue += 1
                continue
            elif peak.pvalue < min_pvalue:
                counter.removed_pvalue += 1
                continue

            assert peak.start < peak.end

            npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(
                peak.contig, peak.start, peak.end, samfiles, offsets)

            outtemp.write("\t".join(map(str, (
                id, peak.contig, peak.start, peak.end,
                npeaks, peakcenter, length, avgval, peakval, nreads,
                peak.pvalue, peak.fold, peak.fdr,
                peak.start + peak.summit - 1,
                peak.tags))) + "\n")
            id += 1
            counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_macs_intervals" % track
    statement = '''cgat csv2db %(csv2db_options)s 
                       --allow-empty-file
                       --add-index=interval_id 
                       --add-index=contig,start
                       --table=%(tablename)s 
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfilename)

    # load diagnostic data
    if os.path.exists(filename_diag):

        tablename = "%s_macsdiag" % track
        statement = '''
        cat %(filename_diag)s 
        | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" 
        | cgat csv2db %(csv2db_options)s 
                  --map=fc:str 
                  --table=%(tablename)s 
        >> %(outfile)s
        '''
        P.run()
def loadMACS(infile, outfile, bamfile, tablename=None):
    '''load MACS results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.macs`.

    This method creates two optional additional files:

    * if the file :file:`<track>_diag.xls` is present, load MACS 
      diagnostic data into the table :file:`<track>_macsdiag`.

    * if the file :file:`<track>_model.r` is present, call R to
      create a MACS peak-shift plot and save it as :file:`<track>_model.pdf`
      in the :file:`export/MACS` directory.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.
    '''

    track = P.snip(os.path.basename(infile), ".macs")
    folder = os.path.dirname(infile)
    if len(folder) > 0:
        infilename = folder + "/" + track + "_peaks.xls"
        filename_diag = folder + "/" + track + "_diag.xls"
        filename_r = folder + "/" + track + "_model.r"
        filename_rlog = folder + "/" + track + ".r.log"
        filename_pdf = track + "_model.pdf"
    else:
        infilename = track + "_peaks.xls"
        filename_diag = track + "_diag.xls"
        filename_r = track + "_model.r"
        filename_rlog = track + ".r.log"
        filename_pdf = track + "_model.pdf"

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
        P.touch(outfile)
        return

    # create plot by calling R
    if os.path.exists(filename_r):
        if len(folder) > 0:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; '''
        else:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; '''
        P.run()

    # filter peaks
    shift = getPeakShiftFromMacs(infile)
    assert shift is not None, "could not determine peak shift from MACS file %s" % infile

    E.info("%s: found peak shift of %i" % (track, shift))

    samfiles = [pysam.Samfile(bamfile, "rb")]
    offsets = [shift / 2]

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")
    id = 0

    # get thresholds
    max_qvalue = float(PARAMS["macs_max_qvalue"])
    # min, as it is -10log10
    min_pvalue = float(PARAMS["macs_min_pvalue"])

    counter = E.Counter()
    with IOTools.openFile(infilename, "r") as ins:
        for peak in WrapperMACS.iteratePeaks(ins):

            if peak.fdr > max_qvalue:
                counter.removed_qvalue += 1
                continue
            elif peak.pvalue < min_pvalue:
                counter.removed_pvalue += 1
                continue

            assert peak.start < peak.end

            npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(
                peak.contig, peak.start, peak.end, samfiles, offsets)

            outtemp.write("\t".join(map(str, (
                id, peak.contig, peak.start, peak.end,
                npeaks, peakcenter, length, avgval, peakval, nreads,
                peak.pvalue, peak.fold, peak.fdr,
                peak.start + peak.summit - 1,
                peak.tags))) + "\n")
            id += 1
            counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_macs_intervals" % track
    statement = '''cgat csv2db %(csv2db_options)s 
                       --allow-empty-file
                       --add-index=interval_id 
                       --add-index=contig,start
                       --table=%(tablename)s 
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfilename)

    # load diagnostic data
    if os.path.exists(filename_diag):

        tablename = "%s_macsdiag" % track
        statement = '''
        cat %(filename_diag)s 
        | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" 
        | cgat csv2db %(csv2db_options)s 
                  --map=fc:str 
                  --table=%(tablename)s 
        >> %(outfile)s
        '''
        P.run()