Esempio n. 1
0
def extractScore(benchDir, benchInputBedPath, args, repSuffix = ""):
    """ Reduce entire benchmark output into a single score value """

    compPath = os.path.join(benchDir,
                             os.path.splitext(
                                 os.path.basename(benchInputBedPath))[0]+
                                "_comp.txt" + repSuffix) 
    baseStats, intStats, weightedStats = extractCompStatsFromFile(compPath)
    stats = intStats
    if args.base is True:
        stats = baseStats
    f1List = []
    for state in args.states.split(","):
        if state not in stats:
            logger.warning("State %s not found in intstats %s. giving 0" % (
                state, str(stats)))
            f1List.append(0)
            continue
        
        prec = stats[state][0] 
        rec = stats[state][1] * args.recallSkew
        f1 = 0
        if prec + rec > 0:
            f1 = 2. * ((prec * rec) / (prec + rec))
        if args.score  == "prec":
            f1List.append(prec)
        elif args.score == "rec":
            f1List.append(rec)
        else:
            f1List.append(f1)

    avgF1 = np.mean(f1List)
    return avgF1
Esempio n. 2
0
    runParallelShellCommands(compCmds, args.proc)

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)        
        return ("%.4f" % prec, "%.4f" % rec, "%.4f" % f1, "%.4f" % spec)
    
    #table in memory
    table = dict()
    for i in xrange(len(tests)):
        for j in xrange(len(truths)):
            opath = os.path.join(args.workDir, "%s_vs_%s.txt" % (testNames[i], truthNames[j]))
            stats = extractCompStatsFromFile(opath)[0]
            if args.state not in stats:
                stats[args.state] = (0,0)
            specificity = extract2ClassSpecificityFromFile(opath, args.state)
            table[(i, j)] = prettyAcc(stats[args.state], specificity)

    csvFile = open(args.outCSV, "w")
    
    header = "test"
    for name in truthNames:
        header += ", F1 " + name
    csvFile.write(header + "\n")

    for i in xrange(len(tests)):
        line = testNames[i]
        for j in xrange(len(truths)):
Esempio n. 3
0
        for j, truthName in enumerate(truthNames):
            cmd = "compareBedStates.py %s %s --tl %s %s > %s" % (getTruthPath(j), predPaths[i], compTracksPath, maskFlags, getCompPath(j, i))
            compCmds.append(cmd)
    runParallelShellCommands(compCmds, 10)

# munging ############
def prettyAcc(prec, rec):
    f1 = 0.
    if prec + rec > 0:
        f1 = (2. * prec * rec) / (prec + rec)
    return ["%.4f" % prec, "%.4f" % rec, "%.4f" % f1]

if startPoint <= 6:
    statsPath = "stats.csv"
    statsFile = open(statsPath, "w")
    header = ",Fit,Interpolate"
    for truthName in truthNames:
        header += ",%s Prec, %s Rec, %s F1" % (truthName, truthName, truthName)
    statsFile.write(header + "\n")
    
    for i, predName in enumerate(predNames):
        line = "%s, %s, %s" % (predName, fits[i], interpolations[i])
        for j, truthName in enumerate(truthNames):
            compPath = getCompPath(j, i)
            stats = extractCompStatsFromFile(compPath)[compIdx]
            if "TE" not in stats:
                stats["TE"] = (0,0)
            line += ", " + ",".join(prettyAcc(stats["TE"][0], stats["TE"][1]))
        statsFile.write(line + "\n")
    statsFile.close()
Esempio n. 4
0
        compPath = os.path.join(workPath, "%s_%s_comp.txt" % compSet)
        truthPathUnfiltered = bedPath(compSet[0], "gap_te")
        queryPathUnfiltered = bedPath(compSet[1], "gap_te")
        truthPath=bedPath(compSet[0], "gap_te_%s" % compName)
        queryPath = bedPath(compSet[1], "gap_te_%s" % compName)

        runShellCommand("filterBedScores.py %s --names TE %f %f"
                        " --rename 0 > %s" % (
                            truthPathUnfiltered, minScore, maxScore, truthPath))
        runShellCommand("filterBedScores.py %s --names TE %f %f"
                        " --rename 0 > %s" % (
                            queryPathUnfiltered, minScore, maxScore,queryPath))

        runShellCommand("compareBedStates.py %s %s > %s" % (
            truthPath, queryPath, compPath))
        baseStats, intervalStats, weightedStats = extractCompStatsFromFile(compPath)
        if "TE" not in baseStats:
            logger.warning("No TE elements in %s" % compPath)
            baseStats["TE"] = (-1., -1.)
            intervalStats["TE"] = (-1, -1.)
        assert "TE" in baseStats
        if compSet[0] not in prMap:
            prMap[compSet[0]] = dict()
            prIntMap[compSet[0]] = dict()
        prMap[compSet[0]][compSet[1]] = baseStats["TE"]
        prIntMap[compSet[0]][compSet[1]] = intervalStats["TE"]
        if compSet[1] not in prMap:
            prMap[compSet[1]] = dict()
            prIntMap[compSet[1]] = dict()
        prMap[compSet[1]][compSet[0]] = baseStats["TE"][1], baseStats["TE"][0]
        prIntMap[compSet[1]][compSet[0]] = (intervalStats["TE"][1],
Esempio n. 5
0
    header = "states, trainSize, precision, recall, f1, specificity"
    for fdr in fdrs:
        header += ", fdrfit%.3f_precision, fdrfit%.3f_recall, fdrfit%.3f_f1, fdrfit%.3f_specificity" % (
            fdr, fdr, fdr, fdr)
    if len(fdrs) > 1:
        header += "\n,,,,"
        for fdr in fdrs:
            header += ", %.3f, %.3f, %.3f, %.3f" % (fdr, fdr, fdr, fdr)
    outFile.write(header + "\n")

    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        comp = os.path.join(args.outDir,
                            os.path.basename(bed).replace(".bed", "_comp.txt"))
        stats = extractCompStatsFromFile(comp)[0]
        if "TE" not in stats:
            stats["TE"] = (0, 0)
        specificity = extract2ClassSpecificityFromFile(comp, "TE")
        line = "%d, %d" % (nStates, tSize) + "," + prettyAcc(
            stats["TE"], specificity)
        for fdr in fdrs:
            compFdr = comp.replace(".txt", "Fdr%f.txt" % fdr)
            statsFdr = extractCompStatsFromFile(compFdr)[0]
            specFdr = extract2ClassSpecificityFromFile(compFdr, "TE")
            if "TE" not in statsFdr:
                statsFdr["TE"] = (0, 0)
            line += ", " + prettyAcc(statsFdr["TE"], specFdr)
        line += "\n"

        outFile.write(line)
Esempio n. 6
0
        compPath = os.path.join(workPath, "%s_%s_comp.txt" % compSet)
        truthPathUnfiltered = bedPath(compSet[0], "gap_te")
        queryPathUnfiltered = bedPath(compSet[1], "gap_te")
        truthPath = bedPath(compSet[0], "gap_te_%s" % compName)
        queryPath = bedPath(compSet[1], "gap_te_%s" % compName)

        runShellCommand("filterBedScores.py %s --names TE %f %f"
                        " --rename 0 > %s" %
                        (truthPathUnfiltered, minScore, maxScore, truthPath))
        runShellCommand("filterBedScores.py %s --names TE %f %f"
                        " --rename 0 > %s" %
                        (queryPathUnfiltered, minScore, maxScore, queryPath))

        runShellCommand("compareBedStates.py %s %s > %s" %
                        (truthPath, queryPath, compPath))
        baseStats, intervalStats, weightedStats = extractCompStatsFromFile(
            compPath)
        if "TE" not in baseStats:
            logger.warning("No TE elements in %s" % compPath)
            baseStats["TE"] = (-1., -1.)
            intervalStats["TE"] = (-1, -1.)
        assert "TE" in baseStats
        if compSet[0] not in prMap:
            prMap[compSet[0]] = dict()
            prIntMap[compSet[0]] = dict()
        prMap[compSet[0]][compSet[1]] = baseStats["TE"]
        prIntMap[compSet[0]][compSet[1]] = intervalStats["TE"]
        if compSet[1] not in prMap:
            prMap[compSet[1]] = dict()
            prIntMap[compSet[1]] = dict()
        prMap[compSet[1]][compSet[0]] = baseStats["TE"][1], baseStats["TE"][0]
        prIntMap[compSet[1]][compSet[0]] = (intervalStats["TE"][1],
Esempio n. 7
0
        return "%.4f, %.4f, %.4f, %.4f" % (prec, rec, f1, spec)

    header = "states, trainSize, precision, recall, f1, specificity"
    for fdr in fdrs:
        header += ", fdrfit%.3f_precision, fdrfit%.3f_recall, fdrfit%.3f_f1, fdrfit%.3f_specificity" % (fdr, fdr, fdr, fdr)
    if len(fdrs) > 1:
        header += "\n,,,,"
        for fdr in fdrs:
            header += ", %.3f, %.3f, %.3f, %.3f" % (fdr, fdr, fdr, fdr)
    outFile.write(header + "\n")

    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        comp = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_comp.txt"))
        stats = extractCompStatsFromFile(comp)[0]
        if "TE" not in stats:
            stats["TE"] = (0,0)
        specificity = extract2ClassSpecificityFromFile(comp, "TE")
        line = "%d, %d" % (nStates, tSize) + "," + prettyAcc(stats["TE"], specificity) 
        for fdr in fdrs:
            compFdr = comp.replace(".txt", "Fdr%f.txt" % fdr)
            statsFdr = extractCompStatsFromFile(compFdr)[0]
            specFdr = extract2ClassSpecificityFromFile(compFdr, "TE")
            if "TE" not in statsFdr:
                statsFdr["TE"] = (0,0)
            line += ", " + prettyAcc(statsFdr["TE"], specFdr)
        line += "\n"

        outFile.write(line)
Esempio n. 8
0
    runParallelShellCommands(compCmds, args.proc)

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        return ("%.4f" % prec, "%.4f" % rec, "%.4f" % f1, "%.4f" % spec)

    #table in memory
    table = dict()
    for i in xrange(len(tests)):
        for j in xrange(len(truths)):
            opath = os.path.join(
                args.workDir, "%s_vs_%s.txt" % (testNames[i], truthNames[j]))
            stats = extractCompStatsFromFile(opath)[0]
            if args.state not in stats:
                stats[args.state] = (0, 0)
            specificity = extract2ClassSpecificityFromFile(opath, args.state)
            table[(i, j)] = prettyAcc(stats[args.state], specificity)

    csvFile = open(args.outCSV, "w")

    header = "test"
    for name in truthNames:
        header += ", F1 " + name
    csvFile.write(header + "\n")

    for i in xrange(len(tests)):
        line = testNames[i]
        for j in xrange(len(truths)):
Esempio n. 9
0
            compCmds.append(cmd)
    runParallelShellCommands(compCmds, 10)


# munging ############
def prettyAcc(prec, rec):
    f1 = 0.
    if prec + rec > 0:
        f1 = (2. * prec * rec) / (prec + rec)
    return ["%.4f" % prec, "%.4f" % rec, "%.4f" % f1]


if startPoint <= 6:
    statsPath = "stats.csv"
    statsFile = open(statsPath, "w")
    header = ",Fit,Interpolate"
    for truthName in truthNames:
        header += ",%s Prec, %s Rec, %s F1" % (truthName, truthName, truthName)
    statsFile.write(header + "\n")

    for i, predName in enumerate(predNames):
        line = "%s, %s, %s" % (predName, fits[i], interpolations[i])
        for j, truthName in enumerate(truthNames):
            compPath = getCompPath(j, i)
            stats = extractCompStatsFromFile(compPath)[compIdx]
            if "TE" not in stats:
                stats["TE"] = (0, 0)
            line += ", " + ",".join(prettyAcc(stats["TE"][0], stats["TE"][1]))
        statsFile.write(line + "\n")
    statsFile.close()
Esempio n. 10
0
def harvestStats(genomePath, regions, outDir, modelerPath, truthPaths,
                 outStatsName, compIdx):
    statFile = open(os.path.join(outDir, outStatsName) + ".csv", "w")
    rows = []
    header = []
    totalBases = 0
    for i, region in enumerate(regions):
        if len(rows) == 0:
            header.append("region")
        row = []
        regionName = getRegionName(region, i)
        row.append(regionName)
        fitTgts = truthPaths
        modelerInputBed = getOutPath(modelerPath, outDir, regionName)
        modelerName = os.path.splitext(
            os.path.basename(modelerInputBed))[0].replace(regionName, "")
        modelerFitBed = getOutPath(genomePath, outDir, regionName,
                                   "unsup_eval_fit_%s" % modelerName)
        if len(rows) == 0:
            header.append("numBases")
        numBases = countBases(modelerInputBed)
        totalBases += numBases
        row.append(numBases)

        for fitTgt in fitTgts:
            fitInputBed = getOutPath(fitTgt, outDir, regionName)
            fitName = os.path.splitext(
                os.path.basename(fitInputBed))[0].replace(regionName, "")
            # "Cheat" comparision where we fit with truth
            fitBed = getOutPath(genomePath, outDir, regionName,
                                "unsup_eval_fit_%s" % fitName)
            cheatCompFile = fitBed.replace(".bed", "comp_cheat.txt")
            # "Semisupervised" comparison where we fit with modeler
            compFile = fitBed.replace(".bed", "comp_modfit.txt")
            # Modeler comparision with truth (baseline)
            modCompFile = fitBed.replace(".bed", "_comp_baseline.txt")

            stats = extractCompStatsFromFile(compFile)[compIdx]
            statsCheat = extractCompStatsFromFile(cheatCompFile)[compIdx]
            statsMod = extractCompStatsFromFile(modCompFile)[compIdx]
            if "TE" not in stats:
                stats["TE"] = (0, 0)
                statsCheat["TE"] = (0, 0)
                statsMod["TE"] = (0, 0)

            if len(rows) == 0:
                header += [
                    fitName[:17] + "_ModPrec", fitName[:12] + "_ModRec",
                    fitName[:17] + "_ModF1"
                ]
            row += prettyAcc(statsMod["TE"][0], statsMod["TE"][1])

            if len(rows) == 0:
                header += [
                    fitName[:17] + "_Prec", fitName[:12] + "_Rec",
                    fitName[:17] + "_F1"
                ]
            row += prettyAcc(stats["TE"][0], stats["TE"][1])

            if len(rows) == 0:
                header += [
                    fitName[:17] + "_PrecCheat", fitName[:12] + "_RecCheat",
                    fitName[:17] + "_F1Cheat"
                ]
            row += prettyAcc(statsCheat["TE"][0], statsCheat["TE"][1])

        if len(rows) == 0:
            rows.append(header)
        rows.append(row)
    # compute weighted averages
    row = ["total", totalBases]
    for col in xrange(2, len(rows[0])):
        val = sum((float(rows[i][1]) / float(totalBases)) * float(rows[i][col])
                  for i in xrange(1, len(rows)))
        row.append("%.4f" % val)
    rows.append(row)

    for row in rows:
        statFile.write(",".join(str(x) for x in row) + "\n")
    statFile.close()
Esempio n. 11
0
def harvestStats(genomePath, regions, outDir, modelerPath, truthPaths,
                 outStatsName, compIdx):
    statFile = open(os.path.join(outDir, outStatsName) + ".csv", "w")
    rows = []
    header = []
    totalBases = 0
    for i, region in enumerate(regions):
        if len(rows) == 0:
            header.append("region")
        row = []
        regionName = getRegionName(region, i)
        row.append(regionName)
        fitTgts = truthPaths
        modelerInputBed = getOutPath(modelerPath, outDir, regionName)
        modelerName = os.path.splitext(os.path.basename(
            modelerInputBed))[0].replace(regionName, "")
        modelerFitBed = getOutPath(genomePath, outDir, regionName,
                                   "unsup_eval_fit_%s" % modelerName)
        if len(rows) == 0:
            header.append("numBases")
        numBases = countBases(modelerInputBed)
        totalBases += numBases
        row.append(numBases)

        for fitTgt in fitTgts:
            fitInputBed = getOutPath(fitTgt, outDir, regionName)
            fitName = os.path.splitext(os.path.basename(
                fitInputBed))[0].replace(regionName, "")
            # "Cheat" comparision where we fit with truth
            fitBed = getOutPath(genomePath, outDir, regionName,
                                   "unsup_eval_fit_%s" % fitName)
            cheatCompFile = fitBed.replace(".bed", "comp_cheat.txt")
            # "Semisupervised" comparison where we fit with modeler
            compFile = fitBed.replace(".bed", "comp_modfit.txt")
            # Modeler comparision with truth (baseline)
            modCompFile = fitBed.replace(".bed", "_comp_baseline.txt")
            
            stats = extractCompStatsFromFile(compFile)[compIdx]
            statsCheat = extractCompStatsFromFile(cheatCompFile)[compIdx]
            statsMod =  extractCompStatsFromFile(modCompFile)[compIdx]
            if "TE" not in stats:
                stats["TE"] = (0,0)
                statsCheat["TE"] = (0,0)
                statsMod["TE"] = (0,0)
            
            if len(rows) == 0:
                header += [fitName[:17] + "_ModPrec", fitName[:12]+ "_ModRec",
                           fitName[:17]+ "_ModF1"]
            row += prettyAcc(statsMod["TE"][0], statsMod["TE"][1])
            
            if len(rows) == 0:
                header += [fitName[:17] + "_Prec", fitName[:12]+ "_Rec",
                           fitName[:17]+ "_F1"]
            row += prettyAcc(stats["TE"][0], stats["TE"][1])

            if len(rows) == 0:
                header += [fitName[:17] + "_PrecCheat", fitName[:12]+ "_RecCheat",
                           fitName[:17]+ "_F1Cheat"]
            row += prettyAcc(statsCheat["TE"][0], statsCheat["TE"][1])
            
        if len(rows) == 0:
            rows.append(header)
        rows.append(row)
    # compute weighted averages
    row = ["total", totalBases]
    for col in xrange(2,len(rows[0])):
        val = sum( (float(rows[i][1]) / float(totalBases)) * float(rows[i][col])
                   for i in xrange(1, len(rows)))
        row.append("%.4f" % val)
    rows.append(row)
    
    for row in rows:
        statFile.write(",".join(str(x) for x in row) + "\n")
    statFile.close()