コード例 #1
0
def buildSeqTable(args, bedIntervals):
    """build table of sequence indexes from input bed file to quickly read 
    while sorting.  Table maps sequence name to range of indexes in 
    bedIntervals.  This only works if bedIntervals are sorted (and should 
    raise an assertion error if that's not the case. 
    """
    logger.info("building sequence name index of %d bed intervals" %
                len(bedIntervals))
    bedSeqTable = dict()
    prevName = None
    prevIdx = 0
    for i, interval in enumerate(bedIntervals):
        seqName = interval[0]
        if seqName != prevName:
            assert seqName not in bedSeqTable
            if prevName is not None:
                bedSeqTable[prevName] = (prevIdx, i)
                prevIdx = i
        prevName = seqName

    seqName = bedIntervals[-1][0]
    assert seqName not in bedSeqTable
    bedSeqTable[seqName] = (prevIdx, len(bedIntervals))
    logger.debug("index has %d unique sequences" % len(bedSeqTable))
    return bedSeqTable
コード例 #2
0
ファイル: tsdFinder.py プロジェクト: glennhickey/teHmm
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
        
    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" % (
        numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc,"--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)
        
    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
コード例 #3
0
ファイル: tsdFinder.py プロジェクト: glennhickey/teHmm
def buildSeqTable(args, bedIntervals):
    """build table of sequence indexes from input bed file to quickly read 
    while sorting.  Table maps sequence name to range of indexes in 
    bedIntervals.  This only works if bedIntervals are sorted (and should 
    raise an assertion error if that's not the case. 
    """
    logger.info("building sequence name index of %d bed intervals" %
                 len(bedIntervals))
    bedSeqTable = dict()
    prevName = None
    prevIdx = 0
    for i, interval in enumerate(bedIntervals):
        seqName = interval[0]
        if seqName != prevName:
            assert seqName not in bedSeqTable
            if prevName is not None:
                bedSeqTable[prevName] = (prevIdx, i)
                prevIdx = i
        prevName = seqName

    seqName = bedIntervals[-1][0]
    assert seqName not in bedSeqTable
    bedSeqTable[seqName] = (prevIdx, len(bedIntervals))
    logger.debug("index has %d unique sequences" % len(bedSeqTable))
    return bedSeqTable
コード例 #4
0
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))

    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" %
                (numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc, "--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)

    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
コード例 #5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name",
                        help="Set ID field (column 4 instead of 5)",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath,
                            mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals,
                            obFile, args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
コード例 #6
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Automatically set the scale attributes of numeric tracks"
        " within a given tracks.xml function using some simple heuristics. ")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("chromSizes",
                        help="2-column chrom sizes file as needed"
                        " by bedGraphToBigWig")
    parser.add_argument("queryBed", help="Region(s) to apply scaling to")
    parser.add_argument("outputDir", help="Output directory")
    parser.add_argument("--tracks",
                        help="Comma-separated list of tracks "
                        "to process. If not set, all tracks with a scaling"
                        " attribute are processed",
                        default=None)
    parser.add_argument("--skip",
                        help="Comma-separated list of tracks to "
                        "skip.",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)

    trackNames = []
    if args.tracks is not None:
        trackNames = args.tracks.split(",")
    skipNames = []
    if args.skip is not None:
        skipNames = args.skip.split(",")

    mergedIntervals = getMergedBedIntervals(args.queryBed)

    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals)
    trackList = trackData.getTrackList()

    for track in trackList:
        if track.getName() not in skipNames and\
          (track.getName() in trackNames or len(trackNames) == 0):
            if track.getScale() is not None or\
              track.getLogScale() is not None or\
              track.getShift() is not None or\
              track.getDelta() is True:
                logger.info("Writing scaled track %s" % track.getName())
                writeScaledTrack(trackData, track, args)

    cleanBedTool(tempBedToolPath)
コード例 #7
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name", help="Set ID field (column 4 instead of 5)",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface    
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath, mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile,
                             args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
コード例 #8
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Automatically set the scale attributes of numeric tracks"
        " within a given tracks.xml function using some simple heuristics. ")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("chromSizes", help="2-column chrom sizes file as needed"
                        " by bedGraphToBigWig")
    parser.add_argument("queryBed", help="Region(s) to apply scaling to")
    parser.add_argument("outputDir", help="Output directory")
    parser.add_argument("--tracks", help="Comma-separated list of tracks "
                        "to process. If not set, all tracks with a scaling"
                        " attribute are processed", default=None)
    parser.add_argument("--skip", help="Comma-separated list of tracks to "
                        "skip.", default=None)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()


    if not os.path.exists(args.outputDir):
        os.makedirs(args.outputDir)

    trackNames = []
    if args.tracks is not None:
        trackNames = args.tracks.split(",")
    skipNames = []
    if args.skip is not None:
        skipNames = args.skip.split(",")
    
    mergedIntervals = getMergedBedIntervals(args.queryBed)

    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals)
    trackList = trackData.getTrackList()

    for track in trackList:
        if track.getName() not in skipNames and\
          (track.getName() in trackNames or len(trackNames) == 0):
          if track.getScale() is not None or\
            track.getLogScale() is not None or\
            track.getShift() is not None or\
            track.getDelta() is True:
            logger.info("Writing scaled track %s" % track.getName())  
            writeScaledTrack(trackData, track, args)

    cleanBedTool(tempBedToolPath)
コード例 #9
0
def findTsds(args, bedIntervals):
    """ search through input bed intervals, loading up the FASTA sequence
    for each one """

    # index for quick lookups in bed file (to be used while scanning fasta file)
    seqTable = buildSeqTable(args, bedIntervals)
    outTsds = []
    faFile = open(args.fastaSequence, "r")
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
    for seqNameFa, sequence in fastaRead(faFile):
        if args.sequences is not None and seqNameFa not in args.sequences and\
          seqNameFa.split()[0] not in args.sequences:
            # skip unflagged sequences when option specified
            continue

        # try name from Fasta as well as name with everything after first
        # whitespace skipped
        if seqNameFa in seqTable:
            seqName = seqNameFa
        else:
            seqName = seqNameFa.split()[0]
        if seqName in seqTable:
            logger.info("Scanning FASTA sequence %s" % seqName)
            bedRange = seqTable[seqName]
            for bedIdx in xrange(bedRange[0], bedRange[1]):
                bedInterval = bedIntervals[bedIdx]
                name = None
                if len(bedInterval) > 3:
                    name = bedInterval[3]
                if nameSet is None or name in nameSet:
                    # we make sequence lower case below because we dont care
                    # about soft masking
                    outTsds += intervalTsds(args, sequence.lower(),
                                            bedInterval)
        else:
            logger.debug("Skipping FASTA sequence %s because no intervals "
                         "found" % seqName)

    return outTsds
コード例 #10
0
ファイル: tsdFinder.py プロジェクト: glennhickey/teHmm
def findTsds(args, bedIntervals):
    """ search through input bed intervals, loading up the FASTA sequence
    for each one """
    
    # index for quick lookups in bed file (to be used while scanning fasta file)
    seqTable = buildSeqTable(args, bedIntervals)
    outTsds = []
    faFile = open(args.fastaSequence, "r")
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
    for seqNameFa, sequence in fastaRead(faFile):
        if args.sequences is not None and seqNameFa not in args.sequences and\
          seqNameFa.split()[0] not in args.sequences:
          # skip unflagged sequences when option specified
          continue
            
        # try name from Fasta as well as name with everything after first
        # whitespace skipped
        if seqNameFa in seqTable:
            seqName = seqNameFa
        else:
            seqName = seqNameFa.split()[0]
        if seqName in seqTable:
            logger.info("Scanning FASTA sequence %s" % seqName)
            bedRange = seqTable[seqName]
            for bedIdx in xrange(bedRange[0], bedRange[1]):
                bedInterval = bedIntervals[bedIdx]
                name = None
                if len(bedInterval) > 3:
                    name = bedInterval[3]
                if nameSet is None or name in nameSet:
                    # we make sequence lower case below because we dont care
                    # about soft masking
                    outTsds += intervalTsds(args, sequence.lower(), bedInterval)
        else:
            logger.debug("Skipping FASTA sequence %s because no intervals "
                          "found" % seqName)

    return outTsds
コード例 #11
0
def setTrackScale(track, numBins, allIntervals, noLog):
    """ Modify the track XML element in place with the heuristically
    computed scaling paramaters below """
    data = readTrackIntoFloatArray(track, allIntervals)
    if len(data) > numBins:
        scaleType, scaleParam, shift = computeScale(data, numBins, noLog)
        # round down so xml file doesnt look too ugly
        if scaleParam > 1e-4:
            scaleParam = float("%.4f" % scaleParam)
        if scaleType == "scale":
            logger.info("Setting track %s scale to %f" % (track.getName(),
                                                          scaleParam))
            track.setScale(scaleParam)
        elif scaleType == "logScale":
            logger.info("Setting track %s logScale to %f" % (track.getName(),
                                                             scaleParam))
            track.setLogScale(scaleParam)
        logger.info("Setting track %s shift to %f" % (track.getName(),
                                                      shift))
        track.setShift(shift)
コード例 #12
0
ファイル: teHmmTrain.py プロジェクト: glennhickey/teHmm
def trainModel(randomSeed, trackData, catMap, userTrans, truthIntervals,
               args):
    """ Run the whole training pipeline
    """
    # activate the random seed
    randGen = np.random.RandomState(randomSeed)

    # create the independent emission model
    logger.info("creating emission model")
    numSymbolsPerTrack = trackData.getNumSymbolsPerTrack()
    logger.debug("numSymbolsPerTrack=%s" % numSymbolsPerTrack)
    # only randomize model if using Baum-Welch 
    randomize = args.supervised is False and args.flatEm is False
    emissionModel = IndependentMultinomialAndGaussianEmissionModel(
        args.numStates,
        numSymbolsPerTrack,
        trackData.getTrackList(),
        normalizeFac=args.emFac,
        randomize=randomize,
        effectiveSegmentLength = args.segLen,
        random_state = randGen,
        randRange = args.emRandRange)

    # create the model
    if not args.cfg:
        logger.info("creating hmm model")
        model = MultitrackHmm(emissionModel, n_iter=args.iter,
                              state_name_map=catMap,
                              fixTrans = args.fixTrans,
                              fixEmission = args.fixEm,
                              fixStart = args.fixStart,
                              forceUserEmissions = args.forceEmProbs,
                              forceUserTrans = args.forceTransProbs,
                              random_state = randGen,
                              thresh = args.emThresh,
                              transMatEpsilons = args.transMatEpsilons,
                              maxProb = args.maxProb,
                              maxProbCut = args.maxProbCut)
    else:
        pairEM = PairEmissionModel(emissionModel, [args.saPrior] *
                                   emissionModel.getNumStates())
        assert args.supervised is True
        nestStates = []
        if args.pairStates is not None:
            pairStates = args.pairStates.split(",")
            nestStates = map(lambda x: catMap.getMap(x), pairStates)
        logger.info("Creating cfg model")
        model = MultitrackCfg(emissionModel, pairEM, nestStates,
                              state_name_map=catMap)

    # initialize the user specified transition probabilities now if necessary
    if args.initTransProbs is not None:
        with open(args.initTransProbs) as f:
            model.applyUserTrans(f.readlines())

    # initialize the user specified emission probabilities now if necessary
    if args.initEmProbs is not None:
        with open(args.initEmProbs) as f:
            # can't apply emissions without a track list! 
            model.trackList = trackData.getTrackList()
            model.applyUserEmissions(f.readlines())

    # initialize the user specified start probabilities now if necessary
    if args.initStartProbs is not None:
        with open(args.initStartProbs) as f:
            model.applyUserStarts(f.readlines())

    # make sure initialization didnt screw up
    model.validate()

    # do the training
    if args.supervised is False:
        logger.info("training via EM")
        model.train(trackData)
    else:
        logger.info("training from input bed states")
        model.supervisedTrain(trackData, truthIntervals)

    # reset the user specified transition probabilities now if necessary
    if args.forceTransProbs is not None:
        with open(args.forceTransProbs) as f:
            model.applyUserTrans(f.readlines())

    # reset the user specified emission probabilities now if necessary
    if args.forceEmProbs is not None:
        with open(args.forceEmProbs) as f:
            model.applyUserEmissions(f.readlines())

    return model
コード例 #13
0
ファイル: teHmmTrain.py プロジェクト: glennhickey/teHmm
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create a teHMM")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trainingBed", help="Path of BED file containing"
                        " genome regions to train model on.  If --supervised "
                        "is used, the names in this bed file will be treated "
                        "as the true annotation (otherwise it is only used for "
                        "interval coordinates)")
    parser.add_argument("outputModel", help="Path of output hmm")
    parser.add_argument("--numStates", help="Number of states in model",
                        type = int, default=2)
    parser.add_argument("--iter", help="Number of EM iterations",
                        type = int, default=100)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--cfg", help="Use Context Free Grammar insead of "
                        "HMM.  Only works with --supervised for now",
                        action = "store_true", default = False)
    parser.add_argument("--saPrior", help="Confidence in self alignment "
                        "track for CFG.  Probability of pair emission "
                        "is multiplied by this number if the bases are aligned"
                        " and its complement if bases are not aligned. Must"
                        " be between [0,1].", default=0.95, type=float)
    parser.add_argument("--pairStates", help="Comma-separated list of states"
                        " (from trainingBed) that are treated as pair-emitors"
                        " for the CFG", default=None)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after each training iteration"
                        " (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed)" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after each training iteration "
                        "(unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed.)" ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing an"
                        " emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default="0.2,0.8")
    parser.add_argument("--segment", help="Bed file of segments to treat as "
                        "single columns for HMM (ie as created with "
                        "segmentTracks.py).  IMPORTANT: this file must cover "
                        "the same regions as the traininBed file. Unless in "
                        "supervised mode, probably best to use same bed file "
                        " as both traingBed and --segment argument.  Otherwise"
                        " use intersectBed to make sure the overlap is exact",
                        default=None)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of replicates (with different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=1, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running replicates (see --rep) in parallel.",
                        type=int, default=1)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=0.001)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.cfg is True:
        assert args.supervised is True
        assert args.saPrior >= 0. and args.saPrior <= 1.
    if args.pairStates is not None:
        assert args.cfg is True
    if args.initTransProbs is not None or args.fixTrans is True or\
      args.initEmProbs is not None or args.fixEm is not None:
        if args.cfg is True:
            raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm "
                               "are not currently compatible with --cfg.")
    if args.fixTrans is True and args.supervised is True:
        raise RuntimeError("--fixTrans option not compatible with --supervised")
    if args.fixEm is True and args.supervised is True:
        raise RuntimeError("--fixEm option not compatible with --supervised")
    if (args.forceTransProbs is not None or args.forceEmProbs is not None) \
      and args.cfg is True:
        raise RuntimeError("--forceTransProbs and --forceEmProbs are not "
                           "currently compatible with --cfg")
    if args.flatEm is True and args.supervised is False and\
      args.initEmProbs is None and args.initTransProbs is None:
      raise RuntimeError("--flatEm must be used with --initEmProbs and or"
                         " --initTransProbs")
    if args.initEmProbs is not None and args.initTransProbs is None:
        raise RuntimeError("--initEmProbs can only be used in conjunction with"
                           " --initTransProbs")
    if args.emRandRange is not None:
        args.emRandRange = args.emRandRange.split(",")
        try:
            assert len(args.emRandRange) == 2
            args.emRandRange = (float(args.emRandRange[0]),
                                float(args.emRandRange[1]))
        except:
            raise RuntimeError("Invalid --emRandRange specified")
    if args.transMatEpsilons is False:
        # old logic here. now overriden with above options
        args.transMatEpsilons = (args.supervised is False and
                                 args.initTransProbs is None and
                                 args.forceTransProbs is None)

    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read training intervals from the bed file
    logger.info("loading training intervals from %s" % args.trainingBed)
    mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.trainingBed)

    # read segment intervals
    segIntervals = None
    if args.segment is not None:
        logger.info("loading segment intervals from %s" % args.segment)
        try:
            checkExactOverlap(args.trainingBed, args.segment)
        except:
            raise RuntimeError("bed file passed with --segments option"
                               " must exactly overlap trainingBed")
        segIntervals = readBedIntervals(args.segment, sort=True)
    elif args.segLen > 0:
        raise RuntimeError("--segLen can only be used with --segment")
    if args.segLen <= 0:
        args.segLen = None
    if args.segLen > 0 and args.segLen != 1:
        logger.warning("--segLen should be 0 (no correction) or 1 (base"
                       " correction).  Values > 1 may cause bias.")

    # read the tracks, while intersecting them with the training intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            segmentIntervals=segIntervals)

    catMap = None
    userTrans = None
    if args.supervised is False and args.initTransProbs is not None:
        logger.debug("initializing transition model with user data")
        catMap = stateNamesFromUserTrans(args.initTransProbs)
        # state number is overrided by the transProbs file
        args.numStates = len(catMap)

    truthIntervals = None
    # state number is overrided by the input bed file in supervised mode
    if args.supervised is True:
        logger.info("processing supervised state names")
        # we reload because we don't want to be merging them here
        truthIntervals = readBedIntervals(args.trainingBed, ncol=4)
        catMap = mapStateNames(truthIntervals)
        args.numStates = len(catMap)

    # train the model
    seeds = [random.randint(0, 4294967294)]
    if args.seed is not None:
        seeds = [args.seed]
        random.seed(args.seed)
    seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)]

    def trainClosure(randomSeed):
        return trainModel(randomSeed, trackData=trackData, catMap=catMap,
                          userTrans=userTrans, truthIntervals=truthIntervals,
                          args=args)
    
    modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads,
                                         execFunction = trainClosure,
                                         useThreads = True)

    # select best model
    logmsg = ""
    bestModel = (-1, LOGZERO)
    for i in xrange(len(modelList)):
        curModel = (i, modelList[i].getLastLogProb())
        if curModel[1] > bestModel[1]:
            bestModel = curModel
        if curModel[1] is not None:
            logmsg += "Rep %i: TotalProb: %f\n" % curModel
    if len(modelList) > 1:
        logging.info("Training Replicates Statistics:\n%s" % logmsg)
        logging.info("Selecting best replicate (%d, %f)" % bestModel)
    model = modelList[bestModel[0]]
        
    # write the model to a pickle
    logger.info("saving trained model to %s" % args.outputModel)
    saveModel(args.outputModel, model)

    # write all replicates
    writtenCount = 0
    if args.saveAllReps is True:
        for i, repModel in enumerate(modelList):
            if i != bestModel[0]:
                repPath = "%s.rep%d" % (args.outputModel, writtenCount)
                logger.info("saving replicate model to %s" % repPath)                
                saveModel(repPath, repModel)
                writtenCount += 1

    cleanBedTool(tempBedToolPath)
コード例 #14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        " that be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions will be "
        " derived by an already-trained model.  This tool is written to"
        " allow combining supervised and unsupervised training.  IE a "
        " supervised model is created (teHmmTrain.py with --supervised "
        " option).  This tool can then be used to create the necessary "
        " files to bootstrap an unsupervised training run with a subset"
        " of the parameters.")

    parser.add_argument("inputModel",
                        help="Path of input model to use "
                        "for bootstrap parameter creation")
    parser.add_argument("outTransProbs",
                        help="File to write transition model"
                        " to (for use with teHmmTrain.py --initTransProbs and"
                        " --forceTransProbs)")
    parser.add_argument("outEmProbs",
                        help="File to write emission model to"
                        " (for use with teHmmTrain.py --initEmProbs and "
                        " --forceEmProbs)")
    parser.add_argument("--ignore",
                        help="comma-separated list of states to ignore from"
                        " inputModel",
                        default=None)
    parser.add_argument("--numAdd",
                        help="Number of \"unlabeled\" states to add"
                        " to the model.",
                        default=0,
                        type=int)
    parser.add_argument("--numTotal",
                        help="Add unlabeled states such that"
                        " output model has given number of states.  If input "
                        "model already has a greater number of states then"
                        " none added",
                        default=0,
                        type=int)
    parser.add_argument("--stp",
                        help="Self-transition probality assigned to"
                        " added states.",
                        default=0.9,
                        type=float)
    parser.add_argument("--allTrans",
                        help="By default only self-transitions"
                        " are written.  Use this option to write entire "
                        "transition matrix (excluding ignroed states)",
                        default=False,
                        action="store_true")

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.numAdd != 0 and args.numTotal != 0:
        raise RuntimeError("--numAdd and --numTotal mutually exclusive")

    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    # parse ignore states
    if args.ignore is None:
        args.ignore = set()
    else:
        args.ignore = set(args.ignore.split(","))

    # make sure we have a state name for every state (should really
    # be done within hmm...)
    stateMap = model.getStateNameMap()
    if stateMap is None:
        stateMap = CategoryMap(reserved=0)
        for i in xrange(model.getEmissionModel().getNumStates()):
            stateMap.getMap(str(i), update=True)

    # write the transition probabilities
    writeTransitions(model, stateMap, args)

    # write the emission probabilities
    writeEmissions(model, stateMap, args)
コード例 #15
0
ファイル: bootstrapModel.py プロジェクト: glennhickey/teHmm
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        " that be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions will be "
        " derived by an already-trained model.  This tool is written to"
        " allow combining supervised and unsupervised training.  IE a "
        " supervised model is created (teHmmTrain.py with --supervised "
        " option).  This tool can then be used to create the necessary "
        " files to bootstrap an unsupervised training run with a subset"
        " of the parameters.")

    parser.add_argument("inputModel", help="Path of input model to use "
                        "for bootstrap parameter creation")
    parser.add_argument("outTransProbs", help="File to write transition model"
                        " to (for use with teHmmTrain.py --initTransProbs and"
                        " --forceTransProbs)")
    parser.add_argument("outEmProbs", help="File to write emission model to"
                        " (for use with teHmmTrain.py --initEmProbs and "
                        " --forceEmProbs)")
    parser.add_argument("--ignore", help="comma-separated list of states to ignore from"
                        " inputModel", default=None)
    parser.add_argument("--numAdd", help="Number of \"unlabeled\" states to add"
                        " to the model.", default=0, type=int)
    parser.add_argument("--numTotal", help="Add unlabeled states such that"
                        " output model has given number of states.  If input "
                        "model already has a greater number of states then"
                        " none added", default=0, type=int)
    parser.add_argument("--stp", help="Self-transition probality assigned to"
                        " added states.", default=0.9, type=float)
    parser.add_argument("--allTrans", help="By default only self-transitions"
                        " are written.  Use this option to write entire "
                        "transition matrix (excluding ignroed states)",
                        default=False, action="store_true")
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.numAdd != 0 and args.numTotal != 0:
        raise RuntimeError("--numAdd and --numTotal mutually exclusive")

    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    # parse ignore states
    if args.ignore is None:
        args.ignore = set()
    else:
        args.ignore = set(args.ignore.split(","))

    # make sure we have a state name for every state (should really
    # be done within hmm...)
    stateMap = model.getStateNameMap()
    if stateMap is None:
        stateMap = CategoryMap(reserved = 0)
        for i in xrange(model.getEmissionModel().getNumStates()):
            stateMap.getMap(str(i), update=True)

    # write the transition probabilities
    writeTransitions(model, stateMap, args)

    # write the emission probabilities
    writeEmissions(model, stateMap, args)
コード例 #16
0
ファイル: tsdFinder.py プロジェクト: glennhickey/teHmm
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Find candidate TSDs (exact forward matches) flanking given"
        "BED intervals.  Score is distance between TSD and bed interval.")
    parser.add_argument("fastaSequence", help="DNA sequence in FASTA format")
    parser.add_argument("inBed", help="BED file with TEs whose flanking regions "
                        "we wish to search")
    parser.add_argument("outBed", help="BED file containing (only) output TSDs")
    parser.add_argument("--min", help="Minimum length of a TSD",
                        default=4, type=int)
    parser.add_argument("--max", help="Maximum length of a TSD",
                        default=6, type=int)
    parser.add_argument("--all", help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour", action="store_true", default=False)
    parser.add_argument("--maxScore", help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None, type=int)
    parser.add_argument("--left", help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=7, type=int)
    parser.add_argument("--right", help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=7, type=int)
    parser.add_argument("--overlap", help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap", default=3, type=int)
    parser.add_argument("--leftName", help="Name of left TSDs in output Bed",
                        default="L_TSD")
    parser.add_argument("--rightName", help="Name of right TSDs in output Bed",
                        default="R_TSD")
    parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique"
                        " matching ID", action="store_true", default=False)
    parser.add_argument("--names", help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed", default=None)
    parser.add_argument("--numProc", help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file", type=int, default=1)
    parser.add_argument("--sequences", help="Only process given sequences of input"
                        " FASTA file (comma-separated list).",  default=None)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    assert os.path.exists(args.inBed)
    assert os.path.exists(args.fastaSequence)
    assert args.min <= args.max
    args.nextId = 0

    if args.sequences is not None:
        args.sequences = set(args.sequences.split(","))

    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.inBed)
    bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True)
    if bedIntervals is None or len(bedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.inBed)

    if args.numProc > 1:
        runParallel(args, bedIntervals)
        return 0
    
    tsds = findTsds(args, bedIntervals)

    writeBedIntervals(tsds, args.outBed)
コード例 #17
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Find candidate TSDs (exact forward matches) flanking given"
        "BED intervals.  Score is distance between TSD and bed interval.")
    parser.add_argument("fastaSequence", help="DNA sequence in FASTA format")
    parser.add_argument("inBed",
                        help="BED file with TEs whose flanking regions "
                        "we wish to search")
    parser.add_argument("outBed",
                        help="BED file containing (only) output TSDs")
    parser.add_argument("--min",
                        help="Minimum length of a TSD",
                        default=4,
                        type=int)
    parser.add_argument("--max",
                        help="Maximum length of a TSD",
                        default=6,
                        type=int)
    parser.add_argument("--all",
                        help="Report all matches in region (as opposed"
                        " to only the nearest to the BED element which is the "
                        "default behaviour",
                        action="store_true",
                        default=False)
    parser.add_argument("--maxScore",
                        help="Only report matches with given "
                        "score or smaller.  The score  is definied as the "
                        "maximum distance between the (two) TSD intervals and "
                        "the query interval",
                        default=None,
                        type=int)
    parser.add_argument("--left",
                        help="Number of bases immediately left of the "
                        "BED element to search for the left TSD",
                        default=7,
                        type=int)
    parser.add_argument("--right",
                        help="Number of bases immediately right of "
                        "the BED element to search for the right TSD",
                        default=7,
                        type=int)
    parser.add_argument("--overlap",
                        help="Number of bases overlapping the "
                        "BED element to include in search (so total space "
                        "on each side will be --left + overlap, and --right + "
                        "--overlap",
                        default=3,
                        type=int)
    parser.add_argument("--leftName",
                        help="Name of left TSDs in output Bed",
                        default="L_TSD")
    parser.add_argument("--rightName",
                        help="Name of right TSDs in output Bed",
                        default="R_TSD")
    parser.add_argument("--id",
                        help="Assign left/right pairs of TSDs a unique"
                        " matching ID",
                        action="store_true",
                        default=False)
    parser.add_argument("--names",
                        help="Only apply to bed interval whose "
                        "name is in (comma-separated) list.  If not specified"
                        " then all intervals are processed",
                        default=None)
    parser.add_argument("--numProc",
                        help="Number of jobs to run in parallel."
                        " (parallization done on different sequences in FASTA"
                        "file",
                        type=int,
                        default=1)
    parser.add_argument("--sequences",
                        help="Only process given sequences of input"
                        " FASTA file (comma-separated list).",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    assert os.path.exists(args.inBed)
    assert os.path.exists(args.fastaSequence)
    assert args.min <= args.max
    args.nextId = 0

    if args.sequences is not None:
        args.sequences = set(args.sequences.split(","))

    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.inBed)
    bedIntervals = readBedIntervals(args.inBed, ncol=4, sort=True)
    if bedIntervals is None or len(bedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" % args.inBed)

    if args.numProc > 1:
        runParallel(args, bedIntervals)
        return 0

    tsds = findTsds(args, bedIntervals)

    writeBedIntervals(tsds, args.outBed)
コード例 #18
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
        " annotation, re-label the prediction's state names so that they "
        " best match the true annotation.  Usees same logic as "
        " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col",
                        help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default=4,
                        type=int)
    parser.add_argument(
        "--intThresh",
        help="Threshold to consider interval from"
        " tgtBed covered by predBed.  If not specified, then base"
        " level statistics will be used. Value in range (0,1]",
        type=float,
        default=None)
    parser.add_argument("--noFrag",
                        help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--qualThresh",
        help="Minimum match ratio between truth"
        " and prediction to relabel prediction.  Example, if"
        " predicted state X overlaps target state LTR 25 pct of "
        "the time, then qualThresh must be at least 0.25 to "
        "label X as LTR in the output.  Value in range (0, 1]",
        type=float,
        default=0.1)
    parser.add_argument("--ignore",
                        help="Comma-separated list of stateNames to"
                        " ignore (in prediction)",
                        default=None)
    parser.add_argument("--ignoreTgt",
                        help="Comma-separated list of stateNames to"
                        " ignore (int target)",
                        default=None)
    parser.add_argument("--tgt",
                        help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument(
        "--unique",
        help="If more than one predicted state maps"
        " to the same target state, add a unique id (numeric "
        "suffix) to the output so that they can be distinguished",
        action="store_true",
        default=False)
    parser.add_argument("--model",
                        help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)",
                        default=None)
    parser.add_argument("--noMerge",
                        help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.",
                        action="store_true",
                        default=False)
    parser.add_argument("--hm",
                        help="Write confusion matrix as heatmap in PDF"
                        " format to specified file",
                        default=None)
    parser.add_argument("--old",
                        help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization",
                        action="store_true",
                        default=False)
    parser.add_argument("--fdr",
                        help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float,
                        default=None)
    parser.add_argument("--tl",
                        help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)",
                        default=None)
    parser.add_argument(
        "--colOrder",
        help="List of states used to force"
        " ordering in heatmap (otherwise alphabetical) columns. These"
        " states will correspond to the tgtBed when --old used and"
        " --predBed otherwise.",
        default=None)
    parser.add_argument(
        "--hmCovRow",
        help="Path to write 1-row heatmap of "
        "state coverage (fraction of bases). only works with --hm",
        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)

        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol=args.col)
    intervals2 = readBedIntervals(args.predBed, ncol=args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1,
                                           args.col - 1, args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))

    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)

    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col - 1,
                   args.noMerge, args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
コード例 #19
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Evaluate a given data set with a trained HMM. Display"
        " the log probability of the input data given the model, and "
        "optionally output the most likely sequence of hidden states.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inputModel", help="Path of hmm created with"
                        "teHmmTrain.py")
    parser.add_argument("bedRegions", help="Intervals to process")
    parser.add_argument("--bed", help="path of file to write viterbi "
                        "output to (most likely sequence of hidden states)",
                        default=None)
    parser.add_argument("--numThreads", help="Number of threads to use (only"
                        " applies to CFG parser for the moment)",
                        type=int, default=1)
    parser.add_argument("--slice", help="Make sure that regions are sliced"
                        " to a maximum length of the given value.  Most "
                        "useful when model is a CFG to keep memory down. "
                        "When 0, no slicing is done",
                        type=int, default=0)
    parser.add_argument("--segment", help="Use the intervals in bedRegions"
                        " as segments which each count as a single column"
                        " for evaluattion.  Note the model should have been"
                        " trained with the --segment option pointing to this"
                        " same bed file.", action="store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)    
    parser.add_argument("--maxPost", help="Use maximum posterior decoding instead"
                        " of Viterbi for evaluation", action="store_true",
                        default=False)
    parser.add_argument("--pd", help="Output BED file for posterior distribution. Must"
                        " be used in conjunction with --pdStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--pdStates", help="comma-separated list of state names to use"
                        " for computing posterior distribution.  For example: "
                        " --pdStates inside,LTR_left,LTR_right will compute the probability"
                        ", for each observation, that the hidden state is inside OR LTR_left"
                        " OR LTR_right.  Must be used with --pd to specify output "
                        "file.", default=None)
    parser.add_argument("--bic", help="save Bayesian Information Criterion (BIC) score"
                        " in given file", default=None)
    parser.add_argument("--ed", help="Output BED file for emission distribution. Must"
                        " be used in conjunction with --edStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--edStates", help="comma-separated list of state names to use"
                        " for computing emission distribution.  For example: "
                        " --edStates inside,LTR_left for each obsercation the probability "
                        " that inside emitted that observaiton plus the probabillity that"
                        " LTR_left emitted it. If more than one state is selected, this "
                        " is not a distribution, but a sum of distributions (and values"
                        " can exceed 1).  Mostly for debugging purposes. Note output in LOG",
                         default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    if args.slice <= 0:
        args.slice = sys.maxint
    elif args.segment is True:
        raise RuntimeError("--slice and --segment options are not compatible at "
                           "this time")
    if (args.pd is not None) ^ (args.pdStates is not None):
        raise RuntimeError("--pd requires --pdStates and vice versa")
    if (args.ed is not None) ^ (args.edStates is not None):
        raise RuntimeError("--ed requires --edStates and vice versa")
    if args.bed is None and (args.pd is not None or args.ed is not None):
        raise RuntimeError("Both --ed and --pd only usable in conjunction with"
                           " --bed")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
    
    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    if isinstance(model, MultitrackCfg):
        if args.maxPost is True:
           raise RuntimeErorr("--post not supported on CFG models")

    # apply the effective segment length
    if args.segLen > 0:
        assert args.segment is True
        model.getEmissionModel().effectiveSegmentLength = args.segLen
        
    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.bedRegions)
    mergedIntervals = getMergedBedIntervals(args.bedRegions, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.bedRegions)

    # slice if desired
    choppedIntervals = [x for x in slicedIntervals(mergedIntervals, args.slice)]

    # read segment intervals
    segIntervals = None
    if args.segment is True:
        logger.info("loading segment intervals from %s" % args.bedRegions)
        segIntervals = readBedIntervals(args.bedRegions, sort=True)

    # load the input
    # read the tracks, while intersecting them with the given interval
    trackData = TrackData()
    # note we pass in the trackList that was saved as part of the model
    # because we do not want to generate a new one.
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData.loadTrackData(args.tracksInfo, choppedIntervals, 
                            model.getTrackList(),
                            segmentIntervals=segIntervals)

    # do the viterbi algorithm
    if isinstance(model, MultitrackHmm):
        algname = "viterbi"
        if args.maxPost is True:
            algname = "posterior decoding"
        logger.info("running %s algorithm" % algname)
    elif isinstance(model, MultitrackCfg):
        logger.info("running CYK algorithm")

    vitOutFile = None
    if args.bed is not None:
        vitOutFile = open(args.bed, "w")
    totalScore = 0
    tableIndex = 0
    totalDatapoints = 0

    # Note: in general there's room to save on memory by only computing single
    # track table at once (just need to add table by table interface to hmm...)
    
    posteriors = [None] * trackData.getNumTrackTables()
    posteriorsFile = None
    posteriorsMask = None
    if args.pd is not None:
        posteriors = model.posteriorDistribution(trackData)
        posteriorsFile = open(args.pd, "w")
        posteriorsMask = getPosteriorsMask(args.pdStates, model)
        assert len(posteriors[0][0]) == len(posteriorsMask)
    emProbs = [None] * trackData.getNumTrackTables()
    emissionsFile = None
    emissionsMask = None
    if args.ed is not None:
        emProbs = model.emissionDistribution(trackData)
        emissionsFile = open(args.ed, "w")
        emissionsMask = getPosteriorsMask(args.edStates, model)
        assert len(emProbs[0][0]) == len(emissionsMask)

    
    decodeFunction = model.viterbi
    if args.maxPost is True:
        decodeFunction = model.posteriorDecode

    for i, (vitLogProb, vitStates) in enumerate(decodeFunction(trackData,
                                                numThreads=args.numThreads)):
        totalScore += vitLogProb
        if args.bed is not None or args.pd is not None:
            if args.bed is not None:
                vitOutFile.write("#Viterbi Score: %f\n" % (vitLogProb))
            trackTable = trackData.getTrackTableList()[tableIndex]
            tableIndex += 1
            statesToBed(trackTable,
                        vitStates, vitOutFile, posteriors[i], posteriorsMask,
                        posteriorsFile, emProbs[i], emissionsMask, emissionsFile)
            totalDatapoints += len(vitStates) * trackTable.getNumTracks()

    print "Viterbi (log) score: %f" % totalScore
    if isinstance(model, MultitrackHmm) and model.current_iteration is not None:
        print "Number of EM iterations: %d" % model.current_iteration
    if args.bed is not None:
        vitOutFile.close()
    if posteriorsFile is not None:
        posteriorsFile.close()
    if emissionsFile is not None:
        emissionsFile.close()

    if args.bic is not None:
        bicFile = open(args.bic, "w")
        # http://en.wikipedia.org/wiki/Bayesian_information_criterion
        lnL = float(totalScore)
        try:
            k = float(model.getNumFreeParameters())
        except:
            # numFreeParameters still not done for semi-supervised
            # just pass through a 0 instead of crashing for now
            k = 0.0 
        n = float(totalDatapoints)
        bic = -2.0 * lnL + k * (np.log(n) + np.log(2 * np.pi))
        bicFile.write("%f\n" % bic)
        bicFile.write("# = -2.0 * lnL + k * (lnN + ln(2 * np.pi))\n"
                      "# where lnL=%f  k=%d (%d states)  N=%d (%d obs * %d tracks)  lnN=%f\n" % (
            lnL, int(k), model.getEmissionModel().getNumStates(), int(totalDatapoints),
            totalDatapoints / model.getEmissionModel().getNumTracks(),
            model.getEmissionModel().getNumTracks(), np.log(n)))
        bicFile.close()

    cleanBedTool(tempBedToolPath)
コード例 #20
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh",
                        help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int,
                        default=1)
    parser.add_argument("--cutTracks",
                        help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed",
                        default=None)
    parser.add_argument("--cutUnscaled",
                        help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial",
                        help="Cut non-gaussian, non-binary"
                        " tracks everytime",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutNonGaussian",
                        help="Cut all but guassian tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--comp",
                        help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore",
                        help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate",
                        default="sequence")
    parser.add_argument("--maxLen",
                        help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int,
                        default=0)
    parser.add_argument(
        "--fixLen",
        help="Just make segments of specifed fixed "
        "length ignoring other parameters and logic (<= 0 means"
        " no fixed length applied",
        type=int,
        default=0)
    parser.add_argument("--stats",
                        help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track",
                        default=None)
    parser.add_argument(
        "--delMask",
        help="Entirely remove intervals from "
        "mask tracks that are > given length (otherwise "
        "they would just be ignored by HMM tools). The difference"
        " here is that removed intervals will break contiguity.",
        type=int,
        default=None)
    parser.add_argument(
        "--chroms",
        help="list of chromosomes, or regions, to run in parallel"
        " (in BED format).  input regions will be intersected with each line"
        " in this file, and the result will correspsond to an individual job",
        default=None)
    parser.add_argument(
        "--proc",
        help="number of processes (use in conjunction with --chroms)",
        type=int,
        default=1)
    parser.add_argument(
        "--co",
        help="count offset for segment labels.  only used internally",
        type=int,
        default=0)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0

    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo,
                            mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                   name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
コード例 #21
0
ファイル: fitStateNames.py プロジェクト: glennhickey/teHmm
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
         " annotation, re-label the prediction's state names so that they "
         " best match the true annotation.  Usees same logic as "
         " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col", help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default = 4, type = int)
    parser.add_argument("--intThresh", help="Threshold to consider interval from"
                        " tgtBed covered by predBed.  If not specified, then base"
                        " level statistics will be used. Value in range (0,1]",
                        type=float, default=None)
    parser.add_argument("--noFrag", help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh", action="store_true",
                        default=False)
    parser.add_argument("--qualThresh", help="Minimum match ratio between truth"
                        " and prediction to relabel prediction.  Example, if"
                        " predicted state X overlaps target state LTR 25 pct of "
                        "the time, then qualThresh must be at least 0.25 to "
                        "label X as LTR in the output.  Value in range (0, 1]",
                        type=float, default=0.1)
    parser.add_argument("--ignore", help="Comma-separated list of stateNames to"
                        " ignore (in prediction)", default=None)
    parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to"
                        " ignore (int target)", default=None)
    parser.add_argument("--tgt", help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument("--unique", help="If more than one predicted state maps"
                        " to the same target state, add a unique id (numeric "
                        "suffix) to the output so that they can be distinguished",
                        action="store_true", default=False)
    parser.add_argument("--model", help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)", default=None)
    parser.add_argument("--noMerge", help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.", action="store_true",
                        default=False)
    parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF"
                        " format to specified file", default = None)
    parser.add_argument("--old", help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization", action="store_true", default=False)
    parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float, default=None)
    parser.add_argument("--tl", help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)", default=None)
    parser.add_argument("--colOrder", help="List of states used to force"
                        " ordering in heatmap (otherwise alphabetical) columns. These"
                        " states will correspond to the tgtBed when --old used and"
                        " --predBed otherwise.", default=None)
    parser.add_argument("--hmCovRow", help="Path to write 1-row heatmap of "
                        "state coverage (fraction of bases). only works with --hm",
                        default=None)

    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)                                
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)
        
        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol = args.col)
    intervals2 = readBedIntervals(args.predBed, ncol = args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1, args.col -1,
                                            args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state    
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))
        
    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)
    
    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col-1, args.noMerge,
                   args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
コード例 #22
0
ファイル: trackDump.py プロジェクト: glennhickey/teHmm
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Write track data into ASCII dump.  Row i corresponds"
        " to the ith position found when scanning query BED IN SORTED ORDER."
        "Column j corresponds to the jth track in the XML file. --map option"
        " used to write internal integer format used by HMM.  Unobserved values"
        " written as \"None\" if default attribute not specified or track not"
        " binary.  Rounding can occur if scaling parameters present.\n\n"
        "IMPORTANT: values stored in 8bit integers internally.  Any track with"
        " more than 256 different values will get clamped (with a warning)")

    parser.add_argument("tracks", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("query", help="BED region(s) to dump. SCANNED IN"
                        " SORTED ORDER")
    parser.add_argument("output", help="Path of file to write output to")
    parser.add_argument("--map", help="Apply name mapping, including"
                        " transformation specified in scale, logScale"
                        ", etc. attributes, that HMM uses internally"
                        ". Important to note that resulting integers"
                        " are just unique IDs.  ID_1 > ID_2 does not"
                        " mean anything", action="store_true",
                        default=False)
    parser.add_argument("--segment", help="Treat each interval in query"
                        " as a single segment (ie with only one data point)"
                        ".  In this case, query should probably have been"
                        " generated with segmentTracks.py",
                        action="store_true",
                        default=False)
    parser.add_argument("--noPos", help="Do not print genomic position"
                        " (first 2 columnts)", action="store_true",
                        default=False)
    parser.add_argument("--noMask", help="Ignore mask tracks",
                        default=False, action="store_true")
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    # make sure output writeable
    outFile = open(args.output, "w")

    # need to remember to fix this, disable as precaution for now
    assert args.noMask is True or args.segment is False
    
    # read query intervals from the bed file
    logger.info("loading query intervals from %s" % args.query)
    mergedIntervals = getMergedBedIntervals(args.query, ncol=3)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.query)

    # read the segment intervals from the (same) bed file
    segIntervals = None
    if args.segment is True:
        logger.info("loading segment intervals from %s" % args.query)
        segIntervals = readBedIntervals(args.query, sort=True)

    # read all data from track xml
    logger.info("loading tracks %s" % args.tracks)
    trackData = TrackData()
    trackData.loadTrackData(args.tracks, mergedIntervals,
                            segmentIntervals=segIntervals,
                            applyMasking = not args.noMask)

    # dump the data to output
    dumpTrackData(trackData, outFile, args.map, not args.noPos)
    outFile.close()
コード例 #23
0
ファイル: segmentTracks.py プロジェクト: glennhickey/teHmm
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh", help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int, default=1)
    parser.add_argument("--cutTracks", help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed", default=None)
    parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks", default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary"
                        " tracks everytime", default=False, action="store_true")
    parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks",
                        default=False, action="store_true")
    parser.add_argument("--comp", help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore", help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate", default="sequence")
    parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int, default=0)
    parser.add_argument("--fixLen", help="Just make segments of specifed fixed "
                        "length ignoring other parameters and logic (<= 0 means"
                        " no fixed length applied",
                        type=int, default=0)
    parser.add_argument("--stats", help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track", default=None)
    parser.add_argument("--delMask", help="Entirely remove intervals from "
                        "mask tracks that are > given length (otherwise "
                        "they would just be ignored by HMM tools). The difference"
                        " here is that removed intervals will break contiguity.",
                        type=int, default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    parser.add_argument("--co", help="count offset for segment labels.  only used internally",
                        type=int, default=0)
        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
        
    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                  name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1
              

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
コード例 #24
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        "from a candidate BED annotation, which can"
        " be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions created here"
        " are extremely simple, but this can be a good shortcut to at least "
        "getting the state names into the init files, which can be further "
        "tweeked by hand.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trackName", help="Name of Track to use as initial"
                        " annotation")
    parser.add_argument("queryBed", help="Bed file with regions to query")
    parser.add_argument("outTransProbs", help="File to write transition model"
                        " to")
    parser.add_argument("outEmProbs", help="File to write emission model to")
    parser.add_argument("--numOut", help="Number of \"outside\" states to add"
                        " to the model.", default=1, type=int)
    parser.add_argument("--numTot", help="Add x \"outside\" states such "
                        "that total states is this. (overrieds --numOut)",
                        default=0, type=int)
    parser.add_argument("--outName", help="Name of outside states (will have"
                        " numeric suffix if more than 1)", default="Outside")
    parser.add_argument("--mode", help="Strategy for initializing the "
                        "transition graph: {\'star\': all states are connected"
                        " to the oustide state(s) but not each other; "
                        " \'data\': transitions estimated from input bed; "
                        " \'full\': dont write edges and let teHmmTrain.py "
                        "initialize as a clique}", default="star")
    parser.add_argument("--selfTran", help="This script will always write all"
                        " the self-transition probabilities to the output file. "
                        "They will all be set to the specified value using this"
                        " option, or estimated from the data if -1", default=-1.,
                        type=float)
    parser.add_argument("--em", help="Emission probability for input track ("
                        "ie probability that state emits itself)",
                        type=float, default=0.95)
    parser.add_argument("--outEmNone", help="Add None emission probabilities"
                        " for target track for Outside states",
                        action="store_true", default=None)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.mode == "star" and args.numOut < 1:
        raise RuntimeError("--numOut must be at least 1 if --mode star is used")
    if args.mode != "star" and args.mode != "data" and args.mode != "full":
        raise RuntimeError("--mode must be one of {star, data, full}")
    if args.mode == "data":
        raise RuntimeError("--data not implemented yet")
    assert os.path.isfile(args.tracksInfo)
    
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # Read the tracks info
    trackList = TrackList(args.tracksInfo)
    # Extract the track we want
    track = trackList.getTrackByName(args.trackName)
    if track is None:
        raise RuntimeError("Track %s not found in tracksInfo" % args.trackName)
    trackPath = track.getPath()
    if track.getDist() != "multinomial" and track.getDist() != "gaussian":
        raise RuntimeError("Track %s does not have multinomial or "
                           "gaussian distribution" % args.trackName)
    if track.getScale() is not None or track.getLogScale() is not None:
        raise RuntimeError("Track %s must not have scale" % args.trackName)
    
    # read query intervals from the bed file
    logger.info("loading query intervals from %s" % args.queryBed)
    mergedIntervals = getMergedBedIntervals(args.queryBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.queryBed)

    # read the track, while intersecting with query intervals
    # (track is saved as temp XML file for sake not changing interface)
    bedIntervals = []
    for queryInterval in mergedIntervals:
        bedIntervals += readBedIntervals(trackPath,
                                        ncol = track.getValCol() + 1,
                                        chrom=queryInterval[0],
                                        start=queryInterval[1],
                                        end=queryInterval[2])

    # 1st pass to collect set of names
    nameMap = CategoryMap(reserved = 0)
    for interval in bedIntervals:
        nameMap.update(interval[track.getValCol()])
    outNameMap = CategoryMap(reserved = 0)
    if args.numTot > 0:
        args.numOut = max(0, args.numTot - len(nameMap))
    for i in xrange(args.numOut):
        outName = args.outName
        if args.numOut > 1:
            outName += str(i)
        assert nameMap.has(outName) is False
        outNameMap.update(outName)

    # write the transition model for use with teHmmTrain.py --initTransProbs    
    writeTransitions(bedIntervals, nameMap, outNameMap, args)

    # write the emission model for use with teHmmTrain.py --initEmProbs
    writeEmissions(bedIntervals, nameMap, outNameMap, args)

    cleanBedTool(tempBedToolPath)