def findOutcastOfLength(seqs, length, shiftStep=0.1, norm="each", mdl=False): # if numShifts < 0: # numShifts = 1 # stepLen = shiftStep * length motif = findMotifOfLengthFast(seqs[:], length, norm=norm) Xnorm = motif.Xnorm seedIdxs = computeAllSeedIdxsFromPair(motif.idx1, motif.idx2, shiftStep) # XXX this assumes only one seq seedIdxs, _ = startsAndEndsWithinBounds(seedIdxs, length, len(seqs[0])) bestScore = -np.inf bestOutcast = None for idx in seedIdxs: seed = Xnorm[idx] if mdl: info = findOutcastInstancesMDL(Xnorm, seed, length) else: info = findOutcastInstances(Xnorm, seed, length) if info and info.score > bestScore: bestScore = info.score bestOutcast = info print "bestOutcast idxs at length {}: {}".format(length, bestOutcast.idxs, bestOutcast.length) return bestOutcast
def findOutcastOfLength(seqs, length, shiftStep=.1, norm='each', mdl=False): # if numShifts < 0: # numShifts = 1 # stepLen = shiftStep * length motif = findMotifOfLengthFast(seqs[:], length, norm=norm) Xnorm = motif.Xnorm seedIdxs = computeAllSeedIdxsFromPair(motif.idx1, motif.idx2, shiftStep) # XXX this assumes only one seq seedIdxs, _ = startsAndEndsWithinBounds(seedIdxs, length, len(seqs[0])) bestScore = -np.inf bestOutcast = None for idx in seedIdxs: seed = Xnorm[idx] if mdl: info = findOutcastInstancesMDL(Xnorm, seed, length) else: info = findOutcastInstances(Xnorm, seed, length) if info and info.score > bestScore: bestScore = info.score bestOutcast = info print "bestOutcast idxs at length {}: {}".format(length, bestOutcast.idxs, bestOutcast.length) return bestOutcast
def learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsAlgo=DEFAULT_SEEDS_ALGO, generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO, extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO, generateSeedsStep=.1, padBothSides=True, **generalizeKwargs): padLen = (len(seq) - X.shape[1]) // 2 if padBothSides: X = ar.addZeroCols(X, padLen, prepend=True) X = ar.addZeroCols(X, padLen, prepend=False) Xblur = ar.addZeroCols(Xblur, padLen, prepend=True) Xblur = ar.addZeroCols(Xblur, padLen, prepend=False) tStartSeed = time.clock() # find seeds; i.e., candidate instance indices from which to generalize numShifts = int(1. / generateSeedsStep) + 1 stepLen = int(Lmax * generateSeedsStep) windowLen = Lmax + stepLen print "learnFF(): stepLen, numShifts", stepLen, numShifts if generateSeedsAlgo == 'pair': searchLen = (Lmin + Lmax) // 2 motif = findMotifOfLengthFast([seq], searchLen) seedIdxs = [motif.idx1, motif.idx2] print "seedIdxs from motif: ", seedIdxs seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'all': seedIdxs = np.arange(X.shape[1] - windowLen) # TODO remove after debug elif generateSeedsAlgo == 'random': seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2)) seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'walk': # score all subseqs based on how much they don't look like random walks # when examined using different sliding window lengths scores = np.zeros(len(seq)) for dim in range(seq.shape[1]): # compute these just once, not once per length dimData = seq[:, dim].ravel() diffs = dimData[1:] - dimData[:-1] std = np.std(diffs) for divideBy in [1, 2, 4, 8]: partialScores = windowScoresRandWalk(dimData, Lmin // divideBy, std=std) scores[:len(partialScores)] += partialScores # figure out optimal seeds based on scores of all subseqs bestIdx = np.argmax(scores) start = max(0, bestIdx - Lmin) end = min(len(scores), start + Lmin) scores[start:end] = -1 secondBestIdx = np.argmax(scores) seedIdxs = [bestIdx, secondBestIdx] seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) else: raise NotImplementedError( "Only algo 'pair' supported to generate seeds" "; got unrecognized algo {}".format(generateSeedsAlgo)) # compute start and end indices of seeds to try seedStartIdxs = np.sort(np.array(seedIdxs)) seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0] seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen] seedEndIdxs = seedStartIdxs + windowLen print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs print "learnFF(): fraction of idxs used as seeds: {}".format( len(seedStartIdxs) / float(len(seq))) tEndSeed = time.clock() generalizeKwargs['windowLen'] = windowLen # TODO remove after prototype bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs( X, Xblur, seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt, generalizeSeedsAlgo=generalizeSeedsAlgo, **generalizeKwargs) # print "learnFF(): got bsfFilt shape", bsfFilt.shape startIdxs, endIdxs = extractTrueLocs( X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax, extractTrueLocsAlgo=extractTrueLocsAlgo) tEndFF = time.clock() print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format( tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed) return startIdxs, endIdxs, bsfFilt
def learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsAlgo=DEFAULT_SEEDS_ALGO, generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO, extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO, generateSeedsStep=.1, padBothSides=True, **generalizeKwargs): padLen = (len(seq) - X.shape[1]) // 2 if padBothSides: X = ar.addZeroCols(X, padLen, prepend=True) X = ar.addZeroCols(X, padLen, prepend=False) Xblur = ar.addZeroCols(Xblur, padLen, prepend=True) Xblur = ar.addZeroCols(Xblur, padLen, prepend=False) tStartSeed = time.clock() # find seeds; i.e., candidate instance indices from which to generalize numShifts = int(1. / generateSeedsStep) + 1 stepLen = int(Lmax * generateSeedsStep) windowLen = Lmax + stepLen print "learnFF(): stepLen, numShifts", stepLen, numShifts if generateSeedsAlgo == 'pair': searchLen = (Lmin + Lmax) // 2 motif = findMotifOfLengthFast([seq], searchLen) seedIdxs = [motif.idx1, motif.idx2] print "seedIdxs from motif: ", seedIdxs seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'all': seedIdxs = np.arange(X.shape[1] - windowLen) # TODO remove after debug elif generateSeedsAlgo == 'random': seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2)) seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'walk': # score all subseqs based on how much they don't look like random walks # when examined using different sliding window lengths scores = np.zeros(len(seq)) for dim in range(seq.shape[1]): # compute these just once, not once per length dimData = seq[:, dim].ravel() diffs = dimData[1:] - dimData[:-1] std = np.std(diffs) for divideBy in [1, 2, 4, 8]: partialScores = windowScoresRandWalk(dimData, Lmin // divideBy, std=std) scores[:len(partialScores)] += partialScores # figure out optimal seeds based on scores of all subseqs bestIdx = np.argmax(scores) start = max(0, bestIdx - Lmin) end = min(len(scores), start + Lmin) scores[start:end] = -1 secondBestIdx = np.argmax(scores) seedIdxs = [bestIdx, secondBestIdx] seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) else: raise NotImplementedError("Only algo 'pair' supported to generate seeds" "; got unrecognized algo {}".format(generateSeedsAlgo)) # compute start and end indices of seeds to try seedStartIdxs = np.sort(np.array(seedIdxs)) seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0] seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen] seedEndIdxs = seedStartIdxs + windowLen print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs print "learnFF(): fraction of idxs used as seeds: {}".format( len(seedStartIdxs) / float(len(seq))) tEndSeed = time.clock() generalizeKwargs['windowLen'] = windowLen # TODO remove after prototype bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs(X, Xblur, seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt, generalizeSeedsAlgo=generalizeSeedsAlgo, **generalizeKwargs) # print "learnFF(): got bsfFilt shape", bsfFilt.shape startIdxs, endIdxs = extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax, extractTrueLocsAlgo=extractTrueLocsAlgo) tEndFF = time.clock() print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format( tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed) return startIdxs, endIdxs, bsfFilt