def _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsStep=.1): padLen = (len(seq) - X.shape[1]) // 2 X = ar.addZeroCols(X, padLen, prepend=True) X = ar.addZeroCols(X, padLen, prepend=False) Xblur = ar.addZeroCols(Xblur, padLen, prepend=True) Xblur = ar.addZeroCols(Xblur, padLen, prepend=False) timeStartSeed = time.clock() # find seeds; i.e., candidate instance indices from which to generalize numShifts = int(1. / generateSeedsStep) + 1 stepLen = int(Lmax * generateSeedsStep) windowLen = Lmax + stepLen # score all subseqs based on how much they don't look like random walks # when examined using different sliding window lengths scores = np.zeros(len(seq)) for dim in range(seq.shape[1]): # compute these just once, not once per length dimData = seq[:, dim].flatten() std = np.std(dimData[1:] - dimData[:-1]) for divideBy in [1, 2, 4, 8]: partialScores = feat.windowScoresRandWalk(dimData, Lmin // divideBy, std) scores[:len(partialScores)] += partialScores # figure out optimal pair based on scores of all subseqs bestIdx = np.argmax(scores) start = max(0, bestIdx - Lmin) end = min(len(scores), start + Lmin) scores[start:end] = -1 # disqualify idxs within Lmin of bestIdx secondBestIdx = np.argmax(scores) # compute all seed idxs from this pair seedIdxs = [bestIdx, secondBestIdx] maxIdx = X.shape[1] - windowLen - 1 seedStartIdxs = _computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen, maxIdx) seedEndIdxs = seedStartIdxs + windowLen timeEndSeed = time.clock() bsfScore, bsfLocs, bsfFilt = _findInstancesUsingSeedLocs(X, Xblur, seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt, windowLen=windowLen) startIdxs, endIdxs = _extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax) timeEndFF = time.clock() print "learnFF(): seconds to find seeds, regions, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format( timeEndSeed - timeStartSeed, timeEndFF - timeEndSeed, timeEndFF - timeStartSeed) return startIdxs, endIdxs, bsfFilt
def _seedScores(seq, Lmin): scores = np.zeros(len(seq)) for dim in range(seq.shape[1]): # compute these just once, not once per length dimData = seq[:, dim].flatten() std = np.std(dimData[1:] - dimData[:-1]) for divideBy in [1, 2, 4, 8]: length = Lmin // divideBy # for length in [8, 16, 32]: partialScores = feat.windowScoresRandWalk(dimData, length, std) # Lmin // divideBy, std, 1000) # TODO remove 1k scores[: len(partialScores)] += partialScores # # center partial scores within full seq # end = min(len(scores), len(partialScores) + length // 2) # start = end - len(partialScores) # scores[start:end] += partialScores return scores
def _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsStep=.1): padLen = (len(seq) - X.shape[1]) // 2 X = ar.addZeroCols(X, padLen, prepend=True) X = ar.addZeroCols(X, padLen, prepend=False) Xblur = ar.addZeroCols(Xblur, padLen, prepend=True) Xblur = ar.addZeroCols(Xblur, padLen, prepend=False) timeStartSeed = time.clock() # find seeds; i.e., candidate instance indices from which to generalize numShifts = int(1. / generateSeedsStep) + 1 stepLen = int(Lmax * generateSeedsStep) windowLen = Lmax + stepLen # score all subseqs based on how much they don't look like random walks # when examined using different sliding window lengths scores = np.zeros(len(seq)) for dim in range(seq.shape[1]): # compute these just once, not once per length dimData = seq[:, dim].flatten() std = np.std(dimData[1:] - dimData[:-1]) for divideBy in [1, 2, 4, 8]: partialScores = feat.windowScoresRandWalk(dimData, Lmin // divideBy, std) scores[:len(partialScores)] += partialScores # figure out optimal pair based on scores of all subseqs bestIdx = np.argmax(scores) start = max(0, bestIdx - Lmin) end = min(len(scores), start + Lmin) scores[start:end] = -1 # disqualify idxs within Lmin of bestIdx secondBestIdx = np.argmax(scores) # compute all seed idxs from this pair seedIdxs = [bestIdx, secondBestIdx] maxIdx = X.shape[1] - windowLen - 1 seedStartIdxs = _computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen, maxIdx) seedEndIdxs = seedStartIdxs + windowLen timeEndSeed = time.clock() bsfScore, bsfLocs, bsfFilt = _findInstancesUsingSeedLocs( X, Xblur, seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt, windowLen=windowLen) startIdxs, endIdxs = _extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax) timeEndFF = time.clock() print "learnFF(): seconds to find seeds, regions, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format( timeEndSeed - timeStartSeed, timeEndFF - timeEndSeed, timeEndFF - timeStartSeed) return startIdxs, endIdxs, bsfFilt