Beispiel #1
0
def _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsStep=.1):

	padLen = (len(seq) - X.shape[1]) // 2
	X = ar.addZeroCols(X, padLen, prepend=True)
	X = ar.addZeroCols(X, padLen, prepend=False)
	Xblur = ar.addZeroCols(Xblur, padLen, prepend=True)
	Xblur = ar.addZeroCols(Xblur, padLen, prepend=False)

	timeStartSeed = time.clock()

	# find seeds; i.e., candidate instance indices from which to generalize
	numShifts = int(1. / generateSeedsStep) + 1
	stepLen = int(Lmax * generateSeedsStep)
	windowLen = Lmax + stepLen

	# score all subseqs based on how much they don't look like random walks
	# when examined using different sliding window lengths
	scores = np.zeros(len(seq))
	for dim in range(seq.shape[1]):
		# compute these just once, not once per length
		dimData = seq[:, dim].flatten()
		std = np.std(dimData[1:] - dimData[:-1])
		for divideBy in [1, 2, 4, 8]:
			partialScores = feat.windowScoresRandWalk(dimData, Lmin // divideBy, std)
			scores[:len(partialScores)] += partialScores

	# figure out optimal pair based on scores of all subseqs
	bestIdx = np.argmax(scores)
	start = max(0, bestIdx - Lmin)
	end = min(len(scores), start + Lmin)
	scores[start:end] = -1 # disqualify idxs within Lmin of bestIdx
	secondBestIdx = np.argmax(scores)

	# compute all seed idxs from this pair
	seedIdxs = [bestIdx, secondBestIdx]
	maxIdx = X.shape[1] - windowLen - 1
	seedStartIdxs = _computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen, maxIdx)
	seedEndIdxs = seedStartIdxs + windowLen

	timeEndSeed = time.clock()

	bsfScore, bsfLocs, bsfFilt = _findInstancesUsingSeedLocs(X, Xblur,
		seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt, windowLen=windowLen)

	startIdxs, endIdxs = _extractTrueLocs(X, Xblur, bsfLocs, bsfFilt,
		windowLen, Lmin, Lmax)

	timeEndFF = time.clock()
	print "learnFF(): seconds to find seeds, regions, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format(
		timeEndSeed - timeStartSeed, timeEndFF - timeEndSeed, timeEndFF - timeStartSeed)

	return startIdxs, endIdxs, bsfFilt
Beispiel #2
0
def _seedScores(seq, Lmin):
    scores = np.zeros(len(seq))
    for dim in range(seq.shape[1]):
        # compute these just once, not once per length
        dimData = seq[:, dim].flatten()
        std = np.std(dimData[1:] - dimData[:-1])
        for divideBy in [1, 2, 4, 8]:
            length = Lmin // divideBy
            # for length in [8, 16, 32]:
            partialScores = feat.windowScoresRandWalk(dimData, length, std)
            # Lmin // divideBy, std, 1000) # TODO remove 1k
            scores[: len(partialScores)] += partialScores
            # # center partial scores within full seq
            # end =  min(len(scores), len(partialScores) + length // 2)
            # start = end - len(partialScores)
            # scores[start:end] += partialScores
    return scores
Beispiel #3
0
def _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsStep=.1):

    padLen = (len(seq) - X.shape[1]) // 2
    X = ar.addZeroCols(X, padLen, prepend=True)
    X = ar.addZeroCols(X, padLen, prepend=False)
    Xblur = ar.addZeroCols(Xblur, padLen, prepend=True)
    Xblur = ar.addZeroCols(Xblur, padLen, prepend=False)

    timeStartSeed = time.clock()

    # find seeds; i.e., candidate instance indices from which to generalize
    numShifts = int(1. / generateSeedsStep) + 1
    stepLen = int(Lmax * generateSeedsStep)
    windowLen = Lmax + stepLen

    # score all subseqs based on how much they don't look like random walks
    # when examined using different sliding window lengths
    scores = np.zeros(len(seq))
    for dim in range(seq.shape[1]):
        # compute these just once, not once per length
        dimData = seq[:, dim].flatten()
        std = np.std(dimData[1:] - dimData[:-1])
        for divideBy in [1, 2, 4, 8]:
            partialScores = feat.windowScoresRandWalk(dimData,
                                                      Lmin // divideBy, std)
            scores[:len(partialScores)] += partialScores

    # figure out optimal pair based on scores of all subseqs
    bestIdx = np.argmax(scores)
    start = max(0, bestIdx - Lmin)
    end = min(len(scores), start + Lmin)
    scores[start:end] = -1  # disqualify idxs within Lmin of bestIdx
    secondBestIdx = np.argmax(scores)

    # compute all seed idxs from this pair
    seedIdxs = [bestIdx, secondBestIdx]
    maxIdx = X.shape[1] - windowLen - 1
    seedStartIdxs = _computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen,
                                                maxIdx)
    seedEndIdxs = seedStartIdxs + windowLen

    timeEndSeed = time.clock()

    bsfScore, bsfLocs, bsfFilt = _findInstancesUsingSeedLocs(
        X,
        Xblur,
        seedStartIdxs,
        seedEndIdxs,
        Lmin,
        Lmax,
        Lfilt,
        windowLen=windowLen)

    startIdxs, endIdxs = _extractTrueLocs(X, Xblur, bsfLocs, bsfFilt,
                                          windowLen, Lmin, Lmax)

    timeEndFF = time.clock()
    print "learnFF(): seconds to find seeds, regions, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format(
        timeEndSeed - timeStartSeed, timeEndFF - timeEndSeed,
        timeEndFF - timeStartSeed)

    return startIdxs, endIdxs, bsfFilt