Beispiel #1
0
def learnFF(seq,
            X,
            Xblur,
            Lmin,
            Lmax,
            Lfilt,
            generateSeedsAlgo=DEFAULT_SEEDS_ALGO,
            generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO,
            extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO,
            generateSeedsStep=.1,
            padBothSides=True,
            **generalizeKwargs):

    padLen = (len(seq) - X.shape[1]) // 2
    if padBothSides:
        X = ar.addZeroCols(X, padLen, prepend=True)
        X = ar.addZeroCols(X, padLen, prepend=False)
        Xblur = ar.addZeroCols(Xblur, padLen, prepend=True)
        Xblur = ar.addZeroCols(Xblur, padLen, prepend=False)

    tStartSeed = time.clock()

    # find seeds; i.e., candidate instance indices from which to generalize
    numShifts = int(1. / generateSeedsStep) + 1
    stepLen = int(Lmax * generateSeedsStep)
    windowLen = Lmax + stepLen
    print "learnFF(): stepLen, numShifts", stepLen, numShifts

    if generateSeedsAlgo == 'pair':
        searchLen = (Lmin + Lmax) // 2
        motif = findMotifOfLengthFast([seq], searchLen)
        seedIdxs = [motif.idx1, motif.idx2]
        print "seedIdxs from motif: ", seedIdxs
        seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

    elif generateSeedsAlgo == 'all':
        seedIdxs = np.arange(X.shape[1] - windowLen)  # TODO remove after debug

    elif generateSeedsAlgo == 'random':
        seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2))
        seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

    elif generateSeedsAlgo == 'walk':
        # score all subseqs based on how much they don't look like random walks
        # when examined using different sliding window lengths
        scores = np.zeros(len(seq))
        for dim in range(seq.shape[1]):
            # compute these just once, not once per length
            dimData = seq[:, dim].ravel()
            diffs = dimData[1:] - dimData[:-1]
            std = np.std(diffs)
            for divideBy in [1, 2, 4, 8]:
                partialScores = windowScoresRandWalk(dimData,
                                                     Lmin // divideBy,
                                                     std=std)
                scores[:len(partialScores)] += partialScores

        # figure out optimal seeds based on scores of all subseqs
        bestIdx = np.argmax(scores)
        start = max(0, bestIdx - Lmin)
        end = min(len(scores), start + Lmin)
        scores[start:end] = -1
        secondBestIdx = np.argmax(scores)

        seedIdxs = [bestIdx, secondBestIdx]
        seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)
    else:
        raise NotImplementedError(
            "Only algo 'pair' supported to generate seeds"
            "; got unrecognized algo {}".format(generateSeedsAlgo))

    # compute start and end indices of seeds to try
    seedStartIdxs = np.sort(np.array(seedIdxs))
    seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0]
    seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen]
    seedEndIdxs = seedStartIdxs + windowLen

    print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs
    print "learnFF(): fraction of idxs used as seeds: {}".format(
        len(seedStartIdxs) / float(len(seq)))

    tEndSeed = time.clock()

    generalizeKwargs['windowLen'] = windowLen  # TODO remove after prototype

    bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs(
        X,
        Xblur,
        seedStartIdxs,
        seedEndIdxs,
        Lmin,
        Lmax,
        Lfilt,
        generalizeSeedsAlgo=generalizeSeedsAlgo,
        **generalizeKwargs)

    # print "learnFF(): got bsfFilt shape", bsfFilt.shape

    startIdxs, endIdxs = extractTrueLocs(
        X,
        Xblur,
        bsfLocs,
        bsfFilt,
        windowLen,
        Lmin,
        Lmax,
        extractTrueLocsAlgo=extractTrueLocsAlgo)

    tEndFF = time.clock()
    print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format(
        tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed)

    return startIdxs, endIdxs, bsfFilt
Beispiel #2
0
def neighborSims1D(seq, length, numNeighbors=100, samplingAlgo='walk',
	similarityAlgo='meanOnly', maxDist=.25, localMaxFilter=False,
	spacedMaxFilter=False, tryNumNeighbors=-1, **sink):
	# spacedMaxFilter=True, tryNumNeighbors=-1, **sink):

	# print "neighborSims1D(); seq shape, requested len, requested count"
	# print seq.shape, length, numNeighbors

	seq = seq.flatten()
	X = window.sliding_window_1D(seq, length)
	numSubseqs = X.shape[0]

	if numNeighbors < 1 or numNeighbors > numSubseqs:
		numNeighbors = numSubseqs
		# origNumNeighbors = numNeighbors
	# elif baseLength:
	# 	origNumNeighbors = numNeighbors
	# 	numNeighbors = int(numNeighbors * float(length) / baseLength)

	if samplingAlgo == 'std':
		probs = np.std(X, axis=1)
	elif samplingAlgo == 'var':
		probs = np.var(X, axis=1)
	elif samplingAlgo == 'unif':
		probs = np.ones(numSubseqs)
	elif samplingAlgo == 'walk':
		probs = windowScoresRandWalk(seq, length)
	else:
		raise ValueError("Unrecognized sampling algorithm {}".format(samplingAlgo))

	# must assess at least as many subseqs as we want to return, and no more
	# than the largest number possible
	tryNumNeighbors = max(tryNumNeighbors, numNeighbors)
	tryNumNeighbors = min(tryNumNeighbors, numSubseqs)

	# print "neighborSims1D(); X shape ", X.shape

	# print np.var(X, axis=1)

	# allDists = pairwiseDists(X)
	# # allDists = pairwiseDists(X) / length
	# # import matplotlib.pyplot as plt
	# # from ..viz import viz_utils as viz
	# # plt.figure()
	# # viz.imshowBetter(allDists)
	# # plt.show()
	# # import sys
	# # sys.exit()

	# # closeEnough = (allDists < maxDist).astype(np.int)
	# # closeEnough = allDists < maxDist
	# closeEnough = allDists < (maxDist * length)
	# neighborCounts = np.sum(closeEnough, axis=1)
	# print neighborCounts
	# eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor
	# # print eligibleIdxs
	# numEligibleIdxs = len(eligibleIdxs)

	# print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs

	# select random subseqs
	probs /= np.sum(probs)
	allIdxs = np.arange(numSubseqs)
	startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs)
	# minSpacing = length // 2
	# startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing,
	# 	probabilities=probs, reduceSpacingIfNeeded=True)
	# 	probabilities=probs, reduceNumIfNeeded=True)
	neighbors = X[startIdxs]

	# mean normalize all subseqs
	X = X - np.mean(X, axis=1, keepdims=True)
	neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True)

	# zNorm = True # TODO remove
	# if zNorm:
	# 	X = ar.zNormalizeRows(X)
	# 	neighbors = ar.zNormalizeRows(neighbors)

	# SELF: pick up here by ensuring sufficient features
	# import dist
	# Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X,
	# 	referenceVectAlgo='sample', norm=None)

	# allVariances = np.var(X, axis=1)
	# sortIdxs = np.argsort(allVariances)
	# allVariances = allVariances[sortIdxs]

	# sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output
	sims = np.zeros((tryNumNeighbors, numSubseqs)) # extra rows for uniform output

	if similarityAlgo == 'meanOnly':
		for i, neighbor in enumerate(neighbors):
			variance = np.var(neighbor)
			if variance < .0001:
				continue

			diffs = X - neighbor
			dists = np.sum(diffs * diffs, axis=1) / length
			dists /= variance # would be within [0, 2] if znormed

			dists[dists > maxDist] = np.inf
			neighborSims = np.maximum(0, 1. - dists)

			# print "i, sims shape", i, neighborSims.shape

			if localMaxFilter:
				idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(), maxima=True)
				sims[i, idxs] = neighborSims[idxs]
			elif spacedMaxFilter:
				idxs = nonOverlappingMaxima(neighborSims, length // 2)
				# idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2
				sims[i, idxs] = neighborSims[idxs]
			else:
				sims[i] = neighborSims

	else:
		raise ValueError("Unrecognized similarity algorithm {}".format(
			similarityAlgo))

	if tryNumNeighbors > numNeighbors: # need to remove some neighbors
		# greedily take rows with most total similarity, but only counting
		# trivial matches once
		scores = np.zeros(len(sims))
		for i, row in enumerate(sims):
			maximaIdxs = nonOverlappingMaxima(row, length // 2)
			scores[i] = np.sum(row[maximaIdxs])
		sortIdxs = np.argsort(scores)[::-1]
		sims = sims[sortIdxs[:numNeighbors]]

	return sims.T
Beispiel #3
0
def learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt,
	generateSeedsAlgo=DEFAULT_SEEDS_ALGO,
	generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO,
	extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO,
	generateSeedsStep=.1, padBothSides=True, **generalizeKwargs):

	padLen = (len(seq) - X.shape[1]) // 2
	if padBothSides:
		X = ar.addZeroCols(X, padLen, prepend=True)
		X = ar.addZeroCols(X, padLen, prepend=False)
		Xblur = ar.addZeroCols(Xblur, padLen, prepend=True)
		Xblur = ar.addZeroCols(Xblur, padLen, prepend=False)

	tStartSeed = time.clock()

	# find seeds; i.e., candidate instance indices from which to generalize
	numShifts = int(1. / generateSeedsStep) + 1
	stepLen = int(Lmax * generateSeedsStep)
	windowLen = Lmax + stepLen
	print "learnFF(): stepLen, numShifts", stepLen, numShifts

	if generateSeedsAlgo == 'pair':
		searchLen = (Lmin + Lmax) // 2
		motif = findMotifOfLengthFast([seq], searchLen)
		seedIdxs = [motif.idx1, motif.idx2]
		print "seedIdxs from motif: ", seedIdxs
		seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

	elif generateSeedsAlgo == 'all':
		seedIdxs = np.arange(X.shape[1] - windowLen) # TODO remove after debug

	elif generateSeedsAlgo == 'random':
		seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2))
		seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

	elif generateSeedsAlgo == 'walk':
		# score all subseqs based on how much they don't look like random walks
		# when examined using different sliding window lengths
		scores = np.zeros(len(seq))
		for dim in range(seq.shape[1]):
			# compute these just once, not once per length
			dimData = seq[:, dim].ravel()
			diffs = dimData[1:] - dimData[:-1]
			std = np.std(diffs)
			for divideBy in [1, 2, 4, 8]:
				partialScores = windowScoresRandWalk(dimData, Lmin // divideBy,
					std=std)
				scores[:len(partialScores)] += partialScores

		# figure out optimal seeds based on scores of all subseqs
		bestIdx = np.argmax(scores)
		start = max(0, bestIdx - Lmin)
		end = min(len(scores), start + Lmin)
		scores[start:end] = -1
		secondBestIdx = np.argmax(scores)

		seedIdxs = [bestIdx, secondBestIdx]
		seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)
	else:
		raise NotImplementedError("Only algo 'pair' supported to generate seeds"
			"; got unrecognized algo {}".format(generateSeedsAlgo))

	# compute start and end indices of seeds to try
	seedStartIdxs = np.sort(np.array(seedIdxs))
	seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0]
	seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen]
	seedEndIdxs = seedStartIdxs + windowLen

	print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs
	print "learnFF(): fraction of idxs used as seeds: {}".format(
		len(seedStartIdxs) / float(len(seq)))

	tEndSeed = time.clock()

	generalizeKwargs['windowLen'] = windowLen # TODO remove after prototype

	bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs(X, Xblur,
		seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt,
		generalizeSeedsAlgo=generalizeSeedsAlgo,
		**generalizeKwargs)

	# print "learnFF(): got bsfFilt shape", bsfFilt.shape

	startIdxs, endIdxs = extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen,
		Lmin, Lmax, extractTrueLocsAlgo=extractTrueLocsAlgo)

	tEndFF = time.clock()
	print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format(
		tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed)

	return startIdxs, endIdxs, bsfFilt
Beispiel #4
0
def neighborSims1D(seq,
                   length,
                   numNeighbors=100,
                   samplingAlgo='walk',
                   similarityAlgo='meanOnly',
                   maxDist=.25,
                   localMaxFilter=False,
                   spacedMaxFilter=False,
                   tryNumNeighbors=-1,
                   **sink):
    # spacedMaxFilter=True, tryNumNeighbors=-1, **sink):

    # print "neighborSims1D(); seq shape, requested len, requested count"
    # print seq.shape, length, numNeighbors

    seq = seq.flatten()
    X = window.sliding_window_1D(seq, length)
    numSubseqs = X.shape[0]

    if numNeighbors < 1 or numNeighbors > numSubseqs:
        numNeighbors = numSubseqs
        # origNumNeighbors = numNeighbors
    # elif baseLength:
    # 	origNumNeighbors = numNeighbors
    # 	numNeighbors = int(numNeighbors * float(length) / baseLength)

    if samplingAlgo == 'std':
        probs = np.std(X, axis=1)
    elif samplingAlgo == 'var':
        probs = np.var(X, axis=1)
    elif samplingAlgo == 'unif':
        probs = np.ones(numSubseqs)
    elif samplingAlgo == 'walk':
        probs = windowScoresRandWalk(seq, length)
    else:
        raise ValueError(
            "Unrecognized sampling algorithm {}".format(samplingAlgo))

    # must assess at least as many subseqs as we want to return, and no more
    # than the largest number possible
    tryNumNeighbors = max(tryNumNeighbors, numNeighbors)
    tryNumNeighbors = min(tryNumNeighbors, numSubseqs)

    # print "neighborSims1D(); X shape ", X.shape

    # print np.var(X, axis=1)

    # allDists = pairwiseDists(X)
    # # allDists = pairwiseDists(X) / length
    # # import matplotlib.pyplot as plt
    # # from ..viz import viz_utils as viz
    # # plt.figure()
    # # viz.imshowBetter(allDists)
    # # plt.show()
    # # import sys
    # # sys.exit()

    # # closeEnough = (allDists < maxDist).astype(np.int)
    # # closeEnough = allDists < maxDist
    # closeEnough = allDists < (maxDist * length)
    # neighborCounts = np.sum(closeEnough, axis=1)
    # print neighborCounts
    # eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor
    # # print eligibleIdxs
    # numEligibleIdxs = len(eligibleIdxs)

    # print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs

    # select random subseqs
    probs /= np.sum(probs)
    allIdxs = np.arange(numSubseqs)
    startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs)
    # minSpacing = length // 2
    # startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing,
    # 	probabilities=probs, reduceSpacingIfNeeded=True)
    # 	probabilities=probs, reduceNumIfNeeded=True)
    neighbors = X[startIdxs]

    # mean normalize all subseqs
    X = X - np.mean(X, axis=1, keepdims=True)
    neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True)

    # zNorm = True # TODO remove
    # if zNorm:
    # 	X = ar.zNormalizeRows(X)
    # 	neighbors = ar.zNormalizeRows(neighbors)

    # SELF: pick up here by ensuring sufficient features
    # import dist
    # Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X,
    # 	referenceVectAlgo='sample', norm=None)

    # allVariances = np.var(X, axis=1)
    # sortIdxs = np.argsort(allVariances)
    # allVariances = allVariances[sortIdxs]

    # sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output
    sims = np.zeros(
        (tryNumNeighbors, numSubseqs))  # extra rows for uniform output

    if similarityAlgo == 'meanOnly':
        for i, neighbor in enumerate(neighbors):
            variance = np.var(neighbor)
            if variance < .0001:
                continue

            diffs = X - neighbor
            dists = np.sum(diffs * diffs, axis=1) / length
            dists /= variance  # would be within [0, 2] if znormed

            dists[dists > maxDist] = np.inf
            neighborSims = np.maximum(0, 1. - dists)

            # print "i, sims shape", i, neighborSims.shape

            if localMaxFilter:
                idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(),
                                                maxima=True)
                sims[i, idxs] = neighborSims[idxs]
            elif spacedMaxFilter:
                idxs = nonOverlappingMaxima(neighborSims, length // 2)
                # idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2
                sims[i, idxs] = neighborSims[idxs]
            else:
                sims[i] = neighborSims

    else:
        raise ValueError(
            "Unrecognized similarity algorithm {}".format(similarityAlgo))

    if tryNumNeighbors > numNeighbors:  # need to remove some neighbors
        # greedily take rows with most total similarity, but only counting
        # trivial matches once
        scores = np.zeros(len(sims))
        for i, row in enumerate(sims):
            maximaIdxs = nonOverlappingMaxima(row, length // 2)
            scores[i] = np.sum(row[maximaIdxs])
        sortIdxs = np.argsort(scores)[::-1]
        sims = sims[sortIdxs[:numNeighbors]]

    return sims.T