Esempio n. 1
0
def old_findAllInstancesMAP(X,
                            Xblur,
                            seedStartIdx,
                            seedEndIdx,
                            Lmin,
                            Lmax,
                            Lfilt,
                            p0=-1,
                            p0blur=-1,
                            logs_0=None,
                            bsfScore=0,
                            **sink):

    # print "map(): X, Xblur shape", X.shape, Xblur.shape

    # ------------------------ stats
    windowLen = seedEndIdx - seedStartIdx  # assume end idx not inclusive

    if p0 <= 0.:
        p0 = np.mean(X)  # fraction of entries that are 1 (roughly)
    if p0blur <= 0.:
        p0blur = np.mean(Xblur)
        p0blur *= 2
    if logs_0 is None:
        # featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1))
        # featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1)) * 2
        # theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans
        theta_0 = np.zeros((Xblur.shape[0], windowLen)) + p0blur
        # theta_0 = np.zeros((Xblur.shape[0], windowLen)) + 2 * p0blur
        theta_0c = 1. - theta_0
        logs_0 = np.log(theta_0)
        logs_0c = np.log(theta_0c)
        # logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0blur) # overzealous
        # logs_0c = np.zeros((Xblur.shape[0], windowLen)) + np.log(1. - p0blur)
        # logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0) # works
    # print "logs_0 nan at:", np.where(np.isnan(logs_0))[0]

    # lamda = -2 * np.log(p0blur) - .001 # just below what we get with 2 instances
    # lamda = -np.log(p0blur) - .001 # actually, including log(theta_0), should be this
    lamda = 0
    # print "lambda", lamda

    # minSim = p0
    # expectedOnesPerWindow = p0blur * X.shape[0] * windowLen
    # noiseSz = p0blur * expectedOnesPerWindow # num ones to begin with

    # ------------------------ candidate location generation
    x0 = Xblur[:, seedStartIdx:seedEndIdx]
    if np.sum(x0) <= 0.:
        print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx)
        return -1, None, None
    # dotProds = dotProdsWithAllWindows(x0, X)
    dotProds = dotProdsWithAllWindows(x0, Xblur)
    # assert(np.min(dotProds) >= 0.)
    # dotProds[dotProds < noiseSz] = 0. # don't even consider places worse than noise

    # compute best locations to try and then sort them in decreasing order
    bestIdxs = nonOverlappingMaxima(dotProds, Lmin)
    # bestIdxs = sub.optimalAlignment(dotProds, Lmin)

    bestProds = dotProds[bestIdxs]
    # keepIdxs = np.where(bestProds > noiseSz)[0]
    # bestIdxs = bestIdxs[keepIdxs]
    # bestProds = bestProds[keepIdxs]
    sortIdxs = np.argsort(bestProds)[::-1]
    idxs = bestIdxs[sortIdxs]

    # ------------------------ now figure out which idxs should be instances
    # avg together the 2 best windows and compute the score
    idx1, idx2 = idxs[:2]
    window1 = X[:, idx1:idx1 + windowLen]
    window2 = X[:, idx2:idx2 + windowLen]
    counts = window1 + window2
    window1blur = Xblur[:, idx1:idx1 + windowLen]
    window2blur = Xblur[:, idx2:idx2 + windowLen]
    countsBlur = window1blur + window2blur

    # compute best pair filter compared to noise
    k = 2.
    theta_1 = countsBlur / k
    # logs_1 = np.zeros(theta_1.shape)
    logs_1 = np.log(theta_1)
    logs_1[np.isneginf(logs_1)] = -99999.  # big negative num
    # logs_1c = np.log(1. - theta_1)
    # logs_1c[np.isneginf(logs_1c)] = -99999.
    # gains = k * (logs_1 - logs_0) - lamda
    # gains = counts * (logs_1 - logs_0) - lamda
    # filt = gains * (gains > 0)
    # logOdds = np.sum(filt)
    logDiffs = (logs_1 - logs_0)
    gains = counts * logDiffs
    threshMask = gains > lamda
    # gains += (k - counts) * (logs_1c - logs_0c)
    filt = logDiffs * threshMask
    # logOdds = np.sum(filt) - lamda * np.count_nonzero(filt)
    # logOdds = np.sum(filt)

    # subtract1 = np.minimum(window1, theta_1)
    # subtract2 = np.minimum(window1, theta_1)
    # subtracts = [subtract1, subtract2]
    # for idx, subVal in zip(idxs[:2], subtracts):
    # 	Xblur[:, idx:idx+windowLen] -= subVal

    # # compute best pair filter compared to nearest enemy
    # if len(idxs) > 2:
    # 	idx = idxs[k]
    # 	nextWindow = Xblur[:, idx:idx+windowLen]
    # 	# theta_e = nextWindow
    # 	theta_e = np.maximum(nextWindow, theta_0)
    # 	logs_e = np.log(theta_e)
    # 	logs_e[np.isneginf(logs_e)] = -99999.
    # 	# logs_ec = np.log(1. - theta_e)
    # 	# logs_ec[np.isneginf(logs_ec)] = -99999.
    # 	gains_e = counts * (logs_1 - logs_e)
    # 	# gains_e = nextWindow * (logs_1 - logs_e)
    # 	filt_e = gains * (gains_e > lamda)
    # 	# logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e)
    # 	logOdds_e = np.sum(filt_e)
    # 	# if logOdds_e < logOdds:
    # 	if True:
    # 		# print("k=2; enemy log odds {} < noise log odds {}".format(
    # 		# 	logOdds_e, logOdds))
    # 		logOdds = logOdds_e

    # logOdds = np.sum(filt * window1) - lamda * np.count_nonzero(filt)
    # logOdds = np.sum(filt * window1)

    logOdds = np.sum(filt * X[:, idx1:idx1 + windowLen])
    # compute best pair filter compared to nearest enemy
    if k < len(idxs):
        idx = idxs[k]
        # nextWindow = Xblur[:, idx:idx+windowLen]
        nextWindow = X[:, idx:idx + windowLen]
        nextWindowOdds = np.sum(filt * nextWindow)
        # nextWindowOdds -= lamda * np.count_nonzero(filt)
        logOdds -= nextWindowOdds

    # logOdds = np.sum(filt * counts) - lamda * np.count_nonzero(filt)
    # # compute best pair filter compared to nearest enemy
    # if k < len(idxs):
    # 	idx = idxs[k]
    # 	nextWindowBlur = Xblur[idx:idx+windowLen]
    # 	theta_e = nextWindowBlur
    # 	logs_e = np.log(theta_e)
    # 	logs_e[np.isneginf(logs_e)] = -99999.
    # 	filt = logs_e
    # 	# logDiffs = (logs_1 - logs_0)
    # 	# gains = counts * logDiffs
    # 	# threshMask = gains > lamda
    # 	# # gains += (k - counts) * (logs_1c - logs_0c)
    # 	# filt = logDiffs * threshMask
    # 	gains_e =
    # 	logOdds_e =

    # if np.mean(x0) > (p0blur * 2):
    # 	fig, axes = plt.subplots(2, 5, figsize=(10,8))
    # 	axes = axes.flatten()
    # 	plt.suptitle("{}-{}".format(idx1, idx2))
    # 	axes[0]
    # 	viz.imshowBetter(window1, ax=axes[0])
    # 	viz.imshowBetter(window2, ax=axes[1])
    # 	viz.imshowBetter(counts, ax=axes[2])
    # 	viz.imshowBetter(gains, ax=axes[3])
    # 	viz.imshowBetter(filt, ax=axes[4])
    # 	axes[4].set_title("{}".format(logOdds))

    # 	viz.imshowBetter(X[:, idx1:idx1+windowLen], ax=axes[5])
    # 	viz.imshowBetter(X[:, idx2:idx2+windowLen], ax=axes[6])

    bestOdds = logOdds
    bestFilt = filt
    bestLocs = idxs[:2]

    # print "map() {} -> {} {}:".format(seedStartIdx, idx1, idx2), logOdds, bestLocs
    # print "map() {}: k=2, total prob: {}".format(seedStartIdx, np.sum(theta_1))
    # return logOdds, bestLocs, bestFilt # TODO remove after debug

    # Alright, so what I want the right answer to be is the biggest patch
    # of black I can get, subtracting off the next-biggest patch of black
    # (weighting each by the filter or something)
    # -so maybe that's the log odds of the Nth loc - the log odds of the Nth loc
    #	-and note that these are log odds *of the loc being an instance*, not
    # 	the log odds of the filter as a whole
    #		-although pretty sure there's some formulation of filter log odds
    #		such that these are equivalent--it just isn't my current logOdds var

    # Xblur = np.copy(Xblur)
    # Xblur[:, idx1:idx1+windowLen] = np.maximum(0, Xblur[:, idx1:idx1+windowLen] - theta_1)
    # Xblur[:, idx2:idxs2+windowLen] = np.maximum(0, Xblur[:, idx2:idx2+windowLen] - theta_1)

    # try adding the rest of the idxs
    for i, idx in enumerate(idxs[2:]):
        k += 1
        window = Xblur[:, idx:idx + windowLen]

        # filter and odds vs noise
        counts += window
        theta_1 = counts / k
        logs_1 = np.log(theta_1)
        logs_1[np.isneginf(logs_1)] = -99999.  # big negative num
        # logs_1c = np.log(1. - theta_1)
        # logs_1c[np.isneginf(logs_1c)] = -99999.
        # gains = k * (logs_1 - logs_0) - lamda
        # gains = counts * (logs_1 - logs_0) - lamda
        # filt = gains * (gains > 0)
        # logOdds = np.sum(filt)
        logDiffs = (logs_1 - logs_0)
        threshMask = gains > lamda
        # gains += (k - counts) * (logs_1c - logs_0c)
        filt = logDiffs * threshMask
        # logOdds = np.sum(filt) - lamda * np.count_nonzero(filt)
        # logOdds = np.sum(filt)

        # logOdds = np.sum(filt * window) - lamda * np.count_nonzero(filt)
        # logOdds = np.sum(filt * window)

        logOdds = np.sum(filt * X[:, idx:idx + windowLen])
        # compute best pair filter compared to nearest enemy
        # randomOdds = np.sum(filt) # Wait, was this even intentional
        randomOdds = np.sum(filt) * p0blur
        nextWindowOdds = 0
        if k < len(idxs):
            idx = idxs[k]
            # nextWindow = Xblur[:, idx:idx+windowLen]
            nextWindow = X[:, idx:idx + windowLen]
            nextWindowOdds = np.sum(filt * nextWindow)
            # nextWindowOdds -= lamda * np.count_nonzero(filt)
            # logOdds -= nextWindowOdds
        penalty = max(randomOdds, nextWindowOdds)
        logOdds -= penalty

        # subtract = np.minimum(window, theta_1)
        # subtracts.append(subtract)
        # Xblur[:, idx:idx+windowLen] -= subtract

        # # filter and odds vs nearest enemy
        # if k < len(idxs):
        # 	idx = idxs[k]
        # 	nextWindow = Xblur[:, idx:idx+windowLen]
        # 	# theta_e = nextWindow
        # 	theta_e = np.maximum(nextWindow, theta_0)
        # 	logs_e = np.log(theta_e)
        # 	logs_e[np.isneginf(logs_e)] = -99999.
        # 	# logs_ec = np.log(1. - theta_e)
        # 	# logs_ec[np.isneginf(logs_ec)] = -99999.
        # 	gains_e = counts * (logs_1 - logs_e)
        # 	# gains_e = nextWindow * (logs_1 - logs_e)
        # 	# gains_e += (k - counts) * (logs_1c - logs_ec)
        # 	filt_e = gains * (gains_e > lamda)
        # 	# logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e)
        # 	logOdds_e = np.sum(filt_e)
        # 	# logOdds = min(logOdds, logOdds_e)
        # 	# if logOdds_e < logOdds:
        # 	if True:
        # 		# print("k={}; enemy log odds {} < noise log odds {}".format(
        # 		# 	k, logOdds_e, logOdds))
        # 		logOdds = logOdds_e

        # Xblur[:, idx:idx+windowLen] = np.maximum(0, Xblur[:, idx:idx+windowLen] - theta_1)

        # print "map() {}: k={}, total prob: {}".format(seedStartIdx, k, np.sum(theta_1))

        if logOdds > bestOdds:
            bestOdds = logOdds
            bestFilt = np.copy(filt)
            bestLocs = idxs[:k]
            # print("k={}; log odds {}".format(k, logOdds))

    # print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs))

    # for idx, subVal in zip(idxs[:len(subtracts)], subtracts):
    # 	Xblur[:, idx:idx+windowLen] += subVal

    return bestOdds, bestLocs, bestFilt
Esempio n. 2
0
def findAllInstancesMAP(X,
                        Xblur,
                        seedStartIdx,
                        seedEndIdx,
                        Lmin,
                        Lmax,
                        Lfilt,
                        p0=-1,
                        p0blur=-1,
                        logs_0=None,
                        bsfScore=0,
                        **sink):

    assert (np.min(X) >= 0.)
    assert (np.max(X) <= 1.)
    assert (np.min(Xblur) >= 0.)
    assert (np.max(Xblur) <= 1.)
    assert (np.all(np.sum(X, axis=1) > 0))
    assert (np.all(np.sum(Xblur, axis=1) > 0))

    # ================================ variable initialization
    windowLen = seedEndIdx - seedStartIdx  # assume end idx not inclusive

    if p0 <= 0.:
        p0 = np.mean(X)  # fraction of entries that are 1 (roughly)
    if p0blur <= 0.:
        p0blur = np.mean(Xblur)
    if logs_0 is None:
        # theta_0 = np.zeros(windowShape) + p0blur
        featureMeans = np.mean(Xblur, axis=1, keepdims=True)
        # featureSums = np.sum(Xblur, axis=1, keepdims=True) + 1. # regularize
        # featureMeans = featureSums / X.shape[1]
        theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans
        logs_0 = np.log(theta_0)

    lamda = 0

    # ================================ candidate location generation
    x0 = Xblur[:, seedStartIdx:seedEndIdx]
    if np.sum(x0) <= 0.:
        print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx)
        return -1, None, None
    dotProds = dotProdsWithAllWindows(x0, X)

    # compute best locations to try and then sort them in decreasing order
    bestIdxs = nonOverlappingMaxima(dotProds, Lmin)

    bestProds = dotProds[bestIdxs]
    sortIdxs = np.argsort(bestProds)[::-1]
    idxs = bestIdxs[sortIdxs]

    # ================================ now figure out which idxs should be instances

    # initialize counts
    idx = idxs[0]
    counts = np.copy(X[:, idx:idx + windowLen])
    countsBlur = np.copy(Xblur[:, idx:idx + windowLen])

    bestOdds = -np.inf
    bestFilt = None
    bestLocs = None
    for i, idx in enumerate(idxs[1:]):
        k = i + 2.

        # update counts
        window = X[:, idx:idx + windowLen]
        windowBlur = Xblur[:, idx:idx + windowLen]
        counts += window
        countsBlur += windowBlur

        # our params
        theta_1 = countsBlur / k
        logs_1 = np.log(theta_1)
        logs_1[np.isneginf(
            logs_1)] = -999  # any non-inf number--will be masked by counts

        logDiffs = (logs_1 - logs_0)
        gains = counts * logDiffs  # *must* use this so -999 is masked
        threshMask = gains > lamda
        # threshMask = logDiffs > 0
        threshMask *= theta_1 > .5
        # threshMask = theta_1 > .5
        # gains += (k - counts) * (logs_1c - logs_0c)
        filt = logDiffs * threshMask

        logOdds = np.sum(counts * filt)

        randomOdds = np.sum(filt) * p0blur * k
        nextWindowOdds = -np.inf
        if k < len(idxs):
            idx = idxs[k]
            nextWindow = X[:, idx:idx + windowLen]
            # nextWindowOdds = np.sum(filt * nextWindow)
            nextWindowOdds = np.sum(filt * nextWindow) * k
        penalty = max(randomOdds, nextWindowOdds)
        logOdds -= penalty

        if logOdds > bestOdds:
            bestOdds = logOdds
            bestFilt = np.copy(filt)
            bestLocs = idxs[:k]
            # print("k={}; log odds {}".format(k, logOdds))

    # print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs))
    return bestOdds, bestLocs, bestFilt
Esempio n. 3
0
def old_findAllInstancesMAP(X, Xblur, seedStartIdx, seedEndIdx, Lmin, Lmax, Lfilt,
	p0=-1, p0blur=-1, logs_0=None, bsfScore=0, **sink):

	# print "map(): X, Xblur shape", X.shape, Xblur.shape

	# ------------------------ stats
	windowLen = seedEndIdx - seedStartIdx # assume end idx not inclusive

	if p0 <= 0.:
		p0 = np.mean(X) # fraction of entries that are 1 (roughly)
	if p0blur <= 0.:
		p0blur = np.mean(Xblur)
		p0blur *= 2
	if logs_0 is None:
		# featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1))
		# featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1)) * 2
		# theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans
		theta_0 = np.zeros((Xblur.shape[0], windowLen)) + p0blur
		# theta_0 = np.zeros((Xblur.shape[0], windowLen)) + 2 * p0blur
		theta_0c = 1. - theta_0
		logs_0 = np.log(theta_0)
		logs_0c = np.log(theta_0c)
		# logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0blur) # overzealous
		# logs_0c = np.zeros((Xblur.shape[0], windowLen)) + np.log(1. - p0blur)
		# logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0) # works
	# print "logs_0 nan at:", np.where(np.isnan(logs_0))[0]

	# lamda = -2 * np.log(p0blur) - .001 # just below what we get with 2 instances
	# lamda = -np.log(p0blur) - .001 # actually, including log(theta_0), should be this
	lamda = 0
	# print "lambda", lamda

	# minSim = p0
	# expectedOnesPerWindow = p0blur * X.shape[0] * windowLen
	# noiseSz = p0blur * expectedOnesPerWindow # num ones to begin with

	# ------------------------ candidate location generation
	x0 = Xblur[:, seedStartIdx:seedEndIdx]
	if np.sum(x0) <= 0.:
		print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx)
		return -1, None, None
	# dotProds = dotProdsWithAllWindows(x0, X)
	dotProds = dotProdsWithAllWindows(x0, Xblur)
	# assert(np.min(dotProds) >= 0.)
	# dotProds[dotProds < noiseSz] = 0. # don't even consider places worse than noise

	# compute best locations to try and then sort them in decreasing order
	bestIdxs = nonOverlappingMaxima(dotProds, Lmin)
	# bestIdxs = sub.optimalAlignment(dotProds, Lmin)

	bestProds = dotProds[bestIdxs]
	# keepIdxs = np.where(bestProds > noiseSz)[0]
	# bestIdxs = bestIdxs[keepIdxs]
	# bestProds = bestProds[keepIdxs]
	sortIdxs = np.argsort(bestProds)[::-1]
	idxs = bestIdxs[sortIdxs]

	# ------------------------ now figure out which idxs should be instances
	# avg together the 2 best windows and compute the score
	idx1, idx2 = idxs[:2]
	window1 = X[:, idx1:idx1+windowLen]
	window2 = X[:, idx2:idx2+windowLen]
	counts = window1 + window2
	window1blur = Xblur[:, idx1:idx1+windowLen]
	window2blur = Xblur[:, idx2:idx2+windowLen]
	countsBlur = window1blur + window2blur

	# compute best pair filter compared to noise
	k = 2.
	theta_1 = countsBlur / k
	# logs_1 = np.zeros(theta_1.shape)
	logs_1 = np.log(theta_1)
	logs_1[np.isneginf(logs_1)] = -99999. # big negative num
	# logs_1c = np.log(1. - theta_1)
	# logs_1c[np.isneginf(logs_1c)] = -99999.
	# gains = k * (logs_1 - logs_0) - lamda
	# gains = counts * (logs_1 - logs_0) - lamda
	# filt = gains * (gains > 0)
	# logOdds = np.sum(filt)
	logDiffs = (logs_1 - logs_0)
	gains = counts * logDiffs
	threshMask = gains > lamda
	# gains += (k - counts) * (logs_1c - logs_0c)
	filt = logDiffs * threshMask
	# logOdds = np.sum(filt) - lamda * np.count_nonzero(filt)
	# logOdds = np.sum(filt)

	# subtract1 = np.minimum(window1, theta_1)
	# subtract2 = np.minimum(window1, theta_1)
	# subtracts = [subtract1, subtract2]
	# for idx, subVal in zip(idxs[:2], subtracts):
	# 	Xblur[:, idx:idx+windowLen] -= subVal

	# # compute best pair filter compared to nearest enemy
	# if len(idxs) > 2:
	# 	idx = idxs[k]
	# 	nextWindow = Xblur[:, idx:idx+windowLen]
	# 	# theta_e = nextWindow
	# 	theta_e = np.maximum(nextWindow, theta_0)
	# 	logs_e = np.log(theta_e)
	# 	logs_e[np.isneginf(logs_e)] = -99999.
	# 	# logs_ec = np.log(1. - theta_e)
	# 	# logs_ec[np.isneginf(logs_ec)] = -99999.
	# 	gains_e = counts * (logs_1 - logs_e)
	# 	# gains_e = nextWindow * (logs_1 - logs_e)
	# 	filt_e = gains * (gains_e > lamda)
	# 	# logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e)
	# 	logOdds_e = np.sum(filt_e)
	# 	# if logOdds_e < logOdds:
	# 	if True:
	# 		# print("k=2; enemy log odds {} < noise log odds {}".format(
	# 		# 	logOdds_e, logOdds))
	# 		logOdds = logOdds_e

	# logOdds = np.sum(filt * window1) - lamda * np.count_nonzero(filt)
	# logOdds = np.sum(filt * window1)

	logOdds = np.sum(filt * X[:, idx1:idx1+windowLen])
	# compute best pair filter compared to nearest enemy
	if k < len(idxs):
		idx = idxs[k]
		# nextWindow = Xblur[:, idx:idx+windowLen]
		nextWindow = X[:, idx:idx+windowLen]
		nextWindowOdds = np.sum(filt * nextWindow)
		# nextWindowOdds -= lamda * np.count_nonzero(filt)
		logOdds -= nextWindowOdds

	# logOdds = np.sum(filt * counts) - lamda * np.count_nonzero(filt)
	# # compute best pair filter compared to nearest enemy
	# if k < len(idxs):
	# 	idx = idxs[k]
	# 	nextWindowBlur = Xblur[idx:idx+windowLen]
	# 	theta_e = nextWindowBlur
	# 	logs_e = np.log(theta_e)
	# 	logs_e[np.isneginf(logs_e)] = -99999.
	# 	filt = logs_e
	# 	# logDiffs = (logs_1 - logs_0)
	# 	# gains = counts * logDiffs
	# 	# threshMask = gains > lamda
	# 	# # gains += (k - counts) * (logs_1c - logs_0c)
	# 	# filt = logDiffs * threshMask
	# 	gains_e =
	# 	logOdds_e =


	# if np.mean(x0) > (p0blur * 2):
	# 	fig, axes = plt.subplots(2, 5, figsize=(10,8))
	# 	axes = axes.flatten()
	# 	plt.suptitle("{}-{}".format(idx1, idx2))
	# 	axes[0]
	# 	viz.imshowBetter(window1, ax=axes[0])
	# 	viz.imshowBetter(window2, ax=axes[1])
	# 	viz.imshowBetter(counts, ax=axes[2])
	# 	viz.imshowBetter(gains, ax=axes[3])
	# 	viz.imshowBetter(filt, ax=axes[4])
	# 	axes[4].set_title("{}".format(logOdds))

	# 	viz.imshowBetter(X[:, idx1:idx1+windowLen], ax=axes[5])
	# 	viz.imshowBetter(X[:, idx2:idx2+windowLen], ax=axes[6])

	bestOdds = logOdds
	bestFilt = filt
	bestLocs = idxs[:2]

	# print "map() {} -> {} {}:".format(seedStartIdx, idx1, idx2), logOdds, bestLocs
	# print "map() {}: k=2, total prob: {}".format(seedStartIdx, np.sum(theta_1))
	# return logOdds, bestLocs, bestFilt # TODO remove after debug



	# Alright, so what I want the right answer to be is the biggest patch
	# of black I can get, subtracting off the next-biggest patch of black
	# (weighting each by the filter or something)
	# -so maybe that's the log odds of the Nth loc - the log odds of the Nth loc
	#	-and note that these are log odds *of the loc being an instance*, not
	# 	the log odds of the filter as a whole
	#		-although pretty sure there's some formulation of filter log odds
	#		such that these are equivalent--it just isn't my current logOdds var



	# Xblur = np.copy(Xblur)
	# Xblur[:, idx1:idx1+windowLen] = np.maximum(0, Xblur[:, idx1:idx1+windowLen] - theta_1)
	# Xblur[:, idx2:idxs2+windowLen] = np.maximum(0, Xblur[:, idx2:idx2+windowLen] - theta_1)

	# try adding the rest of the idxs
	for i, idx in enumerate(idxs[2:]):
		k += 1
		window = Xblur[:, idx:idx+windowLen]

		# filter and odds vs noise
		counts += window
		theta_1 = counts / k
		logs_1 = np.log(theta_1)
		logs_1[np.isneginf(logs_1)] = -99999. # big negative num
		# logs_1c = np.log(1. - theta_1)
		# logs_1c[np.isneginf(logs_1c)] = -99999.
		# gains = k * (logs_1 - logs_0) - lamda
		# gains = counts * (logs_1 - logs_0) - lamda
		# filt = gains * (gains > 0)
		# logOdds = np.sum(filt)
		logDiffs = (logs_1 - logs_0)
		threshMask = gains > lamda
		# gains += (k - counts) * (logs_1c - logs_0c)
		filt = logDiffs * threshMask
		# logOdds = np.sum(filt) - lamda * np.count_nonzero(filt)
		# logOdds = np.sum(filt)

		# logOdds = np.sum(filt * window) - lamda * np.count_nonzero(filt)
		# logOdds = np.sum(filt * window)

		logOdds = np.sum(filt * X[:, idx:idx+windowLen])
		# compute best pair filter compared to nearest enemy
		# randomOdds = np.sum(filt) # Wait, was this even intentional
		randomOdds = np.sum(filt) * p0blur
		nextWindowOdds = 0
		if k < len(idxs):
			idx = idxs[k]
			# nextWindow = Xblur[:, idx:idx+windowLen]
			nextWindow = X[:, idx:idx+windowLen]
			nextWindowOdds = np.sum(filt * nextWindow)
			# nextWindowOdds -= lamda * np.count_nonzero(filt)
			# logOdds -= nextWindowOdds
		penalty = max(randomOdds, nextWindowOdds)
		logOdds -= penalty

		# subtract = np.minimum(window, theta_1)
		# subtracts.append(subtract)
		# Xblur[:, idx:idx+windowLen] -= subtract

		# # filter and odds vs nearest enemy
		# if k < len(idxs):
		# 	idx = idxs[k]
		# 	nextWindow = Xblur[:, idx:idx+windowLen]
		# 	# theta_e = nextWindow
		# 	theta_e = np.maximum(nextWindow, theta_0)
		# 	logs_e = np.log(theta_e)
		# 	logs_e[np.isneginf(logs_e)] = -99999.
		# 	# logs_ec = np.log(1. - theta_e)
		# 	# logs_ec[np.isneginf(logs_ec)] = -99999.
		# 	gains_e = counts * (logs_1 - logs_e)
		# 	# gains_e = nextWindow * (logs_1 - logs_e)
		# 	# gains_e += (k - counts) * (logs_1c - logs_ec)
		# 	filt_e = gains * (gains_e > lamda)
		# 	# logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e)
		# 	logOdds_e = np.sum(filt_e)
		# 	# logOdds = min(logOdds, logOdds_e)
		# 	# if logOdds_e < logOdds:
		# 	if True:
		# 		# print("k={}; enemy log odds {} < noise log odds {}".format(
		# 		# 	k, logOdds_e, logOdds))
		# 		logOdds = logOdds_e

		# Xblur[:, idx:idx+windowLen] = np.maximum(0, Xblur[:, idx:idx+windowLen] - theta_1)

		# print "map() {}: k={}, total prob: {}".format(seedStartIdx, k, np.sum(theta_1))

		if logOdds > bestOdds:
			bestOdds = logOdds
			bestFilt = np.copy(filt)
			bestLocs = idxs[:k]
			# print("k={}; log odds {}".format(k, logOdds))

	# print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs))

	# for idx, subVal in zip(idxs[:len(subtracts)], subtracts):
	# 	Xblur[:, idx:idx+windowLen] += subVal

	return bestOdds, bestLocs, bestFilt
Esempio n. 4
0
def findAllInstancesMAP(X, Xblur, seedStartIdx, seedEndIdx, Lmin, Lmax, Lfilt,
	p0=-1, p0blur=-1, logs_0=None, bsfScore=0, **sink):

	assert(np.min(X) >= 0.)
	assert(np.max(X) <= 1.)
	assert(np.min(Xblur) >= 0.)
	assert(np.max(Xblur) <= 1.)
	assert(np.all(np.sum(X, axis=1) > 0))
	assert(np.all(np.sum(Xblur, axis=1) > 0))

	# ================================ variable initialization
	windowLen = seedEndIdx - seedStartIdx # assume end idx not inclusive

	if p0 <= 0.:
		p0 = np.mean(X) # fraction of entries that are 1 (roughly)
	if p0blur <= 0.:
		p0blur = np.mean(Xblur)
	if logs_0 is None:
		# theta_0 = np.zeros(windowShape) + p0blur
		featureMeans = np.mean(Xblur, axis=1, keepdims=True)
		# featureSums = np.sum(Xblur, axis=1, keepdims=True) + 1. # regularize
		# featureMeans = featureSums / X.shape[1]
		theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans
		logs_0 = np.log(theta_0)

	lamda = 0

	# ================================ candidate location generation
	x0 = Xblur[:, seedStartIdx:seedEndIdx]
	if np.sum(x0) <= 0.:
		print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx)
		return -1, None, None
	dotProds = dotProdsWithAllWindows(x0, X)

	# compute best locations to try and then sort them in decreasing order
	bestIdxs = nonOverlappingMaxima(dotProds, Lmin)

	bestProds = dotProds[bestIdxs]
	sortIdxs = np.argsort(bestProds)[::-1]
	idxs = bestIdxs[sortIdxs]

	# ================================ now figure out which idxs should be instances

	# initialize counts
	idx = idxs[0]
	counts = np.copy(X[:, idx:idx+windowLen])
	countsBlur = np.copy(Xblur[:, idx:idx+windowLen])

	bestOdds = -np.inf
	bestFilt = None
	bestLocs = None
	for i, idx in enumerate(idxs[1:]):
		k = i + 2.

		# update counts
		window = X[:, idx:idx+windowLen]
		windowBlur = Xblur[:, idx:idx+windowLen]
		counts += window
		countsBlur += windowBlur

		# our params
		theta_1 = countsBlur / k
		logs_1 = np.log(theta_1)
		logs_1[np.isneginf(logs_1)] = -999 # any non-inf number--will be masked by counts

		logDiffs = (logs_1 - logs_0)
		gains = counts * logDiffs # *must* use this so -999 is masked
		threshMask = gains > lamda
		# threshMask = logDiffs > 0
		threshMask *= theta_1 > .5
		# threshMask = theta_1 > .5
		# gains += (k - counts) * (logs_1c - logs_0c)
		filt = logDiffs * threshMask

		logOdds = np.sum(counts * filt)

		randomOdds = np.sum(filt) * p0blur * k
		nextWindowOdds = -np.inf
		if k < len(idxs):
			idx = idxs[k]
			nextWindow = X[:, idx:idx+windowLen]
			# nextWindowOdds = np.sum(filt * nextWindow)
			nextWindowOdds = np.sum(filt * nextWindow) * k
		penalty = max(randomOdds, nextWindowOdds)
		logOdds -= penalty

		if logOdds > bestOdds:
			bestOdds = logOdds
			bestFilt = np.copy(filt)
			bestLocs = idxs[:k]
			# print("k={}; log odds {}".format(k, logOdds))

	# print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs))
	return bestOdds, bestLocs, bestFilt
Esempio n. 5
0
def neighborSims1D(seq, length, numNeighbors=100, samplingAlgo='walk',
	similarityAlgo='meanOnly', maxDist=.25, localMaxFilter=False,
	spacedMaxFilter=False, tryNumNeighbors=-1, **sink):
	# spacedMaxFilter=True, tryNumNeighbors=-1, **sink):

	# print "neighborSims1D(); seq shape, requested len, requested count"
	# print seq.shape, length, numNeighbors

	seq = seq.flatten()
	X = window.sliding_window_1D(seq, length)
	numSubseqs = X.shape[0]

	if numNeighbors < 1 or numNeighbors > numSubseqs:
		numNeighbors = numSubseqs
		# origNumNeighbors = numNeighbors
	# elif baseLength:
	# 	origNumNeighbors = numNeighbors
	# 	numNeighbors = int(numNeighbors * float(length) / baseLength)

	if samplingAlgo == 'std':
		probs = np.std(X, axis=1)
	elif samplingAlgo == 'var':
		probs = np.var(X, axis=1)
	elif samplingAlgo == 'unif':
		probs = np.ones(numSubseqs)
	elif samplingAlgo == 'walk':
		probs = windowScoresRandWalk(seq, length)
	else:
		raise ValueError("Unrecognized sampling algorithm {}".format(samplingAlgo))

	# must assess at least as many subseqs as we want to return, and no more
	# than the largest number possible
	tryNumNeighbors = max(tryNumNeighbors, numNeighbors)
	tryNumNeighbors = min(tryNumNeighbors, numSubseqs)

	# print "neighborSims1D(); X shape ", X.shape

	# print np.var(X, axis=1)

	# allDists = pairwiseDists(X)
	# # allDists = pairwiseDists(X) / length
	# # import matplotlib.pyplot as plt
	# # from ..viz import viz_utils as viz
	# # plt.figure()
	# # viz.imshowBetter(allDists)
	# # plt.show()
	# # import sys
	# # sys.exit()

	# # closeEnough = (allDists < maxDist).astype(np.int)
	# # closeEnough = allDists < maxDist
	# closeEnough = allDists < (maxDist * length)
	# neighborCounts = np.sum(closeEnough, axis=1)
	# print neighborCounts
	# eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor
	# # print eligibleIdxs
	# numEligibleIdxs = len(eligibleIdxs)

	# print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs

	# select random subseqs
	probs /= np.sum(probs)
	allIdxs = np.arange(numSubseqs)
	startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs)
	# minSpacing = length // 2
	# startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing,
	# 	probabilities=probs, reduceSpacingIfNeeded=True)
	# 	probabilities=probs, reduceNumIfNeeded=True)
	neighbors = X[startIdxs]

	# mean normalize all subseqs
	X = X - np.mean(X, axis=1, keepdims=True)
	neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True)

	# zNorm = True # TODO remove
	# if zNorm:
	# 	X = ar.zNormalizeRows(X)
	# 	neighbors = ar.zNormalizeRows(neighbors)

	# SELF: pick up here by ensuring sufficient features
	# import dist
	# Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X,
	# 	referenceVectAlgo='sample', norm=None)

	# allVariances = np.var(X, axis=1)
	# sortIdxs = np.argsort(allVariances)
	# allVariances = allVariances[sortIdxs]

	# sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output
	sims = np.zeros((tryNumNeighbors, numSubseqs)) # extra rows for uniform output

	if similarityAlgo == 'meanOnly':
		for i, neighbor in enumerate(neighbors):
			variance = np.var(neighbor)
			if variance < .0001:
				continue

			diffs = X - neighbor
			dists = np.sum(diffs * diffs, axis=1) / length
			dists /= variance # would be within [0, 2] if znormed

			dists[dists > maxDist] = np.inf
			neighborSims = np.maximum(0, 1. - dists)

			# print "i, sims shape", i, neighborSims.shape

			if localMaxFilter:
				idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(), maxima=True)
				sims[i, idxs] = neighborSims[idxs]
			elif spacedMaxFilter:
				idxs = nonOverlappingMaxima(neighborSims, length // 2)
				# idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2
				sims[i, idxs] = neighborSims[idxs]
			else:
				sims[i] = neighborSims

	else:
		raise ValueError("Unrecognized similarity algorithm {}".format(
			similarityAlgo))

	if tryNumNeighbors > numNeighbors: # need to remove some neighbors
		# greedily take rows with most total similarity, but only counting
		# trivial matches once
		scores = np.zeros(len(sims))
		for i, row in enumerate(sims):
			maximaIdxs = nonOverlappingMaxima(row, length // 2)
			scores[i] = np.sum(row[maximaIdxs])
		sortIdxs = np.argsort(scores)[::-1]
		sims = sims[sortIdxs[:numNeighbors]]

	return sims.T
Esempio n. 6
0
def neighborSims1D(seq,
                   length,
                   numNeighbors=100,
                   samplingAlgo='walk',
                   similarityAlgo='meanOnly',
                   maxDist=.25,
                   localMaxFilter=False,
                   spacedMaxFilter=False,
                   tryNumNeighbors=-1,
                   **sink):
    # spacedMaxFilter=True, tryNumNeighbors=-1, **sink):

    # print "neighborSims1D(); seq shape, requested len, requested count"
    # print seq.shape, length, numNeighbors

    seq = seq.flatten()
    X = window.sliding_window_1D(seq, length)
    numSubseqs = X.shape[0]

    if numNeighbors < 1 or numNeighbors > numSubseqs:
        numNeighbors = numSubseqs
        # origNumNeighbors = numNeighbors
    # elif baseLength:
    # 	origNumNeighbors = numNeighbors
    # 	numNeighbors = int(numNeighbors * float(length) / baseLength)

    if samplingAlgo == 'std':
        probs = np.std(X, axis=1)
    elif samplingAlgo == 'var':
        probs = np.var(X, axis=1)
    elif samplingAlgo == 'unif':
        probs = np.ones(numSubseqs)
    elif samplingAlgo == 'walk':
        probs = windowScoresRandWalk(seq, length)
    else:
        raise ValueError(
            "Unrecognized sampling algorithm {}".format(samplingAlgo))

    # must assess at least as many subseqs as we want to return, and no more
    # than the largest number possible
    tryNumNeighbors = max(tryNumNeighbors, numNeighbors)
    tryNumNeighbors = min(tryNumNeighbors, numSubseqs)

    # print "neighborSims1D(); X shape ", X.shape

    # print np.var(X, axis=1)

    # allDists = pairwiseDists(X)
    # # allDists = pairwiseDists(X) / length
    # # import matplotlib.pyplot as plt
    # # from ..viz import viz_utils as viz
    # # plt.figure()
    # # viz.imshowBetter(allDists)
    # # plt.show()
    # # import sys
    # # sys.exit()

    # # closeEnough = (allDists < maxDist).astype(np.int)
    # # closeEnough = allDists < maxDist
    # closeEnough = allDists < (maxDist * length)
    # neighborCounts = np.sum(closeEnough, axis=1)
    # print neighborCounts
    # eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor
    # # print eligibleIdxs
    # numEligibleIdxs = len(eligibleIdxs)

    # print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs

    # select random subseqs
    probs /= np.sum(probs)
    allIdxs = np.arange(numSubseqs)
    startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs)
    # minSpacing = length // 2
    # startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing,
    # 	probabilities=probs, reduceSpacingIfNeeded=True)
    # 	probabilities=probs, reduceNumIfNeeded=True)
    neighbors = X[startIdxs]

    # mean normalize all subseqs
    X = X - np.mean(X, axis=1, keepdims=True)
    neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True)

    # zNorm = True # TODO remove
    # if zNorm:
    # 	X = ar.zNormalizeRows(X)
    # 	neighbors = ar.zNormalizeRows(neighbors)

    # SELF: pick up here by ensuring sufficient features
    # import dist
    # Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X,
    # 	referenceVectAlgo='sample', norm=None)

    # allVariances = np.var(X, axis=1)
    # sortIdxs = np.argsort(allVariances)
    # allVariances = allVariances[sortIdxs]

    # sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output
    sims = np.zeros(
        (tryNumNeighbors, numSubseqs))  # extra rows for uniform output

    if similarityAlgo == 'meanOnly':
        for i, neighbor in enumerate(neighbors):
            variance = np.var(neighbor)
            if variance < .0001:
                continue

            diffs = X - neighbor
            dists = np.sum(diffs * diffs, axis=1) / length
            dists /= variance  # would be within [0, 2] if znormed

            dists[dists > maxDist] = np.inf
            neighborSims = np.maximum(0, 1. - dists)

            # print "i, sims shape", i, neighborSims.shape

            if localMaxFilter:
                idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(),
                                                maxima=True)
                sims[i, idxs] = neighborSims[idxs]
            elif spacedMaxFilter:
                idxs = nonOverlappingMaxima(neighborSims, length // 2)
                # idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2
                sims[i, idxs] = neighborSims[idxs]
            else:
                sims[i] = neighborSims

    else:
        raise ValueError(
            "Unrecognized similarity algorithm {}".format(similarityAlgo))

    if tryNumNeighbors > numNeighbors:  # need to remove some neighbors
        # greedily take rows with most total similarity, but only counting
        # trivial matches once
        scores = np.zeros(len(sims))
        for i, row in enumerate(sims):
            maximaIdxs = nonOverlappingMaxima(row, length // 2)
            scores[i] = np.sum(row[maximaIdxs])
        sortIdxs = np.argsort(scores)[::-1]
        sims = sims[sortIdxs[:numNeighbors]]

    return sims.T