def old_findAllInstancesMAP(X, Xblur, seedStartIdx, seedEndIdx, Lmin, Lmax, Lfilt, p0=-1, p0blur=-1, logs_0=None, bsfScore=0, **sink): # print "map(): X, Xblur shape", X.shape, Xblur.shape # ------------------------ stats windowLen = seedEndIdx - seedStartIdx # assume end idx not inclusive if p0 <= 0.: p0 = np.mean(X) # fraction of entries that are 1 (roughly) if p0blur <= 0.: p0blur = np.mean(Xblur) p0blur *= 2 if logs_0 is None: # featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1)) # featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1)) * 2 # theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans theta_0 = np.zeros((Xblur.shape[0], windowLen)) + p0blur # theta_0 = np.zeros((Xblur.shape[0], windowLen)) + 2 * p0blur theta_0c = 1. - theta_0 logs_0 = np.log(theta_0) logs_0c = np.log(theta_0c) # logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0blur) # overzealous # logs_0c = np.zeros((Xblur.shape[0], windowLen)) + np.log(1. - p0blur) # logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0) # works # print "logs_0 nan at:", np.where(np.isnan(logs_0))[0] # lamda = -2 * np.log(p0blur) - .001 # just below what we get with 2 instances # lamda = -np.log(p0blur) - .001 # actually, including log(theta_0), should be this lamda = 0 # print "lambda", lamda # minSim = p0 # expectedOnesPerWindow = p0blur * X.shape[0] * windowLen # noiseSz = p0blur * expectedOnesPerWindow # num ones to begin with # ------------------------ candidate location generation x0 = Xblur[:, seedStartIdx:seedEndIdx] if np.sum(x0) <= 0.: print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx) return -1, None, None # dotProds = dotProdsWithAllWindows(x0, X) dotProds = dotProdsWithAllWindows(x0, Xblur) # assert(np.min(dotProds) >= 0.) # dotProds[dotProds < noiseSz] = 0. # don't even consider places worse than noise # compute best locations to try and then sort them in decreasing order bestIdxs = nonOverlappingMaxima(dotProds, Lmin) # bestIdxs = sub.optimalAlignment(dotProds, Lmin) bestProds = dotProds[bestIdxs] # keepIdxs = np.where(bestProds > noiseSz)[0] # bestIdxs = bestIdxs[keepIdxs] # bestProds = bestProds[keepIdxs] sortIdxs = np.argsort(bestProds)[::-1] idxs = bestIdxs[sortIdxs] # ------------------------ now figure out which idxs should be instances # avg together the 2 best windows and compute the score idx1, idx2 = idxs[:2] window1 = X[:, idx1:idx1 + windowLen] window2 = X[:, idx2:idx2 + windowLen] counts = window1 + window2 window1blur = Xblur[:, idx1:idx1 + windowLen] window2blur = Xblur[:, idx2:idx2 + windowLen] countsBlur = window1blur + window2blur # compute best pair filter compared to noise k = 2. theta_1 = countsBlur / k # logs_1 = np.zeros(theta_1.shape) logs_1 = np.log(theta_1) logs_1[np.isneginf(logs_1)] = -99999. # big negative num # logs_1c = np.log(1. - theta_1) # logs_1c[np.isneginf(logs_1c)] = -99999. # gains = k * (logs_1 - logs_0) - lamda # gains = counts * (logs_1 - logs_0) - lamda # filt = gains * (gains > 0) # logOdds = np.sum(filt) logDiffs = (logs_1 - logs_0) gains = counts * logDiffs threshMask = gains > lamda # gains += (k - counts) * (logs_1c - logs_0c) filt = logDiffs * threshMask # logOdds = np.sum(filt) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt) # subtract1 = np.minimum(window1, theta_1) # subtract2 = np.minimum(window1, theta_1) # subtracts = [subtract1, subtract2] # for idx, subVal in zip(idxs[:2], subtracts): # Xblur[:, idx:idx+windowLen] -= subVal # # compute best pair filter compared to nearest enemy # if len(idxs) > 2: # idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] # # theta_e = nextWindow # theta_e = np.maximum(nextWindow, theta_0) # logs_e = np.log(theta_e) # logs_e[np.isneginf(logs_e)] = -99999. # # logs_ec = np.log(1. - theta_e) # # logs_ec[np.isneginf(logs_ec)] = -99999. # gains_e = counts * (logs_1 - logs_e) # # gains_e = nextWindow * (logs_1 - logs_e) # filt_e = gains * (gains_e > lamda) # # logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e) # logOdds_e = np.sum(filt_e) # # if logOdds_e < logOdds: # if True: # # print("k=2; enemy log odds {} < noise log odds {}".format( # # logOdds_e, logOdds)) # logOdds = logOdds_e # logOdds = np.sum(filt * window1) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt * window1) logOdds = np.sum(filt * X[:, idx1:idx1 + windowLen]) # compute best pair filter compared to nearest enemy if k < len(idxs): idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] nextWindow = X[:, idx:idx + windowLen] nextWindowOdds = np.sum(filt * nextWindow) # nextWindowOdds -= lamda * np.count_nonzero(filt) logOdds -= nextWindowOdds # logOdds = np.sum(filt * counts) - lamda * np.count_nonzero(filt) # # compute best pair filter compared to nearest enemy # if k < len(idxs): # idx = idxs[k] # nextWindowBlur = Xblur[idx:idx+windowLen] # theta_e = nextWindowBlur # logs_e = np.log(theta_e) # logs_e[np.isneginf(logs_e)] = -99999. # filt = logs_e # # logDiffs = (logs_1 - logs_0) # # gains = counts * logDiffs # # threshMask = gains > lamda # # # gains += (k - counts) * (logs_1c - logs_0c) # # filt = logDiffs * threshMask # gains_e = # logOdds_e = # if np.mean(x0) > (p0blur * 2): # fig, axes = plt.subplots(2, 5, figsize=(10,8)) # axes = axes.flatten() # plt.suptitle("{}-{}".format(idx1, idx2)) # axes[0] # viz.imshowBetter(window1, ax=axes[0]) # viz.imshowBetter(window2, ax=axes[1]) # viz.imshowBetter(counts, ax=axes[2]) # viz.imshowBetter(gains, ax=axes[3]) # viz.imshowBetter(filt, ax=axes[4]) # axes[4].set_title("{}".format(logOdds)) # viz.imshowBetter(X[:, idx1:idx1+windowLen], ax=axes[5]) # viz.imshowBetter(X[:, idx2:idx2+windowLen], ax=axes[6]) bestOdds = logOdds bestFilt = filt bestLocs = idxs[:2] # print "map() {} -> {} {}:".format(seedStartIdx, idx1, idx2), logOdds, bestLocs # print "map() {}: k=2, total prob: {}".format(seedStartIdx, np.sum(theta_1)) # return logOdds, bestLocs, bestFilt # TODO remove after debug # Alright, so what I want the right answer to be is the biggest patch # of black I can get, subtracting off the next-biggest patch of black # (weighting each by the filter or something) # -so maybe that's the log odds of the Nth loc - the log odds of the Nth loc # -and note that these are log odds *of the loc being an instance*, not # the log odds of the filter as a whole # -although pretty sure there's some formulation of filter log odds # such that these are equivalent--it just isn't my current logOdds var # Xblur = np.copy(Xblur) # Xblur[:, idx1:idx1+windowLen] = np.maximum(0, Xblur[:, idx1:idx1+windowLen] - theta_1) # Xblur[:, idx2:idxs2+windowLen] = np.maximum(0, Xblur[:, idx2:idx2+windowLen] - theta_1) # try adding the rest of the idxs for i, idx in enumerate(idxs[2:]): k += 1 window = Xblur[:, idx:idx + windowLen] # filter and odds vs noise counts += window theta_1 = counts / k logs_1 = np.log(theta_1) logs_1[np.isneginf(logs_1)] = -99999. # big negative num # logs_1c = np.log(1. - theta_1) # logs_1c[np.isneginf(logs_1c)] = -99999. # gains = k * (logs_1 - logs_0) - lamda # gains = counts * (logs_1 - logs_0) - lamda # filt = gains * (gains > 0) # logOdds = np.sum(filt) logDiffs = (logs_1 - logs_0) threshMask = gains > lamda # gains += (k - counts) * (logs_1c - logs_0c) filt = logDiffs * threshMask # logOdds = np.sum(filt) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt) # logOdds = np.sum(filt * window) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt * window) logOdds = np.sum(filt * X[:, idx:idx + windowLen]) # compute best pair filter compared to nearest enemy # randomOdds = np.sum(filt) # Wait, was this even intentional randomOdds = np.sum(filt) * p0blur nextWindowOdds = 0 if k < len(idxs): idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] nextWindow = X[:, idx:idx + windowLen] nextWindowOdds = np.sum(filt * nextWindow) # nextWindowOdds -= lamda * np.count_nonzero(filt) # logOdds -= nextWindowOdds penalty = max(randomOdds, nextWindowOdds) logOdds -= penalty # subtract = np.minimum(window, theta_1) # subtracts.append(subtract) # Xblur[:, idx:idx+windowLen] -= subtract # # filter and odds vs nearest enemy # if k < len(idxs): # idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] # # theta_e = nextWindow # theta_e = np.maximum(nextWindow, theta_0) # logs_e = np.log(theta_e) # logs_e[np.isneginf(logs_e)] = -99999. # # logs_ec = np.log(1. - theta_e) # # logs_ec[np.isneginf(logs_ec)] = -99999. # gains_e = counts * (logs_1 - logs_e) # # gains_e = nextWindow * (logs_1 - logs_e) # # gains_e += (k - counts) * (logs_1c - logs_ec) # filt_e = gains * (gains_e > lamda) # # logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e) # logOdds_e = np.sum(filt_e) # # logOdds = min(logOdds, logOdds_e) # # if logOdds_e < logOdds: # if True: # # print("k={}; enemy log odds {} < noise log odds {}".format( # # k, logOdds_e, logOdds)) # logOdds = logOdds_e # Xblur[:, idx:idx+windowLen] = np.maximum(0, Xblur[:, idx:idx+windowLen] - theta_1) # print "map() {}: k={}, total prob: {}".format(seedStartIdx, k, np.sum(theta_1)) if logOdds > bestOdds: bestOdds = logOdds bestFilt = np.copy(filt) bestLocs = idxs[:k] # print("k={}; log odds {}".format(k, logOdds)) # print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs)) # for idx, subVal in zip(idxs[:len(subtracts)], subtracts): # Xblur[:, idx:idx+windowLen] += subVal return bestOdds, bestLocs, bestFilt
def findAllInstancesMAP(X, Xblur, seedStartIdx, seedEndIdx, Lmin, Lmax, Lfilt, p0=-1, p0blur=-1, logs_0=None, bsfScore=0, **sink): assert (np.min(X) >= 0.) assert (np.max(X) <= 1.) assert (np.min(Xblur) >= 0.) assert (np.max(Xblur) <= 1.) assert (np.all(np.sum(X, axis=1) > 0)) assert (np.all(np.sum(Xblur, axis=1) > 0)) # ================================ variable initialization windowLen = seedEndIdx - seedStartIdx # assume end idx not inclusive if p0 <= 0.: p0 = np.mean(X) # fraction of entries that are 1 (roughly) if p0blur <= 0.: p0blur = np.mean(Xblur) if logs_0 is None: # theta_0 = np.zeros(windowShape) + p0blur featureMeans = np.mean(Xblur, axis=1, keepdims=True) # featureSums = np.sum(Xblur, axis=1, keepdims=True) + 1. # regularize # featureMeans = featureSums / X.shape[1] theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans logs_0 = np.log(theta_0) lamda = 0 # ================================ candidate location generation x0 = Xblur[:, seedStartIdx:seedEndIdx] if np.sum(x0) <= 0.: print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx) return -1, None, None dotProds = dotProdsWithAllWindows(x0, X) # compute best locations to try and then sort them in decreasing order bestIdxs = nonOverlappingMaxima(dotProds, Lmin) bestProds = dotProds[bestIdxs] sortIdxs = np.argsort(bestProds)[::-1] idxs = bestIdxs[sortIdxs] # ================================ now figure out which idxs should be instances # initialize counts idx = idxs[0] counts = np.copy(X[:, idx:idx + windowLen]) countsBlur = np.copy(Xblur[:, idx:idx + windowLen]) bestOdds = -np.inf bestFilt = None bestLocs = None for i, idx in enumerate(idxs[1:]): k = i + 2. # update counts window = X[:, idx:idx + windowLen] windowBlur = Xblur[:, idx:idx + windowLen] counts += window countsBlur += windowBlur # our params theta_1 = countsBlur / k logs_1 = np.log(theta_1) logs_1[np.isneginf( logs_1)] = -999 # any non-inf number--will be masked by counts logDiffs = (logs_1 - logs_0) gains = counts * logDiffs # *must* use this so -999 is masked threshMask = gains > lamda # threshMask = logDiffs > 0 threshMask *= theta_1 > .5 # threshMask = theta_1 > .5 # gains += (k - counts) * (logs_1c - logs_0c) filt = logDiffs * threshMask logOdds = np.sum(counts * filt) randomOdds = np.sum(filt) * p0blur * k nextWindowOdds = -np.inf if k < len(idxs): idx = idxs[k] nextWindow = X[:, idx:idx + windowLen] # nextWindowOdds = np.sum(filt * nextWindow) nextWindowOdds = np.sum(filt * nextWindow) * k penalty = max(randomOdds, nextWindowOdds) logOdds -= penalty if logOdds > bestOdds: bestOdds = logOdds bestFilt = np.copy(filt) bestLocs = idxs[:k] # print("k={}; log odds {}".format(k, logOdds)) # print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs)) return bestOdds, bestLocs, bestFilt
def old_findAllInstancesMAP(X, Xblur, seedStartIdx, seedEndIdx, Lmin, Lmax, Lfilt, p0=-1, p0blur=-1, logs_0=None, bsfScore=0, **sink): # print "map(): X, Xblur shape", X.shape, Xblur.shape # ------------------------ stats windowLen = seedEndIdx - seedStartIdx # assume end idx not inclusive if p0 <= 0.: p0 = np.mean(X) # fraction of entries that are 1 (roughly) if p0blur <= 0.: p0blur = np.mean(Xblur) p0blur *= 2 if logs_0 is None: # featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1)) # featureMeans = np.mean(Xblur, axis=1).reshape((-1, 1)) * 2 # theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans theta_0 = np.zeros((Xblur.shape[0], windowLen)) + p0blur # theta_0 = np.zeros((Xblur.shape[0], windowLen)) + 2 * p0blur theta_0c = 1. - theta_0 logs_0 = np.log(theta_0) logs_0c = np.log(theta_0c) # logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0blur) # overzealous # logs_0c = np.zeros((Xblur.shape[0], windowLen)) + np.log(1. - p0blur) # logs_0 = np.zeros((Xblur.shape[0], windowLen)) + np.log(p0) # works # print "logs_0 nan at:", np.where(np.isnan(logs_0))[0] # lamda = -2 * np.log(p0blur) - .001 # just below what we get with 2 instances # lamda = -np.log(p0blur) - .001 # actually, including log(theta_0), should be this lamda = 0 # print "lambda", lamda # minSim = p0 # expectedOnesPerWindow = p0blur * X.shape[0] * windowLen # noiseSz = p0blur * expectedOnesPerWindow # num ones to begin with # ------------------------ candidate location generation x0 = Xblur[:, seedStartIdx:seedEndIdx] if np.sum(x0) <= 0.: print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx) return -1, None, None # dotProds = dotProdsWithAllWindows(x0, X) dotProds = dotProdsWithAllWindows(x0, Xblur) # assert(np.min(dotProds) >= 0.) # dotProds[dotProds < noiseSz] = 0. # don't even consider places worse than noise # compute best locations to try and then sort them in decreasing order bestIdxs = nonOverlappingMaxima(dotProds, Lmin) # bestIdxs = sub.optimalAlignment(dotProds, Lmin) bestProds = dotProds[bestIdxs] # keepIdxs = np.where(bestProds > noiseSz)[0] # bestIdxs = bestIdxs[keepIdxs] # bestProds = bestProds[keepIdxs] sortIdxs = np.argsort(bestProds)[::-1] idxs = bestIdxs[sortIdxs] # ------------------------ now figure out which idxs should be instances # avg together the 2 best windows and compute the score idx1, idx2 = idxs[:2] window1 = X[:, idx1:idx1+windowLen] window2 = X[:, idx2:idx2+windowLen] counts = window1 + window2 window1blur = Xblur[:, idx1:idx1+windowLen] window2blur = Xblur[:, idx2:idx2+windowLen] countsBlur = window1blur + window2blur # compute best pair filter compared to noise k = 2. theta_1 = countsBlur / k # logs_1 = np.zeros(theta_1.shape) logs_1 = np.log(theta_1) logs_1[np.isneginf(logs_1)] = -99999. # big negative num # logs_1c = np.log(1. - theta_1) # logs_1c[np.isneginf(logs_1c)] = -99999. # gains = k * (logs_1 - logs_0) - lamda # gains = counts * (logs_1 - logs_0) - lamda # filt = gains * (gains > 0) # logOdds = np.sum(filt) logDiffs = (logs_1 - logs_0) gains = counts * logDiffs threshMask = gains > lamda # gains += (k - counts) * (logs_1c - logs_0c) filt = logDiffs * threshMask # logOdds = np.sum(filt) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt) # subtract1 = np.minimum(window1, theta_1) # subtract2 = np.minimum(window1, theta_1) # subtracts = [subtract1, subtract2] # for idx, subVal in zip(idxs[:2], subtracts): # Xblur[:, idx:idx+windowLen] -= subVal # # compute best pair filter compared to nearest enemy # if len(idxs) > 2: # idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] # # theta_e = nextWindow # theta_e = np.maximum(nextWindow, theta_0) # logs_e = np.log(theta_e) # logs_e[np.isneginf(logs_e)] = -99999. # # logs_ec = np.log(1. - theta_e) # # logs_ec[np.isneginf(logs_ec)] = -99999. # gains_e = counts * (logs_1 - logs_e) # # gains_e = nextWindow * (logs_1 - logs_e) # filt_e = gains * (gains_e > lamda) # # logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e) # logOdds_e = np.sum(filt_e) # # if logOdds_e < logOdds: # if True: # # print("k=2; enemy log odds {} < noise log odds {}".format( # # logOdds_e, logOdds)) # logOdds = logOdds_e # logOdds = np.sum(filt * window1) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt * window1) logOdds = np.sum(filt * X[:, idx1:idx1+windowLen]) # compute best pair filter compared to nearest enemy if k < len(idxs): idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] nextWindow = X[:, idx:idx+windowLen] nextWindowOdds = np.sum(filt * nextWindow) # nextWindowOdds -= lamda * np.count_nonzero(filt) logOdds -= nextWindowOdds # logOdds = np.sum(filt * counts) - lamda * np.count_nonzero(filt) # # compute best pair filter compared to nearest enemy # if k < len(idxs): # idx = idxs[k] # nextWindowBlur = Xblur[idx:idx+windowLen] # theta_e = nextWindowBlur # logs_e = np.log(theta_e) # logs_e[np.isneginf(logs_e)] = -99999. # filt = logs_e # # logDiffs = (logs_1 - logs_0) # # gains = counts * logDiffs # # threshMask = gains > lamda # # # gains += (k - counts) * (logs_1c - logs_0c) # # filt = logDiffs * threshMask # gains_e = # logOdds_e = # if np.mean(x0) > (p0blur * 2): # fig, axes = plt.subplots(2, 5, figsize=(10,8)) # axes = axes.flatten() # plt.suptitle("{}-{}".format(idx1, idx2)) # axes[0] # viz.imshowBetter(window1, ax=axes[0]) # viz.imshowBetter(window2, ax=axes[1]) # viz.imshowBetter(counts, ax=axes[2]) # viz.imshowBetter(gains, ax=axes[3]) # viz.imshowBetter(filt, ax=axes[4]) # axes[4].set_title("{}".format(logOdds)) # viz.imshowBetter(X[:, idx1:idx1+windowLen], ax=axes[5]) # viz.imshowBetter(X[:, idx2:idx2+windowLen], ax=axes[6]) bestOdds = logOdds bestFilt = filt bestLocs = idxs[:2] # print "map() {} -> {} {}:".format(seedStartIdx, idx1, idx2), logOdds, bestLocs # print "map() {}: k=2, total prob: {}".format(seedStartIdx, np.sum(theta_1)) # return logOdds, bestLocs, bestFilt # TODO remove after debug # Alright, so what I want the right answer to be is the biggest patch # of black I can get, subtracting off the next-biggest patch of black # (weighting each by the filter or something) # -so maybe that's the log odds of the Nth loc - the log odds of the Nth loc # -and note that these are log odds *of the loc being an instance*, not # the log odds of the filter as a whole # -although pretty sure there's some formulation of filter log odds # such that these are equivalent--it just isn't my current logOdds var # Xblur = np.copy(Xblur) # Xblur[:, idx1:idx1+windowLen] = np.maximum(0, Xblur[:, idx1:idx1+windowLen] - theta_1) # Xblur[:, idx2:idxs2+windowLen] = np.maximum(0, Xblur[:, idx2:idx2+windowLen] - theta_1) # try adding the rest of the idxs for i, idx in enumerate(idxs[2:]): k += 1 window = Xblur[:, idx:idx+windowLen] # filter and odds vs noise counts += window theta_1 = counts / k logs_1 = np.log(theta_1) logs_1[np.isneginf(logs_1)] = -99999. # big negative num # logs_1c = np.log(1. - theta_1) # logs_1c[np.isneginf(logs_1c)] = -99999. # gains = k * (logs_1 - logs_0) - lamda # gains = counts * (logs_1 - logs_0) - lamda # filt = gains * (gains > 0) # logOdds = np.sum(filt) logDiffs = (logs_1 - logs_0) threshMask = gains > lamda # gains += (k - counts) * (logs_1c - logs_0c) filt = logDiffs * threshMask # logOdds = np.sum(filt) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt) # logOdds = np.sum(filt * window) - lamda * np.count_nonzero(filt) # logOdds = np.sum(filt * window) logOdds = np.sum(filt * X[:, idx:idx+windowLen]) # compute best pair filter compared to nearest enemy # randomOdds = np.sum(filt) # Wait, was this even intentional randomOdds = np.sum(filt) * p0blur nextWindowOdds = 0 if k < len(idxs): idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] nextWindow = X[:, idx:idx+windowLen] nextWindowOdds = np.sum(filt * nextWindow) # nextWindowOdds -= lamda * np.count_nonzero(filt) # logOdds -= nextWindowOdds penalty = max(randomOdds, nextWindowOdds) logOdds -= penalty # subtract = np.minimum(window, theta_1) # subtracts.append(subtract) # Xblur[:, idx:idx+windowLen] -= subtract # # filter and odds vs nearest enemy # if k < len(idxs): # idx = idxs[k] # nextWindow = Xblur[:, idx:idx+windowLen] # # theta_e = nextWindow # theta_e = np.maximum(nextWindow, theta_0) # logs_e = np.log(theta_e) # logs_e[np.isneginf(logs_e)] = -99999. # # logs_ec = np.log(1. - theta_e) # # logs_ec[np.isneginf(logs_ec)] = -99999. # gains_e = counts * (logs_1 - logs_e) # # gains_e = nextWindow * (logs_1 - logs_e) # # gains_e += (k - counts) * (logs_1c - logs_ec) # filt_e = gains * (gains_e > lamda) # # logOdds_e = np.sum(filt_e) - lamda * np.count_nonzero(filt_e) # logOdds_e = np.sum(filt_e) # # logOdds = min(logOdds, logOdds_e) # # if logOdds_e < logOdds: # if True: # # print("k={}; enemy log odds {} < noise log odds {}".format( # # k, logOdds_e, logOdds)) # logOdds = logOdds_e # Xblur[:, idx:idx+windowLen] = np.maximum(0, Xblur[:, idx:idx+windowLen] - theta_1) # print "map() {}: k={}, total prob: {}".format(seedStartIdx, k, np.sum(theta_1)) if logOdds > bestOdds: bestOdds = logOdds bestFilt = np.copy(filt) bestLocs = idxs[:k] # print("k={}; log odds {}".format(k, logOdds)) # print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs)) # for idx, subVal in zip(idxs[:len(subtracts)], subtracts): # Xblur[:, idx:idx+windowLen] += subVal return bestOdds, bestLocs, bestFilt
def findAllInstancesMAP(X, Xblur, seedStartIdx, seedEndIdx, Lmin, Lmax, Lfilt, p0=-1, p0blur=-1, logs_0=None, bsfScore=0, **sink): assert(np.min(X) >= 0.) assert(np.max(X) <= 1.) assert(np.min(Xblur) >= 0.) assert(np.max(Xblur) <= 1.) assert(np.all(np.sum(X, axis=1) > 0)) assert(np.all(np.sum(Xblur, axis=1) > 0)) # ================================ variable initialization windowLen = seedEndIdx - seedStartIdx # assume end idx not inclusive if p0 <= 0.: p0 = np.mean(X) # fraction of entries that are 1 (roughly) if p0blur <= 0.: p0blur = np.mean(Xblur) if logs_0 is None: # theta_0 = np.zeros(windowShape) + p0blur featureMeans = np.mean(Xblur, axis=1, keepdims=True) # featureSums = np.sum(Xblur, axis=1, keepdims=True) + 1. # regularize # featureMeans = featureSums / X.shape[1] theta_0 = np.ones((Xblur.shape[0], windowLen)) * featureMeans logs_0 = np.log(theta_0) lamda = 0 # ================================ candidate location generation x0 = Xblur[:, seedStartIdx:seedEndIdx] if np.sum(x0) <= 0.: print "map() {}-{}: empty".format(seedStartIdx, seedEndIdx) return -1, None, None dotProds = dotProdsWithAllWindows(x0, X) # compute best locations to try and then sort them in decreasing order bestIdxs = nonOverlappingMaxima(dotProds, Lmin) bestProds = dotProds[bestIdxs] sortIdxs = np.argsort(bestProds)[::-1] idxs = bestIdxs[sortIdxs] # ================================ now figure out which idxs should be instances # initialize counts idx = idxs[0] counts = np.copy(X[:, idx:idx+windowLen]) countsBlur = np.copy(Xblur[:, idx:idx+windowLen]) bestOdds = -np.inf bestFilt = None bestLocs = None for i, idx in enumerate(idxs[1:]): k = i + 2. # update counts window = X[:, idx:idx+windowLen] windowBlur = Xblur[:, idx:idx+windowLen] counts += window countsBlur += windowBlur # our params theta_1 = countsBlur / k logs_1 = np.log(theta_1) logs_1[np.isneginf(logs_1)] = -999 # any non-inf number--will be masked by counts logDiffs = (logs_1 - logs_0) gains = counts * logDiffs # *must* use this so -999 is masked threshMask = gains > lamda # threshMask = logDiffs > 0 threshMask *= theta_1 > .5 # threshMask = theta_1 > .5 # gains += (k - counts) * (logs_1c - logs_0c) filt = logDiffs * threshMask logOdds = np.sum(counts * filt) randomOdds = np.sum(filt) * p0blur * k nextWindowOdds = -np.inf if k < len(idxs): idx = idxs[k] nextWindow = X[:, idx:idx+windowLen] # nextWindowOdds = np.sum(filt * nextWindow) nextWindowOdds = np.sum(filt * nextWindow) * k penalty = max(randomOdds, nextWindowOdds) logOdds -= penalty if logOdds > bestOdds: bestOdds = logOdds bestFilt = np.copy(filt) bestLocs = idxs[:k] # print("k={}; log odds {}".format(k, logOdds)) # print "map() {}: odds, locs {} {}".format(seedStartIdx, bestOdds, len(bestLocs)) return bestOdds, bestLocs, bestFilt
def neighborSims1D(seq, length, numNeighbors=100, samplingAlgo='walk', similarityAlgo='meanOnly', maxDist=.25, localMaxFilter=False, spacedMaxFilter=False, tryNumNeighbors=-1, **sink): # spacedMaxFilter=True, tryNumNeighbors=-1, **sink): # print "neighborSims1D(); seq shape, requested len, requested count" # print seq.shape, length, numNeighbors seq = seq.flatten() X = window.sliding_window_1D(seq, length) numSubseqs = X.shape[0] if numNeighbors < 1 or numNeighbors > numSubseqs: numNeighbors = numSubseqs # origNumNeighbors = numNeighbors # elif baseLength: # origNumNeighbors = numNeighbors # numNeighbors = int(numNeighbors * float(length) / baseLength) if samplingAlgo == 'std': probs = np.std(X, axis=1) elif samplingAlgo == 'var': probs = np.var(X, axis=1) elif samplingAlgo == 'unif': probs = np.ones(numSubseqs) elif samplingAlgo == 'walk': probs = windowScoresRandWalk(seq, length) else: raise ValueError("Unrecognized sampling algorithm {}".format(samplingAlgo)) # must assess at least as many subseqs as we want to return, and no more # than the largest number possible tryNumNeighbors = max(tryNumNeighbors, numNeighbors) tryNumNeighbors = min(tryNumNeighbors, numSubseqs) # print "neighborSims1D(); X shape ", X.shape # print np.var(X, axis=1) # allDists = pairwiseDists(X) # # allDists = pairwiseDists(X) / length # # import matplotlib.pyplot as plt # # from ..viz import viz_utils as viz # # plt.figure() # # viz.imshowBetter(allDists) # # plt.show() # # import sys # # sys.exit() # # closeEnough = (allDists < maxDist).astype(np.int) # # closeEnough = allDists < maxDist # closeEnough = allDists < (maxDist * length) # neighborCounts = np.sum(closeEnough, axis=1) # print neighborCounts # eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor # # print eligibleIdxs # numEligibleIdxs = len(eligibleIdxs) # print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs # select random subseqs probs /= np.sum(probs) allIdxs = np.arange(numSubseqs) startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs) # minSpacing = length // 2 # startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing, # probabilities=probs, reduceSpacingIfNeeded=True) # probabilities=probs, reduceNumIfNeeded=True) neighbors = X[startIdxs] # mean normalize all subseqs X = X - np.mean(X, axis=1, keepdims=True) neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True) # zNorm = True # TODO remove # if zNorm: # X = ar.zNormalizeRows(X) # neighbors = ar.zNormalizeRows(neighbors) # SELF: pick up here by ensuring sufficient features # import dist # Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X, # referenceVectAlgo='sample', norm=None) # allVariances = np.var(X, axis=1) # sortIdxs = np.argsort(allVariances) # allVariances = allVariances[sortIdxs] # sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output sims = np.zeros((tryNumNeighbors, numSubseqs)) # extra rows for uniform output if similarityAlgo == 'meanOnly': for i, neighbor in enumerate(neighbors): variance = np.var(neighbor) if variance < .0001: continue diffs = X - neighbor dists = np.sum(diffs * diffs, axis=1) / length dists /= variance # would be within [0, 2] if znormed dists[dists > maxDist] = np.inf neighborSims = np.maximum(0, 1. - dists) # print "i, sims shape", i, neighborSims.shape if localMaxFilter: idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(), maxima=True) sims[i, idxs] = neighborSims[idxs] elif spacedMaxFilter: idxs = nonOverlappingMaxima(neighborSims, length // 2) # idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2 sims[i, idxs] = neighborSims[idxs] else: sims[i] = neighborSims else: raise ValueError("Unrecognized similarity algorithm {}".format( similarityAlgo)) if tryNumNeighbors > numNeighbors: # need to remove some neighbors # greedily take rows with most total similarity, but only counting # trivial matches once scores = np.zeros(len(sims)) for i, row in enumerate(sims): maximaIdxs = nonOverlappingMaxima(row, length // 2) scores[i] = np.sum(row[maximaIdxs]) sortIdxs = np.argsort(scores)[::-1] sims = sims[sortIdxs[:numNeighbors]] return sims.T
def neighborSims1D(seq, length, numNeighbors=100, samplingAlgo='walk', similarityAlgo='meanOnly', maxDist=.25, localMaxFilter=False, spacedMaxFilter=False, tryNumNeighbors=-1, **sink): # spacedMaxFilter=True, tryNumNeighbors=-1, **sink): # print "neighborSims1D(); seq shape, requested len, requested count" # print seq.shape, length, numNeighbors seq = seq.flatten() X = window.sliding_window_1D(seq, length) numSubseqs = X.shape[0] if numNeighbors < 1 or numNeighbors > numSubseqs: numNeighbors = numSubseqs # origNumNeighbors = numNeighbors # elif baseLength: # origNumNeighbors = numNeighbors # numNeighbors = int(numNeighbors * float(length) / baseLength) if samplingAlgo == 'std': probs = np.std(X, axis=1) elif samplingAlgo == 'var': probs = np.var(X, axis=1) elif samplingAlgo == 'unif': probs = np.ones(numSubseqs) elif samplingAlgo == 'walk': probs = windowScoresRandWalk(seq, length) else: raise ValueError( "Unrecognized sampling algorithm {}".format(samplingAlgo)) # must assess at least as many subseqs as we want to return, and no more # than the largest number possible tryNumNeighbors = max(tryNumNeighbors, numNeighbors) tryNumNeighbors = min(tryNumNeighbors, numSubseqs) # print "neighborSims1D(); X shape ", X.shape # print np.var(X, axis=1) # allDists = pairwiseDists(X) # # allDists = pairwiseDists(X) / length # # import matplotlib.pyplot as plt # # from ..viz import viz_utils as viz # # plt.figure() # # viz.imshowBetter(allDists) # # plt.show() # # import sys # # sys.exit() # # closeEnough = (allDists < maxDist).astype(np.int) # # closeEnough = allDists < maxDist # closeEnough = allDists < (maxDist * length) # neighborCounts = np.sum(closeEnough, axis=1) # print neighborCounts # eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor # # print eligibleIdxs # numEligibleIdxs = len(eligibleIdxs) # print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs # select random subseqs probs /= np.sum(probs) allIdxs = np.arange(numSubseqs) startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs) # minSpacing = length // 2 # startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing, # probabilities=probs, reduceSpacingIfNeeded=True) # probabilities=probs, reduceNumIfNeeded=True) neighbors = X[startIdxs] # mean normalize all subseqs X = X - np.mean(X, axis=1, keepdims=True) neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True) # zNorm = True # TODO remove # if zNorm: # X = ar.zNormalizeRows(X) # neighbors = ar.zNormalizeRows(neighbors) # SELF: pick up here by ensuring sufficient features # import dist # Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X, # referenceVectAlgo='sample', norm=None) # allVariances = np.var(X, axis=1) # sortIdxs = np.argsort(allVariances) # allVariances = allVariances[sortIdxs] # sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output sims = np.zeros( (tryNumNeighbors, numSubseqs)) # extra rows for uniform output if similarityAlgo == 'meanOnly': for i, neighbor in enumerate(neighbors): variance = np.var(neighbor) if variance < .0001: continue diffs = X - neighbor dists = np.sum(diffs * diffs, axis=1) / length dists /= variance # would be within [0, 2] if znormed dists[dists > maxDist] = np.inf neighborSims = np.maximum(0, 1. - dists) # print "i, sims shape", i, neighborSims.shape if localMaxFilter: idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(), maxima=True) sims[i, idxs] = neighborSims[idxs] elif spacedMaxFilter: idxs = nonOverlappingMaxima(neighborSims, length // 2) # idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2 sims[i, idxs] = neighborSims[idxs] else: sims[i] = neighborSims else: raise ValueError( "Unrecognized similarity algorithm {}".format(similarityAlgo)) if tryNumNeighbors > numNeighbors: # need to remove some neighbors # greedily take rows with most total similarity, but only counting # trivial matches once scores = np.zeros(len(sims)) for i, row in enumerate(sims): maximaIdxs = nonOverlappingMaxima(row, length // 2) scores[i] = np.sum(row[maximaIdxs]) sortIdxs = np.argsort(scores)[::-1] sims = sims[sortIdxs[:numNeighbors]] return sims.T