def notSoRandWalkSeq(n=500, exampleLengths=[55, 60, 65], noiseStd=.5): seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) return embedExamples(seq, exampleLengths)
def randWalkSeq(n=500, exampleLengths=[55, 60, 65], noiseStd=.5): seq = synth.randwalk(n, std=noiseStd) return embedExamples(seq, exampleLengths)
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length minSim = .5 # loose cutoff for what counts as similar # k0 = len(exampleLengths) # for version where we tell it k answerIdxs = None # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # ------------------------ msrc from ..datasets import read_msrc as msrc idxs = [2] # idxs = [0] downsampleBy = 2 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape length = max(8, Lmin / 2) Lmin = len(seq) / 20 Lmax = len(seq) / 10 # Lmax = len(seq) / 20 # k0 = 10 minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length postPadLen = length - 1 first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack( (first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # r.plot() # plt.figure() # plt.plot(r.sampleTimes) # answerIdxs = r.gestureIdxs / downsampleBy # print r.gestureIdxs # print answerIdxs # plt.figure() # plt.plot(seq) # for idx in answerIdxs: # ax = plt.gca() # viz.plotVertLine(idx, ax=ax) # plt.show() # return # noise = synth.randconst(seq.shape) # add noise for debugging # seq = np.r_[noise, seq, noise] # ================================ simMat X = computeSimMat(seq, length) X[X < minSim] = 0. # Xorig = np.copy(X) X = ff2.localMaxFilterSimMat(X) Xblur = ff2.filterSimMat(X, length - 1, 'hamming', scaleFilterMethod='max1') # Xblur = ff2.filterSimMat(X, Lmin-1, 'hamming', scaleFilterMethod='max1') Xblur = np.minimum(Xblur, 1.) print "simMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "simMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float( X.size) # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4, 1), (0, 0)) axSim = plt.subplot2grid((4, 1), (1, 0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) Xpad = synth.appendZeros(X, length - 1) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Similarities Matrix") # plt.figure() # plt.imshow(Xorig, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.figure() # plt.imshow(X, interpolation='nearest', aspect='auto') # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=True) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=False) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # plt.figure() # plt.imshow(Xblur, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 # windowShape = (X.shape[0], Lmax) # windowSize = np.prod(windowShape) nLocs = X.shape[1] - windowLen + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag( np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) plt.figure() plt.imshow(windowSims, interpolation='nearest', aspect='auto') # plt.show() # return # ------------------------ find stuff # # # # Version where we we tell it k # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # # selfSims = np.diagonal(windowSims) # # candidateRowIdxs = np.where(selfSims * k0 <= bsfScore)[0] # # for i in candidateRowIdxs: # # row = windowSims[i] # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # if windowSims[i, i] * k0 <= bsfScore: # continue # idxs = sub.optimalAlignK(row, Lmin, k0) # intersection = windowVects[i] # sz = 0 # for idx in idxs: # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # if sz * k0 <= bsfScore: # break # score = sz * k0 # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k0, score)) # bsfScore = score # bsfLocs = idxs # bsfIntersection = np.copy(intersection) # # # # Version where we look for similarities to orig seq # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # print("not abandoning window {}!".format(i)) # # best combination of idxs such that none are within Lmin of each other # idxs = sub.optimalAlignment(row, Lmin) # # print i, ": ", idxs # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # use apodization window # # sz = np.count_nonzero(intersection) # just max-pool # score = sz * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0 # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # # allZeros = np.zeros(intersection.shape) # nextIntersection = np.minimum(intersection, windowVectsBlur[sortedIdxs[0]]) # nextSz = np.sum(nextIntersection) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.copy(nextIntersection) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(intersection, windowVectsBlur[nextIdx]) # nextSz = np.sum(nextIntersection) # sum -> use apodization window # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[ i, i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / ( k + 1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[ nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 score = (sz - nextSz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format( i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format( str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection, # and don't sort the indices, but instead care about overlap # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[1] * 2 sums -= expectedOnesPerCol plt.plot(sums) start, end, _ = maxSubarray(sums) patStart, patEnd = start, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx + windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 # length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm # Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmin = 20 # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length # minSim = .5 minSim = 0. length = Lmin // 2 # length = Lmin // 4 # length = 3 answerIdxs = None USE_MSRC = True # USE_MSRC = False # ================================ data # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) seq = synth.randwalk(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # seq = synth.appendZeros(seq, Lmax) # ------------------------ msrc if USE_MSRC: from ..datasets import read_msrc as msrc # idxs = [0] # idxs = [1] # idxs = [2] # idxs = [7] # length 1500, but instances of length like 20 # idxs = [8] # gets owned on this one cuz patterns of length like 100 # idxs = [9] # missing an annotation, it appears idxs = [10] # something crazy about feature rep here # TODO fix # idxs = [11] # crap cuz bad, low-variance signals # idxs = [12] # has garbagey sections like [10] # idxs = [13] # empty feature mat # TODO # idxs = [14] downsampleBy = 2 # downsampleBy = 1 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape # length = max(8, Lmin / 2) Lmin = len(seq) // 20 # Lmax = len(seq) // 8 Lmax = len(seq) // 10 length = Lmin // 2 # Lmax = len(seq) / 20 # k0 = 10 # minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length # postPadLen = length - 1 postPadLen = Lmax - length first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack((first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # ================================ feature construction logMaxLength = int(np.floor(np.log2(Lmax))) # logMaxLength = int(np.ceil(np.log2(Lmax))) # logMinLength = 3 # -> length 8 # logMinLength = 4 # -> length 16 logMinLength = int(np.floor(np.log2(Lmin))) lengths = np.arange(logMinLength, logMaxLength + 1) lengths = 2 ** lengths # lengths = [16] cardinality = 8 breakpoints = rep.saxBreakpoints(cardinality) X = rep.multiNormalizeAndSparseQuantize(seq, lengths, breakpoints) # X = rep.multiSparseLineProject(seq, lengths, breakpoints, removeZeroRows=False) # lengths2 = np.arange(3, logMaxLength + 1) # lengths2 = 2 ** lengths2 lengths2 = lengths # TODO uncomment after debug # lengths2 = [8, 32] # breakpoints2 = rep.defaultSparseLineBreakpoints(seq, scaleHowMany=2) breakpoints2 = rep.defaultSparseLineBreakpoints(seq) X2 = rep.multiSparseLineProject(seq, lengths2, breakpoints2) # X2 = X2 > minSim X2 = X2 > 0. # ignore correlations # print "shapes:" # print X.shape # print X2.shape X = np.vstack((X, X2)) # plt.figure() # # viz.imshowBetter(X) # viz.imshowBetter(X2) # plt.figure() # viz.imshowBetter(X2 > 0.) # plt.show() # print seq.shape # plt.figure() # plt.plot(seq[:,0]) # bit of pattern, but only varies between -.4 and .2 # okay, so 1st dim is all zeros # variances = rep.slidingVariance(seq, 8) # for dim in range(len(variances)): # plt.figure() # plt.plot(variances[dim].flatten()) # print variances.shape # variances = rep.vstack3Tensor(variances.T) # print variances.shape # plt.plot(variances) # plt.show() # return X = localMaxFilterSimMat(X) # Xbool = np.copy(X) featureMeans = np.mean(X, axis=1).reshape((-1, 1)) # print featureMeans X *= -np.log2(featureMeans) # variable encoding costs for rows # X /= -np.log(featureMeans) # Xblur = localMaxFilterSimMat(X) # try only maxFiltering Xblur Xblur = filterSimMat(X, length-1, 'hamming', scaleFilterMethod='max1') # plt.figure() # viz.imshowBetter(X) # plt.figure() # viz.imshowBetter(Xblur) print "featureMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "featureMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float(X.size) # plt.show() # return # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4,1), (0,0)) axSim = plt.subplot2grid((4,1), (1,0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) # if answerIdxs is not None: # for idx in answerIdxs: # viz.plotVertLine(idx, ax=axSeq) padLen = len(seq) - X.shape[1] Xpad = synth.appendZeros(X, padLen) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Feature Matrix") # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # p0 = 2 * np.mean(X) # lambda for l0 reg based on features being bernoulli at 2 locs minSim = p0 # p0 = -np.log(np.mean(Xbool)) # fraction of entries that are 1 (roughly) # noiseSz = p0 * X.shape[0] * windowLen # way too hard to beat expectedOnesPerWindow = p0 * X.shape[0] * windowLen noiseSz = p0 * expectedOnesPerWindow # num ones to begin with # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag(np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) # plt.figure() # plt.imshow(windowSims, interpolation='nearest', aspect='auto') # ------------------------ find stuff # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 # nextSz = -1 enemySz = max(nextSz, noiseSz) score = (sz - enemySz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break elif noiseSz > nextSz: break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0, and use mean values instead of intersection, # # and don't sort the indices, but instead care about overlap # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] * 2 # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[0] sums -= expectedOnesPerCol # plt.figure() # plt.plot(sums) start, end, _ = maxSubarray(sums) # patStart, patEnd = start, end + 1 + length patStart, patEnd = start, end + 1 # patStart, patEnd = start + length // 2, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx+windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length minSim = .5 # loose cutoff for what counts as similar # k0 = len(exampleLengths) # for version where we tell it k answerIdxs = None # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # ------------------------ msrc from ..datasets import read_msrc as msrc idxs = [2] # idxs = [0] downsampleBy = 2 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape length = max(8, Lmin / 2) Lmin = len(seq) / 20 Lmax = len(seq) / 10 # Lmax = len(seq) / 20 # k0 = 10 minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length postPadLen = length - 1 first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack((first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # r.plot() # plt.figure() # plt.plot(r.sampleTimes) # answerIdxs = r.gestureIdxs / downsampleBy # print r.gestureIdxs # print answerIdxs # plt.figure() # plt.plot(seq) # for idx in answerIdxs: # ax = plt.gca() # viz.plotVertLine(idx, ax=ax) # plt.show() # return # noise = synth.randconst(seq.shape) # add noise for debugging # seq = np.r_[noise, seq, noise] # ================================ simMat X = computeSimMat(seq, length) X[X < minSim] = 0. # Xorig = np.copy(X) X = ff2.localMaxFilterSimMat(X) Xblur = ff2.filterSimMat(X, length-1, 'hamming', scaleFilterMethod='max1') # Xblur = ff2.filterSimMat(X, Lmin-1, 'hamming', scaleFilterMethod='max1') Xblur = np.minimum(Xblur, 1.) print "simMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "simMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float(X.size) # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4,1), (0,0)) axSim = plt.subplot2grid((4,1), (1,0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) Xpad = synth.appendZeros(X, length-1) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Similarities Matrix") # plt.figure() # plt.imshow(Xorig, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.figure() # plt.imshow(X, interpolation='nearest', aspect='auto') # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=True) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=False) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # plt.figure() # plt.imshow(Xblur, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 # windowShape = (X.shape[0], Lmax) # windowSize = np.prod(windowShape) nLocs = X.shape[1] - windowLen + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag(np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) plt.figure() plt.imshow(windowSims, interpolation='nearest', aspect='auto') # plt.show() # return # ------------------------ find stuff # # # # Version where we we tell it k # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # # selfSims = np.diagonal(windowSims) # # candidateRowIdxs = np.where(selfSims * k0 <= bsfScore)[0] # # for i in candidateRowIdxs: # # row = windowSims[i] # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # if windowSims[i, i] * k0 <= bsfScore: # continue # idxs = sub.optimalAlignK(row, Lmin, k0) # intersection = windowVects[i] # sz = 0 # for idx in idxs: # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # if sz * k0 <= bsfScore: # break # score = sz * k0 # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k0, score)) # bsfScore = score # bsfLocs = idxs # bsfIntersection = np.copy(intersection) # # # # Version where we look for similarities to orig seq # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # print("not abandoning window {}!".format(i)) # # best combination of idxs such that none are within Lmin of each other # idxs = sub.optimalAlignment(row, Lmin) # # print i, ": ", idxs # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # use apodization window # # sz = np.count_nonzero(intersection) # just max-pool # score = sz * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0 # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # # allZeros = np.zeros(intersection.shape) # nextIntersection = np.minimum(intersection, windowVectsBlur[sortedIdxs[0]]) # nextSz = np.sum(nextIntersection) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.copy(nextIntersection) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(intersection, windowVectsBlur[nextIdx]) # nextSz = np.sum(nextIntersection) # sum -> use apodization window # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 score = (sz - nextSz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection, # and don't sort the indices, but instead care about overlap # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[1] * 2 sums -= expectedOnesPerCol plt.plot(sums) start, end, _ = maxSubarray(sums) patStart, patEnd = start, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx+windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 # length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm # Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmin = 20 # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length # minSim = .5 minSim = 0. length = Lmin // 2 # length = Lmin // 4 # length = 3 answerIdxs = None USE_MSRC = True # USE_MSRC = False # ================================ data # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) seq = synth.randwalk(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # seq = synth.appendZeros(seq, Lmax) # ------------------------ msrc if USE_MSRC: from ..datasets import read_msrc as msrc # idxs = [0] # idxs = [1] # idxs = [2] # idxs = [7] # length 1500, but instances of length like 20 # idxs = [8] # gets owned on this one cuz patterns of length like 100 # idxs = [9] # missing an annotation, it appears idxs = [10] # something crazy about feature rep here # TODO fix # idxs = [11] # crap cuz bad, low-variance signals # idxs = [12] # has garbagey sections like [10] # idxs = [13] # empty feature mat # TODO # idxs = [14] downsampleBy = 2 # downsampleBy = 1 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape # length = max(8, Lmin / 2) Lmin = len(seq) // 20 # Lmax = len(seq) // 8 Lmax = len(seq) // 10 length = Lmin // 2 # Lmax = len(seq) / 20 # k0 = 10 # minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length # postPadLen = length - 1 postPadLen = Lmax - length first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack( (first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # ================================ feature construction logMaxLength = int(np.floor(np.log2(Lmax))) # logMaxLength = int(np.ceil(np.log2(Lmax))) # logMinLength = 3 # -> length 8 # logMinLength = 4 # -> length 16 logMinLength = int(np.floor(np.log2(Lmin))) lengths = np.arange(logMinLength, logMaxLength + 1) lengths = 2**lengths # lengths = [16] cardinality = 8 breakpoints = rep.saxBreakpoints(cardinality) X = rep.multiNormalizeAndSparseQuantize(seq, lengths, breakpoints) # X = rep.multiSparseLineProject(seq, lengths, breakpoints, removeZeroRows=False) # lengths2 = np.arange(3, logMaxLength + 1) # lengths2 = 2 ** lengths2 lengths2 = lengths # TODO uncomment after debug # lengths2 = [8, 32] # breakpoints2 = rep.defaultSparseLineBreakpoints(seq, scaleHowMany=2) breakpoints2 = rep.defaultSparseLineBreakpoints(seq) X2 = rep.multiSparseLineProject(seq, lengths2, breakpoints2) # X2 = X2 > minSim X2 = X2 > 0. # ignore correlations # print "shapes:" # print X.shape # print X2.shape X = np.vstack((X, X2)) # plt.figure() # # viz.imshowBetter(X) # viz.imshowBetter(X2) # plt.figure() # viz.imshowBetter(X2 > 0.) # plt.show() # print seq.shape # plt.figure() # plt.plot(seq[:,0]) # bit of pattern, but only varies between -.4 and .2 # okay, so 1st dim is all zeros # variances = rep.slidingVariance(seq, 8) # for dim in range(len(variances)): # plt.figure() # plt.plot(variances[dim].flatten()) # print variances.shape # variances = rep.vstack3Tensor(variances.T) # print variances.shape # plt.plot(variances) # plt.show() # return X = localMaxFilterSimMat(X) # Xbool = np.copy(X) featureMeans = np.mean(X, axis=1).reshape((-1, 1)) # print featureMeans X *= -np.log2(featureMeans) # variable encoding costs for rows # X /= -np.log(featureMeans) # Xblur = localMaxFilterSimMat(X) # try only maxFiltering Xblur Xblur = filterSimMat(X, length - 1, 'hamming', scaleFilterMethod='max1') # plt.figure() # viz.imshowBetter(X) # plt.figure() # viz.imshowBetter(Xblur) print "featureMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "featureMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float( X.size) # plt.show() # return # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4, 1), (0, 0)) axSim = plt.subplot2grid((4, 1), (1, 0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) # if answerIdxs is not None: # for idx in answerIdxs: # viz.plotVertLine(idx, ax=axSeq) padLen = len(seq) - X.shape[1] Xpad = synth.appendZeros(X, padLen) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Feature Matrix") # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # p0 = 2 * np.mean(X) # lambda for l0 reg based on features being bernoulli at 2 locs minSim = p0 # p0 = -np.log(np.mean(Xbool)) # fraction of entries that are 1 (roughly) # noiseSz = p0 * X.shape[0] * windowLen # way too hard to beat expectedOnesPerWindow = p0 * X.shape[0] * windowLen noiseSz = p0 * expectedOnesPerWindow # num ones to begin with # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag( np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) # plt.figure() # plt.imshow(windowSims, interpolation='nearest', aspect='auto') # ------------------------ find stuff # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[ i, i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / ( k + 1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[ nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 # nextSz = -1 enemySz = max(nextSz, noiseSz) score = (sz - enemySz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format( i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format( str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break elif noiseSz > nextSz: break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0, and use mean values instead of intersection, # # and don't sort the indices, but instead care about overlap # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] * 2 # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[0] sums -= expectedOnesPerCol # plt.figure() # plt.plot(sums) start, end, _ = maxSubarray(sums) # patStart, patEnd = start, end + 1 + length patStart, patEnd = start, end + 1 # patStart, patEnd = start + length // 2, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx + windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()