def removeCorrelatedRows(X, thresh, accumulate=False): # for some reason, this occasionally removes all rows but the first, # even when it clearly shouldn't X = X[nonzeroRows(X)] Xnorm = meanNormalizeRows(X) Xnorm = normalizeRows(Xnorm) # print "mean row norm: ", np.mean(np.linalg.norm(Xnorm, axis=1)) # exactly 1.0...looks good keepIdxs = np.array([0]) multipliers = np.array([1]) for i, row in enumerate(Xnorm[1:]): Xkeep = Xnorm[keepIdxs] sims = np.dot(Xkeep, row) if np.max(sims) >= 1.0001: print "wtf, max too high!" print "sims", sims print "max", np.max(sims) assert (0) if np.min(sims) <= -1.0001: print "wtf, min too low!" print "sims", sims print "min", np.min(sims) assert (0) if np.all(sims < thresh): keepIdxs = np.r_[keepIdxs, i] multipliers = np.r_[multipliers, 1] elif accumulate: bestMatchIdx = np.argmax(sims) multipliers[bestMatchIdx] += 1 # weight by num times it happens # elif len(sims) == 1: # print i, sims.shape, np.max(sims) # print np.std(row), np.mean(row) return X[keepIdxs] * multipliers.reshape((-1, 1))
def removeCorrelatedRows(X, thresh, accumulate=False): # for some reason, this occasionally removes all rows but the first, # even when it clearly shouldn't X = X[nonzeroRows(X)] Xnorm = meanNormalizeRows(X) Xnorm = normalizeRows(Xnorm) # print "mean row norm: ", np.mean(np.linalg.norm(Xnorm, axis=1)) # exactly 1.0...looks good keepIdxs = np.array([0]) multipliers = np.array([1]) for i, row in enumerate(Xnorm[1:]): Xkeep = Xnorm[keepIdxs] sims = np.dot(Xkeep, row) if np.max(sims) >= 1.0001: print "wtf, max too high!" print "sims", sims print "max", np.max(sims) assert(0) if np.min(sims) <= -1.0001: print "wtf, min too low!" print "sims", sims print "min", np.min(sims) assert(0) if np.all(sims < thresh): keepIdxs = np.r_[keepIdxs, i] multipliers = np.r_[multipliers, 1] elif accumulate: bestMatchIdx = np.argmax(sims) multipliers[bestMatchIdx] += 1 # weight by num times it happens # elif len(sims) == 1: # print i, sims.shape, np.max(sims) # print np.std(row), np.mean(row) return X[keepIdxs] * multipliers.reshape((-1,1))
def _neighborSimsMat(seq, lengths, numNeighbors): seq = seq if len(seq.shape) > 1 else seq.reshape((-1, 1)) # ensure 2d mats = [] for dim, col in enumerate(seq.T): if np.var(col) < ar.DEFAULT_NONZERO_THRESH: # ignore flat dims continue mat = np.zeros((len(lengths) * numNeighbors, len(seq))) for i, m in enumerate(lengths): sims = _neighborSims1D(col, m, numNeighbors=numNeighbors) # we preallocated a matrix of the appropriate dimensions, # so we need to calculate where in that matrix to dump # the similarities computed at this length rowStart = i * numNeighbors # length inner loop rowEnd = rowStart + numNeighbors colStart = (mat.shape[1] - sims.shape[1]) // 2 colEnd = colStart + sims.shape[1] mat[rowStart:rowEnd, colStart:colEnd] = sims # populate data past end of sims with median of each row; # better than 0 padding so that overly frequent stuff remains # overly frequent (so we'll remove it below) medians = np.median(mat[rowStart:rowEnd], axis=1, keepdims=True) mat[rowStart:rowEnd, :colStart] = medians mat[rowStart:rowEnd, colEnd:] = medians # remove rows where no features happened mat = mat[ar.nonzeroRows(mat)] # remove rows that are mostly nonzero, since this means the feature # is happening more often than not and thus isn't very informative minorityOnesRows = np.where(np.mean(mat > 0, axis=1) < .5)[0] mats.append(mat[minorityOnesRows]) return np.vstack(mats)
def learnFFfromSeq(seq, Lmin, Lmax, Lfilt=0): """ Finds the repeating pattern in `seq`. Parameters ---------- seq : 2D array 2D array whose rows are time steps and whose columns are dimensions of the time series. If there is only one dimension, ensure that it is a column vector (so that it is 2D). Lmin : int > 0 The minimum length that an instance of the pattern could be Must be > Lmax / 2. Lmax : int > 0 The maximum length that an instance of the pattern could be. Must be < 2 * Lmin. Lfilt : int, optional The width of the hamming filter used to blur the feature matrix. If set to < 1, will default to Lmin. Returns ------- startIdxs : 1D array The times (rows) in seq in which the estimated pattern instances begin (inclusive). endIdxs : 1D array The times (rows) in seq in which the estimated pattern instances end (non-inclusive). model : 2D array A learned model of the pattern. Can be seen as a digital filter that, when multiplied with a window of data, yields the log odds of that window being an instance of the pattern instead of iid Bernoulli random variables (ignoring overlap constraints). In practice, this is returned so that it can be plotted to show what features are selected. X : 2D array The feature matrix Xblur : 2D array The blurred feature matrix """ Lmin = int(len(seq) * Lmin) if Lmin < 1. else Lmin Lmax = int(len(seq) * Lmax) if Lmax < 1. else Lmax Lfilt = int(len(seq) * Lfilt) if Lfilt < 1. else Lfilt if not Lfilt or Lfilt < 0: Lfilt = Lmin # extend the first and last values out so that features using # longer windows are present for more locations padLen = Lmax seq = feat.extendSeq(seq, padLen, padLen) # build the feature matrix and blurred feature matrix X = feat.buildFeatureMat(seq, Lmin, Lmax) X, Xblur = feat.preprocessFeatureMat(X, Lfilt) # undo padding after constructing feature matrix X = X[:, padLen:-padLen] Xblur = Xblur[:, padLen:-padLen] seq = seq[padLen:-padLen] # catch edge case where all nonzeros in a row were in the padding keepRowIdxs = ar.nonzeroRows(X, thresh=1.) X = X[keepRowIdxs] Xblur = Xblur[keepRowIdxs] # feature matrices must satisfy these (if you plan on using your own) assert (np.min(X) >= 0.) assert (np.max(X) <= 1.) assert (np.min(Xblur) >= 0.) assert (np.max(Xblur) <= 1.) assert (np.all(np.sum(X, axis=1) > 0)) assert (np.all(np.sum(Xblur, axis=1) > 0)) startIdxs, endIdxs, bsfFilt = _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt) return startIdxs, endIdxs, bsfFilt, X, Xblur
def learnFFfromSeq(seq, Lmin, Lmax, Lfilt=0): """ Finds the repeating pattern in `seq`. Parameters ---------- seq : 2D array 2D array whose rows are time steps and whose columns are dimensions of the time series. If there is only one dimension, ensure that it is a column vector (so that it is 2D). Lmin : int > 0 The minimum length that an instance of the pattern could be Must be > Lmax / 2. Lmax : int > 0 The maximum length that an instance of the pattern could be. Must be < 2 * Lmin. Lfilt : int, optional The width of the hamming filter used to blur the feature matrix. If set to < 1, will default to Lmin. Returns ------- startIdxs : 1D array The times (rows) in seq in which the estimated pattern instances begin (inclusive). endIdxs : 1D array The times (rows) in seq in which the estimated pattern instances end (non-inclusive). model : 2D array A learned model of the pattern. Can be seen as a digital filter that, when multiplied with a window of data, yields the log odds of that window being an instance of the pattern instead of iid Bernoulli random variables (ignoring overlap constraints). In practice, this is returned so that it can be plotted to show what features are selected. X : 2D array The feature matrix Xblur : 2D array The blurred feature matrix """ Lmin = int(len(seq) * Lmin) if Lmin < 1.0 else Lmin Lmax = int(len(seq) * Lmax) if Lmax < 1.0 else Lmax Lfilt = int(len(seq) * Lfilt) if Lfilt < 1.0 else Lfilt if not Lfilt or Lfilt < 0: Lfilt = Lmin t0 = time.clock() # extend the first and last values out so that features using # longer windows are present for more locations padLen = Lmax seq = feat.extendSeq(seq, padLen, padLen) # build the feature matrix and blurred feature matrix X = feat.buildFeatureMat(seq, Lmin, Lmax) X, Xblur = feat.preprocessFeatureMat(X, Lfilt) # undo padding after constructing feature matrix X = X[:, padLen:-padLen] Xblur = Xblur[:, padLen:-padLen] seq = seq[padLen:-padLen] # catch edge case where all nonzeros in a row were in the padding keepRowIdxs = ar.nonzeroRows(X, thresh=1.0) X = X[keepRowIdxs] Xblur = Xblur[keepRowIdxs] t1 = time.clock() # feature matrices must satisfy these (if you plan on using your own) assert np.min(X) >= 0.0 assert np.max(X) <= 1.0 assert np.min(Xblur) >= 0.0 assert np.max(Xblur) <= 1.0 assert np.all(np.sum(X, axis=1) > 0) assert np.all(np.sum(Xblur, axis=1) > 0) print ("learnFFfromSeq(): feature construction time:\n\t{}".format(t1 - t0)) startIdxs, endIdxs, bsfFilt = _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt) return startIdxs, endIdxs, bsfFilt, X, Xblur