Beispiel #1
0
def removeCorrelatedRows(X, thresh, accumulate=False):
    # for some reason, this occasionally removes all rows but the first,
    # even when it clearly shouldn't
    X = X[nonzeroRows(X)]
    Xnorm = meanNormalizeRows(X)

    Xnorm = normalizeRows(Xnorm)
    # print "mean row norm: ", np.mean(np.linalg.norm(Xnorm, axis=1)) # exactly 1.0...looks good
    keepIdxs = np.array([0])
    multipliers = np.array([1])
    for i, row in enumerate(Xnorm[1:]):
        Xkeep = Xnorm[keepIdxs]
        sims = np.dot(Xkeep, row)
        if np.max(sims) >= 1.0001:
            print "wtf, max too high!"
            print "sims", sims
            print "max", np.max(sims)
            assert (0)
        if np.min(sims) <= -1.0001:
            print "wtf, min too low!"
            print "sims", sims
            print "min", np.min(sims)
            assert (0)
        if np.all(sims < thresh):
            keepIdxs = np.r_[keepIdxs, i]
            multipliers = np.r_[multipliers, 1]
        elif accumulate:
            bestMatchIdx = np.argmax(sims)
            multipliers[bestMatchIdx] += 1  # weight by num times it happens

        # elif len(sims) == 1:
        # print i, sims.shape, np.max(sims)
        # print np.std(row), np.mean(row)

    return X[keepIdxs] * multipliers.reshape((-1, 1))
Beispiel #2
0
def removeCorrelatedRows(X, thresh, accumulate=False):
	# for some reason, this occasionally removes all rows but the first,
	# even when it clearly shouldn't
	X = X[nonzeroRows(X)]
	Xnorm = meanNormalizeRows(X)

	Xnorm = normalizeRows(Xnorm)
	# print "mean row norm: ", np.mean(np.linalg.norm(Xnorm, axis=1)) # exactly 1.0...looks good
	keepIdxs = np.array([0])
	multipliers = np.array([1])
	for i, row in enumerate(Xnorm[1:]):
		Xkeep = Xnorm[keepIdxs]
		sims = np.dot(Xkeep, row)
		if np.max(sims) >= 1.0001:
			print "wtf, max too high!"
			print "sims", sims
			print "max", np.max(sims)
			assert(0)
		if np.min(sims) <= -1.0001:
			print "wtf, min too low!"
			print "sims", sims
			print "min", np.min(sims)
			assert(0)
		if np.all(sims < thresh):
			keepIdxs = np.r_[keepIdxs, i]
			multipliers = np.r_[multipliers, 1]
		elif accumulate:
			bestMatchIdx = np.argmax(sims)
			multipliers[bestMatchIdx] += 1 # weight by num times it happens

		# elif len(sims) == 1:
			# print i, sims.shape, np.max(sims)
			# print np.std(row), np.mean(row)

	return X[keepIdxs] * multipliers.reshape((-1,1))
Beispiel #3
0
def _neighborSimsMat(seq, lengths, numNeighbors):
    seq = seq if len(seq.shape) > 1 else seq.reshape((-1, 1))  # ensure 2d

    mats = []
    for dim, col in enumerate(seq.T):
        if np.var(col) < ar.DEFAULT_NONZERO_THRESH:  # ignore flat dims
            continue
        mat = np.zeros((len(lengths) * numNeighbors, len(seq)))
        for i, m in enumerate(lengths):
            sims = _neighborSims1D(col, m, numNeighbors=numNeighbors)

            # we preallocated a matrix of the appropriate dimensions,
            # so we need to calculate where in that matrix to dump
            # the similarities computed at this length
            rowStart = i * numNeighbors  # length inner loop
            rowEnd = rowStart + numNeighbors
            colStart = (mat.shape[1] - sims.shape[1]) // 2
            colEnd = colStart + sims.shape[1]

            mat[rowStart:rowEnd, colStart:colEnd] = sims

            # populate data past end of sims with median of each row;
            # better than 0 padding so that overly frequent stuff remains
            # overly frequent (so we'll remove it below)
            medians = np.median(mat[rowStart:rowEnd], axis=1, keepdims=True)
            mat[rowStart:rowEnd, :colStart] = medians
            mat[rowStart:rowEnd, colEnd:] = medians

        # remove rows where no features happened
        mat = mat[ar.nonzeroRows(mat)]

        # remove rows that are mostly nonzero, since this means the feature
        # is happening more often than not and thus isn't very informative
        minorityOnesRows = np.where(np.mean(mat > 0, axis=1) < .5)[0]
        mats.append(mat[minorityOnesRows])

    return np.vstack(mats)
Beispiel #4
0
def _neighborSimsMat(seq, lengths, numNeighbors):
	seq = seq if len(seq.shape) > 1 else seq.reshape((-1, 1)) # ensure 2d

	mats = []
	for dim, col in enumerate(seq.T):
		if np.var(col) < ar.DEFAULT_NONZERO_THRESH: # ignore flat dims
			continue
		mat = np.zeros((len(lengths) * numNeighbors, len(seq)))
		for i, m in enumerate(lengths):
			sims = _neighborSims1D(col, m, numNeighbors=numNeighbors)

			# we preallocated a matrix of the appropriate dimensions,
			# so we need to calculate where in that matrix to dump
			# the similarities computed at this length
			rowStart = i * numNeighbors # length inner loop
			rowEnd = rowStart + numNeighbors
			colStart = (mat.shape[1] - sims.shape[1]) // 2
			colEnd = colStart + sims.shape[1]

			mat[rowStart:rowEnd, colStart:colEnd] = sims

			# populate data past end of sims with median of each row;
			# better than 0 padding so that overly frequent stuff remains
			# overly frequent (so we'll remove it below)
			medians = np.median(mat[rowStart:rowEnd], axis=1, keepdims=True)
			mat[rowStart:rowEnd, :colStart] = medians
			mat[rowStart:rowEnd, colEnd:] = medians

		# remove rows where no features happened
		mat = mat[ar.nonzeroRows(mat)]

		# remove rows that are mostly nonzero, since this means the feature
		# is happening more often than not and thus isn't very informative
		minorityOnesRows = np.where(np.mean(mat > 0, axis=1) < .5)[0]
		mats.append(mat[minorityOnesRows])

	return np.vstack(mats)
Beispiel #5
0
def learnFFfromSeq(seq, Lmin, Lmax, Lfilt=0):
    """
	Finds the repeating pattern in `seq`.

	Parameters
	----------
	seq : 2D array
		2D array whose rows are time steps and whose columns are dimensions
		of the time series. If there is only one dimension, ensure that it
		is a column vector (so that it is 2D).
	Lmin : int > 0
		The minimum length that an instance of the pattern could be Must be
		> Lmax / 2.
	Lmax : int > 0
		The maximum length that an instance of the pattern could be. Must be
		< 2 * Lmin.
	Lfilt : int, optional
		The width of the hamming filter used to blur the feature matrix. If
		set to < 1, will default to Lmin.

	Returns
	-------
	startIdxs : 1D array
		The times (rows) in seq in which the estimated pattern instances
		begin (inclusive).
	endIdxs : 1D array
		The times (rows) in seq in which the estimated pattern instances
		end (non-inclusive).
	model : 2D array
		A learned model of the pattern. Can be seen as a digital filter
		that, when multiplied with a window of data, yields the log odds of
		that window being an instance of the pattern instead of iid
		Bernoulli random variables (ignoring overlap constraints). In
		practice, this is returned so that it can be plotted to show what
		features are selected.
	X : 2D array
		The feature matrix
	Xblur : 2D array
		The blurred feature matrix
	"""

    Lmin = int(len(seq) * Lmin) if Lmin < 1. else Lmin
    Lmax = int(len(seq) * Lmax) if Lmax < 1. else Lmax
    Lfilt = int(len(seq) * Lfilt) if Lfilt < 1. else Lfilt

    if not Lfilt or Lfilt < 0:
        Lfilt = Lmin

    # extend the first and last values out so that features using
    # longer windows are present for more locations
    padLen = Lmax
    seq = feat.extendSeq(seq, padLen, padLen)

    # build the feature matrix and blurred feature matrix
    X = feat.buildFeatureMat(seq, Lmin, Lmax)
    X, Xblur = feat.preprocessFeatureMat(X, Lfilt)

    # undo padding after constructing feature matrix
    X = X[:, padLen:-padLen]
    Xblur = Xblur[:, padLen:-padLen]
    seq = seq[padLen:-padLen]

    # catch edge case where all nonzeros in a row were in the padding
    keepRowIdxs = ar.nonzeroRows(X, thresh=1.)
    X = X[keepRowIdxs]
    Xblur = Xblur[keepRowIdxs]

    # feature matrices must satisfy these (if you plan on using your own)
    assert (np.min(X) >= 0.)
    assert (np.max(X) <= 1.)
    assert (np.min(Xblur) >= 0.)
    assert (np.max(Xblur) <= 1.)
    assert (np.all(np.sum(X, axis=1) > 0))
    assert (np.all(np.sum(Xblur, axis=1) > 0))

    startIdxs, endIdxs, bsfFilt = _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt)
    return startIdxs, endIdxs, bsfFilt, X, Xblur
Beispiel #6
0
def learnFFfromSeq(seq, Lmin, Lmax, Lfilt=0):
    """
	Finds the repeating pattern in `seq`.

	Parameters
	----------
	seq : 2D array
		2D array whose rows are time steps and whose columns are dimensions
		of the time series. If there is only one dimension, ensure that it
		is a column vector (so that it is 2D).
	Lmin : int > 0
		The minimum length that an instance of the pattern could be Must be
		> Lmax / 2.
	Lmax : int > 0
		The maximum length that an instance of the pattern could be. Must be
		< 2 * Lmin.
	Lfilt : int, optional
		The width of the hamming filter used to blur the feature matrix. If
		set to < 1, will default to Lmin.

	Returns
	-------
	startIdxs : 1D array
		The times (rows) in seq in which the estimated pattern instances
		begin (inclusive).
	endIdxs : 1D array
		The times (rows) in seq in which the estimated pattern instances
		end (non-inclusive).
	model : 2D array
		A learned model of the pattern. Can be seen as a digital filter
		that, when multiplied with a window of data, yields the log odds of
		that window being an instance of the pattern instead of iid
		Bernoulli random variables (ignoring overlap constraints). In
		practice, this is returned so that it can be plotted to show what
		features are selected.
	X : 2D array
		The feature matrix
	Xblur : 2D array
		The blurred feature matrix
	"""

    Lmin = int(len(seq) * Lmin) if Lmin < 1.0 else Lmin
    Lmax = int(len(seq) * Lmax) if Lmax < 1.0 else Lmax
    Lfilt = int(len(seq) * Lfilt) if Lfilt < 1.0 else Lfilt

    if not Lfilt or Lfilt < 0:
        Lfilt = Lmin

    t0 = time.clock()

    # extend the first and last values out so that features using
    # longer windows are present for more locations
    padLen = Lmax
    seq = feat.extendSeq(seq, padLen, padLen)

    # build the feature matrix and blurred feature matrix
    X = feat.buildFeatureMat(seq, Lmin, Lmax)
    X, Xblur = feat.preprocessFeatureMat(X, Lfilt)

    # undo padding after constructing feature matrix
    X = X[:, padLen:-padLen]
    Xblur = Xblur[:, padLen:-padLen]
    seq = seq[padLen:-padLen]

    # catch edge case where all nonzeros in a row were in the padding
    keepRowIdxs = ar.nonzeroRows(X, thresh=1.0)
    X = X[keepRowIdxs]
    Xblur = Xblur[keepRowIdxs]

    t1 = time.clock()

    # feature matrices must satisfy these (if you plan on using your own)
    assert np.min(X) >= 0.0
    assert np.max(X) <= 1.0
    assert np.min(Xblur) >= 0.0
    assert np.max(Xblur) <= 1.0
    assert np.all(np.sum(X, axis=1) > 0)
    assert np.all(np.sum(Xblur, axis=1) > 0)

    print ("learnFFfromSeq(): feature construction time:\n\t{}".format(t1 - t0))

    startIdxs, endIdxs, bsfFilt = _learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt)
    return startIdxs, endIdxs, bsfFilt, X, Xblur