Ejemplo n.º 1
0
def pairwiseDists(seqs,
                  length,
                  norm='each',
                  tieDims=False,
                  pad=True,
                  removeZeros=True,
                  k=-1):

    seqs = asListOrTuple(seqs)
    nDims = 1
    if len(seqs[0].shape) < 2 or tieDims:
        Xnorm, _, _ = window.flattened_subseqs_of_length(seqs,
                                                         length,
                                                         norm=norm)
    else:
        nDims = seqs[0].shape[1]
        # bypass flattening--each dim of each seq is treated as a separate
        # 1D seq; we end up with a long list whose elements are 1D vectors,
        # each of which was originally a column within some ND array in seqs
        #
        # note that this may do weird things if there's more than one seq
        # because the dims for each seq are sequential, rather than the seqs
        # for each dim
        separatedByDim = map(lambda X: colsAsList(X), seqs)
        flatSeqs = flattenListOfLists(separatedByDim)
        flatSeqs = map(lambda v: v.flatten(),
                       flatSeqs)  # col vects -> 1D arrays
        Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs,
                                                         length,
                                                         norm='each')

    nSamples, m = Xnorm.shape
    rowsPerDim = nSamples / nDims
    print "----- pairwiseDists"
    print "length", length
    print "origSeqs[0] shape", seqs[0].shape
    print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim
    print "-----"

    if pad:
        paddingLen = length - 1
    else:
        paddingLen = 0

    # print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1)

    # D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up
    Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim + paddingLen))
    # D = np.zeros((nSamples, nSamples))

    maxPossibleDist = 2**2 * m
    maxIdx = 0
    for dim in range(nDims):
        # extract subseqs associated with this dim
        minIdx = maxIdx
        maxIdx += rowsPerDim
        Xdim = Xnorm[minIdx:maxIdx]
        # compute dists to each one
        for i, row in enumerate(Xdim):
            if removeZeros:
                if np.sum(row * row) < 1.e-6:
                    Dtensor[dim, i, :rowsPerDim] = maxPossibleDist
                    continue

            diffs = Xdim - row
            diffs_sq = diffs * diffs
            # dMinIdx = minIdx + dim*paddingLen
            # dMaxIdx = dMinIdx + rowsPerDim
            dists = np.sum(diffs_sq, axis=1)

            # D[minIdx + i, dMinIdx:dMaxIdx] = dists
            Dtensor[dim, i, :rowsPerDim] = dists
        # only keep k lowest dists
        if k > 0:
            for j in np.arange(rowsPerDim):
                col = Dtensor[dim, :, j]
                highestIdxs = np.argsort(col)[k:]
                Dtensor[dim, highestIdxs, j] = maxPossibleDist

    # return Dtensor, D, Xnorm
    return Dtensor, Xnorm
Ejemplo n.º 2
0
def uniqueSubseqsInSignals(signal, length, maxDist, norm='each', tree=None):
    X, _, _ = window.flattened_subseqs_of_length(signal, length, norm=norm)
    Xnorm = zNormalizeRows(X, removeZeros=False)

    # print("subseqsInSignals: signal has %d subseqs" % (len(Xnorm)))

    # init kd tree--we can't give it any data yet because we only want to
    # search through seqs that have been added to the dictionary
    if tree is None:
        width = Xnorm.shape[1]
        tree = kd.create(dimensions=width)

    signalOccurIdxs = {}
    tree.add(Xnorm[0], 0)
    for startIdx, subseq in enumerate(Xnorm[1:]):
        if np.sum(subseq * subseq) < .001:  # ignore zero seqs
            continue

        startIdx += 1  # since we skipped Xnorm[0]
        neighbors = tree.search_knn(subseq, 2)
        neighborIdx = -1
        neighborDist = np.inf
        # pull out whichever neighbor isn't the query
        for node, dist in neighbors:
            idx = node.metadata
            if idx != startIdx:
                neighborIdx = idx
                neighborDist = dist
        if neighborIdx < 0:
            print "ERROR: knn returned <2 neighbors..."
            print "Neighbors returned:", neighbors
            assert (0)

        # print "neighborDist", neighborDist, maxDist
        if neighborDist < maxDist:
            # store that the subseq happened at this idx too
            l = signalOccurIdxs.get(neighborIdx, [])
            l.append(startIdx)
            # signalOccurIdxs[neighborIdx] = l
        else:
            # ah, so this can overwrite crap and yield too few features
            signalOccurIdxs[startIdx] = [startIdx]
            tree.add(subseq, startIdx)

        # rebalance if startIdx is a power of 2, so we do so log(N) times
        if 2**int(np.log2(startIdx)) == startIdx:
            # print "rebalancing at start idx %d" % (startIdx,)
            tree.rebalance()

        # signalOccurIdxs[neighborIdx] = [startIdx]
        # if res:
        # 	nn, dist = res
        # 	if dist <= maxDist:
        # 		# store that the subseq happened at this idx too
        # 		neighborID = nn.metadata
        # 		signalOccurIdxs[neighborID].append(startIdx)
        # 		continue
        # neighborID = startIdx
        # signalOccurIdxs[neighborID] = [startIdx]
        # tree.add(subseq, neighborID)

    return signalOccurIdxs, Xnorm  # return Xnorm for convenience, although confusing...
Ejemplo n.º 3
0
def allZNormalizedSubseqs(seqs, length):
    X, _, _ = window.flattened_subseqs_of_length(seqs, length, norm='each')
    return zNormalizeRows(X, removeZeros=False)
Ejemplo n.º 4
0
def pairwiseDists(seqs, length, norm='each', tieDims=False, pad=True,
	removeZeros=True, k=-1):

	seqs = asListOrTuple(seqs)
	nDims = 1
	if len(seqs[0].shape) < 2 or tieDims:
		Xnorm, _, _ = window.flattened_subseqs_of_length(seqs, length, norm=norm)
	else:
		nDims = seqs[0].shape[1]
		# bypass flattening--each dim of each seq is treated as a separate
		# 1D seq; we end up with a long list whose elements are 1D vectors,
		# each of which was originally a column within some ND array in seqs
		#
		# note that this may do weird things if there's more than one seq
		# because the dims for each seq are sequential, rather than the seqs
		# for each dim
		separatedByDim = map(lambda X: colsAsList(X), seqs)
		flatSeqs = flattenListOfLists(separatedByDim)
		flatSeqs = map(lambda v: v.flatten(), flatSeqs) # col vects -> 1D arrays
		Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs, length, norm='each')

	nSamples, m = Xnorm.shape
	rowsPerDim = nSamples / nDims
	print "----- pairwiseDists"
	print "length", length
	print "origSeqs[0] shape", seqs[0].shape
	print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim
	print "-----"

	if pad:
		paddingLen = length - 1
	else:
		paddingLen = 0

	# print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1)

	# D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up
	Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim+paddingLen))
	# D = np.zeros((nSamples, nSamples))

	maxPossibleDist = 2**2 * m
	maxIdx = 0
	for dim in range(nDims):
		# extract subseqs associated with this dim
		minIdx = maxIdx
		maxIdx += rowsPerDim
		Xdim = Xnorm[minIdx:maxIdx]
		# compute dists to each one
		for i, row in enumerate(Xdim):
			if removeZeros:
				if np.sum(row*row) < 1.e-6:
					Dtensor[dim, i, :rowsPerDim] = maxPossibleDist
					continue

			diffs = Xdim - row
			diffs_sq = diffs * diffs
			# dMinIdx = minIdx + dim*paddingLen
			# dMaxIdx = dMinIdx + rowsPerDim
			dists = np.sum(diffs_sq, axis=1)

			# D[minIdx + i, dMinIdx:dMaxIdx] = dists
			Dtensor[dim, i,:rowsPerDim] = dists
		# only keep k lowest dists
		if k > 0:
			for j in np.arange(rowsPerDim):
				col = Dtensor[dim, :, j]
				highestIdxs = np.argsort(col)[k:]
				Dtensor[dim, highestIdxs, j] = maxPossibleDist

	# return Dtensor, D, Xnorm
	return Dtensor, Xnorm
Ejemplo n.º 5
0
def uniqueSubseqsInSignals(signal, length, maxDist, norm='each', tree=None):
	X, _, _ = window.flattened_subseqs_of_length(signal, length, norm=norm)
	Xnorm = zNormalizeRows(X, removeZeros=False)

	# print("subseqsInSignals: signal has %d subseqs" % (len(Xnorm)))

	# init kd tree--we can't give it any data yet because we only want to
	# search through seqs that have been added to the dictionary
	if tree is None:
		width = Xnorm.shape[1]
		tree = kd.create(dimensions=width)

	signalOccurIdxs = {}
	tree.add(Xnorm[0], 0)
	for startIdx, subseq in enumerate(Xnorm[1:]):
		if np.sum(subseq*subseq) < .001: # ignore zero seqs
			continue

		startIdx += 1 # since we skipped Xnorm[0]
		neighbors = tree.search_knn(subseq, 2)
		neighborIdx = -1
		neighborDist = np.inf
		# pull out whichever neighbor isn't the query
		for node, dist in neighbors:
			idx = node.metadata
			if idx != startIdx:
				neighborIdx = idx
				neighborDist = dist
		if neighborIdx < 0:
			print "ERROR: knn returned <2 neighbors..."
			print "Neighbors returned:", neighbors
			assert(0)

		# print "neighborDist", neighborDist, maxDist
		if neighborDist < maxDist:
			# store that the subseq happened at this idx too
			l = signalOccurIdxs.get(neighborIdx, [])
			l.append(startIdx)
			# signalOccurIdxs[neighborIdx] = l
		else:
			# ah, so this can overwrite crap and yield too few features
			signalOccurIdxs[startIdx] = [startIdx]
			tree.add(subseq, startIdx)

		# rebalance if startIdx is a power of 2, so we do so log(N) times
		if 2**int(np.log2(startIdx)) == startIdx:
			# print "rebalancing at start idx %d" % (startIdx,)
			tree.rebalance()

		# signalOccurIdxs[neighborIdx] = [startIdx]
		# if res:
		# 	nn, dist = res
		# 	if dist <= maxDist:
		# 		# store that the subseq happened at this idx too
		# 		neighborID = nn.metadata
		# 		signalOccurIdxs[neighborID].append(startIdx)
		# 		continue
		# neighborID = startIdx
		# signalOccurIdxs[neighborID] = [startIdx]
		# tree.add(subseq, neighborID)

	return signalOccurIdxs, Xnorm # return Xnorm for convenience, although confusing...
Ejemplo n.º 6
0
def allZNormalizedSubseqs(seqs, length):
	X, _, _ = window.flattened_subseqs_of_length(seqs, length, norm='each')
	return zNormalizeRows(X, removeZeros=False)