Esempio n. 1
0
def uniqueSubseqs(seqs, length, maxDist, tieDims=False):
    """return a set of subseqs such that no subseq is within maxDist of any
	other subseq and all original subseqs are within maxDist of one of the
	subseqs returned; basically a greedy unique operator where uniqueness
	is defined by L2^2 distance within maxDist"""

    seqs = asListOrTuple(seqs)
    is1D = len(seqs[0].shape) == 1

    if tieDims or is1D:
        # zip trick so that we always return two lists (occurIdxs and Xnorms)
        return zip(
            *[uniqueSubseqsInSignals(seqs, length, maxDist, norm='each_mean')])
    else:
        nDims = seqs[0].shape[1]
        occurIdxs = []
        Xnorms = []
        for dim in range(nDims):  # for each dimension
            print("finding unique seqs in dim %d" % (dim, ))
            signals = map(lambda seq: seq[:, dim], seqs)
            signalOccurIdxs, Xnorm = uniqueSubseqsInSignals(signals,
                                                            length,
                                                            maxDist,
                                                            norm='each_mean')
            print("found %d unique seqs" % (len(signalOccurIdxs), ))
            occurIdxs.append(signalOccurIdxs)
            Xnorms.append(Xnorm)

        return occurIdxs, Xnorms
Esempio n. 2
0
def uniqueSubseqs(seqs, length, maxDist, tieDims=False):
	"""return a set of subseqs such that no subseq is within maxDist of any
	other subseq and all original subseqs are within maxDist of one of the
	subseqs returned; basically a greedy unique operator where uniqueness
	is defined by L2^2 distance within maxDist"""

	seqs = asListOrTuple(seqs)
	is1D = len(seqs[0].shape) == 1

	if tieDims or is1D:
		# zip trick so that we always return two lists (occurIdxs and Xnorms)
		return zip(*[uniqueSubseqsInSignals(seqs, length, maxDist, norm='each_mean')])
	else:
		nDims = seqs[0].shape[1]
		occurIdxs = []
		Xnorms = []
		for dim in range(nDims): # for each dimension
			print("finding unique seqs in dim %d" % (dim,))
			signals = map(lambda seq: seq[:, dim], seqs)
			signalOccurIdxs, Xnorm = uniqueSubseqsInSignals(signals, length, maxDist, norm='each_mean')
			print("found %d unique seqs" % (len(signalOccurIdxs),))
			occurIdxs.append(signalOccurIdxs)
			Xnorms.append(Xnorm)

		return occurIdxs, Xnorms
Esempio n. 3
0
def pairwiseDists(seqs,
                  length,
                  norm='each',
                  tieDims=False,
                  pad=True,
                  removeZeros=True,
                  k=-1):

    seqs = asListOrTuple(seqs)
    nDims = 1
    if len(seqs[0].shape) < 2 or tieDims:
        Xnorm, _, _ = window.flattened_subseqs_of_length(seqs,
                                                         length,
                                                         norm=norm)
    else:
        nDims = seqs[0].shape[1]
        # bypass flattening--each dim of each seq is treated as a separate
        # 1D seq; we end up with a long list whose elements are 1D vectors,
        # each of which was originally a column within some ND array in seqs
        #
        # note that this may do weird things if there's more than one seq
        # because the dims for each seq are sequential, rather than the seqs
        # for each dim
        separatedByDim = map(lambda X: colsAsList(X), seqs)
        flatSeqs = flattenListOfLists(separatedByDim)
        flatSeqs = map(lambda v: v.flatten(),
                       flatSeqs)  # col vects -> 1D arrays
        Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs,
                                                         length,
                                                         norm='each')

    nSamples, m = Xnorm.shape
    rowsPerDim = nSamples / nDims
    print "----- pairwiseDists"
    print "length", length
    print "origSeqs[0] shape", seqs[0].shape
    print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim
    print "-----"

    if pad:
        paddingLen = length - 1
    else:
        paddingLen = 0

    # print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1)

    # D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up
    Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim + paddingLen))
    # D = np.zeros((nSamples, nSamples))

    maxPossibleDist = 2**2 * m
    maxIdx = 0
    for dim in range(nDims):
        # extract subseqs associated with this dim
        minIdx = maxIdx
        maxIdx += rowsPerDim
        Xdim = Xnorm[minIdx:maxIdx]
        # compute dists to each one
        for i, row in enumerate(Xdim):
            if removeZeros:
                if np.sum(row * row) < 1.e-6:
                    Dtensor[dim, i, :rowsPerDim] = maxPossibleDist
                    continue

            diffs = Xdim - row
            diffs_sq = diffs * diffs
            # dMinIdx = minIdx + dim*paddingLen
            # dMaxIdx = dMinIdx + rowsPerDim
            dists = np.sum(diffs_sq, axis=1)

            # D[minIdx + i, dMinIdx:dMaxIdx] = dists
            Dtensor[dim, i, :rowsPerDim] = dists
        # only keep k lowest dists
        if k > 0:
            for j in np.arange(rowsPerDim):
                col = Dtensor[dim, :, j]
                highestIdxs = np.argsort(col)[k:]
                Dtensor[dim, highestIdxs, j] = maxPossibleDist

    # return Dtensor, D, Xnorm
    return Dtensor, Xnorm
Esempio n. 4
0
def pairwiseDists(seqs, length, norm='each', tieDims=False, pad=True,
	removeZeros=True, k=-1):

	seqs = asListOrTuple(seqs)
	nDims = 1
	if len(seqs[0].shape) < 2 or tieDims:
		Xnorm, _, _ = window.flattened_subseqs_of_length(seqs, length, norm=norm)
	else:
		nDims = seqs[0].shape[1]
		# bypass flattening--each dim of each seq is treated as a separate
		# 1D seq; we end up with a long list whose elements are 1D vectors,
		# each of which was originally a column within some ND array in seqs
		#
		# note that this may do weird things if there's more than one seq
		# because the dims for each seq are sequential, rather than the seqs
		# for each dim
		separatedByDim = map(lambda X: colsAsList(X), seqs)
		flatSeqs = flattenListOfLists(separatedByDim)
		flatSeqs = map(lambda v: v.flatten(), flatSeqs) # col vects -> 1D arrays
		Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs, length, norm='each')

	nSamples, m = Xnorm.shape
	rowsPerDim = nSamples / nDims
	print "----- pairwiseDists"
	print "length", length
	print "origSeqs[0] shape", seqs[0].shape
	print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim
	print "-----"

	if pad:
		paddingLen = length - 1
	else:
		paddingLen = 0

	# print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1)

	# D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up
	Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim+paddingLen))
	# D = np.zeros((nSamples, nSamples))

	maxPossibleDist = 2**2 * m
	maxIdx = 0
	for dim in range(nDims):
		# extract subseqs associated with this dim
		minIdx = maxIdx
		maxIdx += rowsPerDim
		Xdim = Xnorm[minIdx:maxIdx]
		# compute dists to each one
		for i, row in enumerate(Xdim):
			if removeZeros:
				if np.sum(row*row) < 1.e-6:
					Dtensor[dim, i, :rowsPerDim] = maxPossibleDist
					continue

			diffs = Xdim - row
			diffs_sq = diffs * diffs
			# dMinIdx = minIdx + dim*paddingLen
			# dMaxIdx = dMinIdx + rowsPerDim
			dists = np.sum(diffs_sq, axis=1)

			# D[minIdx + i, dMinIdx:dMaxIdx] = dists
			Dtensor[dim, i,:rowsPerDim] = dists
		# only keep k lowest dists
		if k > 0:
			for j in np.arange(rowsPerDim):
				col = Dtensor[dim, :, j]
				highestIdxs = np.argsort(col)[k:]
				Dtensor[dim, highestIdxs, j] = maxPossibleDist

	# return Dtensor, D, Xnorm
	return Dtensor, Xnorm