def uniqueSubseqs(seqs, length, maxDist, tieDims=False): """return a set of subseqs such that no subseq is within maxDist of any other subseq and all original subseqs are within maxDist of one of the subseqs returned; basically a greedy unique operator where uniqueness is defined by L2^2 distance within maxDist""" seqs = asListOrTuple(seqs) is1D = len(seqs[0].shape) == 1 if tieDims or is1D: # zip trick so that we always return two lists (occurIdxs and Xnorms) return zip( *[uniqueSubseqsInSignals(seqs, length, maxDist, norm='each_mean')]) else: nDims = seqs[0].shape[1] occurIdxs = [] Xnorms = [] for dim in range(nDims): # for each dimension print("finding unique seqs in dim %d" % (dim, )) signals = map(lambda seq: seq[:, dim], seqs) signalOccurIdxs, Xnorm = uniqueSubseqsInSignals(signals, length, maxDist, norm='each_mean') print("found %d unique seqs" % (len(signalOccurIdxs), )) occurIdxs.append(signalOccurIdxs) Xnorms.append(Xnorm) return occurIdxs, Xnorms
def uniqueSubseqs(seqs, length, maxDist, tieDims=False): """return a set of subseqs such that no subseq is within maxDist of any other subseq and all original subseqs are within maxDist of one of the subseqs returned; basically a greedy unique operator where uniqueness is defined by L2^2 distance within maxDist""" seqs = asListOrTuple(seqs) is1D = len(seqs[0].shape) == 1 if tieDims or is1D: # zip trick so that we always return two lists (occurIdxs and Xnorms) return zip(*[uniqueSubseqsInSignals(seqs, length, maxDist, norm='each_mean')]) else: nDims = seqs[0].shape[1] occurIdxs = [] Xnorms = [] for dim in range(nDims): # for each dimension print("finding unique seqs in dim %d" % (dim,)) signals = map(lambda seq: seq[:, dim], seqs) signalOccurIdxs, Xnorm = uniqueSubseqsInSignals(signals, length, maxDist, norm='each_mean') print("found %d unique seqs" % (len(signalOccurIdxs),)) occurIdxs.append(signalOccurIdxs) Xnorms.append(Xnorm) return occurIdxs, Xnorms
def pairwiseDists(seqs, length, norm='each', tieDims=False, pad=True, removeZeros=True, k=-1): seqs = asListOrTuple(seqs) nDims = 1 if len(seqs[0].shape) < 2 or tieDims: Xnorm, _, _ = window.flattened_subseqs_of_length(seqs, length, norm=norm) else: nDims = seqs[0].shape[1] # bypass flattening--each dim of each seq is treated as a separate # 1D seq; we end up with a long list whose elements are 1D vectors, # each of which was originally a column within some ND array in seqs # # note that this may do weird things if there's more than one seq # because the dims for each seq are sequential, rather than the seqs # for each dim separatedByDim = map(lambda X: colsAsList(X), seqs) flatSeqs = flattenListOfLists(separatedByDim) flatSeqs = map(lambda v: v.flatten(), flatSeqs) # col vects -> 1D arrays Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs, length, norm='each') nSamples, m = Xnorm.shape rowsPerDim = nSamples / nDims print "----- pairwiseDists" print "length", length print "origSeqs[0] shape", seqs[0].shape print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim print "-----" if pad: paddingLen = length - 1 else: paddingLen = 0 # print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1) # D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim + paddingLen)) # D = np.zeros((nSamples, nSamples)) maxPossibleDist = 2**2 * m maxIdx = 0 for dim in range(nDims): # extract subseqs associated with this dim minIdx = maxIdx maxIdx += rowsPerDim Xdim = Xnorm[minIdx:maxIdx] # compute dists to each one for i, row in enumerate(Xdim): if removeZeros: if np.sum(row * row) < 1.e-6: Dtensor[dim, i, :rowsPerDim] = maxPossibleDist continue diffs = Xdim - row diffs_sq = diffs * diffs # dMinIdx = minIdx + dim*paddingLen # dMaxIdx = dMinIdx + rowsPerDim dists = np.sum(diffs_sq, axis=1) # D[minIdx + i, dMinIdx:dMaxIdx] = dists Dtensor[dim, i, :rowsPerDim] = dists # only keep k lowest dists if k > 0: for j in np.arange(rowsPerDim): col = Dtensor[dim, :, j] highestIdxs = np.argsort(col)[k:] Dtensor[dim, highestIdxs, j] = maxPossibleDist # return Dtensor, D, Xnorm return Dtensor, Xnorm
def pairwiseDists(seqs, length, norm='each', tieDims=False, pad=True, removeZeros=True, k=-1): seqs = asListOrTuple(seqs) nDims = 1 if len(seqs[0].shape) < 2 or tieDims: Xnorm, _, _ = window.flattened_subseqs_of_length(seqs, length, norm=norm) else: nDims = seqs[0].shape[1] # bypass flattening--each dim of each seq is treated as a separate # 1D seq; we end up with a long list whose elements are 1D vectors, # each of which was originally a column within some ND array in seqs # # note that this may do weird things if there's more than one seq # because the dims for each seq are sequential, rather than the seqs # for each dim separatedByDim = map(lambda X: colsAsList(X), seqs) flatSeqs = flattenListOfLists(separatedByDim) flatSeqs = map(lambda v: v.flatten(), flatSeqs) # col vects -> 1D arrays Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs, length, norm='each') nSamples, m = Xnorm.shape rowsPerDim = nSamples / nDims print "----- pairwiseDists" print "length", length print "origSeqs[0] shape", seqs[0].shape print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim print "-----" if pad: paddingLen = length - 1 else: paddingLen = 0 # print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1) # D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim+paddingLen)) # D = np.zeros((nSamples, nSamples)) maxPossibleDist = 2**2 * m maxIdx = 0 for dim in range(nDims): # extract subseqs associated with this dim minIdx = maxIdx maxIdx += rowsPerDim Xdim = Xnorm[minIdx:maxIdx] # compute dists to each one for i, row in enumerate(Xdim): if removeZeros: if np.sum(row*row) < 1.e-6: Dtensor[dim, i, :rowsPerDim] = maxPossibleDist continue diffs = Xdim - row diffs_sq = diffs * diffs # dMinIdx = minIdx + dim*paddingLen # dMaxIdx = dMinIdx + rowsPerDim dists = np.sum(diffs_sq, axis=1) # D[minIdx + i, dMinIdx:dMaxIdx] = dists Dtensor[dim, i,:rowsPerDim] = dists # only keep k lowest dists if k > 0: for j in np.arange(rowsPerDim): col = Dtensor[dim, :, j] highestIdxs = np.argsort(col)[k:] Dtensor[dim, highestIdxs, j] = maxPossibleDist # return Dtensor, D, Xnorm return Dtensor, Xnorm