def findOutcastInstances(Xnorm, seed, length, maxOverlapFraction=.1, fromSeq=None): minSpacing = max(int((1. - maxOverlapFraction) * length), 1) dists = distsToRows(Xnorm, seed) minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq) minimaDists = dists[minimaIdxs] # sort indices of relative minima in increasing order of distance sortIdxs = np.argsort(minimaDists) idxs = minimaIdxs[sortIdxs] dists = minimaDists[sortIdxs] centroidSums = seed centroid = np.copy(seed) distSum_pattern = 0 vectLen = len(seed) bestGap = -np.inf bestIdxs = None for i, idx in enumerate(idxs[1:]): k = i + 2. # pattern model x = Xnorm[idx] diff = centroid - x distSum_pattern += np.dot(diff, diff) / vectLen centroidSums += x centroid = centroidSums / k # random walk AVG_DIST_TO_RAND_WALK = 1. # AVG_DIST_TO_RAND_WALK = .5 distSum_walk = AVG_DIST_TO_RAND_WALK * k # nearest enemy distSum_enemy = np.inf if k < len(idxs): nextIdx = idxs[k] nextX = Xnorm[nextIdx] diff_enemy = centroid - nextX distSum_enemy = np.dot(diff_enemy, diff_enemy) / vectLen * k rivalSum = min(distSum_walk, distSum_enemy) gap = rivalSum - distSum_pattern if gap > bestGap: bestGap = gap bestIdxs = idxs[:k] return OutcastInfo(score=bestGap, idxs=bestIdxs, length=length, fromSeq=fromSeq)
def old_findOutcastInstances(Xnorm, seed, length, maxOverlapFraction=0.1, fromSeq=None): minSpacing = max(int((1.0 - maxOverlapFraction) * length), 1) dists = distsToRows(Xnorm, seed) minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq) # invertMinimaIdxs = np.arange(len(dists))[minimaIdxs] # print "dists shape: ", dists.shape # print "found minimaIdxs: ", minimaIdxs minimaDists = dists[minimaIdxs] # sort indices of relative minima in increasing order of distance # TODO use a min heap, since that's O(n) and this is O(nlgn) sortIdxs = np.argsort(minimaDists) # unsortIdxs = np.arange(len(minimaDists))[sortIdxs] minimaIdxs = minimaIdxs[sortIdxs] minimaDists = minimaDists[sortIdxs] # initialize with best pair so we don't return anomalies idxs = [minimaIdxs[0], minimaIdxs[1]] # totalDist = 2 * minimaDists[1] # don't count self distance, since 0 # maxDist = minimaDists[1] dist = minimaDists[1] nextIdx, nextDist = minimaIdxs[2], minimaDists[2] # bestScore = nextDist * len(idxs) - totalDist # bestScore = (nextDist - dist) * len(idxs) # bestScore = (nextDist - dist) * np.log(len(idxs)) bestScore = (nextDist / dist) * np.log(len(idxs)) bestIdxs = idxs[:] np.set_printoptions(precision=0) # print "minimaDists:", minimaDists print "minima diffs:", np.r_[0, minimaDists[1:] - minimaDists[:-1]] for i in range(2, len(minimaIdxs) - 1): idx, dist = nextIdx, nextDist nextIdx, nextDist = minimaIdxs[i + 1], minimaDists[i + 1] idxs.append(idx) # totalDist += dist # score = nextDist * len(idxs) - totalDist # score = (nextDist - dist) * len(idxs) # score = (nextDist - dist) * np.log(len(idxs)) score = (nextDist / dist) * np.log(len(idxs)) if score > bestScore: # print "new best score {} for idxs {}".format(score, idxs) bestScore = score bestIdxs = idxs[:] # else: # break bestIdxs = sorted(bestIdxs) return OutcastInfo(score=bestScore, idxs=bestIdxs, length=length, fromSeq=fromSeq)
def findOutcastInstances(Xnorm, seed, length, maxOverlapFraction=0.1, fromSeq=None): minSpacing = max(int((1.0 - maxOverlapFraction) * length), 1) dists = distsToRows(Xnorm, seed) minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq) minimaDists = dists[minimaIdxs] # sort indices of relative minima in increasing order of distance sortIdxs = np.argsort(minimaDists) idxs = minimaIdxs[sortIdxs] dists = minimaDists[sortIdxs] centroidSums = seed centroid = np.copy(seed) distSum_pattern = 0 vectLen = len(seed) bestGap = -np.inf bestIdxs = None for i, idx in enumerate(idxs[1:]): k = i + 2.0 # pattern model x = Xnorm[idx] diff = centroid - x distSum_pattern += np.dot(diff, diff) / vectLen centroidSums += x centroid = centroidSums / k # random walk AVG_DIST_TO_RAND_WALK = 1.0 # AVG_DIST_TO_RAND_WALK = .5 distSum_walk = AVG_DIST_TO_RAND_WALK * k # nearest enemy distSum_enemy = np.inf if k < len(idxs): nextIdx = idxs[k] nextX = Xnorm[nextIdx] diff_enemy = centroid - nextX distSum_enemy = np.dot(diff_enemy, diff_enemy) / vectLen * k rivalSum = min(distSum_walk, distSum_enemy) gap = rivalSum - distSum_pattern if gap > bestGap: bestGap = gap bestIdxs = idxs[:k] return OutcastInfo(score=bestGap, idxs=bestIdxs, length=length, fromSeq=fromSeq)
def findOutcastInstancesMDL(Xnorm, seed, length, maxOverlapFraction=0.1, fromSeq=None, mdlBits=6, useEnemy=True): minSpacing = max(int((1.0 - maxOverlapFraction) * length), 1) dists = distsToRows(Xnorm, seed) minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq) minimaDists = dists[minimaIdxs] # sort indices of relative minima in increasing order of distance sortIdxs = np.argsort(minimaDists) idxs = minimaIdxs[sortIdxs] dists = minimaDists[sortIdxs] # instanceIdxs = [idx1, idx2] # compute quantized subsequences numLevels = int(2 ** mdlBits) mins = np.min(Xnorm, axis=1).reshape((-1, 1)) maxs = np.max(Xnorm, axis=1).reshape((-1, 1)) ranges = maxs - mins Xquant = (Xnorm - mins) / ranges * (numLevels - 1) # 8 bits -> {0..255} Xquant = Xquant.astype(np.int) # initialize MDL stats row = Xquant[idxs[0]] centroidSums = np.copy(row) hypothesisEnt = entropy(row) origEnt = hypothesisEnt bitsave = -np.inf # ensure 2nd subseq gets added instanceIdxs = [idxs[0]] for i, idx in enumerate(idxs[1:]): k = i + 2.0 subseq = Xquant[idx] # compute original entropy of this instance along with current ones newOrigEnt = origEnt + entropy(subseq) # compute centroid when this instance is added newCentroidSums = centroidSums + subseq newCentroid = (newCentroidSums / k).astype(np.int) # compute coded entropy when this instance is added newInstanceIdxs = instanceIdxs[:] newInstanceIdxs.append(idx) # diffs = Xquant[instanceIdxs] - newCentroid # works better, but nonsensical diffs = Xquant[newInstanceIdxs] - newCentroid newCodedEnt = np.sum(entropy(diffs, axis=1)) # compute total bitsave if this instance is added newCodingSave = newOrigEnt - newCodedEnt newHypothesisEnt = entropy(newCentroid) newBitsave = newCodingSave - newHypothesisEnt # divide by 2 as heuristic to reduce entropy, since description length # doesn't correspond to any obvious probabilistic model # noiseDiffs = Xquant[newInstanceIdxs] // 2 # noiseCodedEnt = np.sum(entropy(noiseDiffs, axis=1)) noiseCodedEnt = newCodedEnt / 2 enemyCodedEnt = -np.inf if k < len(idxs): nextIdx = idxs[k] enemySubseq = Xquant[nextIdx] enemyDiffs = Xquant[newInstanceIdxs] - enemySubseq enemyCodedEnt = np.sum(entropy(enemyDiffs, axis=1)) rivalEnt = min(noiseCodedEnt, enemyCodedEnt) newBitsave += rivalEnt if newBitsave > bitsave: bitsave = newBitsave origEnt = newOrigEnt centroidSums = newCentroidSums instanceIdxs = newInstanceIdxs # else: # break bestIdxs = sorted(instanceIdxs) return OutcastInfo(score=bitsave, idxs=bestIdxs, length=length, fromSeq=fromSeq)
def old_findOutcastInstances(Xnorm, seed, length, maxOverlapFraction=.1, fromSeq=None): minSpacing = max(int((1. - maxOverlapFraction) * length), 1) dists = distsToRows(Xnorm, seed) minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq) # invertMinimaIdxs = np.arange(len(dists))[minimaIdxs] # print "dists shape: ", dists.shape # print "found minimaIdxs: ", minimaIdxs minimaDists = dists[minimaIdxs] # sort indices of relative minima in increasing order of distance # TODO use a min heap, since that's O(n) and this is O(nlgn) sortIdxs = np.argsort(minimaDists) # unsortIdxs = np.arange(len(minimaDists))[sortIdxs] minimaIdxs = minimaIdxs[sortIdxs] minimaDists = minimaDists[sortIdxs] # initialize with best pair so we don't return anomalies idxs = [minimaIdxs[0], minimaIdxs[1]] # totalDist = 2 * minimaDists[1] # don't count self distance, since 0 # maxDist = minimaDists[1] dist = minimaDists[1] nextIdx, nextDist = minimaIdxs[2], minimaDists[2] # bestScore = nextDist * len(idxs) - totalDist # bestScore = (nextDist - dist) * len(idxs) # bestScore = (nextDist - dist) * np.log(len(idxs)) bestScore = (nextDist / dist) * np.log(len(idxs)) bestIdxs = idxs[:] np.set_printoptions(precision=0) # print "minimaDists:", minimaDists print "minima diffs:", np.r_[0, minimaDists[1:] - minimaDists[:-1]] for i in range(2, len(minimaIdxs) - 1): idx, dist = nextIdx, nextDist nextIdx, nextDist = minimaIdxs[i + 1], minimaDists[i + 1] idxs.append(idx) # totalDist += dist # score = nextDist * len(idxs) - totalDist # score = (nextDist - dist) * len(idxs) # score = (nextDist - dist) * np.log(len(idxs)) score = (nextDist / dist) * np.log(len(idxs)) if score > bestScore: # print "new best score {} for idxs {}".format(score, idxs) bestScore = score bestIdxs = idxs[:] # else: # break bestIdxs = sorted(bestIdxs) return OutcastInfo(score=bestScore, idxs=bestIdxs, length=length, fromSeq=fromSeq)
def findOutcastInstancesMDL(Xnorm, seed, length, maxOverlapFraction=.1, fromSeq=None, mdlBits=6, useEnemy=True): minSpacing = max(int((1. - maxOverlapFraction) * length), 1) dists = distsToRows(Xnorm, seed) minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq) minimaDists = dists[minimaIdxs] # sort indices of relative minima in increasing order of distance sortIdxs = np.argsort(minimaDists) idxs = minimaIdxs[sortIdxs] dists = minimaDists[sortIdxs] # instanceIdxs = [idx1, idx2] # compute quantized subsequences numLevels = int(2**mdlBits) mins = np.min(Xnorm, axis=1).reshape((-1, 1)) maxs = np.max(Xnorm, axis=1).reshape((-1, 1)) ranges = (maxs - mins) Xquant = (Xnorm - mins) / ranges * (numLevels - 1) # 8 bits -> {0..255} Xquant = Xquant.astype(np.int) # initialize MDL stats row = Xquant[idxs[0]] centroidSums = np.copy(row) hypothesisEnt = entropy(row) origEnt = hypothesisEnt bitsave = -np.inf # ensure 2nd subseq gets added instanceIdxs = [idxs[0]] for i, idx in enumerate(idxs[1:]): k = i + 2. subseq = Xquant[idx] # compute original entropy of this instance along with current ones newOrigEnt = origEnt + entropy(subseq) # compute centroid when this instance is added newCentroidSums = centroidSums + subseq newCentroid = (newCentroidSums / k).astype(np.int) # compute coded entropy when this instance is added newInstanceIdxs = instanceIdxs[:] newInstanceIdxs.append(idx) # diffs = Xquant[instanceIdxs] - newCentroid # works better, but nonsensical diffs = Xquant[newInstanceIdxs] - newCentroid newCodedEnt = np.sum(entropy(diffs, axis=1)) # compute total bitsave if this instance is added newCodingSave = newOrigEnt - newCodedEnt newHypothesisEnt = entropy(newCentroid) newBitsave = newCodingSave - newHypothesisEnt # divide by 2 as heuristic to reduce entropy, since description length # doesn't correspond to any obvious probabilistic model # noiseDiffs = Xquant[newInstanceIdxs] // 2 # noiseCodedEnt = np.sum(entropy(noiseDiffs, axis=1)) noiseCodedEnt = newCodedEnt / 2 enemyCodedEnt = -np.inf if k < len(idxs): nextIdx = idxs[k] enemySubseq = Xquant[nextIdx] enemyDiffs = Xquant[newInstanceIdxs] - enemySubseq enemyCodedEnt = np.sum(entropy(enemyDiffs, axis=1)) rivalEnt = min(noiseCodedEnt, enemyCodedEnt) newBitsave += rivalEnt if newBitsave > bitsave: bitsave = newBitsave origEnt = newOrigEnt centroidSums = newCentroidSums instanceIdxs = newInstanceIdxs # else: # break bestIdxs = sorted(instanceIdxs) return OutcastInfo(score=bitsave, idxs=bestIdxs, length=length, fromSeq=fromSeq)