Exemple #1
0
def findOutcastInstances(Xnorm,
                         seed,
                         length,
                         maxOverlapFraction=.1,
                         fromSeq=None):
    minSpacing = max(int((1. - maxOverlapFraction) * length), 1)

    dists = distsToRows(Xnorm, seed)
    minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq)
    minimaDists = dists[minimaIdxs]

    # sort indices of relative minima in increasing order of distance
    sortIdxs = np.argsort(minimaDists)
    idxs = minimaIdxs[sortIdxs]
    dists = minimaDists[sortIdxs]

    centroidSums = seed
    centroid = np.copy(seed)
    distSum_pattern = 0

    vectLen = len(seed)

    bestGap = -np.inf
    bestIdxs = None
    for i, idx in enumerate(idxs[1:]):
        k = i + 2.

        # pattern model
        x = Xnorm[idx]
        diff = centroid - x
        distSum_pattern += np.dot(diff, diff) / vectLen

        centroidSums += x
        centroid = centroidSums / k

        # random walk
        AVG_DIST_TO_RAND_WALK = 1.
        # AVG_DIST_TO_RAND_WALK = .5
        distSum_walk = AVG_DIST_TO_RAND_WALK * k

        # nearest enemy
        distSum_enemy = np.inf
        if k < len(idxs):
            nextIdx = idxs[k]
            nextX = Xnorm[nextIdx]
            diff_enemy = centroid - nextX
            distSum_enemy = np.dot(diff_enemy, diff_enemy) / vectLen * k

        rivalSum = min(distSum_walk, distSum_enemy)
        gap = rivalSum - distSum_pattern
        if gap > bestGap:
            bestGap = gap
            bestIdxs = idxs[:k]

    return OutcastInfo(score=bestGap,
                       idxs=bestIdxs,
                       length=length,
                       fromSeq=fromSeq)
Exemple #2
0
def old_findOutcastInstances(Xnorm, seed, length, maxOverlapFraction=0.1, fromSeq=None):
    minSpacing = max(int((1.0 - maxOverlapFraction) * length), 1)

    dists = distsToRows(Xnorm, seed)
    minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq)
    # invertMinimaIdxs = np.arange(len(dists))[minimaIdxs]

    # print "dists shape: ", dists.shape
    # print "found minimaIdxs: ", minimaIdxs

    minimaDists = dists[minimaIdxs]

    # sort indices of relative minima in increasing order of distance
    # TODO use a min heap, since that's O(n) and this is O(nlgn)
    sortIdxs = np.argsort(minimaDists)
    # unsortIdxs = np.arange(len(minimaDists))[sortIdxs]
    minimaIdxs = minimaIdxs[sortIdxs]
    minimaDists = minimaDists[sortIdxs]

    # initialize with best pair so we don't return anomalies
    idxs = [minimaIdxs[0], minimaIdxs[1]]
    # totalDist = 2 * minimaDists[1] # don't count self distance, since 0
    # maxDist = minimaDists[1]
    dist = minimaDists[1]
    nextIdx, nextDist = minimaIdxs[2], minimaDists[2]

    # bestScore = nextDist * len(idxs) - totalDist
    # bestScore = (nextDist - dist) * len(idxs)
    # bestScore = (nextDist - dist) * np.log(len(idxs))
    bestScore = (nextDist / dist) * np.log(len(idxs))
    bestIdxs = idxs[:]

    np.set_printoptions(precision=0)
    # print "minimaDists:", minimaDists
    print "minima diffs:", np.r_[0, minimaDists[1:] - minimaDists[:-1]]

    for i in range(2, len(minimaIdxs) - 1):
        idx, dist = nextIdx, nextDist
        nextIdx, nextDist = minimaIdxs[i + 1], minimaDists[i + 1]

        idxs.append(idx)
        # totalDist += dist
        # score = nextDist * len(idxs) - totalDist
        # score = (nextDist - dist) * len(idxs)
        # score = (nextDist - dist) * np.log(len(idxs))
        score = (nextDist / dist) * np.log(len(idxs))

        if score > bestScore:
            # print "new best score {} for idxs {}".format(score, idxs)
            bestScore = score
            bestIdxs = idxs[:]
            # else:
            # 	break

    bestIdxs = sorted(bestIdxs)

    return OutcastInfo(score=bestScore, idxs=bestIdxs, length=length, fromSeq=fromSeq)
Exemple #3
0
def findOutcastInstances(Xnorm, seed, length, maxOverlapFraction=0.1, fromSeq=None):
    minSpacing = max(int((1.0 - maxOverlapFraction) * length), 1)

    dists = distsToRows(Xnorm, seed)
    minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq)
    minimaDists = dists[minimaIdxs]

    # sort indices of relative minima in increasing order of distance
    sortIdxs = np.argsort(minimaDists)
    idxs = minimaIdxs[sortIdxs]
    dists = minimaDists[sortIdxs]

    centroidSums = seed
    centroid = np.copy(seed)
    distSum_pattern = 0

    vectLen = len(seed)

    bestGap = -np.inf
    bestIdxs = None
    for i, idx in enumerate(idxs[1:]):
        k = i + 2.0

        # pattern model
        x = Xnorm[idx]
        diff = centroid - x
        distSum_pattern += np.dot(diff, diff) / vectLen

        centroidSums += x
        centroid = centroidSums / k

        # random walk
        AVG_DIST_TO_RAND_WALK = 1.0
        # AVG_DIST_TO_RAND_WALK = .5
        distSum_walk = AVG_DIST_TO_RAND_WALK * k

        # nearest enemy
        distSum_enemy = np.inf
        if k < len(idxs):
            nextIdx = idxs[k]
            nextX = Xnorm[nextIdx]
            diff_enemy = centroid - nextX
            distSum_enemy = np.dot(diff_enemy, diff_enemy) / vectLen * k

        rivalSum = min(distSum_walk, distSum_enemy)
        gap = rivalSum - distSum_pattern
        if gap > bestGap:
            bestGap = gap
            bestIdxs = idxs[:k]

    return OutcastInfo(score=bestGap, idxs=bestIdxs, length=length, fromSeq=fromSeq)
Exemple #4
0
def findOutcastInstancesMDL(Xnorm, seed, length, maxOverlapFraction=0.1, fromSeq=None, mdlBits=6, useEnemy=True):
    minSpacing = max(int((1.0 - maxOverlapFraction) * length), 1)

    dists = distsToRows(Xnorm, seed)
    minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq)
    minimaDists = dists[minimaIdxs]

    # sort indices of relative minima in increasing order of distance
    sortIdxs = np.argsort(minimaDists)
    idxs = minimaIdxs[sortIdxs]
    dists = minimaDists[sortIdxs]

    # instanceIdxs = [idx1, idx2]

    # compute quantized subsequences
    numLevels = int(2 ** mdlBits)
    mins = np.min(Xnorm, axis=1).reshape((-1, 1))
    maxs = np.max(Xnorm, axis=1).reshape((-1, 1))
    ranges = maxs - mins
    Xquant = (Xnorm - mins) / ranges * (numLevels - 1)  # 8 bits -> {0..255}
    Xquant = Xquant.astype(np.int)

    # initialize MDL stats
    row = Xquant[idxs[0]]
    centroidSums = np.copy(row)
    hypothesisEnt = entropy(row)
    origEnt = hypothesisEnt
    bitsave = -np.inf  # ensure 2nd subseq gets added

    instanceIdxs = [idxs[0]]
    for i, idx in enumerate(idxs[1:]):
        k = i + 2.0
        subseq = Xquant[idx]

        # compute original entropy of this instance along with current ones
        newOrigEnt = origEnt + entropy(subseq)

        # compute centroid when this instance is added
        newCentroidSums = centroidSums + subseq
        newCentroid = (newCentroidSums / k).astype(np.int)

        # compute coded entropy when this instance is added
        newInstanceIdxs = instanceIdxs[:]
        newInstanceIdxs.append(idx)
        # diffs = Xquant[instanceIdxs] - newCentroid # works better, but nonsensical
        diffs = Xquant[newInstanceIdxs] - newCentroid
        newCodedEnt = np.sum(entropy(diffs, axis=1))

        # compute total bitsave if this instance is added
        newCodingSave = newOrigEnt - newCodedEnt
        newHypothesisEnt = entropy(newCentroid)
        newBitsave = newCodingSave - newHypothesisEnt

        # divide by 2 as heuristic to reduce entropy, since description length
        # doesn't correspond to any obvious probabilistic model
        # noiseDiffs = Xquant[newInstanceIdxs] // 2
        # noiseCodedEnt = np.sum(entropy(noiseDiffs, axis=1))
        noiseCodedEnt = newCodedEnt / 2

        enemyCodedEnt = -np.inf
        if k < len(idxs):
            nextIdx = idxs[k]
            enemySubseq = Xquant[nextIdx]
            enemyDiffs = Xquant[newInstanceIdxs] - enemySubseq
            enemyCodedEnt = np.sum(entropy(enemyDiffs, axis=1))
        rivalEnt = min(noiseCodedEnt, enemyCodedEnt)
        newBitsave += rivalEnt

        if newBitsave > bitsave:
            bitsave = newBitsave
            origEnt = newOrigEnt
            centroidSums = newCentroidSums
            instanceIdxs = newInstanceIdxs
            # else:
            # 	break

    bestIdxs = sorted(instanceIdxs)
    return OutcastInfo(score=bitsave, idxs=bestIdxs, length=length, fromSeq=fromSeq)
Exemple #5
0
def old_findOutcastInstances(Xnorm,
                             seed,
                             length,
                             maxOverlapFraction=.1,
                             fromSeq=None):
    minSpacing = max(int((1. - maxOverlapFraction) * length), 1)

    dists = distsToRows(Xnorm, seed)
    minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq)
    # invertMinimaIdxs = np.arange(len(dists))[minimaIdxs]

    # print "dists shape: ", dists.shape
    # print "found minimaIdxs: ", minimaIdxs

    minimaDists = dists[minimaIdxs]

    # sort indices of relative minima in increasing order of distance
    # TODO use a min heap, since that's O(n) and this is O(nlgn)
    sortIdxs = np.argsort(minimaDists)
    # unsortIdxs = np.arange(len(minimaDists))[sortIdxs]
    minimaIdxs = minimaIdxs[sortIdxs]
    minimaDists = minimaDists[sortIdxs]

    # initialize with best pair so we don't return anomalies
    idxs = [minimaIdxs[0], minimaIdxs[1]]
    # totalDist = 2 * minimaDists[1] # don't count self distance, since 0
    # maxDist = minimaDists[1]
    dist = minimaDists[1]
    nextIdx, nextDist = minimaIdxs[2], minimaDists[2]

    # bestScore = nextDist * len(idxs) - totalDist
    # bestScore = (nextDist - dist) * len(idxs)
    # bestScore = (nextDist - dist) * np.log(len(idxs))
    bestScore = (nextDist / dist) * np.log(len(idxs))
    bestIdxs = idxs[:]

    np.set_printoptions(precision=0)
    # print "minimaDists:", minimaDists
    print "minima diffs:", np.r_[0, minimaDists[1:] - minimaDists[:-1]]

    for i in range(2, len(minimaIdxs) - 1):
        idx, dist = nextIdx, nextDist
        nextIdx, nextDist = minimaIdxs[i + 1], minimaDists[i + 1]

        idxs.append(idx)
        # totalDist += dist
        # score = nextDist * len(idxs) - totalDist
        # score = (nextDist - dist) * len(idxs)
        # score = (nextDist - dist) * np.log(len(idxs))
        score = (nextDist / dist) * np.log(len(idxs))

        if score > bestScore:
            # print "new best score {} for idxs {}".format(score, idxs)
            bestScore = score
            bestIdxs = idxs[:]
        # else:
        # 	break

    bestIdxs = sorted(bestIdxs)

    return OutcastInfo(score=bestScore,
                       idxs=bestIdxs,
                       length=length,
                       fromSeq=fromSeq)
Exemple #6
0
def findOutcastInstancesMDL(Xnorm,
                            seed,
                            length,
                            maxOverlapFraction=.1,
                            fromSeq=None,
                            mdlBits=6,
                            useEnemy=True):
    minSpacing = max(int((1. - maxOverlapFraction) * length), 1)

    dists = distsToRows(Xnorm, seed)
    minimaIdxs = nonOverlappingMinima(dists, minSpacing, fromSeq=fromSeq)
    minimaDists = dists[minimaIdxs]

    # sort indices of relative minima in increasing order of distance
    sortIdxs = np.argsort(minimaDists)
    idxs = minimaIdxs[sortIdxs]
    dists = minimaDists[sortIdxs]

    # instanceIdxs = [idx1, idx2]

    # compute quantized subsequences
    numLevels = int(2**mdlBits)
    mins = np.min(Xnorm, axis=1).reshape((-1, 1))
    maxs = np.max(Xnorm, axis=1).reshape((-1, 1))
    ranges = (maxs - mins)
    Xquant = (Xnorm - mins) / ranges * (numLevels - 1)  # 8 bits -> {0..255}
    Xquant = Xquant.astype(np.int)

    # initialize MDL stats
    row = Xquant[idxs[0]]
    centroidSums = np.copy(row)
    hypothesisEnt = entropy(row)
    origEnt = hypothesisEnt
    bitsave = -np.inf  # ensure 2nd subseq gets added

    instanceIdxs = [idxs[0]]
    for i, idx in enumerate(idxs[1:]):
        k = i + 2.
        subseq = Xquant[idx]

        # compute original entropy of this instance along with current ones
        newOrigEnt = origEnt + entropy(subseq)

        # compute centroid when this instance is added
        newCentroidSums = centroidSums + subseq
        newCentroid = (newCentroidSums / k).astype(np.int)

        # compute coded entropy when this instance is added
        newInstanceIdxs = instanceIdxs[:]
        newInstanceIdxs.append(idx)
        # diffs = Xquant[instanceIdxs] - newCentroid # works better, but nonsensical
        diffs = Xquant[newInstanceIdxs] - newCentroid
        newCodedEnt = np.sum(entropy(diffs, axis=1))

        # compute total bitsave if this instance is added
        newCodingSave = newOrigEnt - newCodedEnt
        newHypothesisEnt = entropy(newCentroid)
        newBitsave = newCodingSave - newHypothesisEnt

        # divide by 2 as heuristic to reduce entropy, since description length
        # doesn't correspond to any obvious probabilistic model
        # noiseDiffs = Xquant[newInstanceIdxs] // 2
        # noiseCodedEnt = np.sum(entropy(noiseDiffs, axis=1))
        noiseCodedEnt = newCodedEnt / 2

        enemyCodedEnt = -np.inf
        if k < len(idxs):
            nextIdx = idxs[k]
            enemySubseq = Xquant[nextIdx]
            enemyDiffs = Xquant[newInstanceIdxs] - enemySubseq
            enemyCodedEnt = np.sum(entropy(enemyDiffs, axis=1))
        rivalEnt = min(noiseCodedEnt, enemyCodedEnt)
        newBitsave += rivalEnt

        if newBitsave > bitsave:
            bitsave = newBitsave
            origEnt = newOrigEnt
            centroidSums = newCentroidSums
            instanceIdxs = newInstanceIdxs
        # else:
        # 	break

    bestIdxs = sorted(instanceIdxs)
    return OutcastInfo(score=bitsave,
                       idxs=bestIdxs,
                       length=length,
                       fromSeq=fromSeq)