Beispiel #1
0
def computeNoise(background, foreground):
    print 'filtering irrelevant words...'
    background = read(background, 'dump')
    foreground = read(foreground, 'dump')
    freqDistBackground = counter(background)
    freqDistForeground = counter(foreground)
    print 'done.\ncomputing noise...'
    noise = incrementalSweep(freqDistForeground, freqDistBackground, 200, 0.05)
    print 'done.'
    return [noise, freqDistForeground]
Beispiel #2
0
def incrementalSweep(freqDistCorpus, freqDistBackground, windowSpan, noisiness):
    noisy = True
    oneSpace = [pair[1] for pair in freqDistCorpus]
    otherSpace = [pair[1] for pair in freqDistBackground]
    window = Window(0, windowSpan)
    while noisy == True:
        sharedWords = inCommon(window, oneSpace, otherSpace)
        noisy = evaluate(sharedWords, noisiness)
        #   print window.start,window.end, noisy, sharedWords.shared
        if window.end >= len(oneSpace) \
            or window.end >= len(otherSpace):
            exit('could not create a language model.')
        else:
            window = Window(window.start + 1, windowSpan)
    noise = [pair[1] for pair in counter(oneSpace[:window.end] + \
                                         otherSpace[:window.end]) if pair[0] > 1]
    return set(noise)
Beispiel #3
0
def purge(ranks, clusters):
    for rank in ranks:
        key = rank[1]
        oneCluster = clusters[key]
        freqDist = counter(oneCluster.space)
        tippingPoint = findConfluence(freqDist)
        features = set([count[1] for count in freqDist if count[0] > tippingPoint])
        oneCluster.features = features
        #
    newRanks = []
    print len(ranks), 'incoming clusters and...'
    while ranks:
        key = ranks[0][1]
        validated = False
        remaining = []
        oneCluster = clusters[key]
        #
        for j in range(1, len(ranks)):
            otherKey = ranks[j][1]
            otherCluster = clusters[otherKey]
            shared = oneCluster.features.intersection(otherCluster.features)
            if shared != set([]):
                if (len(shared) * 100) / len(oneCluster.features) < 50 and \
                                        (len(shared) * 100) / len(otherCluster.features) < 50 and \
                                otherCluster.features != set([]):
                    remaining.append(ranks[j])
                    validated = True
            elif otherCluster.features != set([]):
                remaining.append(ranks[j])
                validated = True
            #
        if oneCluster.features != set([]) and \
                        validated == True:
            newRanks.append((ranks[0]))
        ranks = remaining
    print '...', len(newRanks), 'outgoing clusters.'
    return newRanks