def computeNoise(background, foreground): print 'filtering irrelevant words...' background = read(background, 'dump') foreground = read(foreground, 'dump') freqDistBackground = counter(background) freqDistForeground = counter(foreground) print 'done.\ncomputing noise...' noise = incrementalSweep(freqDistForeground, freqDistBackground, 200, 0.05) print 'done.' return [noise, freqDistForeground]
def incrementalSweep(freqDistCorpus, freqDistBackground, windowSpan, noisiness): noisy = True oneSpace = [pair[1] for pair in freqDistCorpus] otherSpace = [pair[1] for pair in freqDistBackground] window = Window(0, windowSpan) while noisy == True: sharedWords = inCommon(window, oneSpace, otherSpace) noisy = evaluate(sharedWords, noisiness) # print window.start,window.end, noisy, sharedWords.shared if window.end >= len(oneSpace) \ or window.end >= len(otherSpace): exit('could not create a language model.') else: window = Window(window.start + 1, windowSpan) noise = [pair[1] for pair in counter(oneSpace[:window.end] + \ otherSpace[:window.end]) if pair[0] > 1] return set(noise)
def purge(ranks, clusters): for rank in ranks: key = rank[1] oneCluster = clusters[key] freqDist = counter(oneCluster.space) tippingPoint = findConfluence(freqDist) features = set([count[1] for count in freqDist if count[0] > tippingPoint]) oneCluster.features = features # newRanks = [] print len(ranks), 'incoming clusters and...' while ranks: key = ranks[0][1] validated = False remaining = [] oneCluster = clusters[key] # for j in range(1, len(ranks)): otherKey = ranks[j][1] otherCluster = clusters[otherKey] shared = oneCluster.features.intersection(otherCluster.features) if shared != set([]): if (len(shared) * 100) / len(oneCluster.features) < 50 and \ (len(shared) * 100) / len(otherCluster.features) < 50 and \ otherCluster.features != set([]): remaining.append(ranks[j]) validated = True elif otherCluster.features != set([]): remaining.append(ranks[j]) validated = True # if oneCluster.features != set([]) and \ validated == True: newRanks.append((ranks[0])) ranks = remaining print '...', len(newRanks), 'outgoing clusters.' return newRanks