/
collocate.py
61 lines (41 loc) · 1.24 KB
/
collocate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from nltk.collocations import *
from nltk.util import ngrams
from nltk.metrics import BigramAssocMeasures
from string import punctuation as punct
def collocRecursively(corp,interp,constructor,threshhold,addUnrelated,addBigram,filters=None):
bgFinder = constructor(corp)
if filters:
bgFinder = applyFilters(bgFinder,filters)
bgScores = {bg:score for bg,score in bgFinder.score_ngrams(BigramAssocMeasures().likelihood_ratio)}
print(sorted(list(bgScores.items()),key=lambda tup: tup[1])[-6:])
idx = 0
N = len(corp)
newCorp = list()
flag = False
while idx < N-1:
bg = (corp[idx],corp[idx+1])
if bgScores.get((interp(bg[0]),interp(bg[1])),0) > threshhold:
addBigram(newCorp,bg)
idx += 2
flag = True
else:
addUnrelated(newCorp,bg[0])
idx += 1
if idx == N-1:
addUnrelated(newCorp,corp[idx])
if flag:
return collocRecursively(newCorp, interp, constructor, threshhold, addUnrelated, addBigram, filters)
return newCorp
def applyFilters(bigrammer,filterList):
for f in filterList:
f(bigrammer)
return bigrammer
def makeCollocated(corp,interpFunc,outfile):
newCorp = list()
curDoc = list()
for word in corp:
if interpFunc(word) == "$|$":
newCorp.append(curDoc)
curDoc = list()
else:
curDoc.append(word)