-
Notifications
You must be signed in to change notification settings - Fork 3
/
keyphrase.py
147 lines (121 loc) · 4.43 KB
/
keyphrase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from dumbo import *
import tfidf
import nltk
# Program Variables
minLength = 3
maxLength = 40
maxGram = 4
separator = ',' # Seps terms in output
# Used when tokenizing words
sentence_re = r'''(?x) # set flag to allow verbose regexps
([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
from cPickle import load
f = open('pos_tag.pkl', 'rb')
postagger = load(f)
f.close()
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
def leaves(tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter = lambda t: t.node=='NP'):
yield subtree.leaves()
def normalise(word):
"""Normalises words to lowercase and stems it."""
word = word.lower()
word = stemmer.stem_word(word)
return word
def acceptableWord(word):
"""Checks conditions for acceptable word: length, stopword."""
from mycorpus import stopwords
accepted = bool(minLength <= len(word) <= maxLength
and word.lower() not in stopwords)
return accepted
def acceptableGram(gram):
"""Checks that the n-gram is appropriate length."""
return bool(1 <= len(gram) <= maxGram)
@opt("addpath", "yes")
def termMapper( (docname, lineNum), line):
"""
Tokenizes, Lemmatizes, POS-Tags and chunks to find noun phrases.
Output: (docname, candidate), (payload, 1)
A payload is information we want to carry through all MapReduce phases
until we need it, e.g. If the term is in the title, or its position.
"""
toks = nltk.regexp_tokenize(line, sentence_re)
toks = [ lemmatizer.lemmatize(t) for t in toks ]
postoks = postagger.tag(toks)
tree = chunker.parse(postoks)
position = 0
for leaf in leaves(tree):
term = [ normalise(w) for w,t in leaf if acceptableWord(w) ]
if not acceptableGram(t):
continue
# Titles appear on line 0
payload = (lineNum is 0, position)
yield (docname, term), (payload, 1)
position += 1
def reducePayloads(a, b):
"""
Reduces two payloads (corresponding to the same term), for use in a
reducer/combiner.
The reduction will return if it is in the title, and the earliest position.
E.g: For a term that is in the title one time, and appears at position 5
and 17:
>>> reducePayloads( (True, 5), (False, 17) )
(True, 5)
"""
return a[0] or b[0], min(a[1], b[1])
def termReducer( (docname, term), values ):
"""Reduces the payload and sums the count over terms per document.
Can be used as a combinator too.
Output: (docname, candidate), (payload, # of occurences)
"""
values = list(values)
payload = reduce(reducePayloads, [p for p,n in values])
n = sum( [ n for p,n in values ] )
yield (docname, term), (payload, n)
class FinalReducer:
"""
Ranks and outputs candidate terms. All title terms are output,
and an upper fraction (based on score) of terms are further selected.
"""
# Upper percentage of terms to output
upper_fraction = 0.5
def __init__(self):
# requires -param doccount D
self.doccount = float(self.params['doccount'])
def __call__(self, docname, values):
terms = []
fd = nltk.probability.FreqDist()
for (term, (inTitle, position), n, N, d) in values:
#relativePos = float(position)/m
term_str = ' '.join(term)
if inTitle:
terms.append(term_str)
else:
score = tfidf.tfidf(n, N, d, self.doccount)
#score *= relative_pos
fd.inc(term_str, score)
# top upper_fraction of terms
n = int(self.upper_fraction * len(fd))
terms += fd.keys()[:n]
yield docname, separator.join(terms)
def runner(job):
job.additer(termMapper, termReducer, combiner = termReducer)
tfidf.add_iters(job)
job.additer(identitymapper, FinalReducer)
if __name__ == "__main__":
main(runner)