/
phrase_tokenizer.py
232 lines (203 loc) · 8.4 KB
/
phrase_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 19 12:06:40 2016
@author: elliott
"""
from numpy import prod
from collections import Counter, Set
from nltk import sent_tokenize,ngrams,PorterStemmer,SnowballStemmer,WordNetLemmatizer
from nltk.tag import perceptron
import utils
import phrase_similarity
import numpy as np
import pickle
tagger = perceptron.PerceptronTagger()
porter = PorterStemmer()
snowball = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
thermometers = ['democrats', 'republicans', 'protestants', 'catholics', 'jews', 'blacks', 'whites', 'southerners',
'big business', 'labor unions', 'liberals', 'conservatives', 'military', 'policemen',
'black militants', 'civil rights leaders', 'chicanos hispanics', 'democratic party',
'middle class people', 'people on welfare', 'political independents', 'political parties',
'poor people', 'republican party', 'womens right activist', 'young people', 'asian-americans', 'congress',
'environmentalists', 'anti abortionists', 'federal government', 'illegal aliens',
'christian fundamentalists', 'radical students', 'farmers', 'feminists', 'evangelical groups',
'elderly', 'supreme court', 'women']
def word_normalize(word,stemmer=None):
w = word.lower()
if stemmer == 'porter':
w = porter.stem(w)
elif stemmer == 'snowball':
w = snowball.stem(w)
elif stemmer == 'lemma':
w = lemmatizer.lemmatize(w)
return w
# Normalize Penn tags
tagdict = {'NN':'N',
'NNS':'N',
'NNP':'N',
'NNPS':'N',
'JJ':'A',
'JJR':'A',
'JJS':'A',
'VBG':'A',
'RB':'A', # adverbs treated as adjectives
'DT':'D',
'IN':'P',
'TO':'P',
'VB':'V',
'VBD':'V',
'VBN':'V',
'VBP':'V',
'VBZ':'V',
'MD': 'V', # modals treated as verbs
'RP': 'V', # particles treated as verbs
'CC': 'C'}
# Allowed sequences of tag patterns (from Ash 2016)
tagpatterns = {'A','N','J',
'AN','NN', 'VN', 'VV', 'NV',
'VP',
'NNN','AAN','ANN','NAN','NPN',
'VAN','VNN', 'AVN', 'VVN',
'VPN','ANV','NVV','VDN', 'VVV', 'NNV',
'VVP','VAV','VVN',
'NCN','VCV', 'ACA',
'PAN',
'NCVN','ANNN','NNNN','NPNN', 'AANN' 'ANNN','ANPN','NNPN','NPAN',
'ACAN', 'NCNN', 'NNCN', 'ANCN', 'NCAN',
'PDAN', 'PNPN',
'VDNN', 'VDAN','VVDN'}
def tagsentence(sent,stemmer=None,vocab=None):
# convert to one-letter tags if applicable,
# replace with none if word not in vocab
# replace with none if tag not in tagdict
tagwords = []
for x in tagger.tag(sent):
if (vocab is None or x[0] in vocab) and x[1] in tagdict:
normword = word_normalize(x[0],stemmer=stemmer)
normtag = tagdict[x[1]]
tagwords.append((normword,normtag))
else:
tagwords.append(None)
return tagwords
def gmean(phrase, termfreqs):
"""geometric mean association."""
n = len(phrase)
p = [termfreqs[w] for w in phrase.split('_')]
pg = termfreqs[phrase]
return pg / (prod(p) ** (1/n))
def train_phraser(max_phrase_length=3, stemmer=None, vocab=None,
min_doc_freq=None, min_gmean=None):
# take documents and get POS-gram dictionary
numdocs = 0
docfreqs = Counter()
termfreqs = Counter()
root_Directory = 'data/clean_Mar_20'
list_of_dirs = utils.getDirectoryList(root_Directory)
for directory in list_of_dirs:
if not directory.endswith('zip'):
print(directory)
utils.createDirectory("similarities")
utils.createDirectory("similarities/" + directory)
files = utils.getFilesListFromDir(directory)
for file_name in files:
para_list = utils.getParaListFromFile(file_name, directory)
for para in para_list:
numdocs += 1
docgrams = set()
# split into sentences
sentences = sent_tokenize(para)
for sentence in sentences:
# split into words and get POS tags
words = sentence.split()
tagwords = tagsentence(words, stemmer, vocab)
for n in range(1, max_phrase_length + 1):
rawgrams = ngrams(tagwords, n)
for rawgram in rawgrams:
# skip grams that have words not in vocab
if None in rawgram:
continue
gramtags = ''.join([x[1][0] for x in rawgram])
if gramtags in tagpatterns:
# if tag sequence is allowed, add to counter
gram = '_'.join([x[0] for x in rawgram])
termfreqs[gram] += 1
docgrams.add(gram)
docfreqs.update(docgrams)
# filter vocabulary based on document frequency and make gram ids
gram2id = {}
id2gram = {}
if min_doc_freq is None:
min_doc_freq = round(numdocs / 200) + 1
i = 0
for (phrase,v) in docfreqs.most_common():
if v < min_doc_freq:
break
if min_gmean is not None:
# check geometric mean association
n = v.count('_') + 1
if len(n) >= 2:
gscore = gmean(phrase,termfreqs)
if gscore[n] < min_gmean[n]:
continue
gram2id[phrase] = i
id2gram[i] = phrase
i += 1
return gram2id, id2gram
def apply_phraser(words, gram2id, max_phrase_length=3):
""""apply phraser method to sentence."
Input should be list of lower-case (stemmed) words"""
sentlength = len(words)
skip = 0
new_s = []
for i in range(sentlength):
if skip > 0:
skip -= 1
continue
if words[i] is None:
continue
for n in reversed(range(1,max_phrase_length+1)):
if i+n > sentlength:
continue
gram = words[i:i+n]
if None in gram:
continue
gram_word = '_'.join(gram)
if gram_word in gram2id:
new_s.append(gram2id[gram_word])
skip = n-1
break
return new_s
'''
documents = ['This is a test document sentence. This is the second sentence.',
'This is a second test document.',
'Beyond a reasonable doubt.']
'''
root_Directory = 'data/clean_Mar_20'
list_of_dirs = utils.getDirectoryList(root_Directory)
# Training the phraser
phrase2id, id2phrase = train_phraser()
# Phrase vector of the thermometers
thermometer_vector = [phrase_similarity.PhraseVector(thermometer) for thermometer in thermometers]
# Getting data from the phraser
for directory in list_of_dirs:
if not directory.endswith('zip'):
print(directory)
utils.createDirectory("similarities")
utils.createDirectory("similarities/" + directory)
files = utils.getFilesListFromDir(directory)
for file_name in files:
para_list = utils.getParaListFromFile(file_name, directory)
caseLevelParaSimilarityVectorsCombined = []
for para in para_list:
set_vector = set()
sentences = sent_tokenize(para)
for sentence in sentences:
# split into words and get POS tags
words = [w.lower() for w in sentence.split()]
phraseids = apply_phraser(words, phrase2id)
for phraseId in phraseids:
set_vector.add(np.array([thermometer.CosineSimilarity(phrase_similarity.PhraseVector(id2phrase[phraseId])) for thermometer in thermometer_vector]))
paraLevelSimilarityVector = np.mean(set_vector, axis=0)
caseLevelParaSimilarityVectorsCombined.append(paraLevelSimilarityVector)
pickle.dump(caseLevelParaSimilarityVectorsCombined, open("similarities/" + directory + "/" + file_name, "wb"))