-
Notifications
You must be signed in to change notification settings - Fork 1
/
queryparser.py
executable file
·487 lines (451 loc) · 17.4 KB
/
queryparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
"""
queryparser.py
author: Sophia van Valkenburg
This script retrieves a query string, user, and password.
If the user and password are correct, parse the query and
return a JSON string of the parsed values.
"""
import json
import datetime
from dateutil import parser as dp
from dateutil import relativedelta as rd
from flask import Flask, request
from nltk import RegexpParser, word_tokenize, pos_tag
from nltk.tree import Tree,ParentedTree
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
UNK = "UNK"
SKIP = "SKIP"
PUNCTUATION = "~`!@#$%^&*()_+-={}|[]\\:\";'<>?,./"
app = Flask(__name__)
@app.route('/parse', methods=['GET','POST'])
def receive_parse_request():
query_text = request.form.get('text')
network_json = request.form.get('networks')
auth = request.args.get('auth')
if not query_text or not auth:
response = {
"error": "must specify 'text' and 'auth' fields"
}
else:
if authenticate(auth):
response = parse(query_text, network_json)
else:
response = {"error": "auth code is incorrect" }
return json.dumps(response)
def authenticate(auth):
return True
def parse(query_text, networks_json):
query_text = preprocess(query_text)
tokens = word_tokenize(query_text)
double_tokens = [ (w, w) for w in tokens ]
wg = word_grammar()
w_cp = RegexpParser(compile_grammar(wg))
word_result = w_cp.parse(double_tokens)
word_result = convert_dates(word_result)
new_tokens = list(zip(*(word_result.leaves()))[0])
tagged = pos_tag(new_tokens)
domain_tagged = tag_domains(tagged, networks_json)
tg = tag_grammar()
t_cp = RegexpParser(compile_grammar(tg))
tagged_result = t_cp.parse(domain_tagged)
slots = assign_slots(new_tokens, tagged_result, word_result)
interpreted_input = make_sentence(slots)
print 'tagged-result = ',tagged_result
print 'word-result = ',word_result
return {"parse":slots, "interpreted":interpreted_input}
def preprocess(string):
"""
-strip, make lowercase
-space out punctuation. necessary to treat punctuation as separate tokens.
so 2/13/2014 becomes 2 / 13 / 2014
-also replace <, > with gt, lt because these symbols are reserved.
"""
string = string.strip().lower()
new_string = ""
for ch in string:
if ch in PUNCTUATION:
if ch == '>':
ch = 'gt'
elif ch == '<':
ch = 'lt'
ch = " " + ch + " "
new_string += ch
return new_string
def make_sentence(slots):
return ""
def convert_dates(tree):
"""
convert dates to one format (YYYY/MM/DD). also find relative dates.
"""
today = datetime.datetime.today()
# default is jan 1
default_date = datetime.datetime( today.year, 1, 1)
for subtree in tree.subtrees():
if subtree.node == 'SDATE':
date = parse_raw_date(subtree, default_date, today, years=1)
date_str = datetime.date.strftime(date, '%m-%d-%Y')
subtree[:] = [(date_str, date_str)]
elif subtree.node == 'RDATE':
date = None
last_number = None
for item in subtree:
if not isinstance(item, Tree):
# it's just a word
if item[0] == 'yesterday':
date = today - rd.relativedelta(days=1)
elif item[0] == 'today':
date = today
else:
if item.node == 'MONTH':
# TODO: this should really be a range
date = parse_raw_date(item,default_date,today,years=1)
elif item.node == 'DAY_OF_WEEK':
date = parse_raw_date(item, today, today, weeks=1)
elif item.node == 'NUM':
# TODO: this should really be a range
parsed_num = parse_raw_number(item)
# if it's 4 digits it's probably a year
if parsed_num >= 1900: # only want recent years
yeardiff = parsed_num-today.year
date = parse_raw_date(item, default_date, today,
years=yeardiff)
else:
last_number = parsed_num
elif item.node == 'DATE_UNIT':
if not last_number:
last_number = 1
leaves = zip(*tree.leaves())[0]
raw_unit = ' '.join(leaves)
if 'day' in raw_unit:
date = today - rd.relativedelta(days=last_number)
elif 'week' in raw_unit:
date = today - rd.relativedelta(weeks=last_number)
elif 'month' in raw_unit:
date = today - rd.relativedelta(months=last_number)
elif 'year' in raw_unit:
date = today - rd.relativedelta(years=last_number)
elif 'decade' in raw_unit:
date = today - rd.relativedelta(
years=10*last_number)
if date:
date_str = datetime.date.strftime(date, '%m-%d-%Y')
subtree.node = 'SDATE'
subtree[:] = [(date_str, date_str)]
return tree
def parse_raw_date(tree, default_date, today, **rd_kwargs):
leaves = zip(*tree.leaves())[0]
raw_date = ' '.join(leaves)
try:
date = dp.parse(raw_date, default=default_date)
except ValueError:
print "Error parsing date %s"%raw_date
return None
if date > today:
# don't want to be future date
date = date - rd.relativedelta(**rd_kwargs)
return date
def parse_raw_number(tree):
leaves = zip(*tree.leaves())[0]
raw_num = ' '.join(leaves)
try:
parsed_num = int(raw_num)
except ValueError:
#for now, just use a limited dict to convert to number
num_dict = {
'one':1,
'two':2,
'three':3,
'four':4,
'five':5,
'six':6,
'seven':7,
'eight':8,
'nine':9,
'ten':10,
}
parsed_num = num_dict.get(raw_num)
return parsed_num
def assign_slots(tokens, tag_tree, word_tree):
stopword_list = stopwords.words('english')
tokens_with_slot_tags = []
word_tree = ParentedTree.convert(word_tree)
tag_tree = ParentedTree.convert(tag_tree)
word_tree_with_cats = tag_words_with_categories(word_tree)
tag_tree_with_cats = tag_words_with_categories(tag_tree)
for i, word in enumerate(tokens):
tag = finalize_tags(i, word, tag_tree_with_cats, word_tree_with_cats)
tokens_with_slot_tags.append((word, tag))
found_query_focus = False
for i, item in enumerate(tokens_with_slot_tags):
word, tag = item
if tag in ['USER','MEDIA','NETWORK'] and not found_query_focus:
tokens_with_slot_tags[i] = (word, 'SEARCH')
found_query_focus = True
elif tag == UNK:
tokens_with_slot_tags[i] = (word, 'KEYWORD')
slots = {}
for word, tag in tokens_with_slot_tags:
if tag == 'SKIP':
continue
elif tag == 'KEYWORD':
if 'KEYWORDS' not in slots:
slots['KEYWORDS'] = []
if word not in stopword_list and word not in PUNCTUATION:
slots['KEYWORDS'].append(word)
else:
if tag not in slots:
slots[tag] = word
else:
previous_words = slots[tag]
slots[tag] = ' '.join([previous_words, word])
return slots
def finalize_tags(i, word, tt, wt):
tt_ind = tt.leaf_treeposition(i)
wt_ind = wt.leaf_treeposition(i)
tt_pos = tt[tt_ind][1]
tt_category = tt[tt_ind][2]
wt_category = wt[wt_ind][2]
if wt_category.endswith('DATE'):
return 'DATE_EXACT'
elif wt_category.endswith('DATE_FROM'):
if tt_pos != 'IN':
return 'DATE_FROM'
else:
return SKIP
elif wt_category.endswith('DATE_TO'):
if tt_pos != 'IN' and tt_pos != 'TO':
return 'DATE_TO'
else:
return SKIP
elif wt_category == 'LENGTH':
if tt_pos == 'CD' or tt_pos == 'LS':
return 'LENGTH_NUM'
elif tt_pos == 'NN' or tt_pos == 'NNS' or tt_pos == 'JJ':
return 'LENGTH_UNIT'
else:
return SKIP
elif tt_pos == 'MEDIA':
return 'MEDIA'
elif tt_pos == 'NETWORK':
return 'NETWORK'
elif tt_pos == 'NETWORK_NAME':
return 'NETWORK_NAME'
elif tt_pos == 'USER':
return 'USER'
elif tt_category == "PosP":
if tt_pos != 'POS' and len(word) > 1:
return 'CREATOR'
else:
return SKIP
elif tt_category == 'PP':
if tt_pos != 'IN' and tt_pos != 'TO':
# get the value of preposition
j = i
tt_pos_j = tt_pos
while j > 0 and tt_pos_j != 'IN' and tt_pos_j != 'T0':
j = j-1
tt_ind_j = tt.leaf_treeposition(j)
tt_pos_j = tt[tt_ind_j][1]
wt_ind_j = wt.leaf_treeposition(j)
wt_category_j = wt[wt_ind_j][2]
if wt_category_j == 'BY' or wt_category_j == 'FROM':
return 'CREATOR'
else:
return UNK
else:
return SKIP
else:
return UNK
def tag_words_with_categories(tree):
"""
tag each word in the tree with the category it is listed under
for example, (LENGTH (NUM 1/1) minute/minute ) should have leaves
1/1/LENGTH minute/minute/LENGTH
"""
for subtree in tree:
if isinstance(subtree, Tree):
for pos in subtree.treepositions('leaves'):
subtree[pos] += (subtree.node,)
return tree
def compile_grammar(glist):
grammar = ""
gdict = {}
for entry in glist:
tag, rules = entry[0], entry[1:]
rule_str = '\n'.join(rules)
if tag in gdict:
if gdict[tag] != rule_str:
raise ValueError("%s has two rules: '%s' and '%s'"
%(tag, gdict[tag], rule_str) )
else:
continue
else:
gdict[tag] = rule_str
grammar += "%s: %s \n"%(tag, rule_str)
return grammar
def tag_grammar():
glist = [
("MedP", "{<DT>?<JJ>*<MEDIA>+<NN.*|NetP|NETWORK.*>*}"),
("NetP", "{<DT>?<JJ>*<NETWORK.*>+<NN.*|MedP|MEDIA>*}"),
("PP", "{<.*>+}", "<.*>}{<RB>?<IN|TO>", "}<WDT><.*>+{"),
("PosP", "{<.*><POS><NNS>?}"),
("QUOTE", "{<``><.*>*<''>}"),
(UNK, "{<.*>}","}<MedP|NetP|PP|PosP|QUOTE>{")
]
return glist
def word_grammar():
NUM = num_grammar("NUM")
DATE = date_grammar("SDATE", "RDATE")
terminals = [
("CMP", "{<over|under|at least|at most|atleast|atmost|"
"more than|greater than|less than|fewer than|gt|lt>}"),
("UNIT", "{<sec|min|hr|second|minute|hour|day|seconds|minutes|hours|"
"days>}"),
("FROM", "{<after|starting|since|beginning|from>}"),
("TO", "{<before|ending|to>}"),
("ON", "{<on>}"),
("IN","{<in>}"),
("OF","{<of>}"),
("ABOUT","{<about>}"),
("BY","{<by>}"),
("WITH","{<with>}"),
]
rules = (
NUM + [ ("LENGTH", "{<CMP>?<NUM|a><->?<UNIT><long>?}") ] +
DATE + [ ("SDATE_FROM","{<FROM><SDATE>}"),
("SDATE_TO", "{<TO><SDATE>}"),
("RDATE_FROM", "{<FROM><RDATE>}"),
("RDATE_TO", "{<TO><RDATE>}"),
(UNK, "{<.*>}",
"}<LENGTH|.*DATE.*|CMP|UNIT|FROM|TO|ON|IN|OF|ABOUT|BY|WITH>{") ]
)
return terminals + rules
def num_grammar(tag="NUM"):
DIGIT = "<one|two|three|four|five|six|seven|eight|nine>"
TEEN = ("<ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|sevente"
"en|eighteen|nineteen>")
TEN = "<twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety>"
LARGE = "(<a>?<hundred|thousand|million|billion|trillion>)"
NUMWORDS = "(("+DIGIT+"|"+TEEN+"|"+TEN+")?"+LARGE+"?)"
NUMCHARS = "<\.>?<\d+>(<\.|/><\d+>)?"
FRACTION = "(<and><a><half|quarter>|<half>)"
NUM = "{"+ NUMCHARS+ "|" + NUMWORDS + FRACTION + "?|" + FRACTION +"}"
return [(tag, NUM)]
def date_grammar(specific_tag="SDATE", relative_tag="RDATE"):
NUM = num_grammar("NUM")
DAY_OF_WEEK = ("{<monday|mon|tuesday|tues|wednesday|wed|thursday|thur|"
"friday|fri|saturday|sat|sunday|sun>}")
MONTH =("{<january|jan|february|feb|march|mar|april|apr|june|jun|july|jul|"
"august|aug|september|sep|sept|october|oct|november|nov|december|"
"dec>}")
ORD = ("<first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|"
"eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|"
"seventeeth|eigteenth|nineteenth|twentieth|thirtieth>")
ORDCHARS = "<\d\d?(st|nd|rd|th)>"
DAY = "{" + ORD + "|" + ORDCHARS +"}"
SEP = "<\.|/|-|,>?"
SPECIFIC_DATE = ( "{<DAY_OF_WEEK>?" + SEP +
"( <NUM>" + SEP + "<NUM> (" + SEP + "<NUM> )? |"
+ "<MONTH>" + SEP + "<DAY|NUM> (" + SEP + "<NUM> )? |"
+ "<DAY|NUM>" + SEP + "<MONTH> (" + SEP + "<NUM> )? |"
+ "<DAY|NUM><of><MONTH>)"
+ SEP + "<DAY_OF_WEEK>?}" )
MOD = "<the last|last|this|this past|the past>"
AGO = "<ago|before|previous|prior|previously|since|old>"
UNIT = "{<day|week|month|year|decade|days|weeks|months|years|decades>}"
RELATIVE_DATE = ("{ <yesterday>|<today>|<NUM><DATE_UNIT>"+AGO+
"|"+MOD+"<NUM>?<DATE_UNIT>" + "|"+MOD+"?<MONTH|DAY_OF_WEEK> |"+
"<NUM> }")
return NUM + [("DATE_UNIT", UNIT), ("DAY_OF_WEEK", DAY_OF_WEEK),
("DAY", DAY), ("MONTH", MONTH),
(specific_tag, SPECIFIC_DATE), (relative_tag, RELATIVE_DATE)]
def matches_domain(word, domain_synset, thresh=0.5):
"""
check whether word is a part of the domain using max synset similarity
in the future, can use machine learning approach
word -- input word
domain_synset __ list of synsets representing the domain
"""
word_synsets = get_synsets([word])
max_similarity_score = 0
for synset1 in domain_synset:
for synset2 in word_synsets:
this_score = synset1.path_similarity(synset2)
max_similarity_score = max( max_similarity_score, this_score)
return max_similarity_score >= thresh
def tag_domains(tagged, networks_json):
"""
returns a list of (word, tag) tuples where some words are tagged as
MEDIA or NETWORK. MEDIA words are tagged using just one feature -
wordnet path similarity between the synsets of the given word and
synsets of the words in the MEDIA category. NETWORK words are tagged
using the same method as MEDIA, but also if the word is part of a
given network list.
In the future, we can add more features to the domain classifier.
We should also look at the next and previous words to match network
names instead of just checking one word (obviously, the current method
will not work for networks that are more than one word long)
"""
media_synset_list = media_synsets()
network_synset_list = network_synsets()
user_synset_list = user_synsets()
network_list = get_network_names(networks_json)
domain_tagged = []
for word, tag in tagged:
if tag.startswith('N'):
if matches_domain(word, media_synset_list):
tag = 'MEDIA'
elif matches_domain(word, network_synset_list):
tag = 'NETWORK'
elif word in network_list:
tag = 'NETWORK_NAME'
elif matches_domain(word, user_synset_list):
tag = 'USER'
domain_tagged.append((word, tag))
return domain_tagged
def media_synsets():
"""
returns wordnet synsets for all words in media list
"""
words = ("media|video|photo|audio|clip|movie|news|content|advertisement|"
"podcast|footage|story|coverage|tv|program|upload|file|radio|segment")
wordlist = words.split("|")
return get_synsets(wordlist)
def network_synsets():
"""
returns wordnet synsets for all words in networks list.
"""
words = ("network|channel|affliate|broadcaster|distributor|provider|"
"telecast|communications")
wordlist = words.split("|")
return get_synsets(wordlist)
def user_synsets():
"""
returns wordnet synsets for all words in users list.
"""
words = ("user|member|username")
wordlist = words.split("|")
return get_synsets(wordlist)
def get_synsets(wordlist):
"""
returns wordnet synsets for all words in the hardcoded list
in the future, perhaps this list can be generated in a better way.
"""
synsets = []
for word in wordlist:
synsets += wn.synsets(word)
return synsets
def get_network_names(networks_json=None):
"""
gets a list of network names from the given json list
"""
networks = []
try:
networks = [ preprocess(s) for s in json.loads(networks_json) ]
except ValueError:
print "Unable to parse JSON networks list"
return networks
if __name__ == "__main__":
app.debug = True
app.run()