-
Notifications
You must be signed in to change notification settings - Fork 1
/
infoextract2.py
345 lines (289 loc) · 8.55 KB
/
infoextract2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import sys,os
import re
import incident_predictor
import preprocess
import utility
import pattern_extractor
import matching
from meta import proc_meta
import expt
from answer_stats import answr_dict, get_weapon, get_perp_indiv, get_perp_org, get_target, get_victim
#DEBUG=True
DEBUG=False
# REGEX PATTERNS
PATTERN = "((DEV|TST1|TST2)\-MUC\d\-\d{4})"
EMPTY_LINE = "\s*\n\s*$" # checks if a line is empty
NEWLINE = "\n(?=.)" # selects a new line if it occurs before some char
COLL_SPACES = "\s+"
SPACES_REPL = " "
SPATT = "'S"
if(len(sys.argv) == 1):
print ("please enter an input file ")
sys.exit()
input_file = sys.argv[1]
output_file = input_file + ".templates"
# opening the output file for writing
f_out = open(output_file,'w')
def print_outf(id_name,incident,weapon_l,perp_indiv_l,perp_org_l,target_l,victim_l):
f_out.write("ID: "+id_name+"\n")
f_out.write("INCIDENT: "+incident+"\n")
if weapon_l:
count = 0
for w in weapon_l:
if(count==0):
s = "WEAPON: %s\n"%(w)
count = 1
else:
s = " %s\n"%(w)
f_out.write(s)
else:
s = "WEAPON: -\n"
f_out.write(s)
if perp_indiv_l:
count = 0
for pi in perp_indiv_l:
if(count==0):
s = "PERP INDIV: %s\n"%(pi)
count = 1
else:
s = " %s\n"%(pi)
f_out.write(s)
else:
s = "PERP INDIV: -\n"
f_out.write(s)
if perp_org_l:
count =0
for po in perp_org_l:
if(count==0):
s = "PERP ORG: %s\n"%(po)
count = 1
else:
s = " %s\n"%(po)
f_out.write(s)
else:
s = "PERP ORG: -\n"
f_out.write(s)
if target_l:
count = 0
for t in target_l:
if(count==0):
s = "TARGET: %s\n"%(t)
count = 1
else:
s = " %s\n"%(t)
f_out.write(s)
else:
s = "TARGET: -\n"
f_out.write(s)
if victim_l:
count =0
for v in victim_l:
if(count==0):
s = "VICTIM: %s\n"%(v)
count = 1
else:
s = " %s\n"%(v)
f_out.write(s)
else:
s = "VICTIM: -\n"
f_out.write(s)
f_out.write("\n")
# the main function that processes each MUC text and produces the answer key
def process_input_text(file_text,id_name):
# remove the \n from in between the lines
(meta,main) = preprocess.split_text(file_text)
if (not meta):
print "ERROR IN SPLITTING MAIN AND META"
return
if(not main):
print "ERROR IN SPLITTING MAIN AND META"
return
#print proc_meta(meta)
temp_victim_list = []
final_victim_set =set([])
temp_target_list = []
final_target_set = set([])
temp_perpi_list = []
final_perpi_set = set([])
file_text = re.sub(NEWLINE," ",main)
file_text_list = file_text.split('\n')
if(DEBUG):
print ("processing text",main)
print ("")
### BEGIN EXPERIMENTAL ###
# pass file text instead of main in infoextract2.py
incident_type = incident_predictor.get_predicted_event(main)
#incident_type = '-'
# TODO NER CALL A FUNCTION THAT returns NER DICT
d = answr_dict()
weapon = get_weapon(file_text, d)
weapon_l = [weapon[0][0]]
perp_org = get_perp_org(file_text,d)
perp_org_l = [perp_org[0][0]]
p_new_list = ['-']
t_new_list = ['-']
v_new_list = ['-']
### END EXPERIMENTAL ###
"""
# open file containing victim patterns
text = utility.f_read('victim_out_patterns_regex2')
victim_patt_lines = text.split('\n')
text = utility.f_read('target_out_patterns_regex2') # has only back patt
target_patt_lines = text.split('\n')
text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns
perp_patt_lines = text.split('\n')
# ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing
# READ EACH LINE IN THE from input file
for line in file_text_list:
line = line.strip()
if(not line):
continue
# split each line into several sentences
sents = utility.sent_splitter(line)
for sent in sents:
#print "processing line",line
# make sure no consecutive white spaces in ur line
sent = sent.strip()
# TODO remove 's and `` from sentence remove `` as well ?
sent = re.sub(SPATT,"",sent)
input_line = re.sub(COLL_SPACES,SPACES_REPL,sent)
temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines)
if temp_victim_list:
for victim in temp_victim_list:
victim = victim.strip()
if victim:
final_victim_set.add(victim)
# TARGET LIST
temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines)
if temp_target_list:
for target in temp_target_list:
target = target.strip()
if target:
final_target_set.add(target)
# PERPI LIST
temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines)
if temp_perpi_list:
for perp in temp_perpi_list:
perp = perp.strip()
if perp:
final_perpi_set.add(perp)
# now use algorithms to clean this list and to remove redundant stuff
# get target_list
#subset removal
v_new_list = list(final_victim_set)
v_new_list = utility.remove_subsets(v_new_list)
if (DEBUG):
print "after subset removal"
print v_new_list
v_new_list = utility.remove_syn(v_new_list)
if (DEBUG):
print "after duplicate removal for ",id_name
print v_new_list
v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters
if (DEBUG):
print "after removing flag words for ",id_name
print v_new_list
v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects
if (DEBUG):
print "after one removing first word flags for ",id_name
print v_new_list
v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER
if (DEBUG):
print "after removing first title words like COLONEL etc ",id_name
print v_new_list
v_new_list = utility.one_word_cleaner(v_new_list)
if (DEBUG):
print "after one word and digit removal for ",id_name
print v_new_list
v_new_list = utility.victim_hacks(v_new_list)# e.g hacks
if (DEBUG):
print "after adding some hacks make unique",id_name
print v_new_list
print "###########################"
t_new_list = list(final_target_set)
t_new_list = utility.remove_subsets(t_new_list)
if (DEBUG):
print "after subset removal"
print t_new_list
t_new_list = utility.remove_syn(t_new_list)
if (DEBUG):
print "after duplicate removal"
print t_new_list
t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters
if (DEBUG):
print "after removing flag words for ",id_name
print t_new_list
t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects
if (DEBUG):
print "after one removing first word flags for ",id_name
print t_new_list
t_new_list = utility.one_word_cleaner(t_new_list)
if (DEBUG):
print "###Final after one word removal for ",id_name
print t_new_list
t_new_list = ['-']
#print "###########################"
p_new_list = ['-']
p_new_list = list(final_perpi_set)
p_new_list = utility.remove_subsets(p_new_list)
if (DEBUG):
print "after subset removal"
print p_new_list
p_new_list = utility.remove_syn(p_new_list)
if (DEBUG):
print "after duplicate removal"
print p_new_list
p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters
if (DEBUG):
print "after removing flag words for ",id_name
print p_new_list
p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects
if (DEBUG):
print "after one removing first word flags for ",id_name
print p_new_list
p_new_list = utility.one_word_cleaner(p_new_list)
if (DEBUG):
print " Final after one word and digit removal for ",id_name
print p_new_list
"""
#print "###########################"
#dict_out = matching.match(parsed_text)
#print ("")
print_outf(id_name,incident_type,weapon_l,p_new_list,perp_org_l,t_new_list,v_new_list)
def process_file():
# compile the regex patter
compiled_pattern = re.compile(PATTERN,re.IGNORECASE)
# open file
f = open(input_file)
# initialize vars
file_text = ""
file_count = 0
id_name_old = ""
line = f.readline()
while(line != ""):
m = compiled_pattern.search(line)
if(m):
file_count += 1
# store starting patter
id_name_new = m.group(1) # group(0) is the whole string
if(file_count == 1):
id_name_old = id_name_new
elif(file_count > 1):
# process old text
ret = process_input_text(file_text,id_name_old)
id_name_old = id_name_new
file_text = ""
else:
# start collecting new line in file_text
file_text = file_text + line
line = f.readline()
# captures last text
ret = process_input_text(file_text,id_name_old)
f.close()
#return text
def main():
process_file()
# close the answer.templates file
f_out.close()
if __name__== "__main__":
main()