-
Notifications
You must be signed in to change notification settings - Fork 1
/
infoextract.py
152 lines (125 loc) · 3.62 KB
/
infoextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import sys
import re
import incident_predictor
import preprocess
from answer_stats import answr_dict, get_weapon, get_perp_indiv, get_perp_org, get_target, get_victim
from nltk import sent_tokenize, word_tokenize, pos_tag, RegexpParser, ne_chunk
DEBUG=False
PATTERN = "((DEV|TST1|TST2)\-MUC\d\-\d{4})"
EMPTY_LINE = "\s*\n\s*$" # checks if a line is empty
NEWLINE = "\n(?=.)" # selects a new line if it occurs before some char
KEY = None
if(len(sys.argv) == 1):
print ("please enter an input file ")
sys.exit()
input_file = sys.argv[1]
output_file = input_file + ".templates"
# opening the output file for writing
f_out = open(output_file,'w')
# THIS should handle multiple line for one slot as well
def print_out(id_name,incident,weapon,perp_indiv,perp_org,target,victim):
# now write output to this file
f_out.write("ID: "+id_name+"\n")
f_out.write("INCIDENT: "+incident+"\n")
f_out.write("WEAPON: "+weapon+"\n")
f_out.write("PERP INDIV: "+perp_indiv+"\n")
f_out.write("PERP ORG: "+perp_org+"\n")
f_out.write("TARGET: "+target+"\n")
f_out.write("VICTIM: "+victim+"\n")
f_out.write("\n")
def make_key():
global KEY
f = open('developset/test_set/answerkeys/AGGREGATE')
key = {}
lastid = ''
for line in f.readlines():
spl = line.split(':')
if spl[0] == 'ID':
lastid = spl[1].strip()
if spl[0] == 'WEAPON':
key[lastid] = spl[1].strip()
KEY = key
# the main function that processes each MUC text and produces the answer key
def process_input_text(file_text,id_name):
global KEY
(meta,main) = preprocess.split_text(file_text)
if(not meta):
print "ERROR IN SPLITTING MAIN AND META"
return
if(not main):
print "ERROR IN SPLITTING MAIN AND META"
return
file_text = re.sub(NEWLINE," ",main)
if(DEBUG):
print ("processing text",main)
print ("")
d = answr_dict()
if not KEY:
make_key()
grammar = r"""
NP: {<RB|PP\$>?<JJ>*<NN>+<POS>?}
NP: {<RB|PP\$>?<JJ>*<NN>+<NNS>*}
{<NNP>+}
{<RB|PP\$>?<JJ>*<NNS>*<POS>?}
"""
#sents = map(pos_tag, map(word_tokenize, [s for s in sent_tokenize(file_text.lower())]))
#cp = RegexpParser(grammar)
#for s in sents:
# print cp.parse(s)
weapons = get_weapon(file_text, d)
print weapons
weapon = weapons[0][0]
print id_name
print 'C', KEY[id_name], '\n', 'D', weapon
print
#perpindiv = get_perp_indiv(file_text, d)
perpindiv = '-'
#perporg = get_perp_org(file_text, d)
perporg = '-'
#targets = get_target(file_text, d)
#target = targets[0][0]
target = '-'
#victims = get_victim(file_text, d)
#victim = victims[0][0]
victim = '-'
incident_type = incident_predictor.get_predicted_event(main)
print_out(id_name,incident_type,weapon,perpindiv,perporg,target,victim)
def process_file():
# compile the regex patter
compiled_pattern = re.compile(PATTERN,re.IGNORECASE)
# open file
f = open(input_file)
# initialize vars
file_text = ""
file_count = 0
id_name_old = ""
line = f.readline()
while(line != ""):
m = compiled_pattern.search(line)
if(m):
file_count += 1
# store starting patter
id_name_new = m.group(1) # group(0) is the whole string
if(file_count == 1):
id_name_old = id_name_new
elif(file_count > 1):
# process old text
ret = process_input_text(file_text,id_name_old)
id_name_old = id_name_new
file_text = ""
else:
# start collecting new line in file_text
file_text = file_text + line
line = f.readline()
# captures last text
ret = process_input_text(file_text,id_name_old)
f.close()
#return text
def main():
# read the file name
process_file()
f_out.close()
# find the start
# run ur basic algo
if __name__== "__main__":
main()