forked from helloram52/automated-event-extraction
/
Main.py
203 lines (166 loc) · 7.45 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import nltk, sys, re
from nltk.corpus import wordnet
from enchant.checker import SpellChecker
from autocorrect import spell
import timex, Utilities
from Event import Event
from nltk.tag import StanfordNERTagger
from Features import Features
KEYWORDS = ['marriage', 'birthday', 'meeting', 'anniversary', 'seminar']
SYNONYMS_FOR_KEYWORDS = {}
PAST_TENSE_TAGS = ['VBD','VBN']
TIMEX_TAG = "</TIMEX2>"
#STANFORD_NER_ROOT = "/Users/vads/Downloads/stanford-ner-2014-06-16/"
STANFORD_NER_ROOT = "/home/ram/Downloads/stanford-ner-2014-06-16/"
STANFORD_NER_PATH = STANFORD_NER_ROOT + 'stanford-ner.jar'
RESULT = []
RESULT_HEADER = ["Event", "When", "Where", "Original Text", "Lexical-Tokens", "Lexical-SpellCorrection", "Syntactic-POS tags", "Syntactic-Temporal tag", "Semantic-Synonym", "Semantic-Location" ]
TIMEX_TAG_REGEX = r'<TIMEX2 .+>.+?</TIMEX2>'
def initialize():
setupKeywords()
SYNONYMS_FOR_KEYWORDS['seminar'].append('lecture')
Utilities.setupLog()
#perform spell correction
def performSpellCorrection(featureObj):
checker = SpellChecker("en_US", featureObj.getText())
for word in checker:
word.replace(spell(word.word))
featureObj.getLexicalFeatures().setSpellCorrection(checker.get_text())
return featureObj
#get synonyms for given word
def getSynonyms(word):
lemmas = []
synsets = wordnet.synsets(word)
for sense in synsets:
lemmas += [re.sub("_", " ", lemma.name()) for lemma in sense.lemmas()]
return list(set(lemmas))
def setupKeywords():
# get all synonyms for given keywords
global SYNONYMS_FOR_KEYWORDS
for word in KEYWORDS:
SYNONYMS_FOR_KEYWORDS[word] = getSynonyms(word)
def isRequiredEvent(obj, dict):
for word in dict:
for synonym in dict[word]:
if synonym in obj.getText().lower():
obj.getSemanticFeatures().setSynonym(str(dict[word]))
return True, word
return False, ""
def getCommandLineArgs():
if len(sys.argv) < 2:
print "ERROR: Usage: Main.py <input> <output>"
exit(1)
return sys.argv[1], sys.argv[2]
def preProcessData(input):
# read input file
inputObjects = Utilities.parseInputFile(inputFileName)
# split text into lines based on delimiter
#lines = Utilities.split(inputData, ".")
# perform spell correction
featureObjects = []
for obj in inputObjects:
featureObjects.append(performSpellCorrection(obj))
return featureObjects
def performTagging(featureObjects):
taggedLines = []
for obj in featureObjects:
taggedLine = ""
try:
taggedLine = timex.tag(obj.getLexicalFeatures().getSpellCorrection().lower())
taggedLine = timex.ground(taggedLine, timex.gmt())
except:
taggedLine = ""
if not Utilities.isEmpty(taggedLine):
obj.getSyntacticFeatures().setTemporalTag(Utilities.firstMatching(TIMEX_TAG_REGEX, taggedLine))
taggedLines.append(obj)
return taggedLines
#check whether event is past
def isEventPast(obj):
initialTokens = Utilities.split(obj.getText().lower(), " ")
obj.getLexicalFeatures().setTokens(initialTokens)
tokens = []
#remove empty or dummy tokens
for token in initialTokens:
if not Utilities.isEmpty(token):
tokens.append(token)
taggedWords = nltk.pos_tag(tokens)
obj.getSyntacticFeatures().setPOSTags(taggedWords)
for (word, tag) in taggedWords:
if tag in PAST_TENSE_TAGS:
return True
return False
def parseLocation(obj):
event = re.sub("<TIMEX2>|</TIMEX2>", "", obj.getLexicalFeatures().getSpellCorrection())
#print "event: {}".format(event)
entities = []
try:
nerTagger = StanfordNERTagger( STANFORD_NER_ROOT + '/classifiers/english.muc.7class.distsim.crf.ser.gz', STANFORD_NER_PATH)
entities = nerTagger.tag(event.split())
except:
print("Unexpected error:", sys.exc_info()[0])
result = ""
for entity in entities:
if entity[1] != 'O':
result += " {}".format( entity[0] )
#print "location: {}".format(result)
obj.getSemanticFeatures().setLocation(result)
return result
def setupEvent(obj, eventType):
eventDate = Utilities.parseDate(obj.getSyntacticFeatures().getTemporalTag())
eventLocation = parseLocation(obj)
return Event(eventType, eventDate, eventLocation)
if __name__ == '__main__':
#initialize variables
initialize()
#read commmand line parameters
inputFileName, outputFileName = getCommandLineArgs()
#preprocess input data
featureObjects = preProcessData(inputFileName)
#perform temporal expression tagging
taggedLines = performTagging(featureObjects)
#select lines which have <TIMEX2> tag
eventsList = Utilities.filter(taggedLines, TIMEX_TAG)
#for lines identified as events, check each whether any word matches with synonyms for keywords
for obj in eventsList:
#print "event: {}".format(event)
isRequired, eventType = isRequiredEvent(obj, SYNONYMS_FOR_KEYWORDS)
if isRequired:
eventObj = setupEvent(obj, eventType)
obj.setEvent(eventObj)
if not isEventPast(obj):
#["Original Text", "Lexical-Tokens", "Lexical-SpellCorrection", "Syntactic-POS tags", "Syntactic-Temporal tag", "Semantic-Synonym", "Semantic-Location" ]
Utilities.computePrecision(obj)
obj.setPredict("yes")
RESULT.append([obj.getEvent().type,
obj.getEvent().date,
obj.getEvent().location,
obj.getText(),
str(obj.getLexicalFeatures().getTokens()),
obj.getLexicalFeatures().getSpellCorrection(),
str(obj.getSyntacticFeatures().getPOSTags()),
obj.getSyntacticFeatures().getTemporalTag(),
obj.getSemanticFeatures().getSynonym(),
obj.getSemanticFeatures().getLocation()])
else:
if Utilities.isDateInFuture(obj.getSyntacticFeatures().getTemporalTag()):
obj.setPredict("yes")
Utilities.computePrecision(obj)
RESULT.append([obj.getEvent().type,
obj.getEvent().date,
obj.getEvent().location,
obj.getText(),
str(obj.getLexicalFeatures().getTokens()),
obj.getLexicalFeatures().getSpellCorrection(),
str(obj.getSyntacticFeatures().getPOSTags()),
obj.getSyntacticFeatures().getTemporalTag(),
obj.getSemanticFeatures().getSynonym(),
obj.getSemanticFeatures().getLocation()])
else:
Utilities.writeLog("INFO [IMPROVED APPROACH]: Event Detected but is identified as past event :" + obj.getText())
else:
Utilities.writeLog("INFO [IMPROVED APPROACH]: Event Detected but event type did not match with required events :" + obj.getText())
Utilities.writeOutput(outputFileName, RESULT_HEADER)
for feature in RESULT:
Utilities.writeOutput(outputFileName, feature)
Utilities.computeRecall(featureObjects)
Utilities.printMetrics()