/
WSD1.py
280 lines (254 loc) · 8.67 KB
/
WSD1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
'''
Dictionary based WSD
Author: Herat Gandhi, Jyoti Pandey, Saikiran, Vinayaka Dattatraya
'''
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
import string
import re
#Read dictionary file in the memory so that we don't have to load file again and again
dict_file = open('Dictionary.xml').read()
'''
Find Glosses from XML file
@param string: word for which we want to find senses
@return: all definitions of a gloss and if word is not in XML then return -1
'''
def find_gloss_from_file(str):
global dict_file
content = dict_file
#Parse using regex
l = re.compile('<lexelt item="' + str + '.[a-z]">(.*?)</lexelt>', re.DOTALL | re.IGNORECASE).findall(content)
if len(l) > 0 :
#return all the glosses
return re.compile('gloss="(.*)"').findall(l[0])
else:
return -1
'''
Remove junk content that is numbers and stopwords from the string
@param string1: string from which we want to remove junk
@param lines1: lines from which we want to ignore words
@return: Clean string with no junk parts
'''
def remove_junk(string1,lines1):
#Regex for numbers
string1 = re.sub('[0-9]+','',string1)
#Remove punctuations
for punct in string.punctuation:
string1 = string1.replace(punct,'')
#Remove Stopwords and unimportant words
temp_l1 = string1.split()
important_words1 = filter(lambda x: x not in stopwords.words('english'), temp_l1)
important_words1 = filter(lambda x: x not in lines1, important_words1)
return ' '.join(important_words1)
'''
Function to find all sub-sequences from a string
@param s1: String for which we want to find sub-sequences
@return List: List of all sub-sequences of the input string
'''
def find_word_sequences(s1):
s1l = s1.split()
i = 0
j = 0
k = 1
new_s1l = []
str = ''
#Iterate through string and find the subsequences of k lengths
while k <= len(s1l):
i = 0
while i < len(s1l)-k+1:
j = i
cnt = 0
str = ''
while j < k+i:
if j == k+i-1:
str += s1l[i+cnt]
else:
str += s1l[i+cnt]+ ' '
j += 1
cnt += 1
i += 1
new_s1l.append(str)
k += 1
return new_s1l
'''
Find total overlap
@param s1,s2: Strings for which we want to find overlaps
@return total: Integer representing weighted overlap score
'''
def calculate_overall_score(s1,s2):
#Find sub-sequences of a string
seq1 = find_word_sequences(s1)
total = 0
#Check whether we find a sub-string in a string or not
for elem in seq1:
if elem in s2:
#if we found a match then check its length and take its square
total += (elem.count(' ')+1) ** 2
return total
'''
Get bag of senses for a list of words
@param temp_words1: Words for which we want to find bag of senses
@return definitions: Definitions of context words also include hypernyms and hyponyms
'''
def get_bag_of_senses(temp_words1):
senses = []
lmtzr = WordNetLemmatizer()
temp_words1 = nltk.pos_tag(temp_words1.split())
#Find the synsets
for t in temp_words1:
try:
if 'VB' in t[1]:
senses.append(wordnet.synsets(lmtzr.lemmatize(t[0],'v')))
else:
senses.append(wordnet.synsets(t[0]))
except:
pass
#Find hypernyms' synsets
hypernyms = []
for sense_l in senses:
for s in sense_l:
hypernyms.append(s.hypernyms())
#Find hyponyms' synsets
hyponyms = []
for sense_l in senses:
for s in sense_l:
hyponyms.append(s.hyponyms())
definitions = []
for sense_l in senses:
for s in sense_l:
definitions.append(s.definition)
for sense_l in hypernyms:
for s in sense_l:
definitions.append(s.name)
for sense_l in hyponyms:
for s in sense_l:
definitions.append(s.name)
definitions = ' '.join(definitions)
return definitions
'''
Get bag of senses for target word
@param list: List of senses of a target word
@return string: String containing all the senses
'''
def get_minibag_of_senses(temp_words1):
senses = []
lmtzr = WordNetLemmatizer()
temp_words1 = nltk.pos_tag(temp_words1.split())
#Find the synsets
for t in temp_words1:
try:
if 'VB' in t[1]:
senses.append(wordnet.synsets(lmtzr.lemmatize(t[0],'v')))
else:
senses.append(wordnet.synsets(t[0]))
except:
pass
#Find senses
definitions = []
for sense_l in senses:
for s in sense_l:
definitions.append(s.name)
definitions = ' '.join(definitions)
return definitions
'''
Stemming function
@param string: String which we want to stem
@return string: String with stemmed words
'''
def stem_funct(str):
res = ''
#Use NLTK's stemmer
st = LancasterStemmer()
#Stem each word and append the result in the string
for word in str.split(' '):
res += ' ' + st.stem(word)
return res
'''
Function to get context words from the given sample
@param line: Sample from which we need to retrieve context words
@param n: n context words we want to retrieve
@return: Return list of n context words
'''
def get_sense_index(line,n,lines1,target):
temp = line.lower()
#Use regex to remove numbers
temp = re.sub('[0-9]+','',temp)
#Get context and target word
first_at = temp.find('@')
last_at = temp.rfind('@')+1
temp1 = temp[:first_at]
temp2 = temp[last_at:]
#Remove junk from context words
important_words1 = remove_junk(temp1, lines1)
important_words2 = remove_junk(temp2, lines1)
definitions = ''
#Get senses for the context words
definitions += ' ' + get_bag_of_senses(important_words1)
definitions += ' ' + get_bag_of_senses(important_words2)
max_index = -1
max = -1
index = 1
max_str = ''
#For all the senses find the perfect sense
val = []
for s in find_gloss_from_file(target):
bag2 = get_minibag_of_senses(remove_junk(s.lower(), lines1))
#Stem the string
stemmed1 = stem_funct(remove_junk(bag2, lines1))
#Stem the string
stemmed2 = stem_funct(definitions)
#Find the overlapping score
temp_sc = calculate_overall_score(stemmed1, stemmed2) * 1.00 / len(stemmed1)
#Append the score in the list
val.append(temp_sc)
#Find the maximum value
if temp_sc > max:
max = temp_sc
max_str = stemmed1
max_index = index
index += 1
#Return the length and sense indexe/s
return [max_index,len(find_gloss_from_file(target))]
'''
Function to perform WSD based on dictionaries
@param filename: Filename to be used for testing
Writes output to output.txt
'''
def WSD_Dict(filename):
fp = open(filename,'r') #Open file for reading
lines = fp.readlines() #Get lines
fp2 = open('words.txt','r') #Open words from which we ignore unimportant words
lines1 = fp2.read().splitlines() #Lines from unimportant file
fpo = open('output.txt','w') #Output file
for line in lines:
at_p = line.find('@') #Find first occurance of @ that helps to identify where to break string
starting = line [:at_p].split()
strating_target = starting[0].split('.')[0] #Target word to be disambiguated
form = starting[0].split('.')[1] #Target word form
line = line[at_p+1:] #Get rest of the line
target_in_sentence = line[line.find('@')+1:line.rfind('@')] #Target word in the sentence
context_words = get_sense_index(line,10,lines1,strating_target) #Get sense from the context
if context_words[0] != -1: #If -1 is returned then we have multiple sense predictions
ind = 1
op = '0\n'
while ind <= context_words[1]:
if ind == context_words[0]:
op += '1\n'
else:
op += '0\n'
ind += 1
fpo.write(op)
else:
ind = 1
op = '1\n'
while ind <= context_words[1]:
op += '0\n'
ind += 1
fpo.write(op) #Write binary string to the file
def main():
filename = raw_input('Enter file name to test: ')
WSD_Dict(filename)
main()