/
dic_preprocessing.py
142 lines (128 loc) · 4.73 KB
/
dic_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
import xml.etree.ElementTree as ET
######################################################
# global variables
######################################################
glob_path = "/profession/[14Spring]-Natural Language Processing/assignment2/"
glob_dict_path = "/profession/[14Spring]-Natural Language Processing/assignment2/dictionary-modified.xml"
# glob_dict_path = "/profession/[14Spring]-Natural Language Processing/assignment2/small_dictionary.xml"
glob_valid_path = "/profession/[14Spring]-Natural Language Processing/assignment2/validation_data.data"
# glob_valid_path = "/profession/[14Spring]-Natural Language Processing/assignment2/small_validation_data.data"
glob_Lucene = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
######################################################
# parse methods
######################################################
"""
dictionary [
"begin.v":{
num:1
senses:[
{
id:1
wn_ids:[1,2,3,4]
gloss:["hello","world"]
examples:[
["hello","world"],
["how","are","you"]
]
},
...
]
},
...
]
"""
def parse_dictionary(dict_path):
dictionary = {}
tree = ET.parse(dict_path)
root = tree.getroot()
for lexeme in root:
senses = []
word_dict = {}
word_dict["senses"] = senses
word_dict["num"] = int(lexeme.attrib['num'])
""" ATTENTION:
In the dictionary.xml, it uses word.type as key, however, I split it here
"""
dictionary[lexeme.attrib["item"]] = word_dict
for sense in lexeme:
sense_id = sense.attrib["id"]
sense_dict = {}
sense_dict["id"] = sense_id
wordnet_ids = sense.attrib["wordnet"]
wordnet_list = []
for id_str in wordnet_ids.split(","):
if id_str != '':
wordnet_list.append(int(id_str))
sense_dict["wn_ids"] = wordnet_list
sense_dict["gloss"] = lemma_stem_sentence(sense.attrib["gloss"])
examples = sense.attrib["examples"].split("|")
example_list = []
sense_dict["examples"] = example_list
for sentence in examples:
example_list.append(lemma_stem_sentence(sentence))
if "#" in sense_id:
ids = sense_id.split("#")
for s_itm in senses:
if s_itm["id"] in ids:
s_itm["examples"] += sense_dict["examples"]
else:
senses.append(sense_dict)
return dictionary
######################################################
"""
validation_list [
{
word:"begin.v"
real_sense:1
target_word_idx:15
sentence:['pressure', 'hello']
},
...
]
"""
def parse_validation(validation_path):
validation_list = []
with open(validation_path) as f:
for line in f:
strs = line.split('|')
word_dict = {}
validation_list.append(word_dict)
word_dict["word"] = strs[0].strip()
word_dict["real_sense"] = int(strs[1])
sentence_list = []
word_dict["sentence"] = sentence_list
lmtzr = WordNetLemmatizer()
ls = LancasterStemmer()
single_words = re.findall("(\w+|%%)",strs[2])
double_mod_found = False
word_count = 0
for single_word in single_words:
if single_word == "%%":
if not double_mod_found:
word_dict["target_word_idx"] = word_count+1
double_mod_found = True
continue
lemmed = lmtzr.lemmatize(single_word)
stemmed = ls.stem(lemmed)
if not stemmed in glob_Lucene:
sentence_list.append(stemmed)
word_count += 1
return validation_list
def lemma_stem_sentence(sentence):
lmtzr = WordNetLemmatizer()
ls = LancasterStemmer()
word_list = []
single_words = re.findall("\w+",sentence)
for single_word in single_words:
lemmed = lmtzr.lemmatize(single_word)
stemmed = ls.stem(lemmed)
if not stemmed in glob_Lucene:
word_list.append(stemmed)
return word_list
if __name__ == '__main__':
# parse_dictionary(glob_dict_path)
parse_validation(glob_valid_path)