-
Notifications
You must be signed in to change notification settings - Fork 0
/
dejean.py
210 lines (185 loc) · 5.55 KB
/
dejean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#! /usr/bin/python
from trie import Trie #for the trie class code
from counts import *
from stats import metrics, compute_stats
"""
Implement a couple of algorithms from the paper by Dejean
Morphemes as necessary Concept for Structures discovery
in Untagged Corpora
"""
dash='-'
STANDARD={} #the gold standard
WORDS={} #segmentation data
MORPHEMES=[] #list of all the segments found
TESTFILE='/data/cs65/morphology/segments.eng'
AFFIXES=[]
#stuff for stats
CUTS=0
CORRECT_CUTS=0
TRUE_BOUNDARIES=0
def get_segments():
"""
Get all the morphemes as proposed by the successor counts
algorithm and insert them into a global list of those
morphemes
"""
global MORPHEMES #make the global variable accessible
for word in WORDS:
segments = segment_word(word).split()
length = len(segments)
if length > 2:
prefix = segments[0] + '-'
suffix = '-' + segments[-1]
MORPHEMES += [prefix, suffix]
MORPHEMES += segments[1:length-1]
elif length == 2:
#get the shorter of the segments
first, secnd = segments
if len(first) < len(secnd):
MORPHEMES += [first+'-', secnd]
else:
MORPHEMES += [first, '-'+secnd]
else:
MORPHEMES += segments
def prune(threshold):
"""
Prune the global list of all morphemes using threshold as the
'cutoff' value
"""
global AFFIXES
valid = []
morphemes = set(MORPHEMES)
for morph in morphemes:
count = MORPHEMES.count(morph)
if count > threshold:
valid.append(morph)
#print morph, count
AFFIXES += valid
def discover_new_morphemes(threshold=2):
"""
After pruning, use the resulting morphemes to discover new
morphemes and then return an extended list comprising of the
original morphemes together with the 'discovered' ones.
"""
global AFFIXES
stems = []
prefix = 0 #is this a prefix or suffix
for word in WORDS:
counts = 0
stem = None
for affix in AFFIXES:
#some checking stuff
pre = 1 if (affix.startswith(dash) and len(affix.strip(dash)) > 0) else 0
suf = 1 if (affix.endswith(dash) and len(affix.strip(dash))>0) else 0
if word.startswith(affix.strip(dash)) and pre:
stem = word.split(affix.strip(dash))[1]
prefix = 1
break
elif word.endswith(affix.strip(dash)) and suf:
stem = word.split(affix.strip(dash))[0]
break
#retry all the affixes
if stem is None: #nothing found
continue
for affix in AFFIXES:
if prefix and (affix + stem) in WORDS:
counts += 1
elif (stem + affix) in WORDS:
counts += 1
if counts > threshold:
stems.append(word) #this is an interesting
#ok we now have all the relevant stems we want. Go through all the words and
#and pull out any affixes that are attached to the said stem
for stem in stems:
words = [x for x in WORDS if stem in x] #get all words with this stem
words = set(words) #optimize and just get unique cases
for word in words:
tmp = word.split(stem) #get all parts of the word
AFFIXES += tmp #get everything in the word other than the stem
#a lot of empty characters and repeats need to be weeded out
AFFIXES = filter(lambda x: True if x else False, set(AFFIXES))
def trim_affixes(affixes):
"""
Given a list of affixes, return a list containing only the longest
matching affixes
"""
discard = []
for affix in affixes:
for entry in affixes:
if affix.strip(dash) in entry and affix != entry:
discard.append(affix)
#remove the discarded entries
return set(affixes) - set(discard)
def set_diff(a,b):
"""
Just return the set difference, but maintaining the
order of the entries.
"""
end = a.index(b[-1])
return a[:end]
def splits(word, affixes):
"""
Get a word and a list of affixes. Break up the word so that
You have the affixes from the word together with anything
else that remains unsegmented.
"""
prefix = ""
suffix = ""
lstem = word
rstem = word
strlen = len(word)
for affix in affixes:
index = word.find(affix.strip(dash))
if index == 0 and affix.endswith(dash):
prefix = affix.strip(dash)
lstem = word[len(affix)-1:]
elif index > strlen-5 and affix.startswith(dash):
suffix = affix.strip(dash)
rstem = word[:index]
continue
stem = word
if prefix and suffix:
stem = set_diff(lstem, rstem)
elif prefix:
stem = lstem
elif suffix:
stem = rstem
return [prefix, stem, suffix]
def segment_words():
"""
Using the final list of affixes, segment the words using a longest
match approach, where you segment based on the longest occuring morpheme
in the word.
"""
global CUTS
global CORRECT_CUTS
global TRUE_BOUNDARIES
#build the chunks
for word in STANDARD:
affixes = trim_affixes([x for x in AFFIXES if x.strip(dash) in word])
segments = splits(word, affixes)
a,b,c = metrics(' '.join(segments), STANDARD[word])
CUTS += b
CORRECT_CUTS += a
TRUE_BOUNDARIES += c
def main():
#build the list of test words
f = open(TESTFILE, 'r')
segments = map(lambda f: f.strip('\n'), f.readlines())
f.close()
#build dictionary of words vs segments
for word in segments:
tmp = word.split('\t')
STANDARD[tmp[0]] = tmp[1] #put the kv pairing
WORDS[tmp[0]] = 0
#print WORDS
get_segments()
prune(11)
discover_new_morphemes()
segment_words()
print "Statistics for Dejean algorithm segmentation"
print "********************************************"
print compute_stats(CUTS, CORRECT_CUTS, TRUE_BOUNDARIES)
print
if __name__=='__main__':
main()