forked from maslinych/daba
-
Notifications
You must be signed in to change notification settings - Fork 0
/
newmorph.py
275 lines (251 loc) · 10.1 KB
/
newmorph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import operator
from ntgloss import Gloss, CompactGloss, emptyGloss, Pattern, Dictionary
from orthography import detone
def nullgloss(word):
'str -> Gloss'
return Gloss(word, (), '', ())
def parse_composite(form, gdict, numparts):
'Str, Dictionary, Int -> [[Str]]'
def parse_composite_aux(form, gdict, num, result):
if not num:
if form:
return []
else:
return result
else:
prefixes = [p for p in gdict.iter_prefixes(form)][::-1]
if not prefixes:
return []
else:
out = []
for pref in prefixes:
parsed = parse_composite_aux(form[len(pref):], gdict, num-1, result+[pref])
if parsed:
if isinstance(parsed[0], list):
for i in parsed:
out.append(i)
else:
out.append(parsed)
return out
return [r for r in parse_composite_aux(form, gdict, numparts, []) if r]
unfold = lambda l: [j for i in l for j in i]
unknown = lambda g: not bool(g.gloss)
def parsed(g):
if g.morphemes:
return len([m for m in g.morphemes if m.gloss]) == len(g.morphemes)
else:
return bool(g.gloss)
def f_add(func, *args):
'(Gloss -> Maybe([Gloss])) -> ([Gloss] -> [Gloss])'
if args:
f = func(*args)
else:
f = func
return lambda parses: parses + unfold(filter(None, map(f, parses)))
def f_apply(func, *args):
'(Gloss -> Maybe([Gloss])) -> ([Gloss] -> [Gloss])'
if args:
f = func(*args)
else:
f = func
return lambda parses: unfold([f(p) or [p] for p in parses])
#def f_filter(func, *args):
def parallel(func, patterns):
'(Gloss, Pattern -> Maybe(Gloss)) -> (Gloss -> Maybe([Gloss]))'
return lambda gloss: unfold(filter(None, [func(p, gloss) for p in patterns]))
def sequential(func, patterns):
'(Gloss, Pattern -> Maybe(Gloss) -> (Gloss -> Maybe([Gloss]))'
def seq(p, gl, match=False):
'(Pattern, Gloss -> Maybe(Gloss)), [Pattern], Gloss -> Gloss'
if not p:
if match:
return gl
else:
return None
else:
applied = func(p[0], gl[0])
if applied:
# FIXME: here we assume func always returns list of len==1
if match:
return seq(p[1:], applied + gl, match=True)
else:
return seq(p[1:], applied, match=True)
else:
return seq(p[1:], gl, match)
# TODO: how to process homonimous affixes? (maybe need to return list of results from single form)
return lambda gloss: seq(patterns, [gloss])
def firstmatch(func, patterns):
'(Gloss, Pattern -> Maybe(Gloss) -> (Gloss -> Maybe([Gloss]))'
def seq(p, gl):
'(Pattern, Gloss -> Maybe(Gloss)), [Pattern], Gloss -> Gloss'
if not p:
return None
else:
applied = func(p[0], gl[0])
if applied:
return applied
else:
return seq(p[1:], gl)
return lambda gloss: seq(patterns, [gloss])
class Parser(object):
def __init__(self, dictionary, grammar, detone=False):
'Dictionary, Grammar, str -> Parser'
self.dictionary = dictionary
self.funcdict = {
'add': f_add,
'apply': f_apply,
'parallel': parallel,
'sequential': sequential,
'firstmatch': firstmatch,
'parsed': parsed,
'lookup': self.lookup,
'parse': self.parse,
'decompose': self.decompose
}
self.processing = []
self.detone = detone
if grammar is None:
self.processing.append((0, f_apply(self.lookup), ('apply', 'lookup')))
else:
self.grammar = grammar
for step in self.grammar.plan['token']:
if step[0] == 'return':
self.processing.append((step[0], lambda l: filter(self.funcdict[step[1]], l), step[1]))
else:
funclist = []
for f in step[1]:
try:
funclist.append(self.funcdict[f])
except KeyError:
funclist.append(self.grammar.patterns[f])
self.processing.append((step[0], funclist[0](*funclist[1:]), step[1]))
def lookup_gloss(self, gloss, gdict):
'Gloss, Dictionary -> tuple(Gloss)'
lookup_form = None
try:
if self.detone:
bare = detone(gloss.form)
if bare in gdict:
lookup_form = bare
else:
if gloss.form in gdict:
lookup_form = gloss.form
else:
bare = detone(gloss.form)
if not gloss.form == bare and bare in gdict:
lookup_form = bare
if lookup_form:
pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss)
return tuple([dgloss for dgloss in gdict[lookup_form] if dgloss.matches(pattern)])
else:
return ()
except (KeyError,AttributeError):
if gloss.form in gdict:
print 'PP', gloss.form, gdict[gloss.form]
else:
print 'PN', gloss.form
return ()
def lookup(self, lemma, make_lemma=False):
'Gloss -> Maybe([Gloss])'
result = None
if parsed(lemma):
return (lemma,)
else:
if lemma.morphemes:
new = CompactGloss(*lemma)
for i,g in enumerate(lemma.morphemes):
if not parsed(g):
dictlist = self.lookup_gloss(g, self.dictionary)
if dictlist:
new = new._replace(morphemes = tuple([dictlist if j==i else m for j,m in enumerate(new.morphemes)]))
result = []
# annotate base form with gloss derived from morpheme glosses
for ngloss in new.glosslist:
glosses = [m.gloss for m in ngloss.morphemes]
if make_lemma and all(glosses):
commongloss = u'-'.join(glosses)
else:
commongloss = ''
stemps = [m.ps for m in ngloss.morphemes if 'mrph' not in m.ps]
if len(stemps) == 1:
try:
commonps = lemma.psunion(stemps[0])
except (ValueError):
commonps = lemma.ps
else:
commonps = lemma.ps
result.append(ngloss._replace(gloss = commongloss, ps = commonps))
else:
result = self.lookup_gloss(lemma, self.dictionary)
return result
def parse(self, pattern, gloss, joinchar='-'):
'Pattern, Gloss, str="-" -> Maybe([Gloss])'
# performs formal parsing only, does not lookup words in dict
result = pattern.apply(gloss)
if result:
return [result]
else:
return None
def decompose(self, pattern, gloss):
'Pattern, Gloss -> Maybe([Gloss])'
try:
parts = len(pattern.select.morphemes)
except (TypeError):
#FIXME: morphemes=None case. Print some error message?
pass
result = []
if parts < 2:
return self.parse(pattern, gloss)
else:
if gloss.morphemes:
#FIXME: use only first non-glossed morpheme as possible stem
stemgloss, stempos = [(m,pos) for pos,m in enumerate(gloss.morphemes) if not m.gloss][0]
else:
stemgloss = gloss
stempos = -1
if stemgloss.psmatch(pattern.select):
stem = stemgloss.form
decomp = [[emptyGloss._replace(form=f) for f in fl] for fl in parse_composite(stem, self.dictionary, parts)]
if decomp:
newmorphemes = [tuple(m.union(p) for m,p in zip(gl, pattern.select.morphemes)) for gl in decomp]
for morphlist in newmorphemes:
if all(morphlist):
if stempos < 0:
newgloss = gloss._replace(morphemes=morphlist)
else:
mlist = list(gloss.morphemes)
mlist[stempos:stempos+1] = list(morphlist)
newgloss = gloss._replace(morphemes=tuple(mlist))
result.extend(filter(operator.truth, [pattern.apply(g) for g in self.lookup(newgloss) if parsed(g)]))
return result or None
return None
def lemmatize(self,word, debug=False):
'word -> (stage, [Gloss])'
stage = -1
parsedword = [nullgloss(word)]
for step, stageparser, stagestr in self.processing:
if step == 'return':
filtered = stageparser(parsedword)
if filtered:
return (stage, filtered)
else:
newparsed = stageparser(parsedword)
#FIXME: debug statement
if not newparsed == parsedword:
stage = step
parsedword = newparsed
if debug:
print stagestr
print stage, '\n'.join(unicode(p) for p in newparsed)
return (stage, parsedword)
def disambiguate(sent):
# TODO: STUB
for step in self.grammar.plan['sentence']:
pass
def process(self, tokens):
'[[word]] -> [[(stage, [Gloss])]]'
for sent in tokens:
return self.disambiguate([self.lemmatize(word) for word in sent])