def counts(wordlist): for word in wordlist: stages = -2 length = [] result = [] for form in word: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) if stages < stage: stages = stage result.extend(gl) length = len(dict_disambiguate(result)) yield (stages, length, word)
wl = {} wl_detone = {} def normalize_bailleul(word): return u''.join([c for c in word if c not in u'.-']) for entry in lexicon.findall('record'): lemmas = [normalize_bailleul(entry.find('lx').text)] lemmas.extend([normalize_bailleul(e.text) for e in entry.findall('va')]) try: ps = set(entry.find('ps').text.split('/')) except AttributeError: ps = set([]) try: gloss = entry.find('ge').text except AttributeError: try: gloss = entry.find('ru').text except AttributeError: gloss = '' if 'mrph' not in ps: #norm = lemmas[0][0] norm = lemmas[0] addlem = (norm,ps,gloss) for lemma in lemmas: if addlem not in wl.setdefault(lemma.lower(), []): wl.setdefault(lemma.lower(), []).append(addlem) if addlem not in wl_detone.setdefault(detone(lemma.lower()), []): wl_detone.setdefault(detone(lemma.lower()), []).append(addlem)
def shorten(s): return normalize_bailleul(detone(s))
from orthograph import convertw, detone from morphology import lemmatize, dict_disambiguate, print_gloss import re ci = ConcordanceIndex(test.words(), key=lambda s: s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form, wl_detone) result.extend(gl) glstr = [print_gloss(g) for g in dict_disambiguate(result)] for gs in glstr: print " ", gs.encode('utf-8') print
import re ci = ConcordanceIndex(test.words(), key=lambda s:s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) result.extend(gl) glstr = [print_gloss(g) for g in dict_disambiguate(result)] for gs in glstr: print " ", gs.encode('utf-8') print
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import wl, test from nltk import FreqDist from orthograph import convertw, detone words = [] for i in test.words(): words.extend(convertw(detone(i))) fd = FreqDist(words) for w, lems in wl.items(): if len(lems) > 1: pslist = reduce(lambda x, y: x.union(y), [lem[1] for lem in lems]) if len(pslist) > 1: #if len(pslist) == 1: # polysemy case print fd[detone(w)], w, for l in lems: print '|', '/'.join(l[1]), u"‘" + l[2] + u"’", print
morphdict = {} for entry in etree.findall('record'): lx = entry.findtext('lx', default='') ge = entry.findtext('ge', default='') ps = entry.findtext('ps') lems = [lx] for va in entry.findall('va'): lems.append(va.text) for v in lems: lemma = v.strip(u'-') morphdict.setdefault(tuple(re.split(r'[.-]', v.strip(u'-').lower())), []).append((lemma, ps, ge)) if v != detone(v): morphdict.setdefault( tuple(re.split(r'[.-]', detone(v.strip(u'-').lower()))), []).append((lemma, ps, ge)) for (lemma, fields) in elist: forms = [(-1, lemma)] forms.extend([(index, value) for index, (tag, value) in enumerate(fields) if tag == 'va']) shift = 1 for i, form in forms: morphs = re.split(r'[.-]+', form) # prepare >=2 morph sequences for lookup mlist = sfec([], morphs) if len(mlist) > 1 and () not in mlist: for morph in mlist:
elist = bailleul.entries('bailleul.txt') etree = e.ElementTree(bailleul.xml('bailleul.txt')) morphdict = {} for entry in etree.findall('record'): lx = entry.findtext('lx', default='') ge = entry.findtext('ge', default='') ps = entry.findtext('ps') lems = [lx] for va in entry.findall('va'): lems.append(va.text) for v in lems: lemma = v.strip(u'-') morphdict.setdefault(tuple(re.split(r'[.-]', v.strip(u'-').lower())), []).append((lemma,ps,ge)) if v != detone(v): morphdict.setdefault(tuple(re.split(r'[.-]', detone(v.strip(u'-').lower()))), []).append((lemma,ps,ge)) for (lemma,fields) in elist: forms = [(-1, lemma)] forms.extend([(index,value) for index, (tag,value) in enumerate(fields) if tag == 'va']) shift = 1 for i, form in forms: morphs = re.split(r'[.-]+', form) # prepare >=2 morph sequences for lookup mlist = sfec([], morphs) if len(mlist) > 1 and () not in mlist: for morph in mlist: try: for gloss in morphdict[morph]: fields.insert(i+shift, (r'mm', ':'.join(gloss)))
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- from bamana import wl, test from nltk import FreqDist from orthograph import convertw, detone words = [] for i in test.words(): words.extend(convertw(detone(i))) fd = FreqDist(words) for w, lems in wl.items(): if len(lems) > 1: pslist = reduce(lambda x, y: x.union(y), [lem[1] for lem in lems]) if len(pslist) > 1: #if len(pslist) == 1: # polysemy case print( fd[detone(w)], w, ) for l in lems: print( '|', '/'.join(l[1]), u"‘" + l[2] + u"’", ) print
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import wl,test from nltk import FreqDist from orthograph import convertw, detone words = [] for i in test.words(): words.extend(convertw(detone(i))) fd = FreqDist(words) for w, lems in wl.items(): if len(lems) > 1: pslist = reduce(lambda x,y: x.union(y), [lem[1] for lem in lems]) if len(pslist) > 1: #if len(pslist) == 1: # polysemy case print fd[detone(w)], w, for l in lems: print '|', '/'.join(l[1]), u"‘" + l[2] + u"’", print