Ejemplo n.º 1
0
def counts(wordlist):
    for word in wordlist:
            stages = -2
            length = []
            result = []
            for form in word:
                if form != detone(form):
                    stage, gl = lemmatize(form, wl)
                else:
                    stage, gl = lemmatize(form,wl_detone)
                if stages < stage:
                    stages = stage
                result.extend(gl)
            length = len(dict_disambiguate(result))
            yield (stages, length, word)
Ejemplo n.º 2
0
wl = {}
wl_detone = {}

def normalize_bailleul(word):
    return u''.join([c for c in word if c not in u'.-'])

for entry in lexicon.findall('record'):
    lemmas = [normalize_bailleul(entry.find('lx').text)]
    lemmas.extend([normalize_bailleul(e.text) for e in entry.findall('va')])
    try:
        ps = set(entry.find('ps').text.split('/'))
    except AttributeError:
        ps = set([])
    try:
        gloss = entry.find('ge').text
    except AttributeError:
        try:
            gloss = entry.find('ru').text
        except AttributeError:
            gloss = ''
    if 'mrph' not in ps:
        #norm = lemmas[0][0]
        norm = lemmas[0]
        addlem = (norm,ps,gloss)
        for lemma in lemmas:
            if addlem not in wl.setdefault(lemma.lower(), []):
                wl.setdefault(lemma.lower(), []).append(addlem)
            if addlem not in wl_detone.setdefault(detone(lemma.lower()), []):
                wl_detone.setdefault(detone(lemma.lower()), []).append(addlem)
Ejemplo n.º 3
0
def shorten(s):
    return normalize_bailleul(detone(s))
Ejemplo n.º 4
0
from orthograph import convertw, detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

ci = ConcordanceIndex(test.words(), key=lambda s: s.lower())
types = list(set([s.lower() for s in set(test.words())]))
types.sort()

for word in types:
    if not re.search(r'[0-9.,;:!?]', word):
        ci.print_concordance(word, lines=15)
        print
        nw = convertw(word)
        nwl = [w for w in nw if w in wl]
        if nwl:
            formlist = nwl
        else:
            formlist = nw
        result = []
        for form in formlist:
            if form != detone(form):
                stage, gl = lemmatize(form, wl)
            else:
                stage, gl = lemmatize(form, wl_detone)
            result.extend(gl)

        glstr = [print_gloss(g) for g in dict_disambiguate(result)]
        for gs in glstr:
            print "    ", gs.encode('utf-8')
        print
Ejemplo n.º 5
0
import re

ci = ConcordanceIndex(test.words(), key=lambda s:s.lower())
types = list(set([s.lower() for s in set(test.words())]))
types.sort()

for word in types:
    if not re.search(r'[0-9.,;:!?]', word):
        ci.print_concordance(word, lines=15)
        print 
        nw = convertw(word)
        nwl = [w for w in nw if w in wl]
        if nwl:
            formlist = nwl
        else:
            formlist = nw
        result = []
        for form in formlist:
            if form != detone(form):
                stage, gl = lemmatize(form, wl)
            else:
                stage, gl = lemmatize(form,wl_detone)
            result.extend(gl)

        glstr = [print_gloss(g) for g in dict_disambiguate(result)]
        for gs in glstr:
            print "    ", gs.encode('utf-8')
        print


Ejemplo n.º 6
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import wl, test
from nltk import FreqDist
from orthograph import convertw, detone

words = []
for i in test.words():
    words.extend(convertw(detone(i)))
fd = FreqDist(words)

for w, lems in wl.items():
    if len(lems) > 1:
        pslist = reduce(lambda x, y: x.union(y), [lem[1] for lem in lems])
        if len(pslist) > 1:
            #if len(pslist) == 1: # polysemy case
            print fd[detone(w)], w,
            for l in lems:
                print '|', '/'.join(l[1]), u"‘" + l[2] + u"’",
            print
Ejemplo n.º 7
0
morphdict = {}

for entry in etree.findall('record'):
    lx = entry.findtext('lx', default='')
    ge = entry.findtext('ge', default='')
    ps = entry.findtext('ps')
    lems = [lx]
    for va in entry.findall('va'):
        lems.append(va.text)
    for v in lems:
        lemma = v.strip(u'-')
        morphdict.setdefault(tuple(re.split(r'[.-]',
                                            v.strip(u'-').lower())),
                             []).append((lemma, ps, ge))
        if v != detone(v):
            morphdict.setdefault(
                tuple(re.split(r'[.-]', detone(v.strip(u'-').lower()))),
                []).append((lemma, ps, ge))

for (lemma, fields) in elist:
    forms = [(-1, lemma)]
    forms.extend([(index, value) for index, (tag, value) in enumerate(fields)
                  if tag == 'va'])
    shift = 1
    for i, form in forms:
        morphs = re.split(r'[.-]+', form)
        # prepare >=2 morph sequences for lookup
        mlist = sfec([], morphs)
        if len(mlist) > 1 and () not in mlist:
            for morph in mlist:
Ejemplo n.º 8
0
elist = bailleul.entries('bailleul.txt')
etree = e.ElementTree(bailleul.xml('bailleul.txt'))

morphdict = {}

for entry in etree.findall('record'):
    lx = entry.findtext('lx', default='')
    ge = entry.findtext('ge', default='')
    ps = entry.findtext('ps')
    lems = [lx]
    for va in entry.findall('va'):
        lems.append(va.text)
    for v in lems:
        lemma = v.strip(u'-')
        morphdict.setdefault(tuple(re.split(r'[.-]', v.strip(u'-').lower())), []).append((lemma,ps,ge))
        if v != detone(v):
            morphdict.setdefault(tuple(re.split(r'[.-]', detone(v.strip(u'-').lower()))), []).append((lemma,ps,ge))
    
for (lemma,fields) in elist:
    forms = [(-1, lemma)]
    forms.extend([(index,value) for index, (tag,value) in enumerate(fields) if tag == 'va'])
    shift = 1
    for i, form in forms:
        morphs = re.split(r'[.-]+', form)
        # prepare >=2 morph sequences for lookup 
        mlist = sfec([], morphs)
        if len(mlist) > 1 and () not in mlist:
            for morph in mlist:
                try:
                    for gloss in morphdict[morph]:
                        fields.insert(i+shift, (r'mm', ':'.join(gloss)))
Ejemplo n.º 9
0
def shorten(s):
    return normalize_bailleul(detone(s))
Ejemplo n.º 10
0
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from bamana import wl, test
from nltk import FreqDist
from orthograph import convertw, detone

words = []
for i in test.words():
    words.extend(convertw(detone(i)))
fd = FreqDist(words)

for w, lems in wl.items():
    if len(lems) > 1:
        pslist = reduce(lambda x, y: x.union(y), [lem[1] for lem in lems])
        if len(pslist) > 1:
            #if len(pslist) == 1: # polysemy case
            print(
                fd[detone(w)],
                w,
            )
            for l in lems:
                print(
                    '|',
                    '/'.join(l[1]),
                    u"‘" + l[2] + u"’",
                )
            print
Ejemplo n.º 11
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import wl,test
from nltk import FreqDist
from orthograph import convertw, detone

words = []
for i in test.words():
    words.extend(convertw(detone(i)))
fd = FreqDist(words)

for w, lems in wl.items():
    if len(lems) > 1:
        pslist = reduce(lambda x,y: x.union(y), [lem[1] for lem in lems])
        if len(pslist) > 1:
        #if len(pslist) == 1: # polysemy case
            print fd[detone(w)], w,
            for l in lems:
                print '|', '/'.join(l[1]), u"‘" + l[2] + u"’",
            print