Ejemplo n.º 1
0
def __main__():
    from bamana import test
    from orthograph import convertw
    ng = collections.defaultdict(int)

    for i in test.words():
        if not re.search(r'[0-9.,;:!?]', i):
            for w in convertw(i):
                for n in map(lambda x:x+1, range(3)):
                    for g in ngrams(w, ngram=n):
                        ng[g] = ng[g]+1

    for g, f in ng.iteritems():
        print u'{0} {1}'.format(g,f)
Ejemplo n.º 2
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from __future__ import division
from bamana import test,wl,wl_detone
from orthograph import convertw,detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

types = list(set([s.lower() for s in set(test.words())]))
types.sort()

wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)]
wtest = [convertw(w) for w in test.words() if not re.search(r'[0-9.,;:!?]', w)]

def counts(wordlist):
    for word in wordlist:
            stages = -2
            length = []
            result = []
            for form in word:
                if form != detone(form):
                    stage, gl = lemmatize(form, wl)
                else:
                    stage, gl = lemmatize(form,wl_detone)
                if stages < stage:
                    stages = stage
                result.extend(gl)
            length = len(dict_disambiguate(result))
            yield (stages, length, word)
Ejemplo n.º 3
0
#!/usr/bin/python
from bamana import test,wordlist,propernames
from orthography import *

wl = set(wordlist.words())
pn = set(propernames.words())

for w in test.words():
    res = convertw(w)
    if len(res) > 1:
        for r in res:
            if not orth_compliant(r):
                res.remove(r)
                break
            if detone(r) in wl:
                print "DICT:", 
            elif detone(r) in pn:
                print "PROPER:",
            else:
                print "NOTDICT:", 
            print r.encode('utf-8'),
        else:
            print 
    
Ejemplo n.º 4
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import test,wl,wl_detone
from nltk.text import ConcordanceIndex
from orthograph import convertw,detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

ci = ConcordanceIndex(test.words(), key=lambda s:s.lower())
types = list(set([s.lower() for s in set(test.words())]))
types.sort()

for word in types:
    if not re.search(r'[0-9.,;:!?]', word):
        ci.print_concordance(word, lines=15)
        print 
        nw = convertw(word)
        nwl = [w for w in nw if w in wl]
        if nwl:
            formlist = nwl
        else:
            formlist = nw
        result = []
        for form in formlist:
            if form != detone(form):
                stage, gl = lemmatize(form, wl)
            else:
                stage, gl = lemmatize(form,wl_detone)
            result.extend(gl)
Ejemplo n.º 5
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from __future__ import division
from bamana import test,wl,wl_detone
from orthograph import convertw,detone
from morphology import lemmatize, dict_disambiguate, print_gloss
from nltk import FreqDist
import re

types = list(set([s.lower() for s in set(test.words())]))
types.sort()

wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)]
fdist = FreqDist(tuple(convertw(w)) for w in test.words() if not re.search(r'[0-9.,;:!?]', w))

def counts(wordlist,fd):
    for word in wordlist:
            stages = -2
            length = []
            result = []
            for form in word:
                if form != detone(form):
                    stage, gl = lemmatize(form, wl)
                else:
                    stage, gl = lemmatize(form,wl_detone)
                if stages < stage:
                    stages = stage
                result.extend(gl)
            length = len(dict_disambiguate(result))
            yield (stages, length, u' '.join(word), fd[tuple(word)])
Ejemplo n.º 6
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import wl,test
from nltk import FreqDist
from orthograph import convertw, detone

words = []
for i in test.words():
    words.extend(convertw(detone(i)))
fd = FreqDist(words)

for w, lems in wl.items():
    if len(lems) > 1:
        pslist = reduce(lambda x,y: x.union(y), [lem[1] for lem in lems])
        if len(pslist) > 1:
        #if len(pslist) == 1: # polysemy case
            print fd[detone(w)], w,
            for l in lems:
                print '|', '/'.join(l[1]), u"‘" + l[2] + u"’",
            print

Ejemplo n.º 7
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-
from __future__ import division
from bamana import test,wordlist,propernames
from orthography import *

wl = set(wordlist.words())
pn = set(propernames.words())

indict = set()
inproper = set()
notindict = set()
all = set(test.words())

for w in all:
    res = convertw(w)
    wordindict = False
    for r in res:
        word = detone(r)
        #all.add(word)
        if not orth_compliant(word):
            break
        if word in wl:
            indict.add(w)
            wordindict =True
        if word in pn:
            inproper.add(w)
            wordindict = True
    else:
        if not wordindict:
            notindict.add(w)