def __main__(): from bamana import test from orthograph import convertw ng = collections.defaultdict(int) for i in test.words(): if not re.search(r'[0-9.,;:!?]', i): for w in convertw(i): for n in map(lambda x:x+1, range(3)): for g in ngrams(w, ngram=n): ng[g] = ng[g]+1 for g, f in ng.iteritems(): print u'{0} {1}'.format(g,f)
def __main__(): from bamana import test from orthograph import convertw ng = collections.defaultdict(int) for i in test.words(): if not re.search(r'[0-9.,;:!?]', i): for w in convertw(i): for n in map(lambda x:x+1, range(3)): for g in ngrams(w, ngram=n): ng[g] = ng[g]+1 for g, f in ng.iteritems(): print(u'{0} {1}'.format(g,f))
#!/usr/bin/python # -*- encoding: utf-8 -*- from __future__ import division from bamana import test,wl,wl_detone from orthograph import convertw,detone from morphology import lemmatize, dict_disambiguate, print_gloss import re types = list(set([s.lower() for s in set(test.words())])) types.sort() wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)] wtest = [convertw(w) for w in test.words() if not re.search(r'[0-9.,;:!?]', w)] def counts(wordlist): for word in wordlist: stages = -2 length = [] result = [] for form in word: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) if stages < stage: stages = stage result.extend(gl) length = len(dict_disambiguate(result)) yield (stages, length, word)
#!/usr/bin/env python3 from bamana import test, wordlist, propernames from orthography import * wl = set(wordlist.words()) pn = set(propernames.words()) for w in test.words(): res = convertw(w) if len(res) > 1: for r in res: if not orth_compliant(r): res.remove(r) break if detone(r) in wl: print("DICT:", ) elif detone(r) in pn: print("PROPER:", ) else: print("NOTDICT:", ) print(r, ) else: print()
#!/usr/bin/python from bamana import test,wordlist,propernames from orthography import * wl = set(wordlist.words()) pn = set(propernames.words()) for w in test.words(): res = convertw(w) if len(res) > 1: for r in res: if not orth_compliant(r): res.remove(r) break if detone(r) in wl: print "DICT:", elif detone(r) in pn: print "PROPER:", else: print "NOTDICT:", print r.encode('utf-8'), else: print
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import test, wl, wl_detone from nltk.text import ConcordanceIndex from orthograph import convertw, detone from morphology import lemmatize, dict_disambiguate, print_gloss import re ci = ConcordanceIndex(test.words(), key=lambda s: s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form, wl_detone) result.extend(gl)
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- from __future__ import division from bamana import test, wordlist, propernames from orthography import * wl = set(wordlist.words()) pn = set(propernames.words()) indict = set() inproper = set() notindict = set() all = set(test.words()) for w in all: res = convertw(w) wordindict = False for r in res: word = detone(r) #all.add(word) if not orth_compliant(word): break if word in wl: indict.add(w) wordindict = True if word in pn: inproper.add(w) wordindict = True else: if not wordindict: notindict.add(w)
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- from __future__ import division from bamana import test, wl, wl_detone from orthograph import convertw, detone from morphology import lemmatize, dict_disambiguate, print_gloss from nltk import FreqDist import re types = list(set([s.lower() for s in set(test.words())])) types.sort() wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)] fdist = FreqDist( tuple(convertw(w)) for w in test.words() if not re.search(r'[0-9.,;:!?]', w)) def counts(wordlist, fd): for word in wordlist: stages = -2 length = [] result = [] for form in word: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form, wl_detone) if stages < stage: stages = stage
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import wl, test from nltk import FreqDist from orthograph import convertw, detone words = [] for i in test.words(): words.extend(convertw(detone(i))) fd = FreqDist(words) for w, lems in wl.items(): if len(lems) > 1: pslist = reduce(lambda x, y: x.union(y), [lem[1] for lem in lems]) if len(pslist) > 1: #if len(pslist) == 1: # polysemy case print fd[detone(w)], w, for l in lems: print '|', '/'.join(l[1]), u"‘" + l[2] + u"’", print
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import test,wl,wl_detone from nltk.text import ConcordanceIndex from orthograph import convertw,detone from morphology import lemmatize, dict_disambiguate, print_gloss import re ci = ConcordanceIndex(test.words(), key=lambda s:s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) result.extend(gl)
#!/usr/bin/python # -*- encoding: utf-8 -*- from __future__ import division from bamana import test,wl,wl_detone from orthograph import convertw,detone from morphology import lemmatize, dict_disambiguate, print_gloss from nltk import FreqDist import re types = list(set([s.lower() for s in set(test.words())])) types.sort() wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)] fdist = FreqDist(tuple(convertw(w)) for w in test.words() if not re.search(r'[0-9.,;:!?]', w)) def counts(wordlist,fd): for word in wordlist: stages = -2 length = [] result = [] for form in word: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) if stages < stage: stages = stage result.extend(gl) length = len(dict_disambiguate(result)) yield (stages, length, u' '.join(word), fd[tuple(word)])
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import wl,test from nltk import FreqDist from orthograph import convertw, detone words = [] for i in test.words(): words.extend(convertw(detone(i))) fd = FreqDist(words) for w, lems in wl.items(): if len(lems) > 1: pslist = reduce(lambda x,y: x.union(y), [lem[1] for lem in lems]) if len(pslist) > 1: #if len(pslist) == 1: # polysemy case print fd[detone(w)], w, for l in lems: print '|', '/'.join(l[1]), u"‘" + l[2] + u"’", print
#!/usr/bin/python # -*- encoding: utf-8 -*- from __future__ import division from bamana import test,wordlist,propernames from orthography import * wl = set(wordlist.words()) pn = set(propernames.words()) indict = set() inproper = set() notindict = set() all = set(test.words()) for w in all: res = convertw(w) wordindict = False for r in res: word = detone(r) #all.add(word) if not orth_compliant(word): break if word in wl: indict.add(w) wordindict =True if word in pn: inproper.add(w) wordindict = True else: if not wordindict: notindict.add(w)