def main(argv): options, remainder = getopt.gnu_getopt(argv[1:], 'ocup:sn:', [ 'original', 'constrained', 'unconstrained', 'pvalue', 'separate', 'name' ]) pv = 0.05 (Goriginal, Gconstrained, Gunconstrained, Gseparate, Gname) = False, False, False, False, 'analyzer.bin' for opt, arg in options: if opt in ('-o', '--original'): Goriginal = True elif opt in ('-c', '--constrained'): Gconstrained = True elif opt in ('-u', '--unconstrained'): Gunconstrained = True elif opt in ('-s', '--separate'): Gseparate = True elif opt in ('-p', '--pvalue'): pv = float(arg) elif opt in ('-n', '--name'): Gname = arg paradigms = paradigm.load_file(remainder[0]) analyzers = [] analyzernames = [] for analyzertype in (('Goriginal', 1.0), ('Gconstrained', pv), ('Gunconstrained', 0.0)): if eval(analyzertype[0]) == True: analyzers.append( paradigms_to_foma(paradigms, analyzertype[0], analyzertype[1])) analyzernames.append(analyzertype[0]) for a in analyzers: print a.encode('utf-8') if len(analyzers) > 0: if Gseparate == True: for a in analyzernames: print 'regex ' + a + ';' else: print 'regex ' + u' .P. '.join(analyzernames) + ';' print 'save stack ' + Gname
def build(inpfile, ngramorder, ngramprior): paradigms = paradigm.load_file( inpfile) # [(occurrence_count, name, paradigm),...,] alphabet = paradigms_to_alphabet(paradigms) numexamples = sum(map(lambda x: x.count, paradigms)) lms = [] # Learn n-gram LM for each variable for pindex, p in enumerate(paradigms): numvars = (len(p.slots) - 1) / 2 slotmodels = [] for v in xrange(0, numvars): varinsts = p.slots[v * 2 + 1][1] model = stringngram(varinsts, alphabet=alphabet, order=ngramorder, ngramprior=ngramprior) slotmodels.append(model) lms.append((numvars, slotmodels)) return paradigms, numexamples, lms
def __init__(self, pfile): # (pname, paradigm forms) self.ptable = dict([(p.name, (['+'.join(f.form) for f in p.forms],p) ) for p in paradigm.load_file(pfile)]) # compatibility adjacency graph self.compat_graph = self._build_compat_graph()
return [(1+m-len(s))*' '+s for s in ss] def lalign(ss): m = max([len(s) for s in ss]) return [(s+(1+m-len(s))*' ') for s in ss] def extract_form_information(ps,only_lemma=False): pss = [(p.name, # paradigm name p.count, # member count [p.paradigm_forms()[0]] if only_lemma else p.paradigm_forms(), # form patterns + form insts ' '.join(['[%s]' % ",".join(list(set(s))[:5]) for (is_var,s) in p.slots if is_var])) # slots (< 5 members) for p in ps] result = defaultdict(set) for (n1,c,fs,s) in pss: for shape in fs: result[shape['form']].add((c,n1, shape['w'], s)) result = [(len(xs),f,xs) for (f,xs) in result.iteritems()] # sort with respect to the ambiguity count result.sort(reverse=True) return result for (n,f,xs) in extract_form_information(paradigm.load_file(sys.argv[1]),'-l' in sys.argv): if len(xs) > 1: # we only print the ambiguous forms. xs = sorted(xs,reverse=True) wfs = ralign([x[2] for x in xs]) lms = lalign(['p_%s %d' % (x[1],x[0]) for x in xs]) sls = [x[3] for x in xs] print ((len(wfs[0])+len(lms[0]) - len(f)-3)*' ' + '=> %s <=' % f).encode('utf-8') for t in zip(lms,wfs,sls): print ('%s%s %s' % t).encode('utf-8') print
def main(argv): options, remainder = getopt.gnu_getopt( argv[1:], 'tk:n:p:dr:', ['tables', 'kbest', 'ngram', 'prior', 'debug', 'pprior']) print_tables, kbest, ngramorder, ngramprior, debug, pprior = False, 1, 3, 0.01, False, 1.0 for opt, arg in options: if opt in ('-t', '--tables'): print_tables = True elif opt in ('-k', '--kbest'): kbest = int(arg) elif opt in ('-n', '--ngram'): ngramorder = int(arg) elif opt in ('-p', '--prior'): ngramprior = float(arg) elif opt in ('-d', '--debug'): debug = True elif opt in ('-d', '--debug'): debug = True elif opt in ('-r', '--pprior'): pprior = float(arg) paradigms = paradigm.load_file( sys.argv[1]) # [(occurrence_count, name, paradigm),...,] alphabet = paradigms_to_alphabet(paradigms) numexamples = sum(map(lambda x: x.count, paradigms)) lms = [] # Learn n-gram LM for each variable for pindex, p in enumerate(paradigms): numvars = (len(p.slots) - 1) / 2 slotmodels = [] for v in xrange(0, numvars): varinsts = p.slots[v * 2 + 1][1] model = stringngram(varinsts, alphabet=alphabet, order=ngramorder, ngramprior=ngramprior) slotmodels.append(model) lms.append((numvars, slotmodels)) for line in iter(lambda: sys.stdin.readline().decode('utf-8'), ''): words = line.strip().split() if len(words) == 0: continue # Quick filter out most paradigms fittingparadigms = [(pindex, p) for pindex, p in enumerate(paradigms) if all( p.fits_paradigm(w, constrained=False) for w in words)] fittingparadigms = filter(lambda p: eval_multiple_entries(p[1], words), fittingparadigms) if debug: print "Plausible paradigms:" for pnum, p in fittingparadigms: print pnum, p.name analyses = [] # Calculate score for each possible variable assignment for pindex, p in fittingparadigms: prior = math.log(p.count / float(numexamples)) vars = eval_multiple_entries(p, words) # All possible instantiations if len(vars) == 0: # Word matches score = prior analyses.append((score, p, ())) else: for v in vars: score = prior * pprior + len(words) * eval_vars( v, lms[pindex]) #score = len(words) * eval_vars(v, lms[pindex]) analyses.append((score, p, v)) analyses.sort(reverse=True, key=lambda x: x[0]) # Print all analyses + optionally a table for aindex, (score, p, v) in enumerate(analyses): if aindex >= kbest: break wordformlist = [] varstring = '(' + ','.join([ str(feat) + '=' + val for feat, val in zip(range(1, len(v) + 1), v) ]) + ')' table = p(*v) # Instantiate table with vars from analysis baseform = table[0][0] matchtable = [(form, msd) for form, msd in table if form in words] wordformlist = [ form + ':' + baseform + ',' + ','.join([m[0] + '=' + m[1] for m in msd]) for form, msd in matchtable ] print( unicode(score) + ' ' + p.name + ' ' + varstring + ' ' + '#'.join(wordformlist)).encode("utf-8") if print_tables: for form, msd in table: if form in words: form = "*" + form + "*" msdprint = ','.join([m[0] + '=' + m[1] for m in msd]) print(form + '\t' + msdprint).encode("utf-8") print
def __init__(self, pfile): # (pname, paradigm forms) self.ptable = dict([(p.name, (['+'.join(f.form) for f in p.forms], p)) for p in paradigm.load_file(pfile)]) # compatibility adjacency graph self.compat_graph = self._build_compat_graph()
[p.paradigm_forms()[0]] if only_lemma else p.paradigm_forms(), # form patterns + form insts ' '.join([ '[%s]' % ",".join(list(set(s))[:5]) for (is_var, s) in p.slots if is_var ])) # slots (< 5 members) for p in ps ] result = defaultdict(set) for (n1, c, fs, s) in pss: for shape in fs: result[shape['form']].add((c, n1, shape['w'], s)) result = [(len(xs), f, xs) for (f, xs) in result.iteritems() ] # sort with respect to the ambiguity count result.sort(reverse=True) return result for (n, f, xs) in extract_form_information(paradigm.load_file(sys.argv[1]), '-l' in sys.argv): if len(xs) > 1: # we only print the ambiguous forms. xs = sorted(xs, reverse=True) wfs = ralign([x[2] for x in xs]) lms = lalign(['p_%s %d' % (x[1], x[0]) for x in xs]) sls = [x[3] for x in xs] print((len(wfs[0]) + len(lms[0]) - len(f) - 3) * ' ' + '=> %s <=' % f).encode('utf-8') for t in zip(lms, wfs, sls): print('%s%s %s' % t).encode('utf-8') print
ss = [ss] return ",".join(['%s=%s' % (n,s) for (n,s) in enumerate(ss,1)]) def read_data(file): (data,table) = ([],[]) with codecs.open(file,encoding='utf-8') as f: for l in f: l = l.strip() if len(l) > 0: table.append(l.split('\t')[0]) else: data.append(table) table = [] return data ps = paradigm.load_file(sys.argv[1]) ds = read_data(sys.argv[2]) exp = [] total_number_of_words = float(sum([p.count for p in ps])) for t in ds: w_result = (t[0],[]) for p in ps: bf_match = p.match(t[0],[0])[0] # baseform match if bf_match == None: bf_match = p.match(t[0],[0],constrained=False)[0] if bf_match != None: for (sc,bs) in bf_match[::-1]: tlen = len(t)
def loop(): " The main loop " parafile = sys.argv[1] paradigms = P.load_file(parafile) parafile = codecs.open(parafile, encoding='utf8').readlines() readline.parse_and_bind('tab: complete') while True: inp = raw_input("skriv ett ord:").decode('utf8') # Show single paradigms if inp.strip() == 'Singlar': singels = find_singles(parafile) print '\n'.join(singels) print continue # Show paradigm statistics if inp.strip() == 'Stats': print '%s paradigm\n' % len(parafile) print 'Ord per paradigm' for words, num in sorted(list(count(parafile).items())): if words >= 10: words = '%s+' % words print words, '\t', num print continue # Show word statistic show = re.search('Show\s(.*\S)', inp) if show is not None: show_word(show.group(1), parafile) continue # Create a table for a new list of words (comma separated) table = re.search(u'Nytt (.*)', inp) if table is not None: words = table.group(1) make_paradigm(words) print continue # Does x inflect as y? show_a = re.search('\?(.*),(.*)', inp) show_b = re.search('(.*),(.*)\?', inp) show_c = re.search(u'Böj\s(.*),(.*)\s*\?', inp) show = show_a or show_b or show_c if show is not None: word1 = show.group(1).strip() word2 = show.group(2).strip() print u'Böjs %s som %s?' % (word1, word2) same = False for line in parafile: re1 = make_wordregexp(word1) re2 = make_wordregexp(word2) match = re.search(re1 + '.*' + re2, line) if match is not None: same = True break match = re.search(re2 + '.*' + re1, line) if match is not None: same = True break print 'Ja' if same else 'Nej' print continue # Inflect x as y inflectlike = False inflect = re.search(u'Böj (.*), (.*)', inp) splitted = inp.split(',') if inflect is not None: word1 = inflect.group(1) word2 = inflect.group(2) inflectlike = True elif len(splitted) > 1: word1 = splitted[0] word2 = splitted[1] inflectlike = True if inflectlike: print u'Böj %s som %s' % (word1, word2) inflect_like(word1, word2, paradigms) continue # Inflect a known word inflect = re.search(u'Böj (.*)', inp) if inflect is not None: word = inflect.group(1) inflect_one(word, paradigms) continue # Show the table for a known word table = re.search(u'Tabell (.*)', inp) if table is not None: word = table.group(1) for p in paradigms: match = re.search(',,(%s),,' % make_innerwordregexp(word), ',,%s,,' % ',,'.join(p.members)) if match is not None: print match.group(1), ':' for f in p.forms: print unicode(f).replace('::', '\t').encode('utf-8') print print continue # Show stats for word show_word(inp, parafile) continue
def read_data(file): (data, table) = ([], []) with codecs.open(file, encoding='utf-8') as f: for l in f: l = l.strip() if len(l) > 0: table.append(l.split('\t')[0]) else: data.append(table) table = [] return data ps = paradigm.load_file(sys.argv[1]) ds = read_data(sys.argv[2]) exp = [] total_number_of_words = float(sum([p.count for p in ps])) for t in ds: w_result = (t[0], []) for p in ps: bf_match = p.match(t[0], [0])[0] # baseform match if bf_match == None: bf_match = p.match(t[0], [0], constrained=False)[0] if bf_match != None: for (sc, bs) in bf_match[::-1]: tlen = len(t)
def main(argv): options, remainder = getopt.gnu_getopt(argv[1:], 'tk:n:p:dr:', ['tables','kbest','ngram','prior','debug','pprior']) print_tables, kbest, ngramorder, ngramprior, debug, pprior = False, 1, 3, 0.01, False, 1.0 for opt, arg in options: if opt in ('-t', '--tables'): print_tables = True elif opt in ('-k', '--kbest'): kbest = int(arg) elif opt in ('-n', '--ngram'): ngramorder = int(arg) elif opt in ('-p', '--prior'): ngramprior = float(arg) elif opt in ('-d', '--debug'): debug = True elif opt in ('-d', '--debug'): debug = True elif opt in ('-r', '--pprior'): pprior = float(arg) paradigms = paradigm.load_file(sys.argv[1]) # [(occurrence_count, name, paradigm),...,] alphabet = paradigms_to_alphabet(paradigms) numexamples = sum(map(lambda x: x.count, paradigms)) lms = [] # Learn n-gram LM for each variable for pindex, p in enumerate(paradigms): numvars = (len(p.slots) - 1)/2 slotmodels = [] for v in xrange(0, numvars): varinsts = p.slots[v*2+1][1] model = stringngram(varinsts, alphabet = alphabet, order = ngramorder, ngramprior = ngramprior) slotmodels.append(model) lms.append((numvars, slotmodels)) for line in iter(lambda: sys.stdin.readline().decode('utf-8'), ''): words = line.strip().split() if len(words) == 0: continue # Quick filter out most paradigms fittingparadigms = [(pindex, p) for pindex, p in enumerate(paradigms) if all(p.fits_paradigm(w, constrained = False) for w in words)] fittingparadigms = filter(lambda p: eval_multiple_entries(p[1], words), fittingparadigms) if debug: print "Plausible paradigms:" for pnum, p in fittingparadigms: print pnum, p.name analyses = [] # Calculate score for each possible variable assignment for pindex, p in fittingparadigms: prior = math.log(p.count/float(numexamples)) vars = eval_multiple_entries(p, words) # All possible instantiations if len(vars) == 0: # Word matches score = prior analyses.append((score, p, ())) else: for v in vars: score = prior * pprior + len(words) * eval_vars(v, lms[pindex]) #score = len(words) * eval_vars(v, lms[pindex]) analyses.append((score, p, v)) analyses.sort(reverse = True, key = lambda x: x[0]) # Print all analyses + optionally a table for aindex, (score, p, v) in enumerate(analyses): if aindex >= kbest: break wordformlist = [] varstring = '(' + ','.join([str(feat) + '=' + val for feat,val in zip(range(1,len(v)+1), v)]) + ')' table = p(*v) # Instantiate table with vars from analysis baseform = table[0][0] matchtable = [(form, msd) for form, msd in table if form in words] wordformlist = [form +':' + baseform + ',' + ','.join([m[0] + '=' + m[1] for m in msd]) for form, msd in matchtable] print (unicode(score) + ' ' + p.name + ' ' + varstring + ' ' + '#'.join(wordformlist)).encode("utf-8") if print_tables: for form, msd in table: if form in words: form = "*" + form + "*" msdprint = ','.join([m[0] + '=' + m[1] for m in msd]) print (form + '\t' + msdprint).encode("utf-8") print