Beispiel #1
0
def main(argv):

    options, remainder = getopt.gnu_getopt(argv[1:], 'ocup:sn:', [
        'original', 'constrained', 'unconstrained', 'pvalue', 'separate',
        'name'
    ])

    pv = 0.05

    (Goriginal, Gconstrained, Gunconstrained, Gseparate,
     Gname) = False, False, False, False, 'analyzer.bin'
    for opt, arg in options:
        if opt in ('-o', '--original'):
            Goriginal = True
        elif opt in ('-c', '--constrained'):
            Gconstrained = True
        elif opt in ('-u', '--unconstrained'):
            Gunconstrained = True
        elif opt in ('-s', '--separate'):
            Gseparate = True
        elif opt in ('-p', '--pvalue'):
            pv = float(arg)
        elif opt in ('-n', '--name'):
            Gname = arg

    paradigms = paradigm.load_file(remainder[0])

    analyzers = []
    analyzernames = []
    for analyzertype in (('Goriginal', 1.0), ('Gconstrained', pv),
                         ('Gunconstrained', 0.0)):
        if eval(analyzertype[0]) == True:
            analyzers.append(
                paradigms_to_foma(paradigms, analyzertype[0], analyzertype[1]))
            analyzernames.append(analyzertype[0])

    for a in analyzers:
        print a.encode('utf-8')

    if len(analyzers) > 0:
        if Gseparate == True:
            for a in analyzernames:
                print 'regex ' + a + ';'
        else:
            print 'regex ' + u' .P. '.join(analyzernames) + ';'
        print 'save stack ' + Gname
Beispiel #2
0
def build(inpfile, ngramorder, ngramprior):
    paradigms = paradigm.load_file(
        inpfile)  # [(occurrence_count, name, paradigm),...,]
    alphabet = paradigms_to_alphabet(paradigms)

    numexamples = sum(map(lambda x: x.count, paradigms))

    lms = []
    # Learn n-gram LM for each variable
    for pindex, p in enumerate(paradigms):
        numvars = (len(p.slots) - 1) / 2
        slotmodels = []
        for v in xrange(0, numvars):
            varinsts = p.slots[v * 2 + 1][1]
            model = stringngram(varinsts,
                                alphabet=alphabet,
                                order=ngramorder,
                                ngramprior=ngramprior)
            slotmodels.append(model)
        lms.append((numvars, slotmodels))
    return paradigms, numexamples, lms
Beispiel #3
0
 def __init__(self, pfile):
     # (pname, paradigm forms)
     self.ptable = dict([(p.name, (['+'.join(f.form) for f in p.forms],p) )
                                                   for p in paradigm.load_file(pfile)])
     # compatibility adjacency graph
     self.compat_graph = self._build_compat_graph()
Beispiel #4
0
    return [(1+m-len(s))*' '+s for s in ss]

def lalign(ss):
    m = max([len(s) for s in ss])
    return [(s+(1+m-len(s))*' ') for s in ss]

def extract_form_information(ps,only_lemma=False):
    pss = [(p.name, # paradigm name
            p.count, # member count
            [p.paradigm_forms()[0]] if only_lemma else p.paradigm_forms(), # form patterns + form insts
            ' '.join(['[%s]' % ",".join(list(set(s))[:5]) for (is_var,s) in p.slots if is_var])) # slots (< 5 members)
                for p in ps]
    result = defaultdict(set)
    for (n1,c,fs,s) in pss:
        for shape in fs:
            result[shape['form']].add((c,n1, shape['w'], s))
    result = [(len(xs),f,xs) for (f,xs) in result.iteritems()] # sort with respect to the ambiguity count
    result.sort(reverse=True)
    return result

for (n,f,xs) in extract_form_information(paradigm.load_file(sys.argv[1]),'-l' in sys.argv):
      if len(xs) > 1: # we only print the ambiguous forms.
        xs = sorted(xs,reverse=True)
        wfs = ralign([x[2] for x in xs])
        lms = lalign(['p_%s %d' % (x[1],x[0]) for x in xs])
        sls = [x[3] for x in xs]
        print ((len(wfs[0])+len(lms[0]) - len(f)-3)*' ' + '=> %s <=' % f).encode('utf-8')
        for t in zip(lms,wfs,sls):
            print ('%s%s  %s' % t).encode('utf-8')
        print
def main(argv):

    options, remainder = getopt.gnu_getopt(
        argv[1:], 'tk:n:p:dr:',
        ['tables', 'kbest', 'ngram', 'prior', 'debug', 'pprior'])

    print_tables, kbest, ngramorder, ngramprior, debug, pprior = False, 1, 3, 0.01, False, 1.0
    for opt, arg in options:
        if opt in ('-t', '--tables'):
            print_tables = True
        elif opt in ('-k', '--kbest'):
            kbest = int(arg)
        elif opt in ('-n', '--ngram'):
            ngramorder = int(arg)
        elif opt in ('-p', '--prior'):
            ngramprior = float(arg)
        elif opt in ('-d', '--debug'):
            debug = True
        elif opt in ('-d', '--debug'):
            debug = True
        elif opt in ('-r', '--pprior'):
            pprior = float(arg)

    paradigms = paradigm.load_file(
        sys.argv[1])  # [(occurrence_count, name, paradigm),...,]
    alphabet = paradigms_to_alphabet(paradigms)

    numexamples = sum(map(lambda x: x.count, paradigms))

    lms = []
    # Learn n-gram LM for each variable
    for pindex, p in enumerate(paradigms):
        numvars = (len(p.slots) - 1) / 2
        slotmodels = []
        for v in xrange(0, numvars):
            varinsts = p.slots[v * 2 + 1][1]
            model = stringngram(varinsts,
                                alphabet=alphabet,
                                order=ngramorder,
                                ngramprior=ngramprior)
            slotmodels.append(model)
        lms.append((numvars, slotmodels))

    for line in iter(lambda: sys.stdin.readline().decode('utf-8'), ''):
        words = line.strip().split()
        if len(words) == 0:
            continue

        # Quick filter out most paradigms
        fittingparadigms = [(pindex, p) for pindex, p in enumerate(paradigms)
                            if all(
                                p.fits_paradigm(w, constrained=False)
                                for w in words)]
        fittingparadigms = filter(lambda p: eval_multiple_entries(p[1], words),
                                  fittingparadigms)

        if debug:
            print "Plausible paradigms:"
            for pnum, p in fittingparadigms:
                print pnum, p.name

        analyses = []
        # Calculate score for each possible variable assignment
        for pindex, p in fittingparadigms:
            prior = math.log(p.count / float(numexamples))
            vars = eval_multiple_entries(p,
                                         words)  # All possible instantiations
            if len(vars) == 0:
                # Word matches
                score = prior
                analyses.append((score, p, ()))
            else:
                for v in vars:
                    score = prior * pprior + len(words) * eval_vars(
                        v, lms[pindex])
                    #score = len(words) * eval_vars(v, lms[pindex])
                    analyses.append((score, p, v))

        analyses.sort(reverse=True, key=lambda x: x[0])

        # Print all analyses + optionally a table
        for aindex, (score, p, v) in enumerate(analyses):
            if aindex >= kbest:
                break
            wordformlist = []
            varstring = '(' + ','.join([
                str(feat) + '=' + val
                for feat, val in zip(range(1,
                                           len(v) + 1), v)
            ]) + ')'
            table = p(*v)  # Instantiate table with vars from analysis
            baseform = table[0][0]
            matchtable = [(form, msd) for form, msd in table if form in words]
            wordformlist = [
                form + ':' + baseform + ',' +
                ','.join([m[0] + '=' + m[1] for m in msd])
                for form, msd in matchtable
            ]
            print(
                unicode(score) + ' ' + p.name + ' ' + varstring + ' ' +
                '#'.join(wordformlist)).encode("utf-8")
            if print_tables:
                for form, msd in table:
                    if form in words:
                        form = "*" + form + "*"
                    msdprint = ','.join([m[0] + '=' + m[1] for m in msd])
                    print(form + '\t' + msdprint).encode("utf-8")

        print
Beispiel #6
0
 def __init__(self, pfile):
     # (pname, paradigm forms)
     self.ptable = dict([(p.name, (['+'.join(f.form) for f in p.forms], p))
                         for p in paradigm.load_file(pfile)])
     # compatibility adjacency graph
     self.compat_graph = self._build_compat_graph()
Beispiel #7
0
            [p.paradigm_forms()[0]] if only_lemma else
            p.paradigm_forms(),  # form patterns + form insts
            ' '.join([
                '[%s]' % ",".join(list(set(s))[:5]) for (is_var, s) in p.slots
                if is_var
            ]))  # slots (< 5 members)
        for p in ps
    ]
    result = defaultdict(set)
    for (n1, c, fs, s) in pss:
        for shape in fs:
            result[shape['form']].add((c, n1, shape['w'], s))
    result = [(len(xs), f, xs) for (f, xs) in result.iteritems()
              ]  # sort with respect to the ambiguity count
    result.sort(reverse=True)
    return result


for (n, f, xs) in extract_form_information(paradigm.load_file(sys.argv[1]),
                                           '-l' in sys.argv):
    if len(xs) > 1:  # we only print the ambiguous forms.
        xs = sorted(xs, reverse=True)
        wfs = ralign([x[2] for x in xs])
        lms = lalign(['p_%s %d' % (x[1], x[0]) for x in xs])
        sls = [x[3] for x in xs]
        print((len(wfs[0]) + len(lms[0]) - len(f) - 3) * ' ' +
              '=> %s <=' % f).encode('utf-8')
        for t in zip(lms, wfs, sls):
            print('%s%s  %s' % t).encode('utf-8')
        print
Beispiel #8
0
        ss = [ss]
    return ",".join(['%s=%s' % (n,s) for (n,s) in enumerate(ss,1)])

def read_data(file):
    (data,table) = ([],[])
    with codecs.open(file,encoding='utf-8') as f:
        for l in f:
            l = l.strip()
            if len(l) > 0:
                table.append(l.split('\t')[0])
            else:
                data.append(table)
                table = []
    return data
     
ps = paradigm.load_file(sys.argv[1])
ds = read_data(sys.argv[2])

exp = []

total_number_of_words = float(sum([p.count for p in ps]))

for t in ds:
    w_result = (t[0],[])
    for p in ps:
        bf_match = p.match(t[0],[0])[0] # baseform match
        if bf_match == None:
            bf_match = p.match(t[0],[0],constrained=False)[0] 
        if bf_match != None:
                for (sc,bs) in bf_match[::-1]:
                    tlen = len(t)
Beispiel #9
0
def loop():
    " The main loop "
    parafile = sys.argv[1]
    paradigms = P.load_file(parafile)
    parafile = codecs.open(parafile, encoding='utf8').readlines()
    readline.parse_and_bind('tab: complete')
    while True:
        inp = raw_input("skriv ett ord:").decode('utf8')
        # Show single paradigms
        if inp.strip() == 'Singlar':
            singels = find_singles(parafile)
            print '\n'.join(singels)
            print
            continue

        # Show paradigm statistics
        if inp.strip() == 'Stats':
            print '%s paradigm\n' % len(parafile)
            print 'Ord per paradigm'
            for words, num in sorted(list(count(parafile).items())):
                if words >= 10:
                    words = '%s+' % words
                print words, '\t', num
            print
            continue

        # Show word statistic
        show = re.search('Show\s(.*\S)', inp)
        if show is not None:
            show_word(show.group(1), parafile)
            continue

        # Create a table for a new list of words (comma separated)
        table = re.search(u'Nytt (.*)', inp)
        if table is not None:
            words = table.group(1)
            make_paradigm(words)
            print
            continue

        # Does x inflect as y?
        show_a = re.search('\?(.*),(.*)', inp)
        show_b = re.search('(.*),(.*)\?', inp)
        show_c = re.search(u'Böj\s(.*),(.*)\s*\?', inp)
        show = show_a or show_b or show_c
        if show is not None:
            word1 = show.group(1).strip()
            word2 = show.group(2).strip()
            print u'Böjs %s som %s?' % (word1, word2)
            same = False
            for line in parafile:
                re1 = make_wordregexp(word1)
                re2 = make_wordregexp(word2)
                match = re.search(re1 + '.*' + re2, line)
                if match is not None:
                    same = True
                    break
                match = re.search(re2 + '.*' + re1, line)
                if match is not None:
                    same = True
                    break
            print 'Ja' if same else 'Nej'
            print
            continue

        # Inflect x as y
        inflectlike = False
        inflect = re.search(u'Böj (.*), (.*)', inp)
        splitted = inp.split(',')
        if inflect is not None:
            word1 = inflect.group(1)
            word2 = inflect.group(2)
            inflectlike = True
        elif len(splitted) > 1:
            word1 = splitted[0]
            word2 = splitted[1]
            inflectlike = True
        if inflectlike:
            print u'Böj %s som %s' % (word1, word2)
            inflect_like(word1, word2, paradigms)
            continue

        # Inflect a known word
        inflect = re.search(u'Böj (.*)', inp)
        if inflect is not None:
            word = inflect.group(1)
            inflect_one(word, paradigms)
            continue

        # Show the table for a known word
        table = re.search(u'Tabell (.*)', inp)
        if table is not None:
            word = table.group(1)
            for p in paradigms:
                match = re.search(',,(%s),,' % make_innerwordregexp(word),
                                  ',,%s,,' % ',,'.join(p.members))
                if match is not None:
                    print match.group(1), ':'
                    for f in p.forms:
                        print unicode(f).replace('::', '\t').encode('utf-8')
                    print
            print
            continue

        # Show stats for word
        show_word(inp, parafile)
        continue
Beispiel #10
0

def read_data(file):
    (data, table) = ([], [])
    with codecs.open(file, encoding='utf-8') as f:
        for l in f:
            l = l.strip()
            if len(l) > 0:
                table.append(l.split('\t')[0])
            else:
                data.append(table)
                table = []
    return data


ps = paradigm.load_file(sys.argv[1])
ds = read_data(sys.argv[2])

exp = []

total_number_of_words = float(sum([p.count for p in ps]))

for t in ds:
    w_result = (t[0], [])
    for p in ps:
        bf_match = p.match(t[0], [0])[0]  # baseform match
        if bf_match == None:
            bf_match = p.match(t[0], [0], constrained=False)[0]
        if bf_match != None:
            for (sc, bs) in bf_match[::-1]:
                tlen = len(t)
Beispiel #11
0
def main(argv):

    options, remainder = getopt.gnu_getopt(argv[1:], 'tk:n:p:dr:', ['tables','kbest','ngram','prior','debug','pprior'])

    print_tables, kbest, ngramorder, ngramprior, debug, pprior = False, 1, 3, 0.01, False, 1.0
    for opt, arg in options:
        if opt in ('-t', '--tables'):
            print_tables = True
        elif opt in ('-k', '--kbest'):
            kbest = int(arg)
        elif opt in ('-n', '--ngram'):
            ngramorder = int(arg)
        elif opt in ('-p', '--prior'):
            ngramprior = float(arg)
        elif opt in ('-d', '--debug'):
            debug = True
        elif opt in ('-d', '--debug'):
            debug = True
        elif opt in ('-r', '--pprior'):
            pprior = float(arg)
               
    paradigms = paradigm.load_file(sys.argv[1]) # [(occurrence_count, name, paradigm),...,]
    alphabet = paradigms_to_alphabet(paradigms)

    numexamples = sum(map(lambda x: x.count, paradigms))

    lms = []
    # Learn n-gram LM for each variable
    for pindex, p in enumerate(paradigms):
        numvars = (len(p.slots) - 1)/2
        slotmodels  = []
        for v in xrange(0, numvars):
            varinsts = p.slots[v*2+1][1]
            model = stringngram(varinsts, alphabet = alphabet, order = ngramorder, ngramprior = ngramprior)
            slotmodels.append(model)
        lms.append((numvars, slotmodels))

            
    for line in iter(lambda: sys.stdin.readline().decode('utf-8'), ''):
        words = line.strip().split()
        if len(words) == 0:
            continue
        
        # Quick filter out most paradigms
        fittingparadigms = [(pindex, p) for pindex, p in enumerate(paradigms) if all(p.fits_paradigm(w, constrained = False) for w in words)]
        fittingparadigms = filter(lambda p: eval_multiple_entries(p[1], words), fittingparadigms)
        
        if debug:
            print "Plausible paradigms:"
            for pnum, p in fittingparadigms:
                print pnum, p.name

        analyses = []
        # Calculate score for each possible variable assignment
        for pindex, p in fittingparadigms:
            prior = math.log(p.count/float(numexamples))
            vars = eval_multiple_entries(p, words) # All possible instantiations
            if len(vars) == 0:
                # Word matches
                score = prior
                analyses.append((score, p, ()))
            else:
                for v in vars:
                    score = prior * pprior + len(words) * eval_vars(v, lms[pindex])
                    #score = len(words) * eval_vars(v, lms[pindex])
                    analyses.append((score, p, v))

        analyses.sort(reverse = True, key = lambda x: x[0])

        # Print all analyses + optionally a table        
        for aindex, (score, p, v) in enumerate(analyses):
            if aindex >= kbest:
                break
            wordformlist = []
            varstring = '(' + ','.join([str(feat) + '=' + val for feat,val in zip(range(1,len(v)+1), v)]) + ')'
            table = p(*v)          # Instantiate table with vars from analysis
            baseform = table[0][0]
            matchtable = [(form, msd) for form, msd in table if form in words]
            wordformlist = [form +':' + baseform + ',' + ','.join([m[0] + '=' + m[1] for m in msd]) for form, msd in matchtable]                    
            print (unicode(score) + ' ' + p.name + ' ' + varstring + ' ' + '#'.join(wordformlist)).encode("utf-8")
            if print_tables:
                for form, msd in table:
                    if form in words:
                        form = "*" + form + "*"
                    msdprint = ','.join([m[0] + '=' + m[1] for m in msd])
                    print (form + '\t' + msdprint).encode("utf-8")

        print