def test_scope_concat_scope(): fst = StemGuesser('(abc)(abc)', '', [(None, 0.0)]).fst assert accepts(fst, 'abcabc') assert not accepts(fst, 'ab') fst = StemGuesser('(abef)', '', [(None, 0.0)]).fst assert accepts(fst, 'abef')
def test_union_concat_scope(): fst = StemGuesser('[abc](de)', '', [(None, 0.0)]).fst assert accepts(fst, 'cde') fst = StemGuesser('[abc](de)[fgh]', '', [(None, 0.0)]).fst assert accepts(fst, 'cdef') assert accepts(fst, 'adeg') fst = StemGuesser('[abc](ce)[fgh]', '', [(None, 0.0)]).fst assert accepts(fst, 'acef')
def test_sigma_star_symbol_sigma_star(): fst = StemGuesser('.*j.*', '', [(None, 0.0)], nahuatl_alphabet).fst assert not accepts(fst, '') assert not accepts(fst, 'a') assert accepts(fst, 'j') fst = StemGuesser('[CV]*[CV][CV]*', '', [(None, 0.0)], nahuatl_alphabet).fst assert not accepts(fst, '') fst = StemGuesser('.*..*', '', [(None, 0.0)], nahuatl_alphabet).fst assert not accepts(fst, '')
def test_sigma_star_following_sigma(): fst = StemGuesser('..*', '', [(None, 0.0)], { 'C': ['b', 'c'], 'V': ['a'] }).fst assert not accepts(fst, '') fst = StemGuesser('..*', '', [(None, 0.0)], nahuatl_alphabet).fst assert accepts(fst, 'a') assert not accepts(fst, '') assert accepts(fst, 'at') assert accepts(fst, 'atp')
def test_closure_no_alphabet(): fst = StemGuesser('CV*', '', [(None, 0.0)]).fst assert accepts(fst, 'C') assert accepts(fst, 'CV') assert accepts(fst, 'CVV') assert accepts(fst, 'CVVV') assert not accepts(fst, 'CVC')
def test_closure_of_scope_no_alphabet(): fst = StemGuesser('(CV)*', '', [(None, 0.0)]).fst assert accepts(fst, '') assert accepts(fst, 'CV') assert accepts(fst, 'CVCV') assert not accepts(fst, 'CVV') assert not accepts(fst, 'CCV')
def test_sigma_star_following(): fst = StemGuesser('t.*', '', [(None, 0.0)], nahuatl_alphabet).fst assert accepts(fst, 't') assert not accepts(fst, '') assert accepts(fst, 'ta') assert accepts(fst, 'tta') assert not accepts(fst, 'at')
def test_sigma_star_preceding(): fst = StemGuesser('.*t', '', [(None, 0.0)], nahuatl_alphabet).fst assert accepts(fst, 't') assert not accepts(fst, '') assert accepts(fst, 'at') assert accepts(fst, 'att') assert not accepts(fst, 'ta')
def test_closure_of_scope_preceding_symbol(): fst = StemGuesser('(CV)*C', '', [(None, 0.0)]).fst assert not accepts(fst, 'CCV') assert accepts(fst, 'CVC') assert accepts(fst, 'CVCVC') assert accepts(fst, 'C') assert not accepts(fst, '')
def test_symbol_closure(): fst = StemGuesser('a*', '', [(None, 0.0)]).fst assert accepts(fst, '') assert accepts(fst, 'a') assert accepts(fst, 'aa') assert accepts(fst, 'aaa') assert accepts(fst, 'aaaa') assert not accepts(fst, 'ab')
def test_sigma_star_even_number(): fst = StemGuesser('.*.*', '', [(None, 0.0)], nahuatl_alphabet).fst assert accepts(fst, 'at') assert accepts(fst, '') assert accepts(fst, 'a') assert accepts(fst, 't') assert accepts(fst, 'at') assert accepts(fst, 'atp')
def parser_from_stem(stem): return compile({ StemGuesser(stem, 'NounStem', [('Absolutive', 0.0)], alphabet=nawat_alphabet, start=True), Slot('Absolutive', [ ('-t', 't', [(None, 0.0)], 0.0), ('-ti', 'ti', [(None, 0.0)], 0.0), ('l-li', 'li', [(None, 0.0)], 0.0) # This case actually has l in the stem ]), })
def test_closure_of_union_no_alphabet(): fst = StemGuesser('[CV]*V[CV]*V[CV]*', '', [(None, 0.0)]).fst assert accepts(fst, 'CVVCV') # bimoraic assert accepts(fst, 'VV') # bimoraic assert accepts(fst, 'VVC') # bimoraic assert accepts(fst, 'CVCV') # bimoraic assert accepts(fst, 'CVCVC') # bimoraic assert not accepts(fst, 'CV') # not bimoraic assert not accepts(fst, 'CC') # not bimoraic assert not accepts(fst, 'CCV') # not bimoraic
def test_concat(): fst = StemGuesser('CVCV', '', [(None, 0.0)]).fst assert accepts(fst, 'CVCV') assert not accepts(fst, 'CVC') assert not accepts(fst, 'CVV')
def test_sigma_star_alone(): fst = StemGuesser('.*', '', [(None, 0.0)], nahuatl_alphabet).fst assert accepts(fst, '') assert accepts(fst, 'a') assert accepts(fst, 'ann') assert accepts(fst, 'nn')
def test_sigma_in_middle(): fst = StemGuesser('p.p', '', [(None, 0.0)], nahuatl_alphabet).fst assert accepts(fst, 'pop') assert accepts(fst, 'pip') assert accepts(fst, 'psp') assert not accepts(fst, 'pp')
def test_sigma_concatenated(): fst = StemGuesser('...', '', [(None, 0.0)], nahuatl_alphabet).fst assert accepts(fst, 'tap') assert not accepts(fst, '') assert not accepts(fst, 'ta') assert not accepts(fst, 'main')
from morphotactics.stem_guesser import StemGuesser import pynini nahuatl_alphabet = { 'C': [ 'm', 'n', 'p', 't', 'k', 'kw', 'h', 'ts', 'tl', 'ch', 's', 'l', 'x', 'j', 'w' ], 'V': ['a', 'e', 'i', 'o'] } bimoraic_fsa = StemGuesser('[CV]*V[CV]*V[CV]*', '', [(None, 0.0)], nahuatl_alphabet).fst bimoraic_fsa_sigma_form = StemGuesser('.*V.*V.*', '', [(None, 0.0)], nahuatl_alphabet).fst # note: StemGuesser('.*V.*V.*', '', [(None, 0.0)], nahuatl_alphabet) != StemGuesser('[CV]*V[CV]*V[CV]*', '', [(None, 0.0)], nahuatl_alphabet) # because of different state numberings during state optimization but they accept the same language still def accepts(fst, input_str): return pynini.compose(input_str, fst).num_states() != 0 def is_bimoraic(oov_stem): return accepts(bimoraic_fsa, oov_stem) def is_bimoraic_sigma_form(oov_stem): return accepts(bimoraic_fsa_sigma_form, oov_stem) def test_sigma_concatenated():
def test_union_concat_union(): fst = StemGuesser('[abc][abc]', '', [(None, 0.0)]).fst assert not accepts(fst, 'abcabc') assert accepts(fst, 'ab')
# This structure can happen within a sentence too, so even though it doesn't # occur all too often, the best way to deal with it is to always include the # subject prefix that predicates take. Slot('Subject', [ ('n-', 'n', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0), ('ni-', 'ni', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0), ('t-', 't', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0), ('ti-', 'ti', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0), ('0-', '', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 100.0), # the most common case by far ], start=True), Slot('NounStem', [ ('', '', [('NounStemC', 0.0), ('NounStemV', 0.0)], 0.0), ]), StemGuesser('.*C', 'NounStemC', [ ('C-Absolutive', 100.0), (None, 0.0), # This rarer case mostly occurs when ending in -l or -s with more than one mora ('tsin', 100.0), ('Locative', 0.0) ], alphabet=nawat_alphabet), Slot('C-Absolutive', [('-ti', 'ti', [(None, 0.0)], 0.0)]), StemGuesser('.*V', 'NounStemV', [ ('V-Absolutive', 100.0), ('tsin', 100.0), ('Locative', 0.0) ], alphabet=nawat_alphabet), Slot('V-Absolutive', [ ('-t', 't', [(None, 0.0)], 0.0), ('l-li', 'li', [(None, 0.0)], 0.0) # Here, l is actually part of the stem, but easier to do this way ]), Slot('PossessedNoun', [ ('no-', 'no', [('PossessedNounStem', 0.0)], 0.0), ('n-', 'n', [('oPossessedNounStem', 0.0)], 0.0),