def buildpreProcessFST_delemmatize(self): # initialize a FSTpre st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) initpres = '1\n' print(initpres, file=compiler) initFSTpre = compiler.compile() fststr.expand_other_symbols(initFSTpre) pre_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_pre_") ] # print(pre_files) # compile txt files into FST, and union them into initFSTpre for f in pre_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) pre = open(f).read() print(pre, file=compiler) pre_FST = compiler.compile() fststr.expand_other_symbols(pre_FST) initFSTpre = initFSTpre.union(pre_FST) return initFSTpre
def consonant_doubling(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z'] vowel = ['a', 'e', 'i', 'o', 'u'] compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') for v in vowel: compiler.write('0 2 ' + v + ' ' + v + '\n') for c in consonant: compiler.write('0 1 ' + c + ' ' + c + '\n') compiler.write('1 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write('1 2 ' + v + ' ' + v + '\n') compiler.write('2 2 i i\n') compiler.write('2 2 u u\n') for i in range(len(consonant)): compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n') for c in consonant: compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n') compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n') compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n') compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n') compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n') compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n') compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n') c = compiler.compile() fststr.expand_other_symbols(c) return c
def buildpreProcessFST(self, curr_str): s = '0\n' tracker = 0 for i in range(len(curr_str)): if (curr_str[i] == '+') or (curr_str[i] == '<'): s += '{} {} <epsilon> {}\n'.format( tracker, tracker + 1, curr_str[tracker:len(curr_str)]) # tracker +=1 break else: s += '{} {} {} {}\n'.format(tracker, tracker + 1, curr_str[tracker], curr_str[tracker]) tracker += 1 s += '{} {} <epsilon> <#>\n{}\n'.format(tracker, tracker + 1, tracker + 1) # print(s) st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) print(s, file=compiler) FSTpre = compiler.compile() fststr.expand_other_symbols(FSTpre) return FSTpre
def ch_sh_e_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('ch_sh_e_insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c
def y_replacement(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('y_replacement.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c
def del_sharp(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 1 <#> <epsilon>\n') compiler.write('1\n') c = compiler.compile() fststr.expand_other_symbols(c) return c
def get_compiler_from_file_name(self, file_name): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) in_file = open(file_name) fst_file = in_file.read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) in_file.close() return c
def buildInVocabFST(self): # initialize a FST1 st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) inits1 = '0\n' print(inits1, file=compiler) initFST1 = compiler.compile() fststr.expand_other_symbols(initFST1) # read dictionary file dict_file = open('in_vocab_dictionary_verbs.txt', 'r') # read each line of the file dict_lines = dict_file.readlines() # build FST for each word for line in dict_lines: # make each line into a list, one list for one word, # including its lemma form, surface form, and the form name line = line.strip() line = line.rstrip(',') lineList = line.split(',') # print(lineList) # now build and update FST base on each line s = '' for i in range(len(lineList[1])): try: s += '{} {} {} {}\n'.format(i, i + 1, lineList[1][i], lineList[0][i]) except: s += '{} {} {} <epsilon>\n'.format(i, i + 1, lineList[1][i]) s += '{} {} <#> <epsilon>\n'.format(len(lineList[1]), len(lineList[1]) + 1) s += '{} {} <epsilon> +Known\n{}\n'.format( len(lineList[1]) + 1, len(lineList[1]) + 2, len(lineList[1]) + 2) # print(s) # now union current FST into the initFST1 compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) print(s, file=compiler) currFST = compiler.compile() fststr.expand_other_symbols(currFST) initFST1 = initFST1.union(currFST) return initFST1
def general(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') compiler.write('0\n') # for special cases # compiler.write('0 1 i i\n') # compiler.write('1 2 n n\n') # compiler.write('2 3 g g\n') # compiler.write('3 4 <^> <^>\n') compiler.write('0 1 <^> <epsilon>\n') compiler.write('1 0 <other> <other>\n') compiler.write('1 0 <#> <#>\n') c = compiler.compile() fststr.expand_other_symbols(c) return c
def get_morphotactics(): suffix = ['', 's', 'ed', 'en', 'ing'] st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) c = compiler.compile() for s in suffix: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 1 +Guess <^>\n') l = len(s) for i in range(l): compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n') compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n') compiler.write(str(l+2)) suffix_rule = compiler.compile() c = c.union(suffix_rule) fststr.expand_other_symbols(c) return c
def buildAllomFST(self): # initialize a FST3 st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) inits3 = '0\n' print(inits3, file=compiler) initFST3 = compiler.compile() fststr.expand_other_symbols(initFST3) # read allom FST txt files allom_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_allom_") ] # print(allom_files) # compile txt files into FST, and union them into initFST3 for f in allom_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) allom = open(f).read() print(allom, file=compiler) allom_FST = compiler.compile() fststr.expand_other_symbols(allom_FST) initFST3 = initFST3.union(allom_FST) # Run indivdual FST file, for debugging purposes: # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) # allom = open('FST_allom_EInsertion_shch.txt').read() # print(allom, file=compiler) # allom_FST = compiler.compile() # fststr.expand_other_symbols(allom_FST) # initFST3 = initFST3.union(allom_FST) return initFST3
def buildMorphFST(self): # initialize a FST2 st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) inits2 = '0\n' print(inits2, file=compiler) initFST2 = compiler.compile() fststr.expand_other_symbols(initFST2) # read morph FST txt files morph_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_morph_") ] # print(morph_files) # compile txt files into FST, and union them into initFST2 for f in morph_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) morph = open(f).read() print(morph, file=compiler) morph_FST = compiler.compile() fststr.expand_other_symbols(morph_FST) initFST2 = initFST2.union(morph_FST) # Run indivdual FST file, for debugging purposes: # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) # morph = open('FST_morph_ing.txt').read() # print(morph, file=compiler) # morph_FST = compiler.compile() # fststr.expand_other_symbols(morph_FST) # initFST2 = initFST2.union(morph_FST) return initFST2
def __init__(self): self.st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) # create preprocessing FST compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 \n 0 0 <other> <other> \n0 0 <epsilon> <#>') preprocessFST = compiler.compile() fststr.expand_other_symbols(preprocessFST) # gets subFSTs for in-vocab FST self.inVocabFile = open("in_vocab_dictionary_verbs.txt") inVocabFST = self.getInVocabFST() morphoFST = self.getMorphoFST() alloFST = self.getAlloFST() # creates out-of-vocab FST compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write( '\n0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n0 2 <#> +Guess \n1 1 <other> <epsilon> \n1 2 <#> +Guess \n2' ) oovPostFST = compiler.compile() fststr.expand_other_symbols(oovPostFST) temp = alloFST.union(oovPostFST) oovFST = fst.compose(morphoFST.arcsort(sort_type="olabel"), temp.arcsort(sort_type="ilabel")) oovFST = fst.compose(oovFST.arcsort(sort_type="olabel"), oovPostFST.arcsort(sort_type="ilabel")) # creates overall FST as union of each of the sub-FSTs self.fstOverall = fst.compose( preprocessFST.arcsort(sort_type="olabel"), inVocabFST.union(oovFST).arcsort(sort_type="ilabel"))
def getInVocabFST(self): st = self.st compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('') inVocabFST = compiler.compile() # takes each lemma, conjugated pair from the in-vocab file and adds it to the FST lineList = [line.rstrip('\n') for line in self.inVocabFile] for line in lineList: parts = line.split(',') parts[0] = [*parts[0]] parts[1] = [*parts[1]] for i in range(len(parts[0]) - len(parts[1])): parts[1].append('<epsilon>') for i in range(len(parts[1]) - len(parts[0])): parts[0].append('<epsilon>') fstword = '' for i in range(len(parts[0])): fstword += '\n' + str(i) + ' ' + str( i + 1) + ' ' + parts[1][i] + ' ' + parts[0][i] fstword += '\n' + str(len(parts[0]) + 1) fstword += '\n' + str(len( parts[0])) + ' ' + str(len(parts[0]) + 1) + ' <#> +Known' ncompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) ncompiler.write(fstword) fstNew = ncompiler.compile() fststr.expand_other_symbols(fstNew) inVocabFST.union(fstNew) fststr.expand_other_symbols(inVocabFST) return inVocabFST
def getMorphoFST(self): compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write('') MorphoFST = compiler.compile() edFST = '\n0 0 <other> <other> \n0 1 e <epsilon> \n1 2 d <epsilon> \n1 0 <epsilon> e \n2 4 <#> <^> \n2 3 <epsilon> e \n3 0 <epsilon> d \n4 5 <epsilon> e \n5 6 <epsilon> d \n6 7 <epsilon> <#> \n7 \n7 7 <other> <other>' ingFST = '\n 0 0 <other> <other> \n0 1 i <epsilon> \n1 2 n <epsilon> \n1 0 <epsilon> i \n2 3 g <epsilon> \n2 9 <epsilon> i \n3 4 <#> <^> \n3 10 <epsilon> i \n4 5 <epsilon> i \n5 6 <epsilon> n \n6 7 <epsilon> g \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> n \n10 11 <epsilon> n \n11 0 <epsilon> g' sFST = '\n 0 0 <other> <other> \n0 1 s <epsilon> \n1 2 <#> <^> \n1 0 <epsilon> s \n2 3 <epsilon> s \n3 4 <epsilon> <#> \n4 \n4 4 <other> <other>' enFST = '\n0 0 <other> <other> \n0 1 e <epsilon> \n1 2 n <epsilon> \n1 0 <epsilon> e \n2 4 <#> <^> \n2 3 <epsilon> e \n3 0 <epsilon> n \n4 5 <epsilon> e \n5 6 <epsilon> n \n6 7 <epsilon> <#> \n7 \n7 7 <other> <other>' asIsFST = '\n0 1 <#> <^> \n0 0 <other> <other> \n1 2 <epsilon> <#> \n2 \n2 2 <other> <other>' edcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) edcompiler.write(edFST) edFSTNew = edcompiler.compile() fststr.expand_other_symbols(edFSTNew) ingcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) ingcompiler.write(ingFST) ingFSTNew = ingcompiler.compile() fststr.expand_other_symbols(ingFSTNew) scompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) scompiler.write(sFST) sFSTNew = scompiler.compile() fststr.expand_other_symbols(sFSTNew) encompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) encompiler.write(enFST) enFSTNew = encompiler.compile() fststr.expand_other_symbols(enFSTNew) asiscompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) asiscompiler.write(asIsFST) asIsFSTNew = asiscompiler.compile() fststr.expand_other_symbols(asIsFSTNew) MorphoFST.union( edFSTNew.union( ingFSTNew.union(sFSTNew.union(enFSTNew.union(asIsFSTNew))))) fststr.expand_other_symbols(MorphoFST) return MorphoFST
from fststr import fststr import pywrapfst as fst # Init FST st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('e-insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) # Test FST test_in = 'fox<^>s<#>' print("input:", test_in) print("output:", fststr.apply(test_in, c))
def get_in_vocab_fst(self): alphabet = fststr.EN_SYMB st = fststr.symbols_table_from_alphabet( alphabet) # <class 'pywrapfst.SymbolTable'> compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) in_vocab_fst = compiler.compile() lemma = [] verb_in_form = [] with open('in_vocab_dictionary_verbs.txt', 'r') as f: for line in f.readlines(): lemma.append(line.split(',')[0]) verb_in_form.append(line.split(',')[1]) for idx in range(len(lemma)): compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 1 <#> <epsilon>\n') compiler.write('1\n') a_fst = compiler.compile() fststr.expand_other_symbols(a_fst) lemma_word = lemma[idx] lemma_word_length = len(lemma_word) form_word = verb_in_form[idx] form_word_length = len(form_word) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) if form_word_length >= lemma_word_length: for i in range(form_word_length): if i < lemma_word_length: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' ' + lemma_word[i] + '\n') else: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' <epsilon>\n') compiler.write( str(form_word_length) + ' ' + str(form_word_length + 1) + ' <epsilon>' + ' +Known\n') compiler.write(str(form_word_length + 1)) b_fst = compiler.compile() fststr.expand_other_symbols(b_fst) c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"), b_fst.arcsort(sort_type="ilabel")) in_vocab_fst.union(c_fst) else: for i in range(lemma_word_length): if i < form_word_length: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' ' + lemma_word[i] + '\n') else: compiler.write( str(i) + ' ' + str(i + 1) + ' <epsilon>' + ' ' + lemma_word[i] + '\n') compiler.write( str(lemma_word_length) + ' ' + str(lemma_word_length + 1) + ' <epsilon> +Known\n') compiler.write(str(lemma_word_length + 1)) b_fst = compiler.compile() fststr.expand_other_symbols(b_fst) c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"), b_fst.arcsort(sort_type="ilabel")) in_vocab_fst.union(c_fst) return in_vocab_fst
def generateFst(self, data, st): lines = [] lineLst = data.split("\n") count = 0 for line in lineLst: curFst = "" stemNinf = line.split(",")[:2] curFst = "0\n" # 0 as final state curFst += "0 0 <other> <other>\n" stem = stemNinf[0] if stem == "": return rootFst #print("stem: %s",stem) #if len(stemNinf)>1: inf = stemNinf[1] #print("inf: %s",inf) for i in range(len(stem)): curFst += str(i) curFst += " " curFst += str(i + 1) curFst += " " if i >= len(inf): curFst += "<epsilon>" else: curFst += inf[i] curFst += " " curFst += stem[i] curFst += "\n" infLen = len(inf) stemLen = len(stem) index = stemLen if stemLen > infLen: continue else: toBeReplaced = inf[stemLen:] for i, s in enumerate(toBeReplaced): index = i + stemLen curFst += str(index) curFst += " " curFst += str(index + 1) curFst += " " curFst += s curFst += " " curFst += "<epsilon>" curFst += "\n" curFst += str(index + 1) curFst += " " curFst += "0" curFst += " " curFst += "<#>" curFst += " " curFst += "+Known" compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) #print("curFst",curFst) compiler.write(curFst) other = compiler.compile() fststr.expand_other_symbols(other) if count == 0: rootFst = other else: rootFst = rootFst.union(other) count += 1 return rootFst
def buildpostProcessFST(self, input_str): # initialize a FSTpost st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) initposts = '0\n' print(initposts, file=compiler) initFSTpost = compiler.compile() fststr.expand_other_symbols(initFSTpost) # read post FST txt files post_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_post_") ] # print(post_files) # compile txt files into FST, and union them into initFSTpost for f in post_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) post = open(f).read() print(post, file=compiler) post_FST = compiler.compile() fststr.expand_other_symbols(post_FST) initFSTpost = initFSTpost.union(post_FST) #print("checkpoint: ", fststr.apply(input_str, initFSTpost), '\n') # Run indivdual FST file, for debugging purposes: # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) # post = open('FST_post_withsign.txt').read() # print(post, file=compiler) # post_FST = compiler.compile() # fststr.expand_other_symbols(post_FST) # initFSTpost = initFSTpost.union(post_FST) # FST that take care of input is original form s = '' # loop through the character parts of the input tracker = 0 for i in range(len(input_str)): if (input_str[i] == '+'): s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1) tracker += 1 break else: s += '{} {} {} {}\n'.format(tracker, tracker + 1, input_str[tracker], input_str[tracker]) tracker += 1 # take care of <#> in the end, change it to +Guess s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) print(s, file=compiler) original_case_FST = compiler.compile() fststr.expand_other_symbols(original_case_FST) initFSTpost = initFSTpost.union(original_case_FST) # # Last FST, clear out any word ends with <#>, output words ends with +Guess and +Known compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) clear = open('FST_finalclearance.txt').read() print(clear, file=compiler) clear_FST = compiler.compile() fststr.expand_other_symbols(clear_FST) lastFST = fst.compose(initFSTpost.arcsort(sort_type="olabel"), clear_FST.arcsort(sort_type="ilabel")) return lastFST
def getAlloFST(self): compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write('') AlloFST = compiler.compile() yRepl = '0 \n0 0 <other> <other> \n0 1 i <epsilon> \n1 2 e <epsilon> \n1 12 <^> <epsilon> \n1 0 <epsilon> i \n2 3 <^> <epsilon> \n2 9 <epsilon> i \n3 4 s <epsilon> \n3 10 <epsilon> i \n4 5 <#> y \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n12 19 <epsilon> i \n13 14 d <epsilon> \n13 20 <epsilon> i \n14 15 <#> y \n15 16 <epsilon> <^> \n16 17 <epsilon> e \n17 18 <epsilon> d \n18 8 <epsilon> <^> \n19 0 <epsilon> <^> \n20 21 <epsilon> <^> \n21 0 <epsilon> e' kIns = '\n0 \n0 0 <other> <other> \n0 1 c c \n1 0 <other> <other> \n1 2 k <epsilon> \n2 0 <epsilon> k \n2 3 <^> <epsilon> \n3 4 i <epsilon> \n 3 10 e <epsilon> \n3 14 <epsilon> k \n4 5 n <^> \n5 6 g i \n6 7 <#> n \n7 8 <epsilon> g \n8 9 <epsilon> <#> \n9 \n10 11 d <^> \n11 12 <#> e \n12 8 <epsilon> d \n14 0 <epsilon> <^>' eDel = '0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n1 2 i <epsilon> \n1 11 e <epsilon> \n2 3 n <epsilon> \n3 4 g <epsilon> \n4 5 <#> e \n5 6 <epsilon> <^> \n6 7 <epsilon> i \n7 8 <epsilon> n \n8 9 <epsilon> g \n9 10 <epsilon> <#> \n10 \n11 12 d <epsilon> \n11 16 <epsilon> e \n12 13 <#> e \n13 14 <epsilon> <^> \n14 15 <epsilon> e \n15 9 <epsilon> d \n16 0 <epsilon> <^>' eInsch = '0 \n0 0 <other> <other> \n0 1 c <epsilon> \n1 2 h <epsilon> \n1 0 <epsilon> c \n2 3 e <epsilon> \n2 11 <epsilon> c \n3 4 <^> <epsilon> ' eInsch += '\n3 12 <epsilon> c \n4 5 s <epsilon> \n4 12 <epsilon> c \n5 6 <#> c \n6 7 <epsilon> h \n7 8 <epsilon> <^> \n8 9 <epsilon> s \n9 10 <epsilon> <#> \n10 \n10 10 <other> <other> \n11 0 <epsilon> h ' eInsch += '\n12 13 <epsilon> h \n13 0 <epsilon> e \n14 15 <epsilon> h \n15 16 <epsilon> e \n16 0 <epsilon> <^>' eInss = '\n0 \n0 0 <other> <other> \n0 1 s <epsilon> \n1 2 e <epsilon> \n1 12 h <epsilon> \n1 0 <epsilon> s \n2 3 <^> <epsilon> \n2 9 <epsilon> s \n3 4 s <epsilon> \n3 10 <epsilon> s \n4 5 <#> s \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n 12 20 <epsilon> s \n13 14 <^> <epsilon> \n13 21 <epsilon> s \n14 15 s <epsilon> \n14 23 <epsilon> s \n15 16 <#> s \n16 17 <epsilon> h \n17 18 <epsilon> <^> \n18 19 <epsilon> s \n19 8 <epsilon> <#> \n20 0 <epsilon> h \n21 22 <epsilon> h \n22 0 <epsilon> e \n23 24 <epsilon> h \n24 25 <epsilon> e \n25 0 <epsilon> <^>' xz = [*'xz'] eInsxz = '\n0 \n0 0 <other> <other> ' for i in range(len(xz)): c = xz[i] n = 11 * i eInsxz += '\n0 ' + str( n + 1) + ' ' + c + ' <epsilon> ' + '\n' + str( n + 1) + ' ' + str(n + 2) + ' e <epsilon> \n' + str( n + 1) + ' 0 <epsilon> ' + c + '\n' + str(n + 2) eInsxz += ' ' + str(n + 3) + ' <^> <epsilon> \n' + str( n + 2) + ' ' + str(n + 9) + ' <epsilon> ' + c + ' \n' + str( n + 3) + ' ' + str(n + 4) + ' s <epsilon> \n' + str(n + 4) eInsxz += ' ' + str(n + 5) + ' <#> ' + c + ' \n' + str( n + 3) + ' ' + str(n + 10) + ' <epsilon> ' + c + ' \n' + str( n + 5) + ' ' + str(n + 6) + ' <epsilon> <^> \n' eInsxz += str(n + 6) + ' ' + str(n + 7) + ' <epsilon> s \n' + str( n + 7) + ' ' + str(n + 8) + ' <epsilon> <#> \n' + str( n + 8) + ' \n' + str(n + 8) + ' ' + str(n + 8) + ' <other> <other>' eInsxz += ' \n' + str(n + 9) + ' 0 <epsilon> e \n' + str( n + 10) + ' ' + str(n + 11) + ' <epsilon> e \n' + str( n + 11) + ' 0 <epsilon> <^>' compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write(eInsch) fstInsch = compiler.compile() fststr.expand_other_symbols(fstInsch) compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write(eInss) fstInss = compiler.compile() fststr.expand_other_symbols(fstInss) compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write(eInsxz) fstInsxz = compiler.compile() fststr.expand_other_symbols(fstInsxz) fsteIns = fstInsch.union(fstInss.union(fstInsxz)) consonants = [*'bcdfghjklmnpqrstvwxz'] consDoub = '' consDoub += '\n0 \n0 1 a a \n0 1 e e \n0 1 i i \n0 1 o o \n0 1 u u \n0 1 y y \n0 0 <other> <other>' consDoub += '\n1 0 a a \n1 0 e e \n1 0 i i \n1 0 o o \n1 0 u u \n1 0 y y' consDoub += '\n2 \n2 2 <other> <other>' for i in range(len(consonants)): c = consonants[i] consDoub += '\n1 ' + str(8 * i + 3) + ' ' + c + ' ' + c consDoub += '\n' + str(8 * i + 3) + ' ' + str(8 * i + 4) + ' ' + c + ' <epsilon>' consDoub += '\n' + str(8 * i + 3) + ' 0 <other> <other>' consDoub += '\n' + str(8 * i + 4) + ' ' + str(8 * i + 5) + ' <^> <^>' consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 6) + ' i i' consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 9) + ' e e' consDoub += '\n' + str(8 * i + 5) + ' 0 <other> <other>' consDoub += '\n' + str(8 * i + 6) + ' ' + str(8 * i + 7) + ' n n' consDoub += '\n' + str(8 * i + 7) + ' ' + str(8 * i + 8) + ' g g' consDoub += '\n' + str(8 * i + 8) + ' 2 <#> <#>' consDoub += '\n' + str(8 * i + 9) + ' ' + str(8 * i + 10) + ' d d' consDoub += '\n' + str(8 * i + 9) + ' 0 <other> <other>' consDoub += '\n' + str(8 * i + 10) + ' 2 <#> <#>' ycompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) ycompiler.write(yRepl) yReplNew = ycompiler.compile() fststr.expand_other_symbols(yReplNew) kcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) kcompiler.write(kIns) kInsNew = kcompiler.compile() fststr.expand_other_symbols(kInsNew) edcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) edcompiler.write(eDel) eDelNew = edcompiler.compile() fststr.expand_other_symbols(eDelNew) cdcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) cdcompiler.write(consDoub) consDoubNew = cdcompiler.compile() fststr.expand_other_symbols(consDoubNew) AlloFST.union( yReplNew.union( kInsNew.union(fsteIns.union(eDelNew.union(consDoubNew))))) fststr.expand_other_symbols(AlloFST) return AlloFST
def lemmatize(self, str): fststr.expand_other_symbols(self.fstOverall) return fststr.apply(str, self.fstOverall)
def delemmatize(self, str): fststr.expand_other_symbols(self.fstOverall) toReturn = fststr.apply(str, self.fstOverall.invert()) self.fstOverall.invert() return toReturn