def LoadVocabFromFile(pron_dict, limit=None, group_size=5000, transducer_file_pattern=None): """Returns a transducer that accepts and outputs all words in the input file.""" #Load and return groups of minimized ar_vocab transducers (if exist) filenames = list(glob.iglob(transducer_file_pattern)) if len(filenames) > 0 : all_transducers = [LoadTransducerFromFile(f) for f in filenames] return all_transducers, True print("Loading the vocab file") vocab = set() for word, ipa_pron_set in pron_dict.items(): for ipa_pron in ipa_pron_set: vocab.add(ipa_pron) # ipa_pron is tuple('d', 'o̯', 'e̯') if limit is not None and len(vocab) > limit: break print("Checking missing letters in Alphabet") seen_letters = set() for w in vocab: seen_letters.update(set(w)) missing_letters = seen_letters - pt.abc.ALL_LETTERS assert len(missing_letters) == 0, missing_letters print("Building transducer") print("Vocab size:", len(vocab), "num groups:", math.ceil(len(vocab) / group_size)) all_transducers = [] for i, w in enumerate(vocab): if i % group_size == 0: print(".", sep="", end="") t = pt.Transducer() all_transducers.append(t) t.set_union(pt.linear_chain(w)) print() return all_transducers, False
def no_complex_transducer(add_meta_arc=True): """No consonant clusters.""" t = pt.Transducer() for l in pt.abc.VOWELS: t.add_arc(0, 0, l, l) t.add_arc(1, 0, l, l) for l in pt.abc.SYLLABLE_BOUNDARIES: t.add_arc(0, 0, l, l) t.add_arc(1, 0, l, l) for l in pt.abc.ALL_LETTERS - pt.abc.VOWELS: t.add_arc(0, 1, l, l) t.add_arc(1, 2, l, l) rule_name = "<<*COMPLEX>>" if add_meta_arc: t.add_arc(2, 1, pt.abc.EPSILON, rule_name) else: t.add_arc(2, 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) t[0].final = True t[1].final = True return t
def peak_transducer(add_meta_arc=True): """In a syllable sonority goes up then down (bell shaped).""" t = pt.Transducer() # Exact Peak implementation is replaced by its approxiamtion: # fire the Peak constraint if there is more than one vowel/semivowel/nasal in a sylable peaks = pt.abc.VOWELS - pt.abc.SEMIVOWELS not_peaks = pt.abc.ALL_SYMS - peaks for l in not_peaks: t.add_arc(0, 0, l, l) t.add_arc(1, 1, l, l) for l in peaks: t.add_arc(0, 1, l, l) t.add_arc(1, 2, l, l) for l in pt.abc.SYLLABLE_BOUNDARIES: t.add_arc(1, 0, l, l) t.add_arc(0, 0, l, l) # if more than one peak -- violation rule_name = "<<PEAK>>" if add_meta_arc: t.add_arc(2, 1, pt.abc.EPSILON, rule_name) else: t.add_arc(2, 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) t[0].final = True return t
def no_complex_margin_transducer(add_meta_arc=True): """No consonants around syllable boundaries. E.g. 'c.b'""" t = pt.Transducer() for l in pt.abc.VOWELS: t.add_arc(0, 0, l, l) t.add_arc(1, 0, l, l) t.add_arc(2, 0, l, l) for l in pt.abc.SYLLABLE_BOUNDARIES: t.add_arc(0, 0, l, l) t.add_arc(1, 2, l, l) t.add_arc(2, 0, l, l) for l in pt.abc.ALL_LETTERS - pt.abc.VOWELS: t.add_arc(0, 1, l, l) t.add_arc(1, 1, l, l) t.add_arc(2, 3, l, l) rule_name = "<<*COMPLEX-margin>>" if add_meta_arc: t.add_arc(3, 1, pt.abc.EPSILON, rule_name) else: t.add_arc(3, 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) t[0].final = True t[1].final = True t[2].final = True return t
def onset_transducer(add_meta_arc=True): """Syllables start with a consonant.""" t = pt.Transducer() for l in pt.abc.CONSONANTS: t.add_arc(0, 1, l, l) t.add_arc(3, 1, l, l) for l in pt.abc.VOWELS - pt.abc.SEMIVOWELS: t.add_arc(0, 2, l, l) t.add_arc(3, 2, l, l) for l in pt.abc.ALL_LETTERS: t.add_arc(1, 1, l, l) for l in pt.abc.SYLLABLE_BOUNDARIES: t.add_arc(1, 3, l, l) rule_name = "<<ONSET>>" if add_meta_arc: t.add_arc(2, 1, pt.abc.EPSILON, rule_name) else: t.add_arc(2, 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) t[1].final = True t[3].final = True return t
def final_vowel_substitution_transducer(add_meta_arc=True): """Substitute final vowels (optionally).""" t = pt.Transducer() for l in pt.abc.ALL_SYMS: t.add_arc(0, 0, l, l) t.add_arc(0, 1, l, l) max_node = 1 for s_ar, s_sw in pt.abc.AR_SW_FINAL_VOWELS: prev_node = 0 for l_ar, l_sw in itertools.zip_longest(s_ar, s_sw, fillvalue=pt.abc.EPSILON): max_node += 1 t.add_arc(prev_node, max_node, l_ar, l_sw) prev_node = max_node """ if len(s_ar) < len(s_sw): rule_name = "<<DEP-IO>>" else: rule_name = "<<IDENT-IO-final>>" """ rule_name = "<<RO_MORPH>>" if add_meta_arc: t.add_arc(max_node, 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 t.add_arc(1, 1, pt.abc.CONSONANT_DOT, pt.abc.CONSONANT_DOT) t.add_arc(1, 1, pt.abc.VOWEL_DOT, pt.abc.VOWEL_DOT) t[1].final = True return t
def ssp_transducer(add_meta_arc=True): """complex onsets rise in sonority toward the nucleus, complex codas fall in sonority.""" t = pt.Transducer() """Simplified version for open syllables sonority_letters = set() for i, sonority_set in enumerate(pt.abc.SONORITY_LIST[:-2]): state_num = i + 1 for l in sonority_set: sonority_letters.add(l) for j in range(state_num+1): t.add_arc(j, state_num, l, l) for i in range(0, state_num + 1): for l in pt.abc.SYLLABLE_BOUNDARIES | (pt.abc.ALL_LETTERS - sonority_letters): t.add_arc(i, 0, l, l) for i in range(1, state_num + 1): rule_name = "<<SSP>>" if add_meta_arc: t.add_arc(i, 0, pt.abc.EPSILON, rule_name) else: t.add_arc(i, 0, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) t[0].final = True return t """ # going up in sonority in a syllable for i, sonority_set in enumerate(pt.abc.SONORITY_LIST): t.add_arc(i, i + 1, pt.abc.EPSILON, pt.abc.EPSILON) for l in sonority_set: t.add_arc(i, i + 1, l, l) t.add_arc(i + 1, i + 1, l, l) max_sonority = len(pt.abc.SONORITY_LIST) # going down in sonority in a syllable max_state = max_sonority for i, sonority_set in enumerate(reversed(pt.abc.SONORITY_LIST)): i += max_sonority max_state = i + 1 t.add_arc(i, i + 1, pt.abc.EPSILON, pt.abc.EPSILON) for l in sonority_set: t.add_arc(i, i + 1, l, l) t.add_arc(i + 1, i + 1, l, l) t.add_arc(max_state, 0, pt.abc.CONSONANT_DOT, pt.abc.CONSONANT_DOT) t.add_arc(max_state, 0, pt.abc.VOWEL_DOT, pt.abc.VOWEL_DOT) # if syllable letters are not in bell shape -- violation rule_name = "<<SSP>>" if add_meta_arc: t.add_arc(max_state, 0, pt.abc.EPSILON, rule_name) else: t.add_arc(max_state, 0, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) t[0].final = True return t
def strip_transducer(morpheme_set, operation_weight, add_meta_arc, rule_name): """Transducer that removes strings. Used for AR suffixes and prefixes""" t = pt.Transducer() t[0].final = True for w in morpheme_set: out_str = [] if add_meta_arc: out_str = [rule_name] t.set_union(pt.linear_chain(w, out_str, operation_weight)) return t
def unsyllabification_transducer(add_meta_arc=True): """Removes the CONSONANT_DOT and VOWEL_DOT symbols from the output.""" t = pt.Transducer() for l in pt.abc.ALL_SYMS: if l in pt.abc.SYLLABLE_BOUNDARIES: t.add_arc(0, 0, l, pt.abc.EPSILON) else: t.add_arc(0, 0, l, l) t[0].final = pt.abc.OT_CONSTRAINTS["<<BIAS>>"] return t
def min_consonant_count_transducer(min_consonant_count=3, add_meta_arc=True): """Allows only strings with at least |min_consonant_count| consonants.""" t = pt.Transducer() for i in range(min_consonant_count+1): for l in pt.abc.ALL_SYMS: t.add_arc(i, i, l, l) if i > 0: for l in pt.abc.CONSONANTS: t.add_arc(i-1, i, l, l) t[min_consonant_count].final = True if add_meta_arc: pt.AddPassThroughArcs(t) return t
def append_transducer(morpheme_set, operation_weight, add_meta_arc, rule_name, add_closure=False): """Transducer that appends strings. Used for SW suffixes and prefixes""" t = pt.Transducer() t[0].final = True for w in morpheme_set: if add_meta_arc: w = list(w) w.append(rule_name) t.set_union(pt.linear_chain("", w, operation_weight)) if add_closure: t.set_closure() return t
def ApplyLoanwords(self, ar_vocab_groups, loanwords_transducer, sw_pre_transducer, add_meta_arc, with_syllabification): time_a = time.time() sw_word_transducer = pt.UnionLinearChains(self.sw_pron_list) if add_meta_arc: pt.AddPassThroughArcs(sw_word_transducer) if with_syllabification: pt.AddSyllabificationArcs(sw_word_transducer) sw_word_transducer.arc_sort_input() time_sw = time.time() print(" building SW transducer took:", time_sw-time_a, "sec") ar_transducer = pt.UnionLinearChains(self.ar_word_list) ar_transducer.arc_sort_output() time_b = time.time() print(" building AR transducer took:", time_b-time_sw, "sec") print(" sw_pre_transducer") sw_vocab = sw_pre_transducer >> sw_word_transducer sw_vocab.arc_sort_input() time_c = time.time() print(" applying sw_pre_transducer took:", time_c-time_b, "sec") print(" loanwords") combined = loanwords_transducer >> sw_vocab combined.arc_sort_input() time_d = time.time() print(" applying loanwords took:", time_d-time_c, "sec") print(" ar_vocab") self.t_all = pt.Transducer() for ar_vocab in ar_vocab_groups: print(".", sep="", end="") sys.stdout.flush() self.t_all.set_union(ar_vocab >> combined) print() self.t_all.arc_sort_input() time_e = time.time() print(" ar_vocab >> combined took:", time_e-time_d, "sec") print(" t_correct") self.t_correct = ar_transducer >> self.t_all self.t_correct.arc_sort_output() self.t_all.arc_sort_output() time_g = time.time() print(" building t_correct took:", time_g-time_e, "sec") print(" total ApplyLoanwords took:", time_g-time_a, "sec")
def syllabification_transducer(add_meta_arc=True): """Appends CONSONANT_DOTs and VOWEL_DOTs symbols.""" t = pt.Transducer() for l in pt.abc.CONSONANTS: t.add_arc(0, 1, l, l) t.add_arc(1, 1, l, l) t.add_arc(2, 1, l, l) t.add_arc(1, 0, pt.abc.EPSILON, pt.abc.CONSONANT_DOT) for l in pt.abc.VOWELS: t.add_arc(0, 2, l, l) t.add_arc(1, 2, l, l) t.add_arc(2, 2, l, l) t.add_arc(2, 0, pt.abc.EPSILON, pt.abc.VOWEL_DOT) t[0].final = True return t
def degemination_transducer(add_meta_arc=True): """Remove repeated consonants (optionally).""" t = pt.Transducer() for l in pt.abc.ALL_SYMS: t.add_arc(0, 0, l, l) next_node = 1 for l in pt.abc.CONSONANTS: t.add_arc(0, next_node, l, l) rule_name = "<<MAX-IO>>" if add_meta_arc: t.add_arc(next_node, 0, pt.abc.EPSILON, rule_name) else: t.add_arc(next_node, 0, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) next_node += 1 t[0].final = True return t
def vowel_deletion_transducer(add_meta_arc=True): """Deletion of vowels.""" t = pt.Transducer() for l in pt.abc.ALL_SYMS: t.add_arc(0, 0, l, l) next_node = 1 for l in pt.abc.VOWELS: t.add_arc(0, next_node, l, pt.abc.EPSILON) rule_name = "<<MAX-V>>" if add_meta_arc: t.add_arc(next_node, 0, pt.abc.EPSILON, rule_name) else: t.add_arc(next_node, 0, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) next_node += 1 t[0].final = True if add_meta_arc: pt.AddPassThroughArcs(t) return t
def epenthesis_transducer(add_meta_arc=True): """Inserts a vowel between two consonants (states 1 and 2) or at the end of the word after a consonant. Or just outputs the letters as-is.""" t = pt.Transducer() for l in pt.abc.ALL_SYMS: t.add_arc(0, 0, l, l) for l in pt.abc.CONSONANTS: t.add_arc(0, 1, l, l) t.add_arc(2, 0, l, l) next_node = 3 for l in pt.abc.VOWELS: t.add_arc(1, next_node, pt.abc.EPSILON, l) rule_name = "<<DEP-IO>>" if add_meta_arc: t.add_arc(next_node, 2, pt.abc.EPSILON, rule_name) else: t.add_arc(next_node, 2, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) next_node += 1 t[0].final = True t[2].final = True return t
def nocoda_transducer(add_meta_arc=True): """Syllables are open.""" t = pt.Transducer() for l in pt.abc.ALL_LETTERS: t.add_arc(0, 0, l, l) for l in pt.abc.CONSONANTS: t.add_arc(0, 1, l, l) t.add_arc(1, 2, pt.abc.CONSONANT_DOT, pt.abc.CONSONANT_DOT) t.add_arc(1, 2, pt.abc.VOWEL_DOT, pt.abc.VOWEL_DOT) rule_name = "<<NOCODA>>" if add_meta_arc: t.add_arc(2, 0, pt.abc.EPSILON, rule_name) else: t.add_arc(2, 0, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) for l in pt.abc.VOWELS | pt.abc.SYLLABLE_BOUNDARIES: t.add_arc(0, 3, l, l) t.add_arc(3, 0, pt.abc.CONSONANT_DOT, pt.abc.CONSONANT_DOT) t.add_arc(3, 0, pt.abc.VOWEL_DOT, pt.abc.VOWEL_DOT) t[0].final = True return t
def length_transducer(add_meta_arc=True): """Syllables should have at most 3 letters.""" t = pt.Transducer() for l in pt.abc.ALL_LETTERS: t.add_arc(0, 1, l, l) t.add_arc(1, 2, l, l) t.add_arc(2, 3, l, l) t.add_arc(3, 4, l, l) for l in pt.abc.SYLLABLE_BOUNDARIES: t.add_arc(1, 0, l, l) t.add_arc(2, 0, l, l) t.add_arc(3, 0, l, l) rule_name = "<<LEN>>" if add_meta_arc: t.add_arc(4, 3, pt.abc.EPSILON, rule_name) else: t.add_arc(4, 3, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) t[0].final = True t[5].final = True return t
def phone_substitution_transducer(add_meta_arc=True): """Substitute similar phones (optionally).""" def DetectViolation(l_ar, l_sw, group): if l_ar == pt.abc.EPSILON or l_sw == pt.abc.EPSILON: return False if group[l_ar] != group[l_sw]: return True return False max_node = 0 t = pt.Transducer() for l in pt.abc.ALL_SYMS: t.add_arc(0, 0, l, l) for s_ar, s_sw in pt.abc.AR_SW_SIMILAR_PHONES: prev_node = 0 manner_violated = False place_violated = False sonority_violated = False voiced_violated = False #frontness_violated = False #openness_violated = False #roundness_violated = False rule_violated = False for l_ar, l_sw in itertools.zip_longest(s_ar, s_sw, fillvalue=pt.abc.EPSILON): max_node += 1 t.add_arc(prev_node, max_node, l_ar, l_sw) prev_node = max_node if not manner_violated: manner_violated = DetectViolation(l_ar, l_sw, pt.abc.MANNER_OF_ARTICULATION) if not place_violated: place_violated = DetectViolation(l_ar, l_sw, pt.abc.PLACE_OF_ARTICULATION) sonority_violated = DetectViolation(l_ar, l_sw, pt.abc.SONORITY) if not (manner_violated or place_violated): voiced_violated = DetectViolation(l_ar, l_sw, pt.abc.STATE_OF_GLOTTIS) """ if not frontness_violated: frontness_violated = DetectViolation(l_ar, l_sw, pt.abc.VOWEL_FRONTNESS) if not openness_violated: openness_violated = DetectViolation(l_ar, l_sw, pt.abc.VOWEL_OPENNESS) if not roundness_violated: roundness_violated = DetectViolation(l_ar, l_sw, pt.abc.VOWEL_ROUNDNESS) """ if manner_violated: rule_violated = True rule_name = "<<IDENT-IO-manner>>" if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 if place_violated: rule_violated = True rule_name = "<<IDENT-IO-place>>" if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 if sonority_violated: rule_violated = True rule_name = "<<IDENT-IO-sonority>>" if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 if voiced_violated: rule_violated = True rule_name = "<<IDENT-IO-voiced>>" if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 rule_name = None if s_ar in pt.abc.PHARYNGEAL: rule_name = "<<IDENT-IO-PHARYNGEAL>>" elif s_ar in pt.abc.PHARYNGEALIZED: rule_name = "<<IDENT-IO-PHARYNGEALIZED>>" elif s_ar in pt.abc.GLOTTAL: rule_name = "<<IDENT-IO-GLOTTAL>>" """ el if frontness_violated: rule_violated = True rule_name = "<<IDENT-IO-frontness>>" if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 if openness_violated: rule_violated = True rule_name = "<<IDENT-IO-openness>>" if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 if roundness_violated: rule_violated = True rule_name = "<<IDENT-IO-roundness>>" if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 """ rule_name = None if s_ar in pt.abc.VOWELS or s_sw in pt.abc.VOWELS and not rule_violated: rule_name = "<<IDENT-IO-v>>" elif rule_violated: rule_name = "<<IDENT-IO-c>>" if rule_name: if add_meta_arc: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, rule_name) else: t.add_arc(max_node, max_node + 1, pt.abc.EPSILON, pt.abc.EPSILON, pt.abc.OT_CONSTRAINTS[rule_name]) max_node += 1 t.add_arc(max_node, 0, pt.abc.EPSILON, pt.abc.EPSILON) t[0].final = True return t