def test_bracket_markings(self): self.initialise_segment_table("plural_english_segment_table.txt") from bracket_rule_transducer import BracketRuleTransducer from bracket_rule_transducer import LEFT_IDENTITY_BRACKET, RIGHT_BRACKETS, LEFT_BRACKETS, RIGHT_IDENTITY_BRACKET rule = Rule([{"cons": "+"}], [{"voice": "-"}], [], [{"voice": "-"}], False) word = 'zt' prologued_word_transducer = get_prologued_word(word) rule.extract_data_from_feature_bundle_lists() bracket_rule_transducer_factory = BracketRuleTransducer(rule) if rule.obligatory: obligatory_dfa = bracket_rule_transducer_factory._get_obligatory_dfa([LEFT_IDENTITY_BRACKET], RIGHT_BRACKETS) obligatory_transducer = pyfst_from_dfa(obligatory_dfa) prologued_obligatory = safe_compose(prologued_word_transducer, obligatory_transducer) custom_obligatory_transducer = pyfst_from_dfa( bracket_rule_transducer_factory._get_obligatory_dfa([RIGHT_IDENTITY_BRACKET], LEFT_BRACKETS)) prologued_obligatory = safe_compose(prologued_obligatory, custom_obligatory_transducer) else: prologued_obligatory = prologued_word_transducer right_context_transducer = bracket_rule_transducer_factory._get_right_context_dfa() prologued_obligatory_right = safe_compose(prologued_obligatory, right_context_transducer) left_context_transducer = bracket_rule_transducer_factory._get_left_context_dfa() prologued_obligatory_right_left = safe_compose(prologued_obligatory_right, left_context_transducer) print(get_transducer_outputs(prologued_obligatory_right_left, limit=10)) replace_transducer = bracket_rule_transducer_factory.get_replace_transducer() prologued_obligatory_right_left_replace = safe_compose(prologued_obligatory_right_left, replace_transducer) print(get_transducer_outputs(prologued_obligatory_right_left_replace, limit=10))
def get_left_to_right_application(self): prologue_transducer = get_prologue_transducer() if self.obligatory or self._should_fix_transducer(): obligatory_transducer = pyfst_from_dfa( self._get_obligatory_dfa([LEFT_IDENTITY_BRACKET], [RIGHT_IDENTITY_BRACKET])) composed_transducer = safe_compose(prologue_transducer, obligatory_transducer) else: composed_transducer = prologue_transducer right_context_transducer = self._get_right_context_dfa() replace_transducer = self.get_replace_transducer() replace_transducer = uniform_encoding.get_weighted_replace_transducer( replace_transducer, self) left_context_transducer = self._get_left_context_dfa() prologue_inverse_transducer = get_prologue_inverse_transducer() composed_transducer = chain_safe_compose(composed_transducer, right_context_transducer, replace_transducer, left_context_transducer) if self.transformation_type == INSERTION: insertion_obligatory_transducer = pyfst_from_dfa( self._get_obligatory_dfa([RIGHT_IDENTITY_BRACKET], [LEFT_IDENTITY_BRACKET])) composed_transducer = safe_compose( composed_transducer, insertion_obligatory_transducer) # remove multiple paths if self.transformation_type == ASSIMILATION or self.transformation_type == INSERTION: RI_obligatory_transducer = pyfst_from_dfa( self._get_custom_obligatory_dfa([RIGHT_APPLICATION_BRACKET], [LEFT_IDENTITY_BRACKET])) composed_transducer = safe_compose(composed_transducer, RI_obligatory_transducer) if self.transformation_type == DELETION: JL_obligatory_transducer = pyfst_from_dfa( self._get_custom_obligatory_dfa([RIGHT_IDENTITY_BRACKET], [LEFT_APPLICATION_BRACKET])) composed_transducer = safe_compose(composed_transducer, JL_obligatory_transducer) composed_transducer = safe_compose(composed_transducer, prologue_inverse_transducer) if not self.obligatory: # obligatory rules should be weighted only in the replace level if not self._should_fix_transducer( ): # only non-right-context rules should be weighted this way composed_transducer = uniform_encoding.get_weighted_rule_transducer( composed_transducer, self) return composed_transducer
def get_sigma_transducer_for_intro(sigma): sigma_key = "".join(sorted(list(sigma))) if sigma_key not in sigma_transducer_dict: sigma_regex = "({})".format("+".join(sigma)) sigma_dfa = get_dfa_from_regex(sigma_regex, sigma=sigma) sigma_transducer_dict[sigma_key] = pyfst_from_dfa(sigma_dfa) return sigma_transducer_dict[sigma_key]
def get_ignore_dfa(sigma, language_dfa, ignored_set): new_sigma = sigma | ignored_set intro_transducer = get_intro_transducer(sigma, ignored_set) language_transducer = pyfst_from_dfa(language_dfa) composed_transducer = safe_compose(language_transducer, intro_transducer) language_dfa = pyfst_to_dfa(composed_transducer, new_sigma) return language_dfa
def get_replace_transducer(self): transducer_symbol_table = SegmentTable().transducer_symbol_table inner_replace_transducer = fst.Transducer( isyms=transducer_symbol_table, osyms=transducer_symbol_table) for segment1, segment2 in self.target_change_tuples_list: inner_replace_transducer.add_arc(0, 1, segment1, segment2) inner_replace_transducer[1].final = True inner_replace_transducer_ignore_brackets = [ LEFT_CENTER_BRACKET, RIGHT_CENTER_BRACKET ] for bracket in inner_replace_transducer_ignore_brackets: inner_replace_transducer.add_arc(0, 0, bracket, bracket) inner_replace_transducer.add_arc(1, 1, bracket, bracket) opt_part = left_bracket_transducer + inner_replace_transducer + right_bracket_transducer add_opt(opt_part) sigma_star_regex = "({})*".format("+".join(self.alphabet)) sigma_star_dfa = get_dfa_from_regex(sigma_star_regex, sigma=self.alphabet) sigma_star_dfa_ignore_identity = get_ignore_dfa( self.alphabet | set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET]), sigma_star_dfa, set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET])) id_sigma_star = pyfst_from_dfa(sigma_star_dfa_ignore_identity) concat_transducer = id_sigma_star + opt_part replace_transducer = concat_transducer.closure() # dot(replace_transducer, "replace_transducer") return replace_transducer
def get_left_to_right_application(self): prologue_transducer = get_prologue_transducer() if self.obligatory: obligatory_transducer = pyfst_from_dfa( self._get_obligatory_dfa([LEFT_IDENTITY_BRACKET], [RIGHT_IDENTITY_BRACKET])) composed_transducer = safe_compose(prologue_transducer, obligatory_transducer) else: composed_transducer = prologue_transducer right_context_transducer = self._get_right_context_dfa() replace_transducer = self.get_replace_transducer() left_context_transducer = self._get_left_context_dfa() prologue_inverse_transducer = get_prologue_inverse_transducer() composed_transducer = chain_safe_compose(composed_transducer, right_context_transducer, replace_transducer, left_context_transducer) if self.transformation_type == INSERTION: insertion_obligatory_transducer = pyfst_from_dfa( self._get_obligatory_dfa([RIGHT_IDENTITY_BRACKET], [LEFT_IDENTITY_BRACKET])) composed_transducer = safe_compose( composed_transducer, insertion_obligatory_transducer) # remove multiple paths if self.transformation_type == ASSIMILATION or self.transformation_type == INSERTION: RI_obligatory_transducer = pyfst_from_dfa( self._get_custom_obligatory_dfa([RIGHT_APPLICATION_BRACKET], [LEFT_IDENTITY_BRACKET])) composed_transducer = safe_compose(composed_transducer, RI_obligatory_transducer) if self.transformation_type == DELETION: JL_obligatory_transducer = pyfst_from_dfa( self._get_custom_obligatory_dfa([RIGHT_IDENTITY_BRACKET], [LEFT_APPLICATION_BRACKET])) composed_transducer = safe_compose(composed_transducer, JL_obligatory_transducer) composed_transducer = safe_compose(composed_transducer, prologue_inverse_transducer) return composed_transducer
def _get_left_context_dfa(self): left_context_key = str(self.left_context_feature_bundle_list) if left_context_key in left_context_dfas: return left_context_dfas[left_context_key] alphabet = self.alphabet sigma_star_dfa = sigma_star_dfa_for_left_context if self.left_context_feature_bundle_list: context_regex = get_context_regex( self.left_context_feature_bundle_list) if configurations["LENGTHENING_FLAG"]: context_regex = context_regex + "(Y)*" left_context_dfa = str2regexp(context_regex, sigma=alphabet).toDFA() left_context_dfa_ignore_L = get_ignore_dfa( alphabet | set(LEFT_BRACKETS), left_context_dfa, set(LEFT_BRACKETS)) sigma_star_left_context_dfa = sigma_star_dfa.concat( left_context_dfa_ignore_L) else: sigma_star_left_context_dfa = sigma_star_dfa left_brackets_regex = "({})".format("+".join(LEFT_BRACKETS)) left_bracket_dfa = get_dfa_from_regex(left_brackets_regex, sigma=LEFT_BRACKETS) sigma_star_L = sigma_star_dfa.concat(left_bracket_dfa) sigma_star_L_complement = ~sigma_star_L subtraction_result = sigma_star_left_context_dfa & sigma_star_L_complement L_sigma_star = left_bracket_dfa.concat(sigma_star_dfa) p_iff_s_dfa = get_p_iff_s_dfa(subtraction_result, L_sigma_star) p_iff_s_ignore_right_bracket = get_ignore_dfa(alphabet | set(BRACKETS), p_iff_s_dfa, set(RIGHT_BRACKETS)) left_context_dfa = p_iff_s_ignore_right_bracket left_context_dfa = pyfst_from_dfa(left_context_dfa) left_context_dfas[left_context_key] = left_context_dfa return left_context_dfa
def _get_right_context_dfa(self): right_context_key = str(self.right_context_feature_bundle_list) if right_context_key in right_context_dfas: return right_context_dfas[right_context_key] alphabet = self.alphabet sigma_star_dfa = sigma_star_dfa_for_right_context if self.right_context_feature_bundle_list: right_context_dfa = get_context_dfa( self.right_context_feature_bundle_list) right_context_dfa_ignore_R = get_ignore_dfa( alphabet | set(RIGHT_BRACKETS), right_context_dfa, set(RIGHT_BRACKETS)) right_context_sigma_star_dfa = right_context_dfa_ignore_R.concat( sigma_star_dfa) else: right_context_sigma_star_dfa = sigma_star_dfa right_brackets_regex = "({})".format("+".join(RIGHT_BRACKETS)) right_bracket_acceptor = get_dfa_from_regex(right_brackets_regex, sigma=RIGHT_BRACKETS) sigma_star_R = sigma_star_dfa.concat(right_bracket_acceptor) R_sigma_star = right_bracket_acceptor.concat(sigma_star_dfa) R_sigma_star_complement = ~R_sigma_star subtraction_result = right_context_sigma_star_dfa & R_sigma_star_complement p_iff_s_dfa = get_p_iff_s_dfa(sigma_star_R, subtraction_result) p_iff_s_ignore_left_bracket = get_ignore_dfa(alphabet | set(BRACKETS), p_iff_s_dfa, set(LEFT_BRACKETS)) right_context_dfa = p_iff_s_ignore_left_bracket right_context_dfa = pyfst_from_dfa(right_context_dfa) right_context_dfas[right_context_key] = right_context_dfa return right_context_dfa
sigma_transducer_dict = dict() def get_sigma_transducer_for_intro(sigma): sigma_key = "".join(sorted(list(sigma))) if sigma_key not in sigma_transducer_dict: sigma_regex = "({})".format("+".join(sigma)) sigma_dfa = get_dfa_from_regex(sigma_regex, sigma=sigma) sigma_transducer_dict[sigma_key] = pyfst_from_dfa(sigma_dfa) return sigma_transducer_dict[sigma_key] alphabet = set(SegmentTable().get_segments_symbols()) m_sigma_star_regex = "({})*".format("+".join(alphabet)) m_sigma_star_dfa = get_dfa_from_regex(m_sigma_star_regex, sigma=alphabet) sigma_star_dfa_for_left_context = get_ignore_dfa(alphabet | set(LEFT_BRACKETS), m_sigma_star_dfa, set(LEFT_BRACKETS)) sigma_star_dfa_for_right_context = get_ignore_dfa( alphabet | set(RIGHT_BRACKETS), m_sigma_star_dfa, set(RIGHT_BRACKETS)) sigma_star_dfa_for_obligatory = get_ignore_dfa(alphabet | set(BRACKETS), m_sigma_star_dfa, set(BRACKETS)) left_bracket_transducer = pyfst_from_dfa( get_dfa_from_regex(LEFT_APPLICATION_BRACKET).toDFA()) right_bracket_transducer = pyfst_from_dfa( get_dfa_from_regex(RIGHT_APPLICATION_BRACKET).toDFA())
def right_bracket_transducer(self): if self.RIGHT_BRACKET_TRANSDUCER is None: self.RIGHT_BRACKET_TRANSDUCER = pyfst_from_dfa( get_dfa_from_regex(RIGHT_APPLICATION_BRACKET).toDFA()) return self.RIGHT_BRACKET_TRANSDUCER
def _rule_construction_helper(self, rule, word): from bracket_rule_transducer import get_prologue_inverse_transducer from bracket_rule_transducer import BracketRuleTransducer from bracket_rule_transducer import LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET, RIGHT_APPLICATION_BRACKET, LEFT_APPLICATION_BRACKET remove_multiple_paths = False second_obligatory = True prologued_word_transducer = get_prologued_word(word) rule.extract_data_from_feature_bundle_lists() bracket_rule_transducer_factory = BracketRuleTransducer(rule) if rule.obligatory: obligatory_dfa = bracket_rule_transducer_factory._get_obligatory_dfa([LEFT_IDENTITY_BRACKET], [RIGHT_IDENTITY_BRACKET]) obligatory_transducer = pyfst_from_dfa(obligatory_dfa) prologued_obligatory = safe_compose(prologued_word_transducer, obligatory_transducer) if second_obligatory: custom_obligatory_transducer = pyfst_from_dfa( bracket_rule_transducer_factory._get_obligatory_dfa([RIGHT_IDENTITY_BRACKET], [LEFT_IDENTITY_BRACKET])) prologued_obligatory = safe_compose(prologued_obligatory, custom_obligatory_transducer) else: prologued_obligatory = prologued_word_transducer right_context_transducer = bracket_rule_transducer_factory._get_right_context_dfa() prologued_obligatory_right = safe_compose(prologued_obligatory, right_context_transducer) replace_transducer = bracket_rule_transducer_factory.get_replace_transducer() prologued_obligatory_right_replace = safe_compose(prologued_obligatory_right, replace_transducer) left_context_transducer = bracket_rule_transducer_factory._get_left_context_dfa() prologued_obligatory_right_replace_left = safe_compose(prologued_obligatory_right_replace, left_context_transducer) print(get_transducer_outputs(prologued_obligatory_right_replace_left, limit=10)) # remove_multiple_paths if remove_multiple_paths: custom_obligatory_transducer = pyfst_from_dfa( bracket_rule_transducer_factory._get_custom_obligatory_dfa([RIGHT_APPLICATION_BRACKET], [LEFT_IDENTITY_BRACKET])) prologued_obligatory_right_replace_left = safe_compose(prologued_obligatory_right_replace_left, custom_obligatory_transducer) custom_obligatory_transducer = pyfst_from_dfa( bracket_rule_transducer_factory._get_custom_obligatory_dfa([RIGHT_IDENTITY_BRACKET], [LEFT_APPLICATION_BRACKET])) prologued_obligatory_right_replace_left = safe_compose(prologued_obligatory_right_replace_left, custom_obligatory_transducer) print(get_transducer_outputs(prologued_obligatory_right_replace_left, limit=10)) prologue_inverse_transducer = get_prologue_inverse_transducer() prologued_obligatory_right_replace_left_inverse = safe_compose(prologued_obligatory_right_replace_left, prologue_inverse_transducer) prologued_obligatory_right_replace_left_inverse.remove_epsilon() transducer_outputs = get_transducer_outputs(prologued_obligatory_right_replace_left_inverse, limit=10) print(transducer_outputs) return transducer_outputs