def test_kleene_star(self): self.initialise_segment_table("plural_english_segment_table.txt") self.configurations["CHANGE_KLEENE_VALUE"] = True rule = Rule([{"cons": "-"}], [{"low": "+"}], [{"cons": "-"}, {"cons": "+", "kleene": True}], [], obligatory=True) rule_set = RuleSet([rule]) self.assertCountEqual(rule_set.get_outputs_of_word("ato"), ['ata']) self.assertCountEqual(rule_set.get_outputs_of_word("attto"), ['attta'])
def test_morpheme_boundary(self): self.initialise_segment_table("abnese_lengthening_segment_table.txt") rule = Rule([], [{"long": "+"}], [], [{"bound": "+"}], obligatory=True) rule_set = RuleSet([rule]) self.assertEqual(rule_set.get_outputs_of_word("abB"), [u'abYB']) rule = Rule([], [{"long": "+"}], [], [{}, {"bound": "+"}], obligatory=True) rule_set = RuleSet([rule]) self.assertEqual(rule_set.get_outputs_of_word("abB"), [u'aYbB'])
def test_kleene_star(self): self.initialise_segment_table("plural_english_segment_table.txt") rule = Rule([{ "cons": "-" }], [{ "low": "+" }], [{ "cons": "-" }, { "cons": "+", "kleene": True }], [], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("ato")) # -> ata print(rule_set.get_outputs_of_word("atttto")) # -> atttta
def test_phi_ro_identity(self): self.initialise_segment_table("ab_segment_table.txt") rule = Rule([{ "cons": "-" }], [{ "cons": "-" }], [{ "cons": "+" }], [{ "cons": "+" }], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("bb")) # should be bb , instead [] print(rule_set.get_outputs_of_word( "bab")) # should be 'bab' instead [u'bab', u'bab']
def setUp(self): configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 configurations["MORPHEME_BOUNDARY_FLAG"] = True configurations["UNDERSPECIFICATION_FLAG"] = True self.initialise_segment_table("underspecification_segment_table.txt") self.data = ['dat', 'tat', 'da', 'ta'] hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], [ 'dag', 'kat', 'dot', 'kod', 'gas', 'toz', 'kta', 'dgo', 'skoz', 'gdas' ]), 'q2': (['qf'], ['zook', 'gos', 'dod', 'sad']) }) rule = Rule([{ "voice": "0" }], [{ "voice": "-" }], [], [{ "bound": "+" }], True) rule.get_transducer() print(rule.get_segment_representation()) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word('daTB'))
def test_insertion_with_right_context_only2(self): configurations["SINGLE_CONTEXT_TRANSDUCER_FLAG"] = True self.initialise_segment_table("abd_segment_table.txt") rule = Rule([], [{"cons": "-"}], [], [{"cons": "+", "labial": "+"}, {"cons": "+", "labial": "-"}], obligatory=True) rule_set = RuleSet([rule]) self.assertCountEqual(rule_set.get_outputs_of_word('bdbd'), ['abdabd'])
def test_vicky(self): self.initialise_segment_table("plural_english_segment_table.txt") rule = Rule([], [{ "voice": "-" }], [{ "voice": "-" }], [], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("dot"))
def test_rule_application_direction(self): # Test whether rules are applied recursively once the environment changes self.initialise_segment_table("turkish_segment_table.txt") rule = Rule([{"syll": "+"}], [{"back": "-"}], [{"syll": "+", "back": "-"}], [], obligatory=True) rule_set = RuleSet([rule]) # TODO: this should be replaced with: # self.assertEqual(rule_set.get_outputs_of_word("i1a"), ['iia']) # I have no idea why `iie` returns here as well, but this is a bug. self.assertIn('iia', rule_set.get_outputs_of_word("i1a"))
def test_assimilation(self): self.initialise_segment_table("plural_english_segment_table.txt") rule = Rule([{ "cons": "+" }], [{ "voice": "-" }], [{ "voice": "-" }], [], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("tz"))
def test_abnese_insertion(self): self.initialise_segment_table("ab_segment_table.txt") rule = Rule([], [{ "cons": "-" }], [{ "cons": "+" }], [{ "cons": "+" }], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("aabb"))
def test_insertion_with_right_context_only_2(self): configurations["SINGLE_CONTEXT_TRANSDUCER_FLAG"] = True self.initialise_segment_table("ab_segment_table.txt") rule = Rule([], [{ "cons": "-" }], [], [{ "cons": "+" }, { "cons": "+" }], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word('bbbb'))
def test_rule_application_direction(self): # Test whether rules are applied recursively once the environment changes self.initialise_segment_table("turkish_segment_table.txt") rule = Rule([{ "cons": "-" }], [{ "back": "-" }], [{ "cons": "-", "back": "-" }], [], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("i1a")) # -> iia
def test_2(self): rule1 = [[], [{ "ATR": "+" }], [{ "coronal": "+" }], [{ "coronal": "+" }], True] rule2 = [[{ "voice": "+" }], [{ "voice": "-" }], [{ "voice": "-" }], [], True] rule_set = RuleSet([Rule(*rule1), Rule(*rule2)]) result = rule_set.get_outputs_of_word('daadt') print(result)
def test_abnese(self): self.initialise_segment_table("ab_segment_table.txt") self.configurations["BRACKET_TRANSDUCER"] = True data = ['bab', 'aabab'] hmm = HMM( {'q0': ['q1'], 'q1': (['qf'], ['bb', 'aabb']) }) rule = Rule([], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], False) # e->a / b_b rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("bb")) grammar = Grammar(hmm, rule_set) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) print(hypothesis.get_energy()) print(hypothesis.get_recent_energy_signature())
def test_abadnese_for_ezer(self): self.initialise_segment_table("abd_segment_table.txt") data = ['aabad', 'abad', 'badaabad', 'aba', 'aaba', 'badaa'] hmm = HMM({ 'q0': ['q1'], 'q1': (['qf'], ['aabd', 'abd', 'bdaabd', 'aba', 'aaba', 'bdaa']) }) rule = Rule([], [{}], [{ "cons": "+", "labial": "+" }], [{ "cons": "+", "labial": "+" }], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("abb")) grammar = Grammar(hmm, rule_set) hypothesis = Hypothesis(grammar, data)
class Grammar: def __init__(self, hmm, rule_set=None): if isinstance(hmm, HMM): self.hmm = hmm else: self.hmm = HMM(hmm) segment_table = SegmentTable() self.segment_symbol_length = uniform_encoding.log2(len(segment_table) + 1) # + 1 for the delimiter if rule_set: self.rule_set = rule_set else: self.rule_set = RuleSet(noise=False) noises = configurations.get("NOISE_RULE_SET", []) self.noise_rule_set = RuleSet.load_noise_rules_from_flat_list(noises) self._cached_hmm_transducer = None self._cached_rule_set_transducer = None self._cached_noise_rule_set_transducer = None def generate_word(self): emission = self.hmm.generate_emission() return choice(self.rule_set.get_outputs_of_word(emission)) def generate_all_words(self): # TODO: I think this also generates noised data. This should be fixed. words = [] emissions = self.hmm.generate_all_emissions() for emission in emissions: words += self.rule_set.get_outputs_of_word(emission) return words def get_transducer(self, with_noise=True): hmm_transducer = self.get_hmm_transducer() if "case_name" in configurations.configurations_dict: case_name = configurations.configurations_dict["case_name"] dot(hmm_transducer, "{}_hmm_transducer".format(case_name)) rules_set_transducer = self.get_rule_set_transducer() if with_noise: noise_rules_transducer = self.get_noise_rule_set_transducer() else: noise_rules_transducer = None return self._compose_grammar_transducers( hmm_transducer, rules_set_transducer, noise_rules_transducer ) def _compose_grammar_transducers(self, first_transducer, *other_transducers): composed_transducer = first_transducer for transducer in other_transducers: if transducer: composed_transducer.arc_sort_input() transducer.arc_sort_input() composed_transducer = composed_transducer >> transducer return composed_transducer def get_nfa(self): grammar_pyfst_transducer = self.get_transducer() # dot(grammar_pyfst_transducer, "grammar_pyfst_transducer") # grammar_pyfst_transducer.remove_epsilon() return ParsingNFA.get_from_pyfst_transducer(grammar_pyfst_transducer) def make_mutation(self): mutation_successful = False if ga_config.MUTATE_BOTH_HMM_AND_RULES: hmm_mutation_successful = False rule_set_mutation_successful = False if configurations["EVOLVE_HMM"]: hmm_mutation_successful = self.hmm.make_mutation() if configurations["EVOLVE_RULES"]: rule_set_mutation_successful = self.rule_set.make_mutation() mutation_successful = mutation_successful or rule_set_mutation_successful or hmm_mutation_successful if hmm_mutation_successful: self.invalidate_cached_hmm_transducer() if rule_set_mutation_successful: self.invalidate_cached_rule_set_transducer() else: rule_set_mutation_weight = 0 if not configurations["EVOLVE_RULES"] else configurations["MUTATE_RULE_SET"] hmm_mutation_weight = 0 if not configurations["EVOLVE_HMM"] else configurations["MUTATE_HMM"] mutation_weights = [('rule_set', rule_set_mutation_weight), ('hmm', hmm_mutation_weight)] weighted_mutatable_object_list = get_weighted_list(mutation_weights) object_name_to_mutate = choice(weighted_mutatable_object_list) if object_name_to_mutate == 'rule_set': object_to_mutate = self.rule_set elif object_name_to_mutate == 'hmm': object_to_mutate = self.hmm mutation_successful = object_to_mutate.make_mutation() if mutation_successful: if object_name_to_mutate == 'hmm': self.invalidate_cached_hmm_transducer() elif object_name_to_mutate == 'rule_set': self.invalidate_cached_rule_set_transducer() return mutation_successful def get_encoding_length(self): if not configurations["UNDERSPECIFICATION_FLAG"]: hmm_encoding_length = self.hmm.get_encoding_length(self.segment_symbol_length, restrictions_on_alphabet=configurations["RESTRICTIONS_ON_ALPHABET"]) else: hmm_encoding_length = self.hmm.get_underspecified_encoding_length() rules_encoding_length = self.rule_set.get_encoding_length() return hmm_encoding_length, rules_encoding_length def generate_word_list(self, n): result = [] for _ in range(n): result.append(self.generate_word()) return result def get_all_outputs(self, with_noise=True): transducer = self.get_transducer(with_noise=with_noise) if configurations["MINIMIZE_TRANSDUCER"]: transducer = self.minimize_transducer(transducer) transducer_symbol_table = SegmentTable().transducer_symbol_table outputs = list() for path in transducer.paths(): output = "" for i in path: symbol = transducer_symbol_table.find(i.olabel) if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY: output += symbol outputs.append(output) return outputs def get_hmm_transducer(self): if self._cached_hmm_transducer is None: self._cached_hmm_transducer = self.hmm.get_transducer() return self._cached_hmm_transducer def get_rule_set_transducer(self): if self._cached_rule_set_transducer is None: # rule set transducer may be None self._cached_rule_set_transducer = self.rule_set.get_transducer() return self._cached_rule_set_transducer def get_noise_rule_set_transducer(self): if self._cached_noise_rule_set_transducer is None: self._cached_noise_rule_set_transducer = self.noise_rule_set.get_transducer() return self._cached_noise_rule_set_transducer def get_log_lines(self): return self.hmm.get_log_lines() + self.rule_set.get_log_lines() def invalidate_cached_hmm_transducer(self): self._cached_hmm_transducer = None def invalidate_cached_rule_set_transducer(self): self._cached_rule_set_transducer = None @staticmethod def minimize_transducer(transducer): transducer.project_output() transducer = transducer.determinize() transducer.minimize() return transducer def __getstate__(self): # Don't pickle cached transducers state = self.__dict__.copy() state['_cached_hmm_transducer'] = None state['_cached_rule_set_transducer'] = None state['_cached_noise_rule_set_transducer'] = None return state
def test_phi_ro_identity(self): self.initialise_segment_table("ab_segment_table.txt") rule = Rule([{"cons": "-"}], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], obligatory=True) rule_set = RuleSet([rule]) self.assertCountEqual(rule_set.get_outputs_of_word("bb"), ["bb"]) self.assertCountEqual(rule_set.get_outputs_of_word("bab"), ["bab"])
def print_rule_word_outputs(rule, word): rule_set = RuleSet([rule]) result = rule_set.get_outputs_of_word(word) #print("{} -> {} {}".format(word, rule_set.get_outputs_of_word(word), rule)) return result
class Grammar: def __init__(self, hmm, rule_set=None): if isinstance(hmm, HMM): self.hmm = hmm else: self.hmm = HMM(hmm) segment_table = SegmentTable() self.segment_symbol_length = ceil(log(len(segment_table) + 1, 2)) # + 1 for the delimiter if rule_set: self.rule_set = rule_set else: self.rule_set = RuleSet() def generate_word(self): emission = self.hmm.generate_emission() return choice(self.rule_set.get_outputs_of_word(emission)) def get_transducer(self): hmm_transducer = self.hmm.get_transducer() if "case_name" in configurations.configurations_dict: case_name = configurations.configurations_dict["case_name"] dot(hmm_transducer, "{}_hmm_transducer".format(case_name)) rules_set_transducer = self.rule_set.get_transducer() if rules_set_transducer: hmm_transducer.arc_sort_input() rules_set_transducer.arc_sort_input() composed_hmm_rules_transducer = hmm_transducer >> rules_set_transducer else: composed_hmm_rules_transducer = hmm_transducer return composed_hmm_rules_transducer def get_nfa(self): grammar_pyfst_transducer = self.get_transducer() # dot(grammar_pyfst_transducer, "grammar_pyfst_transducer") # grammar_pyfst_transducer.remove_epsilon() return ParsingNFA.get_from_pyfst_transducer(grammar_pyfst_transducer) def make_mutation(self): num_mutations = randint(1, ga_config.MAX_MUTATIONS) mutation_result = False for _ in range(num_mutations): if ga_config.MUTATE_BOTH_HMM_AND_RULES: rule_set_success = False hmm_success = False if configurations["EVOLVE_RULES"]: rule_set_success = self.rule_set.make_mutation() if configurations["EVOLVE_HMM"]: hmm_success = self.hmm.make_mutation() mutation_result = mutation_result or rule_set_success or hmm_success else: rule_set_mutation_weight = 0 if not configurations[ "EVOLVE_RULES"] else configurations["MUTATE_RULE_SET"] hmm_mutation_weight = 0 if not configurations[ "EVOLVE_HMM"] else configurations["MUTATE_HMM"] mutation_weights = [(self.rule_set, rule_set_mutation_weight), (self.hmm, hmm_mutation_weight)] weighted_mutatable_object_list = get_weighted_list( mutation_weights) object_to_mutate = choice(weighted_mutatable_object_list) mutation_result = object_to_mutate.make_mutation() return mutation_result def get_encoding_length(self): if not configurations["UNDERSPECIFICATION_FLAG"]: hmm_encoding_length = self.hmm.get_encoding_length( self.segment_symbol_length, restrictions_on_alphabet=configurations[ "RESTRICTIONS_ON_ALPHABET"]) else: hmm_encoding_length = self.hmm.get_underspecified_encoding_length() rules_encoding_length = self.rule_set.get_encoding_length() return hmm_encoding_length, rules_encoding_length def generate_word_list(self, n): result = [] for _ in range(n): result.append(self.generate_word()) return result def get_all_outputs(self): transducer = self.get_transducer() transducer_symbol_table = SegmentTable().transducer_symbol_table outputs = list() for path in transducer.paths(): output = "" for i in path: symbol = transducer_symbol_table.find(i.olabel) if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY: output += symbol outputs.append(output) return outputs
def test_vicky(self): self.initialise_segment_table("plural_english_segment_table.txt") rule = Rule([], [{"voice": "-"}], [{"voice": "-"}], [], obligatory=True) rule_set = RuleSet([rule]) self.assertCountEqual(rule_set.get_outputs_of_word("dot"), ["dot" + s for s in ('s', 'k', 't')])
def test_degenerate_assimilation(self): self.initialise_segment_table("plural_english_segment_table.txt") rule = Rule([{"cons": "+", "low": "+"}], [{"voice": "-"}], [{"voice": "-"}], [], obligatory=False) rule_set = RuleSet([rule]) self.assertCountEqual(rule_set.get_outputs_of_word("tz"), ["tz"])