コード例 #1
0
    def test_get_log_lines(self):
        hmm = HMM({'q0': ['q1'],
                   'q1': (['q2', 'q3', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']),
                   'q2': (['q3', 'qf'], ['zo', 'go', 'do']),
                   'q3': (['qf'], ['as', 'ak', 'at'])})
        print(hmm)

        for line in hmm.get_log_lines():
            print(line)
コード例 #2
0
    def test_advance_emission(self):
        hmm = HMM({'q0': ['q1'],
                   'q1': (['q1', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do'] + ['at'])
                   })

        self.write_to_dot_to_file(hmm, "pre_advance_emission_hmm")
        hmm.advance_emission()
        for line in hmm.get_log_lines():
            print(line)
        self.write_to_dot_to_file(hmm, "advance_emission_hmm")
コード例 #3
0
    def test_split_then_merge_state(self):
        hmm = HMM({'q0': ['q1'],
                   'q1': (['qf'], ['koko', 'gogo'])
                   })

        self.write_to_dot_to_file(hmm, "split_states_before")
        hmm.split_state()
        hmm.merge_states()
        for line in hmm.get_log_lines():
            print(line)
        self.write_to_dot_to_file(hmm, "split_states_after")
コード例 #4
0
    def test_merge_emissions(self):
        hmm = HMM({'q0': ['q1', 'q5'],
                   'q1': (['qf'], ['koko']),
                   'q5': (['qf'], ['dag', 'kat'])
                   })

        self.write_to_dot_to_file(hmm, "merge_states_before")
        hmm.merge_emissions()
        for line in hmm.get_log_lines():
            print(line)
        self.write_to_dot_to_file(hmm, "merge_states_after")
コード例 #5
0
class Grammar:
    def __init__(self, hmm, rule_set=None):
        if isinstance(hmm, HMM):
            self.hmm = hmm
        else:
            self.hmm = HMM(hmm)
        segment_table = SegmentTable()
        self.segment_symbol_length = uniform_encoding.log2(len(segment_table) + 1)  # + 1 for the delimiter
        if rule_set:
            self.rule_set = rule_set
        else:
            self.rule_set = RuleSet(noise=False)

        noises = configurations.get("NOISE_RULE_SET", [])
        self.noise_rule_set = RuleSet.load_noise_rules_from_flat_list(noises)

        self._cached_hmm_transducer = None
        self._cached_rule_set_transducer = None
        self._cached_noise_rule_set_transducer = None

    def generate_word(self):
        emission = self.hmm.generate_emission()
        return choice(self.rule_set.get_outputs_of_word(emission))

    def generate_all_words(self):
        # TODO: I think this also generates noised data. This should be fixed.
        words = []
        emissions = self.hmm.generate_all_emissions()
        for emission in emissions:
            words += self.rule_set.get_outputs_of_word(emission)
        return words

    def get_transducer(self, with_noise=True):
        hmm_transducer = self.get_hmm_transducer()

        if "case_name" in configurations.configurations_dict:
            case_name = configurations.configurations_dict["case_name"]
            dot(hmm_transducer, "{}_hmm_transducer".format(case_name))
        rules_set_transducer = self.get_rule_set_transducer()
        if with_noise:
            noise_rules_transducer = self.get_noise_rule_set_transducer()
        else:
            noise_rules_transducer = None

        return self._compose_grammar_transducers(
            hmm_transducer, rules_set_transducer, noise_rules_transducer
        )

    def _compose_grammar_transducers(self, first_transducer, *other_transducers):
        composed_transducer = first_transducer
        for transducer in other_transducers:
            if transducer:
                composed_transducer.arc_sort_input()
                transducer.arc_sort_input()
                composed_transducer = composed_transducer >> transducer
        return composed_transducer

    def get_nfa(self):
        grammar_pyfst_transducer = self.get_transducer()
        # dot(grammar_pyfst_transducer, "grammar_pyfst_transducer")
        # grammar_pyfst_transducer.remove_epsilon()
        return ParsingNFA.get_from_pyfst_transducer(grammar_pyfst_transducer)

    def make_mutation(self):
        mutation_successful = False

        if ga_config.MUTATE_BOTH_HMM_AND_RULES:
            hmm_mutation_successful = False
            rule_set_mutation_successful = False

            if configurations["EVOLVE_HMM"]:
                hmm_mutation_successful = self.hmm.make_mutation()
            if configurations["EVOLVE_RULES"]:
                rule_set_mutation_successful = self.rule_set.make_mutation()

            mutation_successful = mutation_successful or rule_set_mutation_successful or hmm_mutation_successful

            if hmm_mutation_successful:
                self.invalidate_cached_hmm_transducer()
            if rule_set_mutation_successful:
                self.invalidate_cached_rule_set_transducer()

        else:
            rule_set_mutation_weight = 0 if not configurations["EVOLVE_RULES"] else configurations["MUTATE_RULE_SET"]
            hmm_mutation_weight = 0 if not configurations["EVOLVE_HMM"] else configurations["MUTATE_HMM"]

            mutation_weights = [('rule_set', rule_set_mutation_weight),
                                ('hmm', hmm_mutation_weight)]

            weighted_mutatable_object_list = get_weighted_list(mutation_weights)
            object_name_to_mutate = choice(weighted_mutatable_object_list)
            if object_name_to_mutate == 'rule_set':
                object_to_mutate = self.rule_set
            elif object_name_to_mutate == 'hmm':
                object_to_mutate = self.hmm
            mutation_successful = object_to_mutate.make_mutation()

            if mutation_successful:
                if object_name_to_mutate == 'hmm':
                    self.invalidate_cached_hmm_transducer()
                elif object_name_to_mutate == 'rule_set':
                    self.invalidate_cached_rule_set_transducer()

        return mutation_successful

    def get_encoding_length(self):
        if not configurations["UNDERSPECIFICATION_FLAG"]:
            hmm_encoding_length = self.hmm.get_encoding_length(self.segment_symbol_length,
                                                               restrictions_on_alphabet=configurations["RESTRICTIONS_ON_ALPHABET"])
        else:
            hmm_encoding_length = self.hmm.get_underspecified_encoding_length()
        rules_encoding_length = self.rule_set.get_encoding_length()
        return hmm_encoding_length, rules_encoding_length

    def generate_word_list(self, n):
        result = []
        for _ in range(n):
            result.append(self.generate_word())
        return result

    def get_all_outputs(self, with_noise=True):
        transducer = self.get_transducer(with_noise=with_noise)
        if configurations["MINIMIZE_TRANSDUCER"]:
            transducer = self.minimize_transducer(transducer)

        transducer_symbol_table = SegmentTable().transducer_symbol_table
        outputs = list()
        for path in transducer.paths():
            output = ""
            for i in path:
                symbol = transducer_symbol_table.find(i.olabel)
                if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY:
                    output += symbol
            outputs.append(output)
        return outputs

    def get_hmm_transducer(self):
        if self._cached_hmm_transducer is None:
            self._cached_hmm_transducer = self.hmm.get_transducer()
        return self._cached_hmm_transducer

    def get_rule_set_transducer(self):
        if self._cached_rule_set_transducer is None:  # rule set transducer may be None
            self._cached_rule_set_transducer = self.rule_set.get_transducer()

        return self._cached_rule_set_transducer

    def get_noise_rule_set_transducer(self):
        if self._cached_noise_rule_set_transducer is None:
            self._cached_noise_rule_set_transducer = self.noise_rule_set.get_transducer()
        return self._cached_noise_rule_set_transducer

    def get_log_lines(self):
        return self.hmm.get_log_lines() + self.rule_set.get_log_lines()

    def invalidate_cached_hmm_transducer(self):
        self._cached_hmm_transducer = None

    def invalidate_cached_rule_set_transducer(self):
        self._cached_rule_set_transducer = None

    @staticmethod
    def minimize_transducer(transducer):
        transducer.project_output()
        transducer = transducer.determinize()
        transducer.minimize()
        return transducer

    def __getstate__(self):
        # Don't pickle cached transducers
        state = self.__dict__.copy()
        state['_cached_hmm_transducer'] = None
        state['_cached_rule_set_transducer'] = None
        state['_cached_noise_rule_set_transducer'] = None
        return state