def test_morphology_only(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        data = [u'tozat', u'tozgoat', u'tozgo', u'tozdoat', u'tozdo', u'tozzoat', u'tozzo', u'toz', u'dagat', u'daggoat', u'daggo', u'dagdoat', u'dagdo', u'dagzoat', u'dagzo', u'dag', u'gasat', u'gasgoat', u'gasgo', u'gasdoat', u'gasdo', u'gaszoat', u'gaszo', u'gas', u'kodat', u'kodgoat', u'kodgo', u'koddoat', u'koddo', u'kodzoat', u'kodzo', u'kod', u'katat', u'katgoat', u'katgo', u'katdoat', u'katdo', u'katzoat', u'katzo', u'kat', u'dotat', u'dotgoat', u'dotgo', u'dotdoat', u'dotdo', u'dotzoat', u'dotzo', u'dot']

        #target
        hmm = {'q0': ['q1'],
              'q1': (['q2', 'q3', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']),
              'q2': (['q3','qf'], ['zo', 'go', 'do']),
              'q3': (['qf'], ['at'])}
        self.configurations.simulation_data = data
        self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 5190)

        #single_sate
        hmm = HMM({'q0': ['q1'],
              'q1': (['q1', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do'] + ['at'])
                })
        self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6430)


        #two state
        hmm = {'q0': ['q1'],
              'q1': (['q1', 'q2', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do']),
              'q2': (['qf'], ['at'])
                }
        self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6010)

        #from simualation
        hmm = HMM({'q0': ['q1'],
      'q1': (['q1', 'qf'], ['toz', 'do', 'zo', 'gas', 'kod', 'dag', 'at', 'zoat', 'kat', 'go', 'dot'])
        })
    def test_epsilon_emission(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        from fst import EPSILON

        hmm = HMM({'q0': ['q1'],
                   'q1': (['q2'], ['dog', 'kat']),
                   'q2': (['qf'], ['z', EPSILON])
                   })
        self.write_to_dot_to_file(hmm, 'epsilon_hmm')

        hmm_transducer = hmm.get_transducer()
        self.write_to_dot_to_file(hmm_transducer, 'epsilon_hmm_transducer')

        grammar = Grammar(hmm, None)
        word_1 = 'dog'
        word_2 = 'dogz'
        print(hmm)

        hypothesis = Hypothesis(grammar, [word_1, word_2])
        encoding_length = hypothesis.get_data_encoding_length_by_grammar()
        assert encoding_length == 4.0

        print(hmm.add_epsilon_emission_to_state())
        print(hmm.add_epsilon_emission_to_state())
        print(hmm.add_epsilon_emission_to_state())
        print(hmm.remove_epsilon_emission_from_state())
        print(hmm.remove_epsilon_emission_from_state())
        print(hmm.add_epsilon_emission_to_state())

        self.write_to_dot_to_file(hmm, 'epsilon_hmm_after_mutation')
    def test_opacity_two_hypotheses(self):
        from simulations import dag_zook_opacity as simulation
        self.initialise_simulation(simulation)
        hmm = HMM({
            'q0': ['q1'],
            'q1': (['q2', 'q3'], [
                'daot', 'dkoz', 'dog', 'dok', 'gdaas', 'gkas', 'kaos', 'kat',
                'kood', 'ksoag', 'ogtd', 'oktdo', 'skaz', 'tak', 'tso'
            ]),
            'q2': (['qf'], ['go', 'kazka', 'soka', 'ta', EPSILON]),
            'q3': (['qf'], ['da', 'saat', 'tsk', 'zoka'])
        })

        epenthesis_rule = Rule([], [{
            'low': '+'
        }], [{
            'coronal': '+'
        }], [{
            'coronal': '+'
        }], True)
        assimilation_rule = Rule([{
            'cons': '+'
        }], [{
            'voice': '-'
        }], [{
            'voice': '-'
        }], [], True)

        rule_set = RuleSet([assimilation_rule, epenthesis_rule])
        grammar = Grammar(hmm, rule_set)
        hypothesis = Hypothesis(grammar)
        print(hypothesis.get_energy())
 def get_energy(self, simulation_case):
     case_name = simulation_case.case_name
     configuration.configurations_dict["case_name"] = case_name
     if isinstance(simulation_case.hmm_dict, HMM):
         hmm = simulation_case.hmm_dict
     else:
         hmm = HMM(simulation_case.hmm_dict)
     if isinstance(simulation_case.flat_rule_set_list, RuleSet):
         rule_set = simulation_case.flat_rule_set_list
     else:
         rule_set_list = []
         for flat_rule in simulation_case.flat_rule_set_list:
             rule_set_list.append(Rule(*flat_rule))
         rule_set = RuleSet(rule_set_list)
     grammar = Grammar(hmm, rule_set)
     self.write_to_dot_to_file(hmm, "hmm_" + case_name)
     self.write_to_dot_to_file(grammar.get_nfa(),
                               "grammar_nfa_" + case_name)
     hypothesis = Hypothesis(grammar, self.data)
     energy = hypothesis.get_energy()
     if self.target_energy:
         print("{}: {} distance from target: {}".format(
             case_name, hypothesis.get_recent_energy_signature(),
             energy - self.target_energy))
     else:
         print("{}: {}".format(case_name,
                               hypothesis.get_recent_energy_signature()))
     return energy
    def test_abadnese(self):
        self.initialise_segment_table("abd_segment_table.txt")
        data = [
            'bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba',
            'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad',
            'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab',
            'bbabadab', 'baabadab', 'babbadab'
        ]

        hmm = HMM({
            'q0': ['q1'],
            'q1': (['q2',
                    'qf'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba',
                            'babba']),
            'q2': (['qf'], ['dba', 'dad', 'dab'])
        })
        rule = Rule.load([[{
            "cons": "+"
        }], [{
            "labial": "+"
        }], [{
            "labial": "+"
        }], [], True])
        rule_set = RuleSet([rule])

        grammar = Grammar(hmm, rule_set)
        hypothesis = Hypothesis(grammar, data)
        self.assertEqual(hypothesis.get_energy(), 245)
    def test_turkish_blah(self):
        self.initialise_simulation(turkish_vowel_harmony_new_weights)
        Q2s = [
            'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e', EPSILON
        ]
        hmm_dict = {
            'q0': ['q1'],
            'q1': (['q2'], [
                'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi',
                'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi',
                'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp'
            ]),
            'q2': (['qf'], Q2s),
        }
        some_hmm = HMM(deepcopy(hmm_dict))
        some_rules = RuleSet([
            Rule([{
                "syll": "+"
            }], [{
                "back": "+"
            }], [{
                "cont": "+",
                "back": "+"
            }, {
                "syll": "-",
                "kleene": True
            }], [], True)
        ])

        some_hypo = Hypothesis(Grammar(some_hmm, some_rules))

        #
        self.assert_equal_no_infs(self.get_target_hypo().get_energy(),
                                  some_hypo.get_energy())
    def test_simulated_annealing_runtime(self):
        import simulations.turkish_vowel_harmony as current_simulation
        configurations.load_configurations_from_dict(
            current_simulation.configurations_dict)
        self.initialise_segment_table('turkish_segment_table.txt')

        initial_hmm = None
        initial_rule_set = None
        initial_hypothesis = Hypothesis.create_initial_hypothesis(
            current_simulation.data, initial_hmm, initial_rule_set)
        target_tuple = current_simulation.target_tuple
        data = current_simulation.data
        target_rule_set = RuleSet.load_form_flat_list(target_tuple[1])
        target_hypothesis = Hypothesis.create_hypothesis(
            HMM(target_tuple[0]), target_rule_set, data)
        target_energy = target_hypothesis.get_energy()

        simulated_annealing = SimulatedAnnealing(initial_hypothesis,
                                                 target_energy)
        simulated_annealing.before_loop()

        # mutate hypothesis for some time before measuring steps
        for i in range(500):
            simulated_annealing.make_step()

        @timeit_best_of_N
        def make_step_profiled():
            simulated_annealing.make_step()

        make_step_profiled()
Exemple #8
0
    def printTrace(self, cell_type, src_sent):
        '''Prints the trace for top entries (as defined by settings.opts.trace_rules) in the cell (for debugging)'''

        traceFile = settings.opts.outFile + ".trace"
        tF = open(traceFile, 'a')

        nbest_cnt = 0
        hypTraceStack = []
        tgt_key = self.calcCandScore(cell_type)
        for entry in self.table[tgt_key][:]:

            tF.write("TRACE_BEGIN\n")
            hypTraceStack.append(entry)
            tF.write( "#Input  :: %s\n" % (src_sent) )
            tF.write( "#Output :: %s ||| %s\n" % (Hypothesis.getHypothesis(entry), Hypothesis.getFeatVec(entry)) )

            while ( hypTraceStack ):
                trace_entry = hypTraceStack.pop(0)
                for back_pointer in trace_entry.bp:
                    hypTraceStack.insert(0, back_pointer)
                inf_entry = trace_entry.inf_entry
                if inf_entry is not None:   # Non-leaf nodes in derivation
                    tF.write( "%s ||| %s ||| %s ||| %s\n" % ( inf_entry.src, Hypothesis.getHypothesis(inf_entry), Hypothesis.getFeatVec(inf_entry), trace_entry.inf_cell ) )
                else:                       # Leaf nodes in derivation
                    tF.write( "%s ||| %s ||| %s ||| %s\n" % ( trace_entry.src, Hypothesis.getHypothesis(trace_entry), Hypothesis.getFeatVec(trace_entry), trace_entry.inf_cell ) )

            tF.write("TRACE_END\n")
            nbest_cnt += 1
            del hypTraceStack[:]
            if nbest_cnt == settings.opts.trace_rules: break
        tF.close()
Exemple #9
0
 def notify_facet(self, facet=None, value=None, groupname=None):
     params = {}
     params[facet] = value
     params['max_results'] = 200
     h = Hypothesis(token=self.token)
     rows = list(h.search_all(params))
     rows.sort(key=itemgetter('updated'))
     cache = self.data()
     for row in rows:
         new = False
         anno = HypothesisAnnotation(row)
         if self.type == 'set':
             if anno.id not in cache:
                 cache.add(anno.id)
                 new = True
         if self.type == 'dict':
             if not value in cache:
                 cache[value] = set()
             if anno.id not in cache[value]:
                 cache[value].add(anno.id)
                 new = True
         if new and anno.id not in self.notified_ids:
             self.notify(anno, groupname=groupname)
             self.notified_ids.append(anno.id)
     self.save(cache)
     return self.notified_ids
Exemple #10
0
 def get_energy(self, hmm, rule_set_list, case_name):
     grammar = Grammar(hmm, RuleSet(rule_set_list))
     hypothesis = Hypothesis(grammar, self.data)
     energy = hypothesis.get_energy()
     print("{}: {}".format(case_name,
                           hypothesis.get_recent_energy_signature()))
     return energy
def normal_hypothesis(distribution: dist.Distribution, size: int):
    sample = distribution.create_sample(size)
    characteristics = Characteristics(sample)
    hypothesis = Hypothesis()
    hyp_distribution = dist.NormalDistribution(characteristics.mean(),
                                               characteristics.variance())
    hypothesis.check_hypothesis(sample, hyp_distribution)
def uniform_hypothesis(distribution: dist.Distribution, size: int):
    sample = distribution.create_sample(size)
    characteristics = Characteristics(sample)
    hypothesis = Hypothesis()
    hyp_distribution = dist.UniformDistribution(characteristics.min(),
                                                characteristics.max())
    hypothesis.check_hypothesis(sample, hyp_distribution)
def add_expanded_hyp(
    ctc_table: np.ndarray,
    weights: dict,
    row: int,
    col: int, 
    candidate_hyp: Hypothesis,
    parent: (int, int)):

    current_hyp = ctc_table[row, col]

    weights = {
        "lm_score" : 0.0,
        "null_trailing" : 0.0,
        "null_token_ratio" : 0.0
    }

    if current_hyp:
        score_current = score_hypothesis(current_hyp[0], weights, 0)
        score_candidate = score_hypothesis(candidate_hyp, weights, 0)

        if score_candidate <= score_current:
            return

        candidate_hyp.recombine_with(current_hyp[0])

    ctc_table[row, col] = (candidate_hyp, parent)
Exemple #14
0
 def notify_facet(self, facet=None, value=None, groupname=None):
     params = {'_separate_replies':'true'}
     params[facet] = value
     params['limit'] = 200
     h_url = Hypothesis().query_url.format(query=urlencode(params))
     #print h_url
     r = None
     if self.token is not None:
         h = Hypothesis(token=self.token)
         r = h.token_authenticated_query(h_url)
     else:
         r = requests.get(h_url).json()
     rows = r['rows']
     rows += r['replies']
     cache = self.data()
     rows.sort(key=itemgetter('updated'))
     for row in rows:
         new = False
         anno = HypothesisAnnotation(row)
         if self.type == 'set':
             if anno.id not in cache:
                 cache.add(anno.id)
                 new = True
         if self.type == 'dict':
             if not value in cache:
                 cache[value] = set()
             if anno.id not in cache[value]:
                 cache[value].add(anno.id) 
                 new = True
         if new and anno.id not in self.notified_ids:
             self.notify(anno, groupname=groupname)
             self.notified_ids.append(anno.id)
     self.save(cache)
     return self.notified_ids
    def test_turkish__only_syll_is_the_correct_context(self):
        self.initialise_simulation(turkish_vowel_harmony_new_weights)

        # +syll --> +back
        hmm_dict = {
            'q0': ['q1'],
            'q1': (['q2'], [
                'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi',
                'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi',
                'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp'
            ]),
            'q2': (['qf'], [
                'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e',
                EPSILON
            ]),
        }
        rule_change = ([{"syll": "+"}], [{"back": "+"}])

        # +syll --> -back
        hmm_dict2 = {
            'q0': ['q1'],
            'q1': (['q2'], [
                'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi',
                'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi',
                'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp'
            ]),
            'q2': (['qf'], [
                '1n', 'lar', 's1z', '1', 'tan', 'sal', 'l1k', 'l1', 'a',
                EPSILON
            ]),
        }
        rule_change2 = ([{"syll": "+"}], [{"back": "-"}])

        target_energy = self.get_target_hypo().get_energy()
        unexpexted_context = []
        for feat in 'syll,back,round,high,voice,cont,lateral,son'.split(','):
            for val in ['+', '-']:
                if (feat, val) == ('syll', '-'):
                    continue
                for r, change in enumerate([rule_change, rule_change2],
                                           start=1):
                    for h, hmm in enumerate([hmm_dict, hmm_dict2], start=1):
                        some_hmm = HMM(deepcopy(hmm))
                        rule = change + ([{
                            "syll": "+",
                            "back": change[1][0]['back']
                        }, {
                            feat: val,
                            "kleene": True
                        }], [], True)
                        some_rules = RuleSet([Rule(*rule)])
                        some_hypo = Hypothesis(Grammar(some_hmm, some_rules))
                        if some_hypo.get_energy() <= target_energy:
                            unexpexted_context.append(
                                {f"hmm{h} rule {r}": {
                                    feat: val
                                }})

        assert unexpexted_context == [], f"Unexpected kleene context for rule: {unexpexted_context}"
Exemple #16
0
    def emit_group_rss(self, group=None, groupname=None):
        md = markdown.Markdown()
        from feedgen.feed import FeedGenerator
        fg = FeedGenerator()
        fg.id('https://h.jonudell.info')
        fg.title('Hypothesis group %s' % groupname)
        fg.author({'name': 'Jon Udell', 'email': '*****@*****.**'})
        fg.description("Hypothesis notifications for group %s" % groupname)
        fg.link(href='https://h.jonudell.info/group_rss')
        fg.language('en')
        h = Hypothesis(token=self.token, limit=20)
        ids = self.data()
        annos = []
        for id in ids:
            try:
                anno = h.get_annotation(id)
                assert ('id' in anno.keys())
                annos.append(anno)
            except:
                print('cannot get %s, deleted?' % id)
            annos.sort(key=itemgetter('updated'), reverse=True)
        annos = [HypothesisAnnotation(a) for a in annos]
        for anno in annos:
            ref_user = None
            in_reply_to = None
            root_id = anno.id
            if len(anno.references) > 0:
                try:
                    ref_id = anno.references[-1:][0]
                    root_id = anno.references[0]
                    ref = h.get_annotation(ref_id)
                    ref_user = HypothesisAnnotation(ref).user
                    in_reply_to = '<p>in reply to %s </p>' % ref_user
                except:
                    print("cannot get user for ref_id %s, deleted?" % ref_id)
            fe = fg.add_entry()
            fe.id(anno.id)
            fe.title('%s annotated %s in the group %s at %s ' %
                     (anno.user, anno.doc_title, groupname, anno.updated))
            fe.author({"email": None, "name": anno.user, "uri": None})
            dl = "https://hyp.is/%s" % anno.id
            fe.link({"href": "%s" % dl})
            content = ''
            if ref_user is not None:
                content += in_reply_to
            if anno.exact is not None:
                content += '<p>in reference to: </p> <p> <blockquote><em>%s</em></blockquote></p>' % anno.exact
            content += '<p> %s <a href="https://hyp.is/%s">said</a>: </p> ' % (
                anno.user, root_id)
            content += '%s ' % md.convert(anno.text)
            if len(anno.tags):
                content += '<p>tags: %s' % ', '.join(anno.tags)
            fe.content(content, type='CDATA')
            dt = dateutil.parser.parse(anno.updated)
            dt_tz = dt.replace(tzinfo=pytz.UTC)
            fe.pubdate(dt_tz)

        rssfeed = fg.rss_str(pretty=True)  # Get the RSS feed as string
        fg.rss_file('%s.xml' % group)  # Write the RSS feed to a file
 def get_energy(self, hmm, rule_set_list, case_name):
     grammar = Grammar(hmm, RuleSet(rule_set_list))
     self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa_" + case_name)
     hypothesis = Hypothesis(grammar, self.data)
     energy = hypothesis.get_energy()
     print("{}: {}".format(case_name,
                           hypothesis.get_recent_energy_signature()))
     return energy
def laplace_hypothesis(distribution: dist.Distribution, size: int):
    sample = distribution.create_sample(size)
    characteristics = Characteristics(sample)
    hypothesis = Hypothesis()
    hyp_distribution = dist.LaplaceDistribution(
        characteristics.mean(),
        characteristics.variance() / (2**0.5))
    hypothesis.check_hypothesis(sample, hyp_distribution)
Exemple #19
0
    def test_crossover(self):
        self.initialise_segment_table("dag_zook_segments_new.txt")
        rule_set_1 = RuleSet([
            Rule(*[[{
                "cons": "+"
            }], [{
                "voice": "-"
            }], [{
                "low": "+"
            }], [{
                "cont": "-"
            }], True])
        ])
        rule_set_2 = RuleSet([
            Rule(*[[{
                "cons": "+"
            }], [{
                "low": "-"
            }], [{
                "voice": "-"
            }], [], False])
        ])
        plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog']
        hmm_1 = HMM({
            INITIAL_STATE: ['q1'],
            'q1': (['q2', FINAL_STATE], ['dag', 'kot']),
            'q2': ([FINAL_STATE], ['z'])
        })
        hmm_2 = HMM({
            INITIAL_STATE: ['q1'],
            'q1': (['q2'], ['dog', 'kat']),
            'q2': (['q3'], ['s']),
            'q3': ([FINAL_STATE], ['z'])
        })

        grammar_1 = Grammar(hmm_1, rule_set_1)
        grammar_2 = Grammar(hmm_2, rule_set_2)

        hypothesis_1 = Hypothesis(grammar_1, plural_english_data)
        hypothesis_2 = Hypothesis(grammar_2, plural_english_data)
        offspring_1, offspring_2 = GeneticAlgorithm.crossover(
            hypothesis_1, hypothesis_2)

        print("*** Parents:\n")
        GeneticAlgorithm.log_hypothesis(hypothesis_1)
        GeneticAlgorithm.log_hypothesis(hypothesis_2)

        print("\n\n*** Offspring:\n")
        GeneticAlgorithm.log_hypothesis(offspring_1)
        GeneticAlgorithm.log_hypothesis(offspring_2)

        offspring_3, offspring_4 = GeneticAlgorithm.crossover(
            offspring_1, offspring_2)

        print("\n\n*** 2nd gen offspring:\n")
        GeneticAlgorithm.log_hypothesis(offspring_3)
        GeneticAlgorithm.log_hypothesis(offspring_4)
    def test_abadnese_no_rule(self):
        self.initialise_segment_table("abd_segment_table.txt")
        data = ['bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad',
        'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab',
        'baabadab', 'babbadab']

        hmm = HMM({'q0': ['q1'],
              'q1': (['q2'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']),
              'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab'])})

        grammar = Grammar(hmm, [])
        self.configurations.simulation_data = data
        hypothesis = Hypothesis(grammar)
        self.assertEqual(int(hypothesis.get_energy()), 243)
    def test_plural_english_hypothesis(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        self.rule_set = self.get_rule_set("plural_english_rule_set.json")
        plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog']
        hmm = HMM({INITIAL_STATE: ['q1'],
                 'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
                 'q2': ([FINAL_STATE], ['z'])})

        grammar = Grammar(hmm, self.rule_set)
        self.write_to_dot_file(self.rule_set.rules[0].get_transducer(), "plural_english_rule")
        self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa")
        self.configurations.simulation_data = plural_english_data
        hypothesis = Hypothesis(grammar)
        self.assertEqual(int(hypothesis.get_energy()), 117)
    def test_morphology_only2(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25
        data = [
            u'tozata', u'tozaso', u'tozakt', u'tozzookata', u'tozzookaso',
            u'tozzookakt', u'tozzook', u'tozdodata', u'tozdodaso',
            u'tozdodakt', u'tozdod', u'tozgosata', u'tozgosaso', u'tozgosakt',
            u'tozgos', u'toz', u'dagata', u'dagaso', u'dagakt', u'dagzookata',
            u'dagzookaso', u'dagzookakt', u'dagzook', u'dagdodata',
            u'dagdodaso', u'dagdodakt', u'dagdod', u'daggosata', u'daggosaso',
            u'daggosakt', u'daggos', u'dag', u'gasata', u'gasaso', u'gasakt',
            u'gaszookata', u'gaszookaso', u'gaszookakt', u'gaszook',
            u'gasdodata', u'gasdodaso', u'gasdodakt', u'gasdod', u'gasgosata',
            u'gasgosaso', u'gasgosakt', u'gasgos', u'gas', u'kodata',
            u'kodaso', u'kodakt', u'kodzookata', u'kodzookaso', u'kodzookakt',
            u'kodzook', u'koddodata', u'koddodaso', u'koddodakt', u'koddod',
            u'kodgosata', u'kodgosaso', u'kodgosakt', u'kodgos', u'kod',
            u'katata', u'kataso', u'katakt', u'katzookata', u'katzookaso',
            u'katzookakt', u'katzook', u'katdodata', u'katdodaso',
            u'katdodakt', u'katdod', u'katgosata', u'katgosaso', u'katgosakt',
            u'katgos', u'kat', u'dotata', u'dotaso', u'dotakt', u'dotzookata',
            u'dotzookaso', u'dotzookakt', u'dotzook', u'dotdodata',
            u'dotdodaso', u'dotdodakt', u'dotdod', u'dotgosata', u'dotgosaso',
            u'dotgosakt', u'dotgos', u'dot'
        ]
        hmm = HMM({
            'q0': [u'q1'],
            'q1': ([u'q2', u'q3',
                    u'qf'], ['toz', 'dag', 'kat', 'dot', 'kod', 'gas']),
            'q2': ([u'q3', u'qf'], ['zook', 'gos', 'dod']),
            'q3': ([u'qf'], ['aso', 'akt', 'ata'])
        })

        hypothesis = Hypothesis(Grammar(hmm, []), data)
Exemple #23
0
    def inferPosterior(self, likelihood, MCMCOn=True):
        """
			Uses inference engine to compute posterior probability from the 
			likelihood and prior (beta distribution).
		"""

        posterior = likelihood * self.prior
        # print posterior.sum()
        # posterior /= posterior.sum()
        # posterior = [np.around(i,4) for i in posterior]
        if MCMCOn:
            samples, h = self.MCMC(posterior, 200000)
            hypMCMC = list(set(h))
            posteriorMCMC = [h.count(i) / float(len(h)) for i in hypMCMC]
            self.hypMCMC = hypMCMC
            self.posteriorsMCMC.append(posteriorMCMC)

            self.evalHypMCMC = list()
            H = Hypothesis(Grid('testGrid'))
            for h in self.hypMCMC:
                h = h.replace('Then', 'H.Then')
                h = h.replace('And', 'H.And')
                h = h.replace('Or', 'H.Or')
                self.evalHypMCMC.append(eval(h))

        else:
            self.hypMCMC = self.hypotheses
            self.evalHypMCMC = list()
            self.evalHypMCMC = self.evalHypotheses
            self.posteriorsMCMC.append(posterior)

        self.posteriors.append(posterior)
    def test_get_parsing_results(self):
        self.initialise_segment_table("abnese_lengthening_segment_table.txt")
        configurations["MORPHEME_BOUNDARY_FLAG"] = True
        configurations["LENGTHENING_FLAG"] = True
        configurations["HMM_ENCODING_LENGTH_MULTIPLIER"] = 100
        configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 20
        hmm = HMM({
            'q0': ['q1'],
            'q1': (['qf'], ['aabb', 'abb', 'bbaabb', 'aba', 'aaba', 'bbaa'])
        })

        rule1 = Rule([], [{
            "long": "+"
        }], [], [{}, {
            "bound": "+"
        }],
                     obligatory=True)
        rule2 = Rule([], [{
            "syll": "+"
        }], [{
            "cons": "+"
        }], [{
            "cons": "+"
        }],
                     obligatory=True)
        rule_set = RuleSet([rule1, rule2])

        grammar = Grammar(hmm, rule_set)
        data = [
            u'baba:a', u'babaab:ab', u'ab:a', u'aab:a', u'aab:ab', u'ab:ab'
        ]

        hypothesis = Hypothesis(grammar, data)
        simulated_annealing = SimulatedAnnealing(hypothesis, 0)
        print(simulated_annealing._get_parsing_results())
 def test_get_random_hypothesis(self):
     self.configurations["EVOLVE_HMM"] = True
     self.configurations["EVOLVE_RULES"] = True
     self.initialise_segment_table("plural_english_segment_table.txt")
     data = ['kats', 'dogz', 'kat', 'dog']
     rand_hypothesis = Hypothesis.get_random_hypothesis(data)
     log_hypothesis(rand_hypothesis)
    def test_assimilation_no_rule(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        data = ['kat', 'dot',     'dag', 'kod'] + \
               ['katso', 'dotso', 'dagzo', 'kodzo'] + \
               ['katko', 'dotko', 'daggo', 'kodgo'] + \
               ['katto', 'dotto', 'dagdo', 'koddo']

        hmm = HMM({'q0': ['q1'],
              'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']),
              'q2': (['qf'], ['zo', 'go', 'do', 'to', 'so', 'ko'])
               })

        grammar = Grammar(hmm, [])

        hypothesis = Hypothesis(grammar)
        self.configurations.simulation_data = data
        self.assertEqual(int(hypothesis.get_energy()), 230)
    def test_assimilation2(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        self.rule_set = self.get_rule_set("plural_english_rule_set.json")
        data = ['kat', 'dot',     'dag', 'kod'] + \
               ['katso', 'dotso', 'dagzo', 'kodzo'] + \
               ['katko', 'dotko', 'daggo', 'kodgo'] + \
               ['katto', 'dotto', 'dagdo', 'koddo']

        hmm = HMM({'q0': ['q1'],
              'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']),
              'q2': (['qf'], ['zo', 'go', 'do'])
               })

        grammar = Grammar(hmm, self.rule_set)
        self.configurations.simulation_data = data
        hypothesis = Hypothesis(grammar)
        for _ in range(10):  #1.4
            energy = hypothesis.get_energy()
Exemple #28
0
def get_hypothesis_from_log_string(hypothesis_string):
    from grammar import Grammar
    from hypothesis import Hypothesis

    hmm = get_hmm_from_hypothesis_string(hypothesis_string)
    rule_set = get_rule_set_from_hypothesis_string(hypothesis_string)

    grammar = Grammar(hmm, rule_set)
    return Hypothesis(grammar)
Exemple #29
0
    def __getRulesFromPT(self, s_rule, span):
        ''' Get the rules from the Phrase table and create new entry object for each rule returned by phrase table '''

        tgtLst = PhraseTable.getRuleEntries(s_rule, self.sent_indx)
        newTgtLst = []
        for r_item in tgtLst:
            new_entry = Hypothesis.createFromRule(r_item, span)
            newTgtLst.append(new_entry)

        return newTgtLst
Exemple #30
0
    def __getRulesFromPT(self, s_rule, span):
        ''' Get the rules from the Phrase table and create new entry object for each rule returned by phrase table '''

        tgtLst = PhraseTable.getRuleEntries(s_rule, self.sent_indx)
        newTgtLst = []
        for r_item in tgtLst:
            new_entry = Hypothesis.createFromRule(r_item, span)
            newTgtLst.append(new_entry)

        return newTgtLst
    def test_abnese(self):
        self.initialise_segment_table("ab_segment_table.txt")
        self.configurations["BRACKET_TRANSDUCER"] = True
        data = ['bab', 'aabab']

        hmm = HMM( {'q0': ['q1'],
              'q1': (['qf'], ['bb', 'aabb'])
              })
        rule = Rule([], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], False)  # e->a / b_b
        rule_set = RuleSet([rule])

        print(rule_set.get_outputs_of_word("bb"))

        grammar = Grammar(hmm, rule_set)
        self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa")
        self.configurations.simulation_data = data
        hypothesis = Hypothesis(grammar)

        print(hypothesis.get_energy())
        print(hypothesis.get_recent_energy_signature())
    def test_abadnese_no_rule(self):
        self.initialise_segment_table("abd_segment_table.txt")
        data = [
            'bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba',
            'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad',
            'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab',
            'bbabadab', 'baabadab', 'babbadab'
        ]

        hmm = HMM({
            'q0': ['q1'],
            'q1': (['q2',
                    'qf'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba',
                            'babba']),
            'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab'])
        })

        grammar = Grammar(hmm, [])
        hypothesis = Hypothesis(grammar, data)
        self.assertEqual(hypothesis.get_energy(), 252)
Exemple #33
0
    def decode(self):
        '''
        return the best translation for the sentence
        '''
        # initialize hypothesis stack
        for i in xrange(self.number_of_foreign_words +1):
            self.hypothesis_stack[i] = HypothesisStack(MAX_HISTOGRAMS)

        # create initial hypothesis
        hyp_init = Hypothesis.create_initial()
        self.hypothesis_stack[0].push(hyp_init)
        for i in xrange(self.number_of_foreign_words):
            for hyp in self.hypothesis_stack[i]:
                new_hyps = self._generate_new_hypotheses(hyp)
                for new_hyp in new_hyps:
                    self.hypothesis_stack[len(new_hyp.get_foreign_covered_indexes())].push(new_hyp)

        return self._get_best_hypothesis()
Exemple #34
0
    def forceDecodePrune(self, refsLst, last_cell=False):
        '''Prune the top-level cells for force decoding'''

        left_side = 'S'             # forceDecodePrune() can only be used in 'S' cells for 'S' derivations
        for key in self.table.iterkeys():
            if key[0] != left_side: continue
            cand_indx = 0
            for cand in self.table[key][:]:
                matches_ref = False
                cand_tgt = Hypothesis.getHypothesis(cand)
                for ref_i in refsLst:
                    if (not last_cell and ref_i.startswith(cand_tgt)) or (last_cell and ref_i == cand_tgt):
                        matches_ref = True
                        break
                if not matches_ref: del self.table[key][cand_indx]
                else: cand_indx += 1

            # If all the entries are deleted, then there will be no S derivations in the cell; set has_S_tree as False
            if not self.table[key]: self.has_S_tree = False
        return self.has_S_tree
Exemple #35
0
    def mergeProducts(self):
        'Calculate the top-N derivations by lazily finding the sum of log probs (equivalent to product of probs)'

        # Initializations
        heap_indx = 0
        cb_pop_count = 0
        mP_candLst = []

        for cube_indx, cube_obj in self.cubeDict.iteritems():
            mP_candTup = ()
            mP_candTup = Cube.getBestItem(cube_obj, cube_indx)
            heapq.heappush(mP_candLst, (mP_candTup[0], heap_indx, mP_candTup[1], mP_candTup[2], mP_candTup[3]))
            heap_indx += 1

        candLst_size = heap_indx
        while candLst_size > 0:
            # get the best item from the heap
            (h_score, h_indx, mP_entry_obj, cube_indx, mP_r) = heapq.heappop(mP_candLst)
            candLst_size -= 1

            # push the best item into coverageHeap from which the N-best list will be extracted
            if mP_entry_obj is not None:
                # @type mP_entry_obj Entry
                entry_exists = Lazy.indexHypothesis(mP_entry_obj.tgt, Hypothesis.getScoreSansLM(mP_entry_obj), h_indx)
                if (entry_exists is None or not settings.opts.use_unique_nbest):
                    mP_entry_obj.inf_cell = Lazy.cell_span
                    if self.cbp_diversity > 0: self.recordDiversity(cube_indx)
                    heapq.heappush(self.coverageHeap, (h_score, h_indx, mP_entry_obj, cube_indx, mP_r))
                elif entry_exists == -1:
                    pass
                else:
                    curr_h_indx = self.getItemIndxInHeap(entry_exists)
                    if self.cbp_diversity > 0: self.recordDiversity(cube_indx, self.coverageHeap[curr_h_indx][3])
                    self.coverageHeap[curr_h_indx] = (h_score, entry_exists, mP_entry_obj, cube_indx, mP_r)

            if cb_pop_count >= self.bsize and (self.cbp_diversity == 0 or ( self.cbp_diversity > 0 and \
                self.coverageDict.has_key(cube_indx) and self.coverageDict[cube_indx] >= self.cbp_diversity )):
                continue

            # get the neighbours for the best item from the corresponding cube
            cube_obj = self.cubeDict[cube_indx]
            # @type cube_obj Cube
            neighbours = Cube.xploreNeighbours(cube_obj, cube_indx, mP_r)

            # add the neighbouring entries to the heap
            for mP_candTup in neighbours:
                if mP_candTup[1] is not None:
                    entry_exists = Lazy.checkHypothesis(mP_candTup[1].tgt)
                    if entry_exists is None:                        # New hypothesis, increment cbp
                        cb_pop_count += 1

                new_candTup = (mP_candTup[0], heap_indx, mP_candTup[1], mP_candTup[2], mP_candTup[3])
                heapq.heappush(mP_candLst, new_candTup)
                candLst_size += 1
                heap_indx += 1

        # Explore new items for increasing diversity (if required)
        if self.cbp_diversity > 0:
            for c_ind in self.cubeDict.keys():
                diversityItems = []
                cube_obj = self.cubeDict[c_ind]
                if not self.coverageDict.has_key(c_ind):
                    diversityItems = Cube.getkItems4Diversity(cube_obj, c_ind, self.cbp_diversity)
                elif self.coverageDict[c_ind] < self.cbp_diversity:
                    diversityItems = Cube.getkItems4Diversity(cube_obj, c_ind, self.cbp_diversity - self.coverageDict[c_ind])
                else: continue

                for mP_candTup in diversityItems:
                    new_candTup = (mP_candTup[0], heap_indx, mP_candTup[1], mP_candTup[2], mP_candTup[3])
                    heapq.heappush(self.coverageHeap, new_candTup)
                    heap_indx += 1

        Nbest_size = 0
        NbestLst = []
        self.coverageDict = {}
        heapq.heapify(self.coverageHeap)
        while self.coverageHeap:
            (h_score, h_indx, mP_entry_obj, cube_indx, mP_r) = heapq.heappop( self.coverageHeap )
            if ( Nbest_size < self.bsize or (Nbest_size >= self.bsize and self.cbp_diversity > 0 and \
                (not self.coverageDict.has_key(cube_indx) or self.coverageDict[cube_indx] < self.cbp_diversity)) ):
                NbestLst.append( mP_entry_obj )
                Nbest_size += 1

                if self.cbp_diversity > 0:
                    if self.coverageDict.has_key(cube_indx): self.coverageDict[cube_indx] += 1
                    else: self.coverageDict[cube_indx] = 1

        return NbestLst