def test_morphology_only(self): self.initialise_segment_table("plural_english_segment_table.txt") data = [u'tozat', u'tozgoat', u'tozgo', u'tozdoat', u'tozdo', u'tozzoat', u'tozzo', u'toz', u'dagat', u'daggoat', u'daggo', u'dagdoat', u'dagdo', u'dagzoat', u'dagzo', u'dag', u'gasat', u'gasgoat', u'gasgo', u'gasdoat', u'gasdo', u'gaszoat', u'gaszo', u'gas', u'kodat', u'kodgoat', u'kodgo', u'koddoat', u'koddo', u'kodzoat', u'kodzo', u'kod', u'katat', u'katgoat', u'katgo', u'katdoat', u'katdo', u'katzoat', u'katzo', u'kat', u'dotat', u'dotgoat', u'dotgo', u'dotdoat', u'dotdo', u'dotzoat', u'dotzo', u'dot'] #target hmm = {'q0': ['q1'], 'q1': (['q2', 'q3', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']), 'q2': (['q3','qf'], ['zo', 'go', 'do']), 'q3': (['qf'], ['at'])} self.configurations.simulation_data = data self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 5190) #single_sate hmm = HMM({'q0': ['q1'], 'q1': (['q1', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do'] + ['at']) }) self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6430) #two state hmm = {'q0': ['q1'], 'q1': (['q1', 'q2', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do']), 'q2': (['qf'], ['at']) } self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6010) #from simualation hmm = HMM({'q0': ['q1'], 'q1': (['q1', 'qf'], ['toz', 'do', 'zo', 'gas', 'kod', 'dag', 'at', 'zoat', 'kat', 'go', 'dot']) })
def test_epsilon_emission(self): self.initialise_segment_table("plural_english_segment_table.txt") from fst import EPSILON hmm = HMM({'q0': ['q1'], 'q1': (['q2'], ['dog', 'kat']), 'q2': (['qf'], ['z', EPSILON]) }) self.write_to_dot_to_file(hmm, 'epsilon_hmm') hmm_transducer = hmm.get_transducer() self.write_to_dot_to_file(hmm_transducer, 'epsilon_hmm_transducer') grammar = Grammar(hmm, None) word_1 = 'dog' word_2 = 'dogz' print(hmm) hypothesis = Hypothesis(grammar, [word_1, word_2]) encoding_length = hypothesis.get_data_encoding_length_by_grammar() assert encoding_length == 4.0 print(hmm.add_epsilon_emission_to_state()) print(hmm.add_epsilon_emission_to_state()) print(hmm.add_epsilon_emission_to_state()) print(hmm.remove_epsilon_emission_from_state()) print(hmm.remove_epsilon_emission_from_state()) print(hmm.add_epsilon_emission_to_state()) self.write_to_dot_to_file(hmm, 'epsilon_hmm_after_mutation')
def test_opacity_two_hypotheses(self): from simulations import dag_zook_opacity as simulation self.initialise_simulation(simulation) hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'q3'], [ 'daot', 'dkoz', 'dog', 'dok', 'gdaas', 'gkas', 'kaos', 'kat', 'kood', 'ksoag', 'ogtd', 'oktdo', 'skaz', 'tak', 'tso' ]), 'q2': (['qf'], ['go', 'kazka', 'soka', 'ta', EPSILON]), 'q3': (['qf'], ['da', 'saat', 'tsk', 'zoka']) }) epenthesis_rule = Rule([], [{ 'low': '+' }], [{ 'coronal': '+' }], [{ 'coronal': '+' }], True) assimilation_rule = Rule([{ 'cons': '+' }], [{ 'voice': '-' }], [{ 'voice': '-' }], [], True) rule_set = RuleSet([assimilation_rule, epenthesis_rule]) grammar = Grammar(hmm, rule_set) hypothesis = Hypothesis(grammar) print(hypothesis.get_energy())
def get_energy(self, simulation_case): case_name = simulation_case.case_name configuration.configurations_dict["case_name"] = case_name if isinstance(simulation_case.hmm_dict, HMM): hmm = simulation_case.hmm_dict else: hmm = HMM(simulation_case.hmm_dict) if isinstance(simulation_case.flat_rule_set_list, RuleSet): rule_set = simulation_case.flat_rule_set_list else: rule_set_list = [] for flat_rule in simulation_case.flat_rule_set_list: rule_set_list.append(Rule(*flat_rule)) rule_set = RuleSet(rule_set_list) grammar = Grammar(hmm, rule_set) self.write_to_dot_to_file(hmm, "hmm_" + case_name) self.write_to_dot_to_file(grammar.get_nfa(), "grammar_nfa_" + case_name) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() if self.target_energy: print("{}: {} distance from target: {}".format( case_name, hypothesis.get_recent_energy_signature(), energy - self.target_energy)) else: print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def test_abadnese(self): self.initialise_segment_table("abd_segment_table.txt") data = [ 'bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab' ] hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab']) }) rule = Rule.load([[{ "cons": "+" }], [{ "labial": "+" }], [{ "labial": "+" }], [], True]) rule_set = RuleSet([rule]) grammar = Grammar(hmm, rule_set) hypothesis = Hypothesis(grammar, data) self.assertEqual(hypothesis.get_energy(), 245)
def test_turkish_blah(self): self.initialise_simulation(turkish_vowel_harmony_new_weights) Q2s = [ 'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e', EPSILON ] hmm_dict = { 'q0': ['q1'], 'q1': (['q2'], [ 'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi', 'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi', 'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp' ]), 'q2': (['qf'], Q2s), } some_hmm = HMM(deepcopy(hmm_dict)) some_rules = RuleSet([ Rule([{ "syll": "+" }], [{ "back": "+" }], [{ "cont": "+", "back": "+" }, { "syll": "-", "kleene": True }], [], True) ]) some_hypo = Hypothesis(Grammar(some_hmm, some_rules)) # self.assert_equal_no_infs(self.get_target_hypo().get_energy(), some_hypo.get_energy())
def test_simulated_annealing_runtime(self): import simulations.turkish_vowel_harmony as current_simulation configurations.load_configurations_from_dict( current_simulation.configurations_dict) self.initialise_segment_table('turkish_segment_table.txt') initial_hmm = None initial_rule_set = None initial_hypothesis = Hypothesis.create_initial_hypothesis( current_simulation.data, initial_hmm, initial_rule_set) target_tuple = current_simulation.target_tuple data = current_simulation.data target_rule_set = RuleSet.load_form_flat_list(target_tuple[1]) target_hypothesis = Hypothesis.create_hypothesis( HMM(target_tuple[0]), target_rule_set, data) target_energy = target_hypothesis.get_energy() simulated_annealing = SimulatedAnnealing(initial_hypothesis, target_energy) simulated_annealing.before_loop() # mutate hypothesis for some time before measuring steps for i in range(500): simulated_annealing.make_step() @timeit_best_of_N def make_step_profiled(): simulated_annealing.make_step() make_step_profiled()
def printTrace(self, cell_type, src_sent): '''Prints the trace for top entries (as defined by settings.opts.trace_rules) in the cell (for debugging)''' traceFile = settings.opts.outFile + ".trace" tF = open(traceFile, 'a') nbest_cnt = 0 hypTraceStack = [] tgt_key = self.calcCandScore(cell_type) for entry in self.table[tgt_key][:]: tF.write("TRACE_BEGIN\n") hypTraceStack.append(entry) tF.write( "#Input :: %s\n" % (src_sent) ) tF.write( "#Output :: %s ||| %s\n" % (Hypothesis.getHypothesis(entry), Hypothesis.getFeatVec(entry)) ) while ( hypTraceStack ): trace_entry = hypTraceStack.pop(0) for back_pointer in trace_entry.bp: hypTraceStack.insert(0, back_pointer) inf_entry = trace_entry.inf_entry if inf_entry is not None: # Non-leaf nodes in derivation tF.write( "%s ||| %s ||| %s ||| %s\n" % ( inf_entry.src, Hypothesis.getHypothesis(inf_entry), Hypothesis.getFeatVec(inf_entry), trace_entry.inf_cell ) ) else: # Leaf nodes in derivation tF.write( "%s ||| %s ||| %s ||| %s\n" % ( trace_entry.src, Hypothesis.getHypothesis(trace_entry), Hypothesis.getFeatVec(trace_entry), trace_entry.inf_cell ) ) tF.write("TRACE_END\n") nbest_cnt += 1 del hypTraceStack[:] if nbest_cnt == settings.opts.trace_rules: break tF.close()
def notify_facet(self, facet=None, value=None, groupname=None): params = {} params[facet] = value params['max_results'] = 200 h = Hypothesis(token=self.token) rows = list(h.search_all(params)) rows.sort(key=itemgetter('updated')) cache = self.data() for row in rows: new = False anno = HypothesisAnnotation(row) if self.type == 'set': if anno.id not in cache: cache.add(anno.id) new = True if self.type == 'dict': if not value in cache: cache[value] = set() if anno.id not in cache[value]: cache[value].add(anno.id) new = True if new and anno.id not in self.notified_ids: self.notify(anno, groupname=groupname) self.notified_ids.append(anno.id) self.save(cache) return self.notified_ids
def get_energy(self, hmm, rule_set_list, case_name): grammar = Grammar(hmm, RuleSet(rule_set_list)) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def normal_hypothesis(distribution: dist.Distribution, size: int): sample = distribution.create_sample(size) characteristics = Characteristics(sample) hypothesis = Hypothesis() hyp_distribution = dist.NormalDistribution(characteristics.mean(), characteristics.variance()) hypothesis.check_hypothesis(sample, hyp_distribution)
def uniform_hypothesis(distribution: dist.Distribution, size: int): sample = distribution.create_sample(size) characteristics = Characteristics(sample) hypothesis = Hypothesis() hyp_distribution = dist.UniformDistribution(characteristics.min(), characteristics.max()) hypothesis.check_hypothesis(sample, hyp_distribution)
def add_expanded_hyp( ctc_table: np.ndarray, weights: dict, row: int, col: int, candidate_hyp: Hypothesis, parent: (int, int)): current_hyp = ctc_table[row, col] weights = { "lm_score" : 0.0, "null_trailing" : 0.0, "null_token_ratio" : 0.0 } if current_hyp: score_current = score_hypothesis(current_hyp[0], weights, 0) score_candidate = score_hypothesis(candidate_hyp, weights, 0) if score_candidate <= score_current: return candidate_hyp.recombine_with(current_hyp[0]) ctc_table[row, col] = (candidate_hyp, parent)
def notify_facet(self, facet=None, value=None, groupname=None): params = {'_separate_replies':'true'} params[facet] = value params['limit'] = 200 h_url = Hypothesis().query_url.format(query=urlencode(params)) #print h_url r = None if self.token is not None: h = Hypothesis(token=self.token) r = h.token_authenticated_query(h_url) else: r = requests.get(h_url).json() rows = r['rows'] rows += r['replies'] cache = self.data() rows.sort(key=itemgetter('updated')) for row in rows: new = False anno = HypothesisAnnotation(row) if self.type == 'set': if anno.id not in cache: cache.add(anno.id) new = True if self.type == 'dict': if not value in cache: cache[value] = set() if anno.id not in cache[value]: cache[value].add(anno.id) new = True if new and anno.id not in self.notified_ids: self.notify(anno, groupname=groupname) self.notified_ids.append(anno.id) self.save(cache) return self.notified_ids
def test_turkish__only_syll_is_the_correct_context(self): self.initialise_simulation(turkish_vowel_harmony_new_weights) # +syll --> +back hmm_dict = { 'q0': ['q1'], 'q1': (['q2'], [ 'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi', 'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi', 'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp' ]), 'q2': (['qf'], [ 'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e', EPSILON ]), } rule_change = ([{"syll": "+"}], [{"back": "+"}]) # +syll --> -back hmm_dict2 = { 'q0': ['q1'], 'q1': (['q2'], [ 'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi', 'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi', 'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp' ]), 'q2': (['qf'], [ '1n', 'lar', 's1z', '1', 'tan', 'sal', 'l1k', 'l1', 'a', EPSILON ]), } rule_change2 = ([{"syll": "+"}], [{"back": "-"}]) target_energy = self.get_target_hypo().get_energy() unexpexted_context = [] for feat in 'syll,back,round,high,voice,cont,lateral,son'.split(','): for val in ['+', '-']: if (feat, val) == ('syll', '-'): continue for r, change in enumerate([rule_change, rule_change2], start=1): for h, hmm in enumerate([hmm_dict, hmm_dict2], start=1): some_hmm = HMM(deepcopy(hmm)) rule = change + ([{ "syll": "+", "back": change[1][0]['back'] }, { feat: val, "kleene": True }], [], True) some_rules = RuleSet([Rule(*rule)]) some_hypo = Hypothesis(Grammar(some_hmm, some_rules)) if some_hypo.get_energy() <= target_energy: unexpexted_context.append( {f"hmm{h} rule {r}": { feat: val }}) assert unexpexted_context == [], f"Unexpected kleene context for rule: {unexpexted_context}"
def emit_group_rss(self, group=None, groupname=None): md = markdown.Markdown() from feedgen.feed import FeedGenerator fg = FeedGenerator() fg.id('https://h.jonudell.info') fg.title('Hypothesis group %s' % groupname) fg.author({'name': 'Jon Udell', 'email': '*****@*****.**'}) fg.description("Hypothesis notifications for group %s" % groupname) fg.link(href='https://h.jonudell.info/group_rss') fg.language('en') h = Hypothesis(token=self.token, limit=20) ids = self.data() annos = [] for id in ids: try: anno = h.get_annotation(id) assert ('id' in anno.keys()) annos.append(anno) except: print('cannot get %s, deleted?' % id) annos.sort(key=itemgetter('updated'), reverse=True) annos = [HypothesisAnnotation(a) for a in annos] for anno in annos: ref_user = None in_reply_to = None root_id = anno.id if len(anno.references) > 0: try: ref_id = anno.references[-1:][0] root_id = anno.references[0] ref = h.get_annotation(ref_id) ref_user = HypothesisAnnotation(ref).user in_reply_to = '<p>in reply to %s </p>' % ref_user except: print("cannot get user for ref_id %s, deleted?" % ref_id) fe = fg.add_entry() fe.id(anno.id) fe.title('%s annotated %s in the group %s at %s ' % (anno.user, anno.doc_title, groupname, anno.updated)) fe.author({"email": None, "name": anno.user, "uri": None}) dl = "https://hyp.is/%s" % anno.id fe.link({"href": "%s" % dl}) content = '' if ref_user is not None: content += in_reply_to if anno.exact is not None: content += '<p>in reference to: </p> <p> <blockquote><em>%s</em></blockquote></p>' % anno.exact content += '<p> %s <a href="https://hyp.is/%s">said</a>: </p> ' % ( anno.user, root_id) content += '%s ' % md.convert(anno.text) if len(anno.tags): content += '<p>tags: %s' % ', '.join(anno.tags) fe.content(content, type='CDATA') dt = dateutil.parser.parse(anno.updated) dt_tz = dt.replace(tzinfo=pytz.UTC) fe.pubdate(dt_tz) rssfeed = fg.rss_str(pretty=True) # Get the RSS feed as string fg.rss_file('%s.xml' % group) # Write the RSS feed to a file
def get_energy(self, hmm, rule_set_list, case_name): grammar = Grammar(hmm, RuleSet(rule_set_list)) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa_" + case_name) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def laplace_hypothesis(distribution: dist.Distribution, size: int): sample = distribution.create_sample(size) characteristics = Characteristics(sample) hypothesis = Hypothesis() hyp_distribution = dist.LaplaceDistribution( characteristics.mean(), characteristics.variance() / (2**0.5)) hypothesis.check_hypothesis(sample, hyp_distribution)
def test_crossover(self): self.initialise_segment_table("dag_zook_segments_new.txt") rule_set_1 = RuleSet([ Rule(*[[{ "cons": "+" }], [{ "voice": "-" }], [{ "low": "+" }], [{ "cont": "-" }], True]) ]) rule_set_2 = RuleSet([ Rule(*[[{ "cons": "+" }], [{ "low": "-" }], [{ "voice": "-" }], [], False]) ]) plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog'] hmm_1 = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dag', 'kot']), 'q2': ([FINAL_STATE], ['z']) }) hmm_2 = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2'], ['dog', 'kat']), 'q2': (['q3'], ['s']), 'q3': ([FINAL_STATE], ['z']) }) grammar_1 = Grammar(hmm_1, rule_set_1) grammar_2 = Grammar(hmm_2, rule_set_2) hypothesis_1 = Hypothesis(grammar_1, plural_english_data) hypothesis_2 = Hypothesis(grammar_2, plural_english_data) offspring_1, offspring_2 = GeneticAlgorithm.crossover( hypothesis_1, hypothesis_2) print("*** Parents:\n") GeneticAlgorithm.log_hypothesis(hypothesis_1) GeneticAlgorithm.log_hypothesis(hypothesis_2) print("\n\n*** Offspring:\n") GeneticAlgorithm.log_hypothesis(offspring_1) GeneticAlgorithm.log_hypothesis(offspring_2) offspring_3, offspring_4 = GeneticAlgorithm.crossover( offspring_1, offspring_2) print("\n\n*** 2nd gen offspring:\n") GeneticAlgorithm.log_hypothesis(offspring_3) GeneticAlgorithm.log_hypothesis(offspring_4)
def test_abadnese_no_rule(self): self.initialise_segment_table("abd_segment_table.txt") data = ['bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab'] hmm = HMM({'q0': ['q1'], 'q1': (['q2'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab'])}) grammar = Grammar(hmm, []) self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 243)
def test_plural_english_hypothesis(self): self.initialise_segment_table("plural_english_segment_table.txt") self.rule_set = self.get_rule_set("plural_english_rule_set.json") plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog'] hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z'])}) grammar = Grammar(hmm, self.rule_set) self.write_to_dot_file(self.rule_set.rules[0].get_transducer(), "plural_english_rule") self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = plural_english_data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 117)
def test_morphology_only2(self): self.initialise_segment_table("plural_english_segment_table.txt") configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 data = [ u'tozata', u'tozaso', u'tozakt', u'tozzookata', u'tozzookaso', u'tozzookakt', u'tozzook', u'tozdodata', u'tozdodaso', u'tozdodakt', u'tozdod', u'tozgosata', u'tozgosaso', u'tozgosakt', u'tozgos', u'toz', u'dagata', u'dagaso', u'dagakt', u'dagzookata', u'dagzookaso', u'dagzookakt', u'dagzook', u'dagdodata', u'dagdodaso', u'dagdodakt', u'dagdod', u'daggosata', u'daggosaso', u'daggosakt', u'daggos', u'dag', u'gasata', u'gasaso', u'gasakt', u'gaszookata', u'gaszookaso', u'gaszookakt', u'gaszook', u'gasdodata', u'gasdodaso', u'gasdodakt', u'gasdod', u'gasgosata', u'gasgosaso', u'gasgosakt', u'gasgos', u'gas', u'kodata', u'kodaso', u'kodakt', u'kodzookata', u'kodzookaso', u'kodzookakt', u'kodzook', u'koddodata', u'koddodaso', u'koddodakt', u'koddod', u'kodgosata', u'kodgosaso', u'kodgosakt', u'kodgos', u'kod', u'katata', u'kataso', u'katakt', u'katzookata', u'katzookaso', u'katzookakt', u'katzook', u'katdodata', u'katdodaso', u'katdodakt', u'katdod', u'katgosata', u'katgosaso', u'katgosakt', u'katgos', u'kat', u'dotata', u'dotaso', u'dotakt', u'dotzookata', u'dotzookaso', u'dotzookakt', u'dotzook', u'dotdodata', u'dotdodaso', u'dotdodakt', u'dotdod', u'dotgosata', u'dotgosaso', u'dotgosakt', u'dotgos', u'dot' ] hmm = HMM({ 'q0': [u'q1'], 'q1': ([u'q2', u'q3', u'qf'], ['toz', 'dag', 'kat', 'dot', 'kod', 'gas']), 'q2': ([u'q3', u'qf'], ['zook', 'gos', 'dod']), 'q3': ([u'qf'], ['aso', 'akt', 'ata']) }) hypothesis = Hypothesis(Grammar(hmm, []), data)
def inferPosterior(self, likelihood, MCMCOn=True): """ Uses inference engine to compute posterior probability from the likelihood and prior (beta distribution). """ posterior = likelihood * self.prior # print posterior.sum() # posterior /= posterior.sum() # posterior = [np.around(i,4) for i in posterior] if MCMCOn: samples, h = self.MCMC(posterior, 200000) hypMCMC = list(set(h)) posteriorMCMC = [h.count(i) / float(len(h)) for i in hypMCMC] self.hypMCMC = hypMCMC self.posteriorsMCMC.append(posteriorMCMC) self.evalHypMCMC = list() H = Hypothesis(Grid('testGrid')) for h in self.hypMCMC: h = h.replace('Then', 'H.Then') h = h.replace('And', 'H.And') h = h.replace('Or', 'H.Or') self.evalHypMCMC.append(eval(h)) else: self.hypMCMC = self.hypotheses self.evalHypMCMC = list() self.evalHypMCMC = self.evalHypotheses self.posteriorsMCMC.append(posterior) self.posteriors.append(posterior)
def test_get_parsing_results(self): self.initialise_segment_table("abnese_lengthening_segment_table.txt") configurations["MORPHEME_BOUNDARY_FLAG"] = True configurations["LENGTHENING_FLAG"] = True configurations["HMM_ENCODING_LENGTH_MULTIPLIER"] = 100 configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 20 hmm = HMM({ 'q0': ['q1'], 'q1': (['qf'], ['aabb', 'abb', 'bbaabb', 'aba', 'aaba', 'bbaa']) }) rule1 = Rule([], [{ "long": "+" }], [], [{}, { "bound": "+" }], obligatory=True) rule2 = Rule([], [{ "syll": "+" }], [{ "cons": "+" }], [{ "cons": "+" }], obligatory=True) rule_set = RuleSet([rule1, rule2]) grammar = Grammar(hmm, rule_set) data = [ u'baba:a', u'babaab:ab', u'ab:a', u'aab:a', u'aab:ab', u'ab:ab' ] hypothesis = Hypothesis(grammar, data) simulated_annealing = SimulatedAnnealing(hypothesis, 0) print(simulated_annealing._get_parsing_results())
def test_get_random_hypothesis(self): self.configurations["EVOLVE_HMM"] = True self.configurations["EVOLVE_RULES"] = True self.initialise_segment_table("plural_english_segment_table.txt") data = ['kats', 'dogz', 'kat', 'dog'] rand_hypothesis = Hypothesis.get_random_hypothesis(data) log_hypothesis(rand_hypothesis)
def test_assimilation_no_rule(self): self.initialise_segment_table("plural_english_segment_table.txt") data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do', 'to', 'so', 'ko']) }) grammar = Grammar(hmm, []) hypothesis = Hypothesis(grammar) self.configurations.simulation_data = data self.assertEqual(int(hypothesis.get_energy()), 230)
def test_assimilation2(self): self.initialise_segment_table("plural_english_segment_table.txt") self.rule_set = self.get_rule_set("plural_english_rule_set.json") data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do']) }) grammar = Grammar(hmm, self.rule_set) self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) for _ in range(10): #1.4 energy = hypothesis.get_energy()
def get_hypothesis_from_log_string(hypothesis_string): from grammar import Grammar from hypothesis import Hypothesis hmm = get_hmm_from_hypothesis_string(hypothesis_string) rule_set = get_rule_set_from_hypothesis_string(hypothesis_string) grammar = Grammar(hmm, rule_set) return Hypothesis(grammar)
def __getRulesFromPT(self, s_rule, span): ''' Get the rules from the Phrase table and create new entry object for each rule returned by phrase table ''' tgtLst = PhraseTable.getRuleEntries(s_rule, self.sent_indx) newTgtLst = [] for r_item in tgtLst: new_entry = Hypothesis.createFromRule(r_item, span) newTgtLst.append(new_entry) return newTgtLst
def test_abnese(self): self.initialise_segment_table("ab_segment_table.txt") self.configurations["BRACKET_TRANSDUCER"] = True data = ['bab', 'aabab'] hmm = HMM( {'q0': ['q1'], 'q1': (['qf'], ['bb', 'aabb']) }) rule = Rule([], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], False) # e->a / b_b rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("bb")) grammar = Grammar(hmm, rule_set) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) print(hypothesis.get_energy()) print(hypothesis.get_recent_energy_signature())
def test_abadnese_no_rule(self): self.initialise_segment_table("abd_segment_table.txt") data = [ 'bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab' ] hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab']) }) grammar = Grammar(hmm, []) hypothesis = Hypothesis(grammar, data) self.assertEqual(hypothesis.get_energy(), 252)
def decode(self): ''' return the best translation for the sentence ''' # initialize hypothesis stack for i in xrange(self.number_of_foreign_words +1): self.hypothesis_stack[i] = HypothesisStack(MAX_HISTOGRAMS) # create initial hypothesis hyp_init = Hypothesis.create_initial() self.hypothesis_stack[0].push(hyp_init) for i in xrange(self.number_of_foreign_words): for hyp in self.hypothesis_stack[i]: new_hyps = self._generate_new_hypotheses(hyp) for new_hyp in new_hyps: self.hypothesis_stack[len(new_hyp.get_foreign_covered_indexes())].push(new_hyp) return self._get_best_hypothesis()
def forceDecodePrune(self, refsLst, last_cell=False): '''Prune the top-level cells for force decoding''' left_side = 'S' # forceDecodePrune() can only be used in 'S' cells for 'S' derivations for key in self.table.iterkeys(): if key[0] != left_side: continue cand_indx = 0 for cand in self.table[key][:]: matches_ref = False cand_tgt = Hypothesis.getHypothesis(cand) for ref_i in refsLst: if (not last_cell and ref_i.startswith(cand_tgt)) or (last_cell and ref_i == cand_tgt): matches_ref = True break if not matches_ref: del self.table[key][cand_indx] else: cand_indx += 1 # If all the entries are deleted, then there will be no S derivations in the cell; set has_S_tree as False if not self.table[key]: self.has_S_tree = False return self.has_S_tree
def mergeProducts(self): 'Calculate the top-N derivations by lazily finding the sum of log probs (equivalent to product of probs)' # Initializations heap_indx = 0 cb_pop_count = 0 mP_candLst = [] for cube_indx, cube_obj in self.cubeDict.iteritems(): mP_candTup = () mP_candTup = Cube.getBestItem(cube_obj, cube_indx) heapq.heappush(mP_candLst, (mP_candTup[0], heap_indx, mP_candTup[1], mP_candTup[2], mP_candTup[3])) heap_indx += 1 candLst_size = heap_indx while candLst_size > 0: # get the best item from the heap (h_score, h_indx, mP_entry_obj, cube_indx, mP_r) = heapq.heappop(mP_candLst) candLst_size -= 1 # push the best item into coverageHeap from which the N-best list will be extracted if mP_entry_obj is not None: # @type mP_entry_obj Entry entry_exists = Lazy.indexHypothesis(mP_entry_obj.tgt, Hypothesis.getScoreSansLM(mP_entry_obj), h_indx) if (entry_exists is None or not settings.opts.use_unique_nbest): mP_entry_obj.inf_cell = Lazy.cell_span if self.cbp_diversity > 0: self.recordDiversity(cube_indx) heapq.heappush(self.coverageHeap, (h_score, h_indx, mP_entry_obj, cube_indx, mP_r)) elif entry_exists == -1: pass else: curr_h_indx = self.getItemIndxInHeap(entry_exists) if self.cbp_diversity > 0: self.recordDiversity(cube_indx, self.coverageHeap[curr_h_indx][3]) self.coverageHeap[curr_h_indx] = (h_score, entry_exists, mP_entry_obj, cube_indx, mP_r) if cb_pop_count >= self.bsize and (self.cbp_diversity == 0 or ( self.cbp_diversity > 0 and \ self.coverageDict.has_key(cube_indx) and self.coverageDict[cube_indx] >= self.cbp_diversity )): continue # get the neighbours for the best item from the corresponding cube cube_obj = self.cubeDict[cube_indx] # @type cube_obj Cube neighbours = Cube.xploreNeighbours(cube_obj, cube_indx, mP_r) # add the neighbouring entries to the heap for mP_candTup in neighbours: if mP_candTup[1] is not None: entry_exists = Lazy.checkHypothesis(mP_candTup[1].tgt) if entry_exists is None: # New hypothesis, increment cbp cb_pop_count += 1 new_candTup = (mP_candTup[0], heap_indx, mP_candTup[1], mP_candTup[2], mP_candTup[3]) heapq.heappush(mP_candLst, new_candTup) candLst_size += 1 heap_indx += 1 # Explore new items for increasing diversity (if required) if self.cbp_diversity > 0: for c_ind in self.cubeDict.keys(): diversityItems = [] cube_obj = self.cubeDict[c_ind] if not self.coverageDict.has_key(c_ind): diversityItems = Cube.getkItems4Diversity(cube_obj, c_ind, self.cbp_diversity) elif self.coverageDict[c_ind] < self.cbp_diversity: diversityItems = Cube.getkItems4Diversity(cube_obj, c_ind, self.cbp_diversity - self.coverageDict[c_ind]) else: continue for mP_candTup in diversityItems: new_candTup = (mP_candTup[0], heap_indx, mP_candTup[1], mP_candTup[2], mP_candTup[3]) heapq.heappush(self.coverageHeap, new_candTup) heap_indx += 1 Nbest_size = 0 NbestLst = [] self.coverageDict = {} heapq.heapify(self.coverageHeap) while self.coverageHeap: (h_score, h_indx, mP_entry_obj, cube_indx, mP_r) = heapq.heappop( self.coverageHeap ) if ( Nbest_size < self.bsize or (Nbest_size >= self.bsize and self.cbp_diversity > 0 and \ (not self.coverageDict.has_key(cube_indx) or self.coverageDict[cube_indx] < self.cbp_diversity)) ): NbestLst.append( mP_entry_obj ) Nbest_size += 1 if self.cbp_diversity > 0: if self.coverageDict.has_key(cube_indx): self.coverageDict[cube_indx] += 1 else: self.coverageDict[cube_indx] = 1 return NbestLst