def test_morphology_only(self): self.initialise_segment_table("plural_english_segment_table.txt") data = [u'tozat', u'tozgoat', u'tozgo', u'tozdoat', u'tozdo', u'tozzoat', u'tozzo', u'toz', u'dagat', u'daggoat', u'daggo', u'dagdoat', u'dagdo', u'dagzoat', u'dagzo', u'dag', u'gasat', u'gasgoat', u'gasgo', u'gasdoat', u'gasdo', u'gaszoat', u'gaszo', u'gas', u'kodat', u'kodgoat', u'kodgo', u'koddoat', u'koddo', u'kodzoat', u'kodzo', u'kod', u'katat', u'katgoat', u'katgo', u'katdoat', u'katdo', u'katzoat', u'katzo', u'kat', u'dotat', u'dotgoat', u'dotgo', u'dotdoat', u'dotdo', u'dotzoat', u'dotzo', u'dot'] #target hmm = {'q0': ['q1'], 'q1': (['q2', 'q3', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']), 'q2': (['q3','qf'], ['zo', 'go', 'do']), 'q3': (['qf'], ['at'])} self.configurations.simulation_data = data self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 5190) #single_sate hmm = HMM({'q0': ['q1'], 'q1': (['q1', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do'] + ['at']) }) self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6430) #two state hmm = {'q0': ['q1'], 'q1': (['q1', 'q2', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do']), 'q2': (['qf'], ['at']) } self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6010) #from simualation hmm = HMM({'q0': ['q1'], 'q1': (['q1', 'qf'], ['toz', 'do', 'zo', 'gas', 'kod', 'dag', 'at', 'zoat', 'kat', 'go', 'dot']) })
def generate_samples(grammar_dir, outfiles): """Generates a set of samples and writes them to the output files. Args: grammar_dir: directory to load grammar files from. outfiles: A list of output filenames. """ f = open(os.path.join(grammar_dir, 'template.html')) template = f.read() f.close() htmlgrammar = Grammar() err = htmlgrammar.parse_from_file(os.path.join(grammar_dir, 'html.txt')) # CheckGrammar(htmlgrammar) if err > 0: print('There were errors parsing grammar') return cssgrammar = Grammar() err = cssgrammar.parse_from_file(os.path.join(grammar_dir, 'css.txt')) # CheckGrammar(cssgrammar) if err > 0: print('There were errors parsing grammar') return jsgrammar = Grammar() err = jsgrammar.parse_from_file(os.path.join(grammar_dir, 'js.txt')) # CheckGrammar(jsgrammar) if err > 0: print('There were errors parsing grammar') return # JS and HTML grammar need access to CSS grammar. # Add it as import htmlgrammar.add_import('cssgrammar', cssgrammar) jsgrammar.add_import('cssgrammar', cssgrammar) ttt = 0 for outfile in outfiles: ttt += 1 result = generate_new_sample(template, htmlgrammar, cssgrammar, jsgrammar, g=ttt, fi=outfile) #F:/domato/curpus/ if result is not None: print('Writing a sample to ' + outfile) try: f = open(outfile, 'w') f.write(result) f.close() except IOError: print('Error writing to output')
def test_crossover(self): self.initialise_segment_table("dag_zook_segments_new.txt") rule_set_1 = RuleSet([ Rule(*[[{ "cons": "+" }], [{ "voice": "-" }], [{ "low": "+" }], [{ "cont": "-" }], True]) ]) rule_set_2 = RuleSet([ Rule(*[[{ "cons": "+" }], [{ "low": "-" }], [{ "voice": "-" }], [], False]) ]) plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog'] hmm_1 = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dag', 'kot']), 'q2': ([FINAL_STATE], ['z']) }) hmm_2 = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2'], ['dog', 'kat']), 'q2': (['q3'], ['s']), 'q3': ([FINAL_STATE], ['z']) }) grammar_1 = Grammar(hmm_1, rule_set_1) grammar_2 = Grammar(hmm_2, rule_set_2) hypothesis_1 = Hypothesis(grammar_1, plural_english_data) hypothesis_2 = Hypothesis(grammar_2, plural_english_data) offspring_1, offspring_2 = GeneticAlgorithm.crossover( hypothesis_1, hypothesis_2) print("*** Parents:\n") GeneticAlgorithm.log_hypothesis(hypothesis_1) GeneticAlgorithm.log_hypothesis(hypothesis_2) print("\n\n*** Offspring:\n") GeneticAlgorithm.log_hypothesis(offspring_1) GeneticAlgorithm.log_hypothesis(offspring_2) offspring_3, offspring_4 = GeneticAlgorithm.crossover( offspring_1, offspring_2) print("\n\n*** 2nd gen offspring:\n") GeneticAlgorithm.log_hypothesis(offspring_3) GeneticAlgorithm.log_hypothesis(offspring_4)
def test_crossover(self): from copy import deepcopy rule_1 = Rule.load([[{ 'cont': '+' }], [{ 'coronal': '-' }], [{ 'coronal': '-' }], [], True]) rule_2 = Rule.load([[{ 'cons': '+', 'low': '-' }], [{ 'voice': '-' }], [{ 'voice': '-' }], [], True]) crossover_rule_1 = deepcopy(rule_1) crossover_rule_2 = deepcopy(rule_2) crossover_rule_1.left_context_feature_bundle_list = rule_2.left_context_feature_bundle_list crossover_rule_1.right_context_feature_bundle_list = rule_2.right_context_feature_bundle_list crossover_rule_1.change_feature_bundle_list = rule_2.change_feature_bundle_list crossover_rule_2.left_context_feature_bundle_list = rule_1.left_context_feature_bundle_list crossover_rule_2.right_context_feature_bundle_list = rule_1.right_context_feature_bundle_list crossover_rule_2.change_feature_bundle_list = rule_1.change_feature_bundle_list rule_set_1 = RuleSet([crossover_rule_1]) rule_set_2 = RuleSet([crossover_rule_2]) print(rule_set_1) print(rule_set_2) hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do']) }) grammar_1 = Grammar(hmm, rule_set_1) grammar_2 = Grammar(hmm, rule_set_2) data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hypothesis_1 = Hypothesis(grammar_1, data) hypothesis_2 = Hypothesis(grammar_2, data) print(hypothesis_1.get_energy()) print(hypothesis_2.get_energy())
def crossover(self, h1, h2): if GeneticAlgorithm._is_incest(h1, h2): return h1, h2, crossover_rules = False crossover_hmm = False if ga_config.CROSSOVER_BOTH_HMM_AND_RULES: if configurations["EVOLVE_RULES"]: crossover_rules = True if configurations["EVOLVE_HMM"]: crossover_hmm = True else: loci = [] if configurations["EVOLVE_RULES"]: loci.append('rules') if configurations["EVOLVE_HMM"]: loci.append('hmm') locus = random.choice(loci) if locus == 'rules': crossover_rules = True elif locus == 'hmm': crossover_hmm = True offspring_1 = deepcopy(h1) offspring_2 = deepcopy(h2) if crossover_rules: offspring_1_rule_set, offspring_2_rule_set = RuleSet.crossover( offspring_1.grammar.rule_set, offspring_2.grammar.rule_set) else: offspring_1_rule_set, offspring_2_rule_set = offspring_1.grammar.rule_set, offspring_2.grammar.rule_set if crossover_hmm: offspring_1_hmm, offspring_2_hmm = HMM.crossover( offspring_1.grammar.hmm, offspring_2.grammar.hmm) else: offspring_1_hmm, offspring_2_hmm = offspring_1.grammar.hmm, offspring_2.grammar.hmm offspring_1.grammar = Grammar(offspring_1_hmm, offspring_1_rule_set) offspring_2.grammar = Grammar(offspring_2_hmm, offspring_2_rule_set) # Invalidate mutated offspring fitness value (i.e. mark for re-evaluation) offspring_1.invalidate_fitness() offspring_1.invalidate_energy() offspring_2.invalidate_fitness() offspring_2.invalidate_energy() return offspring_1, offspring_2,
def GenerateSamples(grammar_dir, outfiles): """Generates a set of samples and writes them to the output files. Args: grammar_dir: directory to load grammar files from. outfiles: A list of output filenames. """ f = open(os.path.join(grammar_dir, 'template.html')) template = f.read() f.close() htmlgrammar = Grammar() err = htmlgrammar.ParseFromFile(os.path.join(grammar_dir, 'html.txt')) #CheckGrammar(htmlgrammar) if err > 0: print 'There were errors parsing grammar' return cssgrammar = Grammar() err = cssgrammar.ParseFromFile(os.path.join(grammar_dir, 'css.txt')) #CheckGrammar(cssgrammar) if err > 0: print 'There were errors parsing grammar' return jsgrammar = Grammar() err = jsgrammar.ParseFromFile(os.path.join(grammar_dir, 'js.txt')) #CheckGrammar(jsgrammar) if err > 0: print 'There were errors parsing grammar' return # JS and HTML grammar need acces to CSS grammar. # Add it as import htmlgrammar.AddImport('cssgrammar', cssgrammar) jsgrammar.AddImport('cssgrammar', cssgrammar) for outfile in outfiles: result = GenerateNewSample(template, htmlgrammar, cssgrammar, jsgrammar) if result is not None: print 'Writing a sample to ' + outfile try: f = open(outfile, 'w') f.write(result) f.close() except IOError: print 'Error writing to output'
def test_parsing(self): # Ignoring semantics for now... numeral_rules = [ Rule('$E', 'one'), Rule('$E', 'two'), Rule('$E', 'three'), Rule('$E', 'four'), ] operator_rules = [ Rule('$UnOp', 'minus'), Rule('$BinOp', 'plus'), Rule('$BinOp', 'minus'), Rule('$BinOp', 'times'), ] compositional_rules = [ Rule('$E', '$UnOp $E'), Rule('$EBO', '$E $BinOp'), Rule('$E', '$EBO $E') ] arithmetic_rules = numeral_rules + operator_rules + compositional_rules arithmetic_grammar = Grammar(arithmetic_rules) for example in self.one_parse_examples: self.assertEqual(1, len(arithmetic_grammar.parse(example.input)), example) # print(arithmetic_grammar.parse(example.input)[0]) for example in self.two_parse_examples: self.assertEqual(2, len(arithmetic_grammar.parse(example.input)), example)
def test_learning_from_many_denotations(self): """ Large number of examples are used for training. Last 4 arithmetic_examples are used for testing. b_trn: performance metrics on training set before training a_trn: performance metrics on training set after training denotation accuracy: # of examples where denotation of parse at position 0 was correct """ arithmetic_grammar = Grammar(self.arithmetic_rules) arithmetic_examples = self.two_parse_examples + self.one_parse_examples from executor import Executor arithmetic_model = Model( grammar=arithmetic_grammar, feature_fn=Parse.operator_precedence_features, weights=defaultdict(float), # Initialize with all weights at zero executor=Executor.execute) from metrics import DenotationAccuracyMetric from arithmetic import arithmetic_dev_examples b_trn, b_tst, a_trn, a_tst = arithmetic_model.train_test( train_examples=arithmetic_dev_examples, test_examples=arithmetic_examples[13:], training_metric=DenotationAccuracyMetric(), seed=1) # BEFORE SGD self.assertEqual(b_trn['denotation accuracy'], 64) # AFTER SGD self.assertEqual(a_trn['denotation accuracy'], 92) # Improvement
def test_turkish_blah(self): self.initialise_simulation(turkish_vowel_harmony_new_weights) Q2s = [ 'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e', EPSILON ] hmm_dict = { 'q0': ['q1'], 'q1': (['q2'], [ 'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi', 'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi', 'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp' ]), 'q2': (['qf'], Q2s), } some_hmm = HMM(deepcopy(hmm_dict)) some_rules = RuleSet([ Rule([{ "syll": "+" }], [{ "back": "+" }], [{ "cont": "+", "back": "+" }, { "syll": "-", "kleene": True }], [], True) ]) some_hypo = Hypothesis(Grammar(some_hmm, some_rules)) # self.assert_equal_no_infs(self.get_target_hypo().get_energy(), some_hypo.get_energy())
def generate_samples(grammar_dir, outfiles): """Generates a set of samples and writes them to the output files. Args: grammar_dir: directory to load grammar files from. outfiles: A list of output filenames. """ f = open(os.path.join(grammar_dir, 'ox_template.html')) template = f.read() f.close() jsgrammar = Grammar() err = jsgrammar.parse_from_file(os.path.join(grammar_dir, 'oxjs.txt')) if err > 0: print('There were errors parsing grammar') return for outfile in outfiles: result = generate_new_sample(template, jsgrammar) if result is not None: print('Writing a sample to ' + outfile) try: f = open(outfile, 'w') f.write(result) f.close() except IOError: print('Error writing to output')
def test_first(self): """Grammar: init should compute First sets""" for rules, first in self.known_firsts: g = Grammar(rules) self.assertEqual(first.keys(), g.first.keys()) for s in first: self.assertEqual(first[s], g.first[s])
def test_parser_kleene(self): hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['at', 'attstktttt', 'st']), 'q2': ([FINAL_STATE], ['o']) }) hmm_transducer = hmm.get_transducer() self.write_to_dot_to_file(hmm_transducer, "test_hmm_transducer_kleene") assimilation_rule_with_kleene = Rule([{ "cons": "-" }], [{ "low": "+" }], [{ "cons": "-" }, { "cons": "+", "kleene": True }], [], obligatory=True) rule_set_with_kleene = RuleSet([assimilation_rule_with_kleene]) grammar = Grammar(hmm, rule_set_with_kleene) nfa = grammar.get_nfa() self.write_to_dot_to_file(nfa, "test_parser_nfa_kleene")
def delete_nonderivable_nonterminals(grammar): new_grammar = Grammar() new_grammar.axiom = grammar.axiom new_grammar.terminals = grammar.terminals unwatched = list([new_grammar.axiom]) watched = set() while unwatched: nonterminal = unwatched[0] unwatched = unwatched.remove(nonterminal) or [] watched.add(nonterminal) rules = find_rules_for_nonterminal(grammar.rules, nonterminal) for rule in rules: for symbol in rule.right_side: if isinstance(symbol, Nonterminal): if symbol not in watched and symbol not in unwatched: unwatched.append(symbol) new_grammar.nonterminals = watched new_rules = [] for rule in grammar.rules: if rule.left_side[0] in watched: new_rules.append(rule) new_grammar.rules = new_rules return new_grammar
def test_grammar_creation(self): numeral_rules = [ Rule('$E', 'one'), Rule('$E', 'two'), Rule('$E', 'three'), Rule('$E', 'four'), ] operator_rules = [ Rule('$UnOp', 'minus'), Rule('$BinOp', 'plus'), Rule('$BinOp', 'minus'), Rule('$BinOp', 'times'), ] compositional_rules = [ Rule('$E', '$UnOp $E'), Rule('$EBO', '$E $BinOp'), Rule('$E', '$EBO $E') ] arithmetic_rules = numeral_rules + operator_rules + compositional_rules arithmetic_grammar = Grammar(arithmetic_rules) self.assertEqual(3, len(arithmetic_grammar.binary_rules)) self.assertEqual(7, len(arithmetic_grammar.lexical_rules))
def toGrammar(self): from grammar import Grammar """ 1. δ(q, a) => Q -> aP 2. δ(q, a); q in P => Q - a 3. δ(q0, a) => S -> aP 4. δ(q0, a) => S -> aP 5. q0 in F .... """ grammar_rules = {} for path in self.func: source, letter = path targets = self.func[path] source = source if source is not self.initial_state else "S" grammar_rules.setdefault(source, "") for target in targets: target = target if target is not self.initial_state else "S" if source in self.accepted_sates: grammar_rules[source] += (f"| {letter}") elif source is self.initial_state: grammar_rules[source] += (f"| {letter}{target}") else: grammar_rules[source] += (f"| {letter}{target}") grammar_rules[source] = grammar_rules[source].strip("|").strip() g = Grammar() for source, rule in grammar_rules.items(): g[source] = rule return g
def induce_entity_grammar(self, start_grammar): """Induce an entity-swapping grammar. Get the entities from the original dataset. Get the places to put holes from start_grammar. """ new_grammar = Grammar() # Entity rules for x, y in self.dataset: alignments = self.domain.get_entity_alignments(x, y) for cat, x_span, y_span in alignments: x_str = x[x_span[0]:x_span[1]] y_str = y[y_span[0]:y_span[1]] new_grammar.add_rule(cat, x_str, y_str) # Root/template rules for cat, x_str, y_str in start_grammar.rule_list: # Anchor on single mention in x--allow one-to-many x-to-y mapping alignments = self.domain.get_entity_alignments(x_str, y_str) x_swaps = list(set( [(x_span, '%s_%d' % (inner_cat, x_span[0])) for i, (inner_cat, x_span, y_span) in enumerate(alignments)])) x_new = self.splice(x_str, x_swaps) y_swaps = [(y_span, '%s_%d' % (inner_cat, x_span[0])) for i, (inner_cat, x_span, y_span) in enumerate(alignments)] y_new = self.splice(y_str, y_swaps) new_grammar.add_rule(cat, x_new, y_new) # new_grammar.print_self() return new_grammar
def get_energy(self, simulation_case): case_name = simulation_case.case_name configuration.configurations_dict["case_name"] = case_name if isinstance(simulation_case.hmm_dict, HMM): hmm = simulation_case.hmm_dict else: hmm = HMM(simulation_case.hmm_dict) if isinstance(simulation_case.flat_rule_set_list, RuleSet): rule_set = simulation_case.flat_rule_set_list else: rule_set_list = [] for flat_rule in simulation_case.flat_rule_set_list: rule_set_list.append(Rule(*flat_rule)) rule_set = RuleSet(rule_set_list) grammar = Grammar(hmm, rule_set) self.write_to_dot_to_file(hmm, "hmm_" + case_name) self.write_to_dot_to_file(grammar.get_nfa(), "grammar_nfa_" + case_name) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() if self.target_energy: print("{}: {} distance from target: {}".format( case_name, hypothesis.get_recent_energy_signature(), energy - self.target_energy)) else: print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def test_follow(self): """Grammar: init should compute Follow sets""" for rules, follow in self.known_follows: g = Grammar(rules) self.assertEqual(follow.keys(), g.follow.keys()) for s in follow: self.assertEqual(follow[s], g.follow[s])
def grammar_cc() -> Grammar: return Grammar(terminals='cd', non_terminals='SC', start='S', rules=[Rule('S', 'CC'), Rule('C', 'cC'), Rule('C', 'd')])
def test_parse(self): """LL1: check parse() against know result""" gram, sentence, ref_leftmost = self.ref_parse ll1 = LL1(Grammar(gram)) tree = ll1.parse(sentence.split()) t_leftmost = tuple(tree.leftmost()) self.assertEqual(t_leftmost, ref_leftmost)
def untorch(g): if type(g.logVariable) == float: return g else: return Grammar(g.logVariable.data.tolist()[0], [(l.data.tolist()[0], t, p) for l, t, p in g.productions])
def evaluate_latent_semparse(): print "======================================================================" print 'LATENT SEMANTIC PARSING' # Only (input, LF root node) pairs for this task; y[0][1] indexes # into the semantics of the root node: latent_semparse_train = [[x, y[0][1]] for x, y, d in semdata.sem_train] latent_semparse_test = [[x, y[0][1]] for x, y, d in semdata.sem_test] # To make this interesting, we add a rule of type-raising for # digits, so that derivations with the predicate neg in them have # multiple derivational paths leading to the same output: First, # every digit can now be introduced in its lifted form: for word in ('one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'): crude_lexicon[word] += [('Q', 'lift(%s)' % i) for i in range(1, 10)] # The new rule reversing the order of application between U and # its N (qua Q): rules.append(['U', 'Q', 'N', (1, 0)]) # Semantics for lift: functions['lift'] = (lambda x: (lambda f: f(x))) # New grammar: gram = Grammar(crude_lexicon, rules, functions) # Now train with LatentSGD, where the output transformation is # one that grabs the root node: evaluate(phi=phi_sem, optimizer=LatentSGD, train=latent_semparse_train, test=latent_semparse_test, classes=gram.gen, T=10, eta=0.1, output_transform=(lambda y: y[0][1]))
def get_energy(self, hmm, rule_set_list, case_name): grammar = Grammar(hmm, RuleSet(rule_set_list)) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def test_sequitur(self): """docstring for test_sequitur""" g = Grammar() g.train_string("Hello, world!") self.assertEqual("0 --(0)--> H e l l o , _ w o r l d ! \n", g.print_grammar())
def randomize_grammar(cls, nodes_by_type, vocabulary, possible_segments): # Randomize rules min_rules_num = 1 max_rules_num = 2 rules_num = random.randint(min_rules_num, max_rules_num) max_num_feature_nodes_per_rule = 1 max_num_affix_segments_per_rule = 1 max_num_environment_roots = len(nodes_by_type[SyntacticNode.TYPE_ROOT]) rules = [] for i in range(rules_num): num_feature_nodes = random.randint(1, max_num_feature_nodes_per_rule) feature_nodes = random.choices( nodes_by_type[SyntacticNode.TYPE_FEATURE], k=num_feature_nodes) num_affix_segments = random.randint( 0, max_num_affix_segments_per_rule) output_affix = '' for _ in range( num_affix_segments ): # We don't use choices because segments in affix can repeat output_affix += random.choice(possible_segments) num_environment_roots = random.randint(0, max_num_environment_roots) environment_roots = random.sample( nodes_by_type[SyntacticNode.TYPE_ROOT], k=num_environment_roots) rules.append(Rule(feature_nodes, output_affix, environment_roots)) return Grammar(nodes_by_type, vocabulary, rules)
def test_operator_precedence_features(self): """ See if a count of operator precedence patterns is a good feature for ranking parses. """ arithmetic_grammar = Grammar(self.arithmetic_rules) parses = arithmetic_grammar.parse("two times two plus three") self.assertEqual(2, len(parses)) # Look at Parse.operator_precedence_features(). It generates different # results for the two parses parse0_features = parses[0].operator_precedence_features() parse1_features = parses[1].operator_precedence_features() # In the first parse, + precedes * once self.assertEqual(parse0_features, {('+', '*'): 1.0}) # In the second parse, * precedes + once self.assertEqual(parse1_features, {('*', '+'): 1.0}) # Look at Parse.score() parse0_score = parses[0].score(Parse.operator_precedence_features, self.weights) parse1_score = parses[1].score(Parse.operator_precedence_features, self.weights) # Parse.operator_precedence_features() is good at distinguishing parses self.assertEqual(-1.0, parse0_score) self.assertEqual(1.0, parse1_score)
def test_epsilon_emission(self): self.initialise_segment_table("plural_english_segment_table.txt") from fst import EPSILON hmm = HMM({'q0': ['q1'], 'q1': (['q2'], ['dog', 'kat']), 'q2': (['qf'], ['z', EPSILON]) }) self.write_to_dot_to_file(hmm, 'epsilon_hmm') hmm_transducer = hmm.get_transducer() self.write_to_dot_to_file(hmm_transducer, 'epsilon_hmm_transducer') grammar = Grammar(hmm, None) word_1 = 'dog' word_2 = 'dogz' print(hmm) hypothesis = Hypothesis(grammar, [word_1, word_2]) encoding_length = hypothesis.get_data_encoding_length_by_grammar() assert encoding_length == 4.0 print(hmm.add_epsilon_emission_to_state()) print(hmm.add_epsilon_emission_to_state()) print(hmm.add_epsilon_emission_to_state()) print(hmm.remove_epsilon_emission_from_state()) print(hmm.remove_epsilon_emission_from_state()) print(hmm.add_epsilon_emission_to_state()) self.write_to_dot_to_file(hmm, 'epsilon_hmm_after_mutation')
def test_evaluation_with_scoring(self): """ Evaluate the grammar on all examples, collecting metrics: semantics oracle accuracy: # of examples where one parse or the other was correct. semantics accuracy: # of examples where parse at position 0 was correct. """ arithmetic_grammar = Grammar(self.arithmetic_rules) from executor import Executor arithmetic_model = Model(grammar=arithmetic_grammar, feature_fn=Parse.operator_precedence_features, weights=self.weights, executor=Executor.execute) from experiment import evaluate_model metrics = evaluate_model(model=arithmetic_model, examples=self.one_parse_examples + self.two_parse_examples) self.assertEqual(metrics['semantics oracle accuracy'], 17) self.assertEqual(metrics['semantics accuracy'], 16) # Improvement
def test_get_parsing_results(self): self.initialise_segment_table("abnese_lengthening_segment_table.txt") configurations["MORPHEME_BOUNDARY_FLAG"] = True configurations["LENGTHENING_FLAG"] = True configurations["HMM_ENCODING_LENGTH_MULTIPLIER"] = 100 configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 20 hmm = HMM({ 'q0': ['q1'], 'q1': (['qf'], ['aabb', 'abb', 'bbaabb', 'aba', 'aaba', 'bbaa']) }) rule1 = Rule([], [{ "long": "+" }], [], [{}, { "bound": "+" }], obligatory=True) rule2 = Rule([], [{ "syll": "+" }], [{ "cons": "+" }], [{ "cons": "+" }], obligatory=True) rule_set = RuleSet([rule1, rule2]) grammar = Grammar(hmm, rule_set) data = [ u'baba:a', u'babaab:ab', u'ab:a', u'aab:a', u'aab:ab', u'ab:ab' ] hypothesis = Hypothesis(grammar, data) simulated_annealing = SimulatedAnnealing(hypothesis, 0) print(simulated_annealing._get_parsing_results())
def test_learning_from_denotation(self): arithmetic_grammar = Grammar(self.arithmetic_rules) arithmetic_examples = self.two_parse_examples + self.one_parse_examples from executor import Executor arithmetic_model = Model( grammar=arithmetic_grammar, feature_fn=Parse.operator_precedence_features, weights=defaultdict(float), # Initialize with all weights at zero executor=Executor.execute) # Train based on correct/incorrect denotation from metrics import DenotationAccuracyMetric b_trn, b_tst, a_trn, a_tst = arithmetic_model.train_test( train_examples=arithmetic_examples[:13], test_examples=arithmetic_examples[13:], training_metric=DenotationAccuracyMetric(), seed=1) # BEFORE SGD self.assertEqual(b_trn['semantics accuracy'], 10) self.assertEqual(b_tst['denotation accuracy'], 4) # AFTER SGD self.assertEqual(a_trn['semantics accuracy'], 12) # Improvement self.assertEqual(a_trn['denotation accuracy'], 13) # Improvement