コード例 #1
0
    def test_morphology_only(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        data = [u'tozat', u'tozgoat', u'tozgo', u'tozdoat', u'tozdo', u'tozzoat', u'tozzo', u'toz', u'dagat', u'daggoat', u'daggo', u'dagdoat', u'dagdo', u'dagzoat', u'dagzo', u'dag', u'gasat', u'gasgoat', u'gasgo', u'gasdoat', u'gasdo', u'gaszoat', u'gaszo', u'gas', u'kodat', u'kodgoat', u'kodgo', u'koddoat', u'koddo', u'kodzoat', u'kodzo', u'kod', u'katat', u'katgoat', u'katgo', u'katdoat', u'katdo', u'katzoat', u'katzo', u'kat', u'dotat', u'dotgoat', u'dotgo', u'dotdoat', u'dotdo', u'dotzoat', u'dotzo', u'dot']

        #target
        hmm = {'q0': ['q1'],
              'q1': (['q2', 'q3', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']),
              'q2': (['q3','qf'], ['zo', 'go', 'do']),
              'q3': (['qf'], ['at'])}
        self.configurations.simulation_data = data
        self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 5190)

        #single_sate
        hmm = HMM({'q0': ['q1'],
              'q1': (['q1', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do'] + ['at'])
                })
        self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6430)


        #two state
        hmm = {'q0': ['q1'],
              'q1': (['q1', 'q2', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do']),
              'q2': (['qf'], ['at'])
                }
        self.assertLess(Hypothesis(Grammar(hmm, [])).get_energy(), 6010)

        #from simualation
        hmm = HMM({'q0': ['q1'],
      'q1': (['q1', 'qf'], ['toz', 'do', 'zo', 'gas', 'kod', 'dag', 'at', 'zoat', 'kat', 'go', 'dot'])
        })
コード例 #2
0
def generate_samples(grammar_dir, outfiles):
    """Generates a set of samples and writes them to the output files.

    Args:
      grammar_dir: directory to load grammar files from.
      outfiles: A list of output filenames.
    """

    f = open(os.path.join(grammar_dir, 'template.html'))
    template = f.read()
    f.close()

    htmlgrammar = Grammar()
    err = htmlgrammar.parse_from_file(os.path.join(grammar_dir, 'html.txt'))
    # CheckGrammar(htmlgrammar)
    if err > 0:
        print('There were errors parsing grammar')
        return

    cssgrammar = Grammar()
    err = cssgrammar.parse_from_file(os.path.join(grammar_dir, 'css.txt'))
    # CheckGrammar(cssgrammar)
    if err > 0:
        print('There were errors parsing grammar')
        return

    jsgrammar = Grammar()
    err = jsgrammar.parse_from_file(os.path.join(grammar_dir, 'js.txt'))
    # CheckGrammar(jsgrammar)
    if err > 0:
        print('There were errors parsing grammar')
        return

    # JS and HTML grammar need access to CSS grammar.
    # Add it as import
    htmlgrammar.add_import('cssgrammar', cssgrammar)
    jsgrammar.add_import('cssgrammar', cssgrammar)

    ttt = 0

    for outfile in outfiles:

        ttt += 1

        result = generate_new_sample(template,
                                     htmlgrammar,
                                     cssgrammar,
                                     jsgrammar,
                                     g=ttt,
                                     fi=outfile)  #F:/domato/curpus/

        if result is not None:
            print('Writing a sample to ' + outfile)
            try:
                f = open(outfile, 'w')
                f.write(result)
                f.close()
            except IOError:
                print('Error writing to output')
コード例 #3
0
    def test_crossover(self):
        self.initialise_segment_table("dag_zook_segments_new.txt")
        rule_set_1 = RuleSet([
            Rule(*[[{
                "cons": "+"
            }], [{
                "voice": "-"
            }], [{
                "low": "+"
            }], [{
                "cont": "-"
            }], True])
        ])
        rule_set_2 = RuleSet([
            Rule(*[[{
                "cons": "+"
            }], [{
                "low": "-"
            }], [{
                "voice": "-"
            }], [], False])
        ])
        plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog']
        hmm_1 = HMM({
            INITIAL_STATE: ['q1'],
            'q1': (['q2', FINAL_STATE], ['dag', 'kot']),
            'q2': ([FINAL_STATE], ['z'])
        })
        hmm_2 = HMM({
            INITIAL_STATE: ['q1'],
            'q1': (['q2'], ['dog', 'kat']),
            'q2': (['q3'], ['s']),
            'q3': ([FINAL_STATE], ['z'])
        })

        grammar_1 = Grammar(hmm_1, rule_set_1)
        grammar_2 = Grammar(hmm_2, rule_set_2)

        hypothesis_1 = Hypothesis(grammar_1, plural_english_data)
        hypothesis_2 = Hypothesis(grammar_2, plural_english_data)
        offspring_1, offspring_2 = GeneticAlgorithm.crossover(
            hypothesis_1, hypothesis_2)

        print("*** Parents:\n")
        GeneticAlgorithm.log_hypothesis(hypothesis_1)
        GeneticAlgorithm.log_hypothesis(hypothesis_2)

        print("\n\n*** Offspring:\n")
        GeneticAlgorithm.log_hypothesis(offspring_1)
        GeneticAlgorithm.log_hypothesis(offspring_2)

        offspring_3, offspring_4 = GeneticAlgorithm.crossover(
            offspring_1, offspring_2)

        print("\n\n*** 2nd gen offspring:\n")
        GeneticAlgorithm.log_hypothesis(offspring_3)
        GeneticAlgorithm.log_hypothesis(offspring_4)
コード例 #4
0
    def test_crossover(self):
        from copy import deepcopy

        rule_1 = Rule.load([[{
            'cont': '+'
        }], [{
            'coronal': '-'
        }], [{
            'coronal': '-'
        }], [], True])
        rule_2 = Rule.load([[{
            'cons': '+',
            'low': '-'
        }], [{
            'voice': '-'
        }], [{
            'voice': '-'
        }], [], True])

        crossover_rule_1 = deepcopy(rule_1)
        crossover_rule_2 = deepcopy(rule_2)
        crossover_rule_1.left_context_feature_bundle_list = rule_2.left_context_feature_bundle_list
        crossover_rule_1.right_context_feature_bundle_list = rule_2.right_context_feature_bundle_list
        crossover_rule_1.change_feature_bundle_list = rule_2.change_feature_bundle_list

        crossover_rule_2.left_context_feature_bundle_list = rule_1.left_context_feature_bundle_list
        crossover_rule_2.right_context_feature_bundle_list = rule_1.right_context_feature_bundle_list
        crossover_rule_2.change_feature_bundle_list = rule_1.change_feature_bundle_list

        rule_set_1 = RuleSet([crossover_rule_1])
        rule_set_2 = RuleSet([crossover_rule_2])
        print(rule_set_1)
        print(rule_set_2)

        hmm = HMM({
            'q0': ['q1'],
            'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']),
            'q2': (['qf'], ['zo', 'go', 'do'])
        })
        grammar_1 = Grammar(hmm, rule_set_1)
        grammar_2 = Grammar(hmm, rule_set_2)

        data = ['kat', 'dot',     'dag', 'kod'] + \
               ['katso', 'dotso', 'dagzo', 'kodzo'] + \
               ['katko', 'dotko', 'daggo', 'kodgo'] + \
               ['katto', 'dotto', 'dagdo', 'koddo']

        hypothesis_1 = Hypothesis(grammar_1, data)
        hypothesis_2 = Hypothesis(grammar_2, data)

        print(hypothesis_1.get_energy())
        print(hypothesis_2.get_energy())
コード例 #5
0
    def crossover(self, h1, h2):
        if GeneticAlgorithm._is_incest(h1, h2):
            return h1, h2,

        crossover_rules = False
        crossover_hmm = False
        if ga_config.CROSSOVER_BOTH_HMM_AND_RULES:
            if configurations["EVOLVE_RULES"]:
                crossover_rules = True
            if configurations["EVOLVE_HMM"]:
                crossover_hmm = True

        else:
            loci = []
            if configurations["EVOLVE_RULES"]:
                loci.append('rules')
            if configurations["EVOLVE_HMM"]:
                loci.append('hmm')
            locus = random.choice(loci)

            if locus == 'rules':
                crossover_rules = True
            elif locus == 'hmm':
                crossover_hmm = True

        offspring_1 = deepcopy(h1)
        offspring_2 = deepcopy(h2)

        if crossover_rules:
            offspring_1_rule_set, offspring_2_rule_set = RuleSet.crossover(
                offspring_1.grammar.rule_set, offspring_2.grammar.rule_set)
        else:
            offspring_1_rule_set, offspring_2_rule_set = offspring_1.grammar.rule_set, offspring_2.grammar.rule_set

        if crossover_hmm:
            offspring_1_hmm, offspring_2_hmm = HMM.crossover(
                offspring_1.grammar.hmm, offspring_2.grammar.hmm)
        else:
            offspring_1_hmm, offspring_2_hmm = offspring_1.grammar.hmm, offspring_2.grammar.hmm

        offspring_1.grammar = Grammar(offspring_1_hmm, offspring_1_rule_set)
        offspring_2.grammar = Grammar(offspring_2_hmm, offspring_2_rule_set)

        # Invalidate mutated offspring fitness value (i.e. mark for re-evaluation)
        offspring_1.invalidate_fitness()
        offspring_1.invalidate_energy()
        offspring_2.invalidate_fitness()
        offspring_2.invalidate_energy()

        return offspring_1, offspring_2,
コード例 #6
0
ファイル: generator.py プロジェクト: mmg1/molybden
def GenerateSamples(grammar_dir, outfiles):
    """Generates a set of samples and writes them to the output files.

  Args:
    grammar_dir: directory to load grammar files from.
    outfiles: A list of output filenames.
  """

    f = open(os.path.join(grammar_dir, 'template.html'))
    template = f.read()
    f.close()

    htmlgrammar = Grammar()
    err = htmlgrammar.ParseFromFile(os.path.join(grammar_dir, 'html.txt'))
    #CheckGrammar(htmlgrammar)
    if err > 0:
        print 'There were errors parsing grammar'
        return

    cssgrammar = Grammar()
    err = cssgrammar.ParseFromFile(os.path.join(grammar_dir, 'css.txt'))
    #CheckGrammar(cssgrammar)
    if err > 0:
        print 'There were errors parsing grammar'
        return

    jsgrammar = Grammar()
    err = jsgrammar.ParseFromFile(os.path.join(grammar_dir, 'js.txt'))
    #CheckGrammar(jsgrammar)
    if err > 0:
        print 'There were errors parsing grammar'
        return

    # JS and HTML grammar need acces to CSS grammar.
    # Add it as import
    htmlgrammar.AddImport('cssgrammar', cssgrammar)
    jsgrammar.AddImport('cssgrammar', cssgrammar)

    for outfile in outfiles:
        result = GenerateNewSample(template, htmlgrammar, cssgrammar,
                                   jsgrammar)

        if result is not None:
            print 'Writing a sample to ' + outfile
            try:
                f = open(outfile, 'w')
                f.write(result)
                f.close()
            except IOError:
                print 'Error writing to output'
コード例 #7
0
    def test_parsing(self):
        # Ignoring semantics for now...
        numeral_rules = [
            Rule('$E', 'one'),
            Rule('$E', 'two'),
            Rule('$E', 'three'),
            Rule('$E', 'four'),
        ]

        operator_rules = [
            Rule('$UnOp', 'minus'),
            Rule('$BinOp', 'plus'),
            Rule('$BinOp', 'minus'),
            Rule('$BinOp', 'times'),
        ]

        compositional_rules = [
            Rule('$E', '$UnOp $E'),
            Rule('$EBO', '$E $BinOp'),
            Rule('$E', '$EBO $E')
        ]

        arithmetic_rules = numeral_rules + operator_rules + compositional_rules

        arithmetic_grammar = Grammar(arithmetic_rules)
        for example in self.one_parse_examples:
            self.assertEqual(1, len(arithmetic_grammar.parse(example.input)),
                             example)
            # print(arithmetic_grammar.parse(example.input)[0])
        for example in self.two_parse_examples:
            self.assertEqual(2, len(arithmetic_grammar.parse(example.input)),
                             example)
コード例 #8
0
    def test_learning_from_many_denotations(self):
        """
        Large number of examples are used for training.
        Last 4 arithmetic_examples are used for testing.
        b_trn: performance metrics on training set before training
        a_trn: performance metrics on training set after training

        denotation accuracy: # of examples where denotation of parse at position 
        0 was correct
        """
        arithmetic_grammar = Grammar(self.arithmetic_rules)
        arithmetic_examples = self.two_parse_examples + self.one_parse_examples

        from executor import Executor

        arithmetic_model = Model(
            grammar=arithmetic_grammar,
            feature_fn=Parse.operator_precedence_features,
            weights=defaultdict(float),  # Initialize with all weights at zero
            executor=Executor.execute)

        from metrics import DenotationAccuracyMetric
        from arithmetic import arithmetic_dev_examples

        b_trn, b_tst, a_trn, a_tst = arithmetic_model.train_test(
            train_examples=arithmetic_dev_examples,
            test_examples=arithmetic_examples[13:],
            training_metric=DenotationAccuracyMetric(),
            seed=1)

        # BEFORE SGD
        self.assertEqual(b_trn['denotation accuracy'], 64)

        # AFTER SGD
        self.assertEqual(a_trn['denotation accuracy'], 92)  # Improvement
コード例 #9
0
    def test_turkish_blah(self):
        self.initialise_simulation(turkish_vowel_harmony_new_weights)
        Q2s = [
            'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e', EPSILON
        ]
        hmm_dict = {
            'q0': ['q1'],
            'q1': (['q2'], [
                'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi',
                'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi',
                'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp'
            ]),
            'q2': (['qf'], Q2s),
        }
        some_hmm = HMM(deepcopy(hmm_dict))
        some_rules = RuleSet([
            Rule([{
                "syll": "+"
            }], [{
                "back": "+"
            }], [{
                "cont": "+",
                "back": "+"
            }, {
                "syll": "-",
                "kleene": True
            }], [], True)
        ])

        some_hypo = Hypothesis(Grammar(some_hmm, some_rules))

        #
        self.assert_equal_no_infs(self.get_target_hypo().get_energy(),
                                  some_hypo.get_energy())
コード例 #10
0
ファイル: ox.py プロジェクト: fuzzamos/OxFuzz
def generate_samples(grammar_dir, outfiles):
    """Generates a set of samples and writes them to the output files.

    Args:
      grammar_dir: directory to load grammar files from.
      outfiles: A list of output filenames.
    """

    f = open(os.path.join(grammar_dir, 'ox_template.html'))
    template = f.read()
    f.close()

    jsgrammar = Grammar()
    err = jsgrammar.parse_from_file(os.path.join(grammar_dir, 'oxjs.txt'))

    if err > 0:
        print('There were errors parsing grammar')
        return

    for outfile in outfiles:
        result = generate_new_sample(template, jsgrammar)

        if result is not None:
            print('Writing a sample to ' + outfile)
            try:
                f = open(outfile, 'w')
                f.write(result)
                f.close()
            except IOError:
                print('Error writing to output')
コード例 #11
0
 def test_first(self):
     """Grammar: init should compute First sets"""
     for rules, first in self.known_firsts:
         g = Grammar(rules)
         self.assertEqual(first.keys(), g.first.keys())
         for s in first:
             self.assertEqual(first[s], g.first[s])
コード例 #12
0
    def test_parser_kleene(self):
        hmm = HMM({
            INITIAL_STATE: ['q1'],
            'q1': (['q2', FINAL_STATE], ['at', 'attstktttt', 'st']),
            'q2': ([FINAL_STATE], ['o'])
        })

        hmm_transducer = hmm.get_transducer()
        self.write_to_dot_to_file(hmm_transducer, "test_hmm_transducer_kleene")

        assimilation_rule_with_kleene = Rule([{
            "cons": "-"
        }], [{
            "low": "+"
        }], [{
            "cons": "-"
        }, {
            "cons": "+",
            "kleene": True
        }], [],
                                             obligatory=True)

        rule_set_with_kleene = RuleSet([assimilation_rule_with_kleene])
        grammar = Grammar(hmm, rule_set_with_kleene)

        nfa = grammar.get_nfa()
        self.write_to_dot_to_file(nfa, "test_parser_nfa_kleene")
コード例 #13
0
def delete_nonderivable_nonterminals(grammar):
	new_grammar = Grammar()
	new_grammar.axiom = grammar.axiom
	new_grammar.terminals = grammar.terminals

	unwatched = list([new_grammar.axiom])
	watched = set()
	while unwatched:
		nonterminal = unwatched[0]
		unwatched = unwatched.remove(nonterminal) or []
		watched.add(nonterminal)

		rules = find_rules_for_nonterminal(grammar.rules, nonterminal)
		for rule in rules:
			for symbol in rule.right_side:
				if isinstance(symbol, Nonterminal):
					if symbol not in watched and symbol not in unwatched:
						unwatched.append(symbol)

	new_grammar.nonterminals = watched

	new_rules = []
	for rule in grammar.rules:
		if rule.left_side[0] in watched:
			new_rules.append(rule)

	new_grammar.rules = new_rules

	return new_grammar	 
コード例 #14
0
    def test_grammar_creation(self):
        numeral_rules = [
            Rule('$E', 'one'),
            Rule('$E', 'two'),
            Rule('$E', 'three'),
            Rule('$E', 'four'),
        ]

        operator_rules = [
            Rule('$UnOp', 'minus'),
            Rule('$BinOp', 'plus'),
            Rule('$BinOp', 'minus'),
            Rule('$BinOp', 'times'),
        ]

        compositional_rules = [
            Rule('$E', '$UnOp $E'),
            Rule('$EBO', '$E $BinOp'),
            Rule('$E', '$EBO $E')
        ]

        arithmetic_rules = numeral_rules + operator_rules + compositional_rules

        arithmetic_grammar = Grammar(arithmetic_rules)
        self.assertEqual(3, len(arithmetic_grammar.binary_rules))
        self.assertEqual(7, len(arithmetic_grammar.lexical_rules))
コード例 #15
0
    def toGrammar(self):
        from grammar import Grammar
        """
        1. δ(q, a)          => Q -> aP
        2. δ(q, a); q in P  => Q - a
        3. δ(q0, a)         => S -> aP
        4. δ(q0, a)         => S -> aP
        5. q0 in F          ....
        """
        grammar_rules = {}

        for path in self.func:
            source, letter = path
            targets = self.func[path]

            source = source if source is not self.initial_state else "S"
            grammar_rules.setdefault(source, "")

            for target in targets:
                target = target if target is not self.initial_state else "S"

                if source in self.accepted_sates:
                    grammar_rules[source] += (f"| {letter}")
                elif source is self.initial_state:
                    grammar_rules[source] += (f"| {letter}{target}")
                else:
                    grammar_rules[source] += (f"| {letter}{target}")

            grammar_rules[source] = grammar_rules[source].strip("|").strip()

        g = Grammar()
        for source, rule in grammar_rules.items():
            g[source] = rule
        return g
コード例 #16
0
    def induce_entity_grammar(self, start_grammar):
        """Induce an entity-swapping grammar.

        Get the entities from the original dataset.
        Get the places to put holes from start_grammar.
        """
        new_grammar = Grammar()

        # Entity rules
        for x, y in self.dataset:
            alignments = self.domain.get_entity_alignments(x, y)
            for cat, x_span, y_span in alignments:
                x_str = x[x_span[0]:x_span[1]]
                y_str = y[y_span[0]:y_span[1]]
                new_grammar.add_rule(cat, x_str, y_str)

        # Root/template rules
        for cat, x_str, y_str in start_grammar.rule_list:
            # Anchor on single mention in x--allow one-to-many x-to-y mapping
            alignments = self.domain.get_entity_alignments(x_str, y_str)
            x_swaps = list(set(
                [(x_span, '%s_%d' % (inner_cat, x_span[0]))
                 for i, (inner_cat, x_span, y_span) in enumerate(alignments)]))
            x_new = self.splice(x_str, x_swaps)
            y_swaps = [(y_span, '%s_%d' % (inner_cat, x_span[0]))
                       for i, (inner_cat, x_span, y_span) in enumerate(alignments)]
            y_new = self.splice(y_str, y_swaps)
            new_grammar.add_rule(cat, x_new, y_new)

        # new_grammar.print_self()
        return new_grammar
コード例 #17
0
 def get_energy(self, simulation_case):
     case_name = simulation_case.case_name
     configuration.configurations_dict["case_name"] = case_name
     if isinstance(simulation_case.hmm_dict, HMM):
         hmm = simulation_case.hmm_dict
     else:
         hmm = HMM(simulation_case.hmm_dict)
     if isinstance(simulation_case.flat_rule_set_list, RuleSet):
         rule_set = simulation_case.flat_rule_set_list
     else:
         rule_set_list = []
         for flat_rule in simulation_case.flat_rule_set_list:
             rule_set_list.append(Rule(*flat_rule))
         rule_set = RuleSet(rule_set_list)
     grammar = Grammar(hmm, rule_set)
     self.write_to_dot_to_file(hmm, "hmm_" + case_name)
     self.write_to_dot_to_file(grammar.get_nfa(),
                               "grammar_nfa_" + case_name)
     hypothesis = Hypothesis(grammar, self.data)
     energy = hypothesis.get_energy()
     if self.target_energy:
         print("{}: {} distance from target: {}".format(
             case_name, hypothesis.get_recent_energy_signature(),
             energy - self.target_energy))
     else:
         print("{}: {}".format(case_name,
                               hypothesis.get_recent_energy_signature()))
     return energy
コード例 #18
0
 def test_follow(self):
     """Grammar: init should compute Follow sets"""
     for rules, follow in self.known_follows:
         g = Grammar(rules)
         self.assertEqual(follow.keys(), g.follow.keys())
         for s in follow:
             self.assertEqual(follow[s], g.follow[s])
コード例 #19
0
def grammar_cc() -> Grammar:
    return Grammar(terminals='cd',
                   non_terminals='SC',
                   start='S',
                   rules=[Rule('S', 'CC'),
                          Rule('C', 'cC'),
                          Rule('C', 'd')])
コード例 #20
0
 def test_parse(self):
     """LL1: check parse() against know result"""
     gram, sentence, ref_leftmost = self.ref_parse
     ll1 = LL1(Grammar(gram))
     tree = ll1.parse(sentence.split())
     t_leftmost = tuple(tree.leftmost())
     self.assertEqual(t_leftmost, ref_leftmost)
コード例 #21
0
def untorch(g):
    if type(g.logVariable) == float:
        return g
    else:
        return Grammar(g.logVariable.data.tolist()[0],
                       [(l.data.tolist()[0], t, p)
                        for l, t, p in g.productions])
コード例 #22
0
def evaluate_latent_semparse():
    print "======================================================================"
    print 'LATENT SEMANTIC PARSING'
    # Only (input, LF root node) pairs for this task; y[0][1] indexes
    # into the semantics of the root node:
    latent_semparse_train = [[x, y[0][1]] for x, y, d in semdata.sem_train]
    latent_semparse_test = [[x, y[0][1]] for x, y, d in semdata.sem_test]
    # To make this interesting, we add a rule of type-raising for
    # digits, so that derivations with the predicate neg in them have
    # multiple derivational paths leading to the same output: First,
    # every digit can now be introduced in its lifted form:
    for word in ('one', 'two', 'three', 'four', 'five', 'six', 'seven',
                 'eight', 'nine'):
        crude_lexicon[word] += [('Q', 'lift(%s)' % i) for i in range(1, 10)]
    # The new rule reversing the order of application between U and
    # its N (qua Q):
    rules.append(['U', 'Q', 'N', (1, 0)])
    # Semantics for lift:
    functions['lift'] = (lambda x: (lambda f: f(x)))
    # New grammar:
    gram = Grammar(crude_lexicon, rules, functions)
    # Now train with LatentSGD, where the output transformation is
    # one that grabs the root node:
    evaluate(phi=phi_sem,
             optimizer=LatentSGD,
             train=latent_semparse_train,
             test=latent_semparse_test,
             classes=gram.gen,
             T=10,
             eta=0.1,
             output_transform=(lambda y: y[0][1]))
コード例 #23
0
 def get_energy(self, hmm, rule_set_list, case_name):
     grammar = Grammar(hmm, RuleSet(rule_set_list))
     hypothesis = Hypothesis(grammar, self.data)
     energy = hypothesis.get_energy()
     print("{}: {}".format(case_name,
                           hypothesis.get_recent_energy_signature()))
     return energy
コード例 #24
0
    def test_sequitur(self):
        """docstring for test_sequitur"""
        g = Grammar()
        g.train_string("Hello, world!")

        self.assertEqual("0 --(0)--> H e l l o , _ w o r l d ! \n",
                         g.print_grammar())
コード例 #25
0
    def randomize_grammar(cls, nodes_by_type, vocabulary, possible_segments):
        # Randomize rules
        min_rules_num = 1
        max_rules_num = 2
        rules_num = random.randint(min_rules_num, max_rules_num)
        max_num_feature_nodes_per_rule = 1
        max_num_affix_segments_per_rule = 1
        max_num_environment_roots = len(nodes_by_type[SyntacticNode.TYPE_ROOT])

        rules = []
        for i in range(rules_num):
            num_feature_nodes = random.randint(1,
                                               max_num_feature_nodes_per_rule)
            feature_nodes = random.choices(
                nodes_by_type[SyntacticNode.TYPE_FEATURE], k=num_feature_nodes)

            num_affix_segments = random.randint(
                0, max_num_affix_segments_per_rule)
            output_affix = ''
            for _ in range(
                    num_affix_segments
            ):  # We don't use choices because segments in affix can repeat
                output_affix += random.choice(possible_segments)

            num_environment_roots = random.randint(0,
                                                   max_num_environment_roots)
            environment_roots = random.sample(
                nodes_by_type[SyntacticNode.TYPE_ROOT],
                k=num_environment_roots)
            rules.append(Rule(feature_nodes, output_affix, environment_roots))

        return Grammar(nodes_by_type, vocabulary, rules)
コード例 #26
0
    def test_operator_precedence_features(self):
        """
        See if a count of operator precedence patterns is a good feature for 
        ranking parses.
        """
        arithmetic_grammar = Grammar(self.arithmetic_rules)
        parses = arithmetic_grammar.parse("two times two plus three")
        self.assertEqual(2, len(parses))
        # Look at Parse.operator_precedence_features(). It generates different
        # results for the two parses
        parse0_features = parses[0].operator_precedence_features()
        parse1_features = parses[1].operator_precedence_features()
        # In the first parse, + precedes * once
        self.assertEqual(parse0_features, {('+', '*'): 1.0})
        # In the second parse, * precedes + once
        self.assertEqual(parse1_features, {('*', '+'): 1.0})

        # Look at Parse.score()
        parse0_score = parses[0].score(Parse.operator_precedence_features,
                                       self.weights)
        parse1_score = parses[1].score(Parse.operator_precedence_features,
                                       self.weights)
        # Parse.operator_precedence_features() is good at distinguishing parses
        self.assertEqual(-1.0, parse0_score)
        self.assertEqual(1.0, parse1_score)
コード例 #27
0
    def test_epsilon_emission(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        from fst import EPSILON

        hmm = HMM({'q0': ['q1'],
                   'q1': (['q2'], ['dog', 'kat']),
                   'q2': (['qf'], ['z', EPSILON])
                   })
        self.write_to_dot_to_file(hmm, 'epsilon_hmm')

        hmm_transducer = hmm.get_transducer()
        self.write_to_dot_to_file(hmm_transducer, 'epsilon_hmm_transducer')

        grammar = Grammar(hmm, None)
        word_1 = 'dog'
        word_2 = 'dogz'
        print(hmm)

        hypothesis = Hypothesis(grammar, [word_1, word_2])
        encoding_length = hypothesis.get_data_encoding_length_by_grammar()
        assert encoding_length == 4.0

        print(hmm.add_epsilon_emission_to_state())
        print(hmm.add_epsilon_emission_to_state())
        print(hmm.add_epsilon_emission_to_state())
        print(hmm.remove_epsilon_emission_from_state())
        print(hmm.remove_epsilon_emission_from_state())
        print(hmm.add_epsilon_emission_to_state())

        self.write_to_dot_to_file(hmm, 'epsilon_hmm_after_mutation')
コード例 #28
0
    def test_evaluation_with_scoring(self):
        """
        Evaluate the grammar on all examples, collecting metrics:
        
        semantics oracle accuracy: # of examples where one parse or the other was
        correct.

        semantics accuracy: # of examples where parse at position 0 was correct.
        """
        arithmetic_grammar = Grammar(self.arithmetic_rules)

        from executor import Executor

        arithmetic_model = Model(grammar=arithmetic_grammar,
                                 feature_fn=Parse.operator_precedence_features,
                                 weights=self.weights,
                                 executor=Executor.execute)

        from experiment import evaluate_model

        metrics = evaluate_model(model=arithmetic_model,
                                 examples=self.one_parse_examples +
                                 self.two_parse_examples)
        self.assertEqual(metrics['semantics oracle accuracy'], 17)
        self.assertEqual(metrics['semantics accuracy'], 16)  # Improvement
コード例 #29
0
    def test_get_parsing_results(self):
        self.initialise_segment_table("abnese_lengthening_segment_table.txt")
        configurations["MORPHEME_BOUNDARY_FLAG"] = True
        configurations["LENGTHENING_FLAG"] = True
        configurations["HMM_ENCODING_LENGTH_MULTIPLIER"] = 100
        configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 20
        hmm = HMM({
            'q0': ['q1'],
            'q1': (['qf'], ['aabb', 'abb', 'bbaabb', 'aba', 'aaba', 'bbaa'])
        })

        rule1 = Rule([], [{
            "long": "+"
        }], [], [{}, {
            "bound": "+"
        }],
                     obligatory=True)
        rule2 = Rule([], [{
            "syll": "+"
        }], [{
            "cons": "+"
        }], [{
            "cons": "+"
        }],
                     obligatory=True)
        rule_set = RuleSet([rule1, rule2])

        grammar = Grammar(hmm, rule_set)
        data = [
            u'baba:a', u'babaab:ab', u'ab:a', u'aab:a', u'aab:ab', u'ab:ab'
        ]

        hypothesis = Hypothesis(grammar, data)
        simulated_annealing = SimulatedAnnealing(hypothesis, 0)
        print(simulated_annealing._get_parsing_results())
コード例 #30
0
    def test_learning_from_denotation(self):
        arithmetic_grammar = Grammar(self.arithmetic_rules)
        arithmetic_examples = self.two_parse_examples + self.one_parse_examples

        from executor import Executor

        arithmetic_model = Model(
            grammar=arithmetic_grammar,
            feature_fn=Parse.operator_precedence_features,
            weights=defaultdict(float),  # Initialize with all weights at zero
            executor=Executor.execute)

        # Train based on correct/incorrect denotation
        from metrics import DenotationAccuracyMetric

        b_trn, b_tst, a_trn, a_tst = arithmetic_model.train_test(
            train_examples=arithmetic_examples[:13],
            test_examples=arithmetic_examples[13:],
            training_metric=DenotationAccuracyMetric(),
            seed=1)

        # BEFORE SGD
        self.assertEqual(b_trn['semantics accuracy'], 10)
        self.assertEqual(b_tst['denotation accuracy'], 4)

        # AFTER SGD
        self.assertEqual(a_trn['semantics accuracy'], 12)  # Improvement
        self.assertEqual(a_trn['denotation accuracy'], 13)  # Improvement