def _get_rule_penalty(rule, config, examples, verbose=False): """Returns a "penalty" for the rule based on the given set of examples.""" # TODO(petershaw): This could potentially be more effecient by pre-indexing # the examples in a data structure such as a Trie. # TODO(petershaw): Could also consider sub-sampling the dataset for the # purpose of computing these correlations. # Optionally compute over a sample of examples only. sample_size = config.get("sample_size", 0) num_examples_match_source = 0 num_examples_match_target = 0 num_examples_match_source_and_target = 0 for source_str, target_str in examples: source = tuple(source_str.split()) target = tuple(target_str.split()) match_source = rule_utils.rhs_can_maybe_derive(rule.source, source) match_target = rule_utils.rhs_can_maybe_derive(rule.target, target) if match_source: num_examples_match_source += 1 if match_target: num_examples_match_target += 1 if match_source and match_target: num_examples_match_source_and_target += 1 if sample_size and num_examples_match_source_and_target >= sample_size: # Break early if using sample size and found sufficient sample. break # Ensure that at least one example is found. if not num_examples_match_source_and_target: print("Rule did not match any examples.") # TODO(petershaw): Raise instead? return 0.0 if not num_examples_match_source: raise ValueError("num_examples_match_source: %s" % num_examples_match_source) if not num_examples_match_target: raise ValueError("num_examples_match_target: %s" % num_examples_match_target) if verbose: print("_get_rule_cost: %s" % rule) print("num_examples_match_source: %s" % num_examples_match_source) print("num_examples_match_target: %s" % num_examples_match_target) print("num_examples_match_source_and_target: %s" % num_examples_match_source_and_target) cost = 0.0 p_source_given_target = (float(num_examples_match_source_and_target) / num_examples_match_target) cost -= (config["source_given_target_coef"] * math.log2(p_source_given_target)) p_target_given_source = (float(num_examples_match_source_and_target) / num_examples_match_source) cost -= (config["target_given_source_coef"] * math.log2(p_target_given_source)) return cost
def test_rhs_can_maybe_derive_1(self): rule_a = qcfg_rule.rule_from_string( "who is NT_1 ' s boss ? ### ( Yield :output ( FindManager :recipient ( NT_1 ) ) )" ) rule_b = qcfg_rule.rule_from_string( "who is NT_1 ? ### ( Yield :output ( NT_1 ) )") self.assertTrue( rule_utils.rhs_can_maybe_derive(rule_b.source, rule_a.source)) self.assertTrue( rule_utils.rhs_can_maybe_derive(rule_b.target, rule_a.target))
def _find_relevant_rules(current_rules, candidate_rule): # TODO(petershaw): This can potentially be made more effecient by # pre-indexing rules in a data structure such as a Trie. relevant_rules = [] for rule in current_rules: if (rule_utils.rhs_can_maybe_derive(candidate_rule.source, rule.source) and rule_utils.rhs_can_maybe_derive(candidate_rule.target, rule.target)): relevant_rules.append(rule) return relevant_rules
def _find_possible_candidates(rule_to_split, other_rules, config): """Return possible rule candidates.""" all_candidates = set() for other_rule in other_rules: if other_rule == rule_to_split: continue if (rule_utils.rhs_can_maybe_derive(other_rule.source, rule_to_split.source) and rule_utils.rhs_can_maybe_derive(other_rule.target, rule_to_split.target)): unifiers = unification_utils.get_rule_unifiers( rule_to_split, other_rule, config) candidates = {(unifier, other_rule) for unifier in unifiers} all_candidates |= candidates return all_candidates
def test_rhs_can_maybe_derive_8(self): rhs = tuple("foo NT_1 NT_2 bar".split()) goal_rhs = tuple("foo foo bar buz buz".split()) self.assertFalse(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_7(self): rhs = tuple("turn right NT_1".split()) goal_rhs = tuple("turn left turn right".split()) self.assertFalse(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_4(self): rhs = tuple("foo".split()) goal_rhs = tuple("bar foo bar".split()) self.assertTrue(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_false_2(self): rhs = tuple("NT_1 named NT_2".split()) goal_rhs = tuple("NT_1 foo".split()) self.assertFalse(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_true_5(self): rhs = tuple("I_TURN_RIGHT NT_1".split()) goal_rhs = tuple( "I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN I_RUN". split()) self.assertTrue(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_true_4(self): rhs = tuple("NT_1 right".split()) goal_rhs = tuple("run after run right thrice".split()) self.assertTrue(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_true_2(self): rhs = tuple("foo foo NT_1".split()) goal_rhs = tuple("foo foo bar NT_1".split()) self.assertTrue(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_true_1(self): rhs = tuple("NT_1 named NT_2".split()) goal_rhs = tuple("NT_1 have a major city named NT_2".split()) self.assertTrue(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_10(self): rhs = tuple("NT_1 foo bar".split()) goal_rhs = tuple("NT_1 foo xyz foo bar".split()) self.assertTrue(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))
def test_rhs_can_maybe_derive_9(self): rhs = tuple("NT_1 ( capital )".split()) goal_rhs = tuple("answer ( loc_1 ( smallest ( capital ) ) )".split()) self.assertTrue(rule_utils.rhs_can_maybe_derive(rhs, goal_rhs))