def insert_cvt_if_needed(self, tree): predicate = get_main_predicate_from_tree(tree) cvt = self.get_cvt_cached(predicate) if cvt: if IsString(tree): tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree)) elif tree.label() == u'COUNT': tree = tree_or_string('(COUNT (ID !{0} {1}))'.format( cvt, tree[0])) elif not IsString(tree[0]): tree_repr = ' '.join(map(str, tree)) tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree_repr)) else: tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree)) return tree
def test_TerminalSingleFullMatch2(self): src_tree = tree_or_string(u'食べた') src_tree_pat = TreePattern(src_tree, (), []) similarities = self.dict_cost.GetSimilar(src_tree_pat) expected_similarities = [] trg_tree = tree_or_string(u'eat') trg_tree_pat = TreePattern(trg_tree, (), []) expected_similarities.append( Similarity(0.8, 'dict_part', src_tree_pat, trg_tree_pat)) trg_tree = tree_or_string(u'ate') trg_tree_pat = TreePattern(trg_tree, (), []) expected_similarities.append( Similarity(0.9, 'dict_part', src_tree_pat, trg_tree_pat)) self.assertItemsEqual(expected_similarities, similarities)
def test_TerminalEqualEntity(self): src_tree = tree_or_string(u'(N @杉田山本)') trg_tree = tree_or_string(u'(N @Sugita_Yamamoto)') src_tree_pat = TreePattern(src_tree, (0,), []) trg_tree_pat = TreePattern(trg_tree, (0,), []) similarities = self.ent_ind.GetSimilarity(src_tree_pat, trg_tree_pat) self.assertEqual(1, len(similarities)) expected_similarities = [Similarity(0.0, 'entity_copy', src_tree_pat, trg_tree_pat)] self.assertEqual(expected_similarities, similarities) similarities = self.ent_ind.GetSimilar(src_tree_pat) result_pattern = TreePattern(u'@Sugita_Yamamoto', (), []) expected_similarities = \ [Similarity(0.0, 'entity_copy', src_tree_pat, result_pattern)] self.assertEqual(expected_similarities, similarities)
def test_NodeToLeafSimilar(self): tree1 = tree_or_string('(is (italian the) smart)') tree2 = tree_or_string('(french bright)') path1 = (1, ) path2 = (0, ) subpaths1 = [] subpaths2 = [] tree_pattern1 = TreePattern(tree1, path1, subpaths1) tree_pattern2 = TreePattern(tree2, path2, subpaths2) similarities = \ self.similarity_scorer.GetSimilarity(tree_pattern1, tree_pattern2) self.assertEqual(1, len(similarities)) expected_similarities = \ [Similarity(self.kScore, 'synonym', tree_pattern1, tree_pattern2)] self.assertListEqual(expected_similarities, similarities)
def test_TerminalEqualVarUpper(self): src_tree = tree_or_string('(:index A-1)') trg_tree = tree_or_string('(:tense e-2)') src_tree_pat = TreePattern(src_tree, (0,), []) trg_tree_pat = TreePattern(trg_tree, (0,), []) similarities = self.var_ind.GetSimilarity(src_tree_pat, trg_tree_pat) self.assertEqual(1, len(similarities)) expected_similarities = [Similarity(0.0, 'var_copy', src_tree_pat, trg_tree_pat)] self.assertEqual(expected_similarities, similarities) similarities = self.var_ind.GetSimilar(src_tree_pat) result_pattern = TreePattern('A-1', (), []) expected_similarities = \ [Similarity(0.0, 'var_copy', src_tree_pat, result_pattern)] self.assertEqual(expected_similarities, similarities)
def test_TerminalToTerminalSimilar(self): tree1 = tree_or_string('italian') tree2 = tree_or_string('european') path1 = () path2 = () subpaths1 = [] subpaths2 = [] tree_pattern1 = TreePattern(tree1, path1, subpaths1) tree_pattern2 = TreePattern(tree2, path2, subpaths2) similarities = \ self.similarity_scorer.GetSimilarity(tree_pattern1, tree_pattern2) self.assertEqual(1, len(similarities)) expected_similarities = \ [Similarity(self.kScore, 'hypernym', tree_pattern1, tree_pattern2)] self.assertListEqual(expected_similarities, similarities)
def AddAllPredicatesForEntitiesFromProds(productions, linker): """ This function is designed to compensate lack of coverage on our predicate linker. We collect all entities from rules, and then we obtain all predicates that connect to those entities. Then, we extend the rules by adding one more rule for each extra predicate. """ rules = list(set([production.rhs.rule for production in productions])) # Get entities. entities = GetEntitiesFromRules(rules, linker) # Get the predicates (with reverse operation if needed) for entities. predicates = set( p for e in entities for p in GetEntityPredicates(e, linker, with_reverse_op=True)) # Get a dictionary of productions with predicates. # We characterize a production by its non-terminal and non-terminals of RHS. productions_with_preds = {} for p in productions: if p.rhs.rule.state == 'predicate': productions_with_preds[(p.non_terminal, tuple(p.rhs.non_terminals))] = p # Go through all these productions and duplicate productions with # a different predicate as their rule's rhs. extended_productions = list() for prod in productions_with_preds.values(): if prod.non_terminal[0] == 'predicate': rule = prod.rhs.rule for pred in predicates: new_rule = XTRule( rule.state, rule.lhs, tree_or_string(pred), rule.newstates, rule.weight) deriv_rhs = RHS(new_rule, prod.rhs.non_terminals) new_prod = Production(prod.non_terminal, deriv_rhs, None) extended_productions.append(new_prod) extended_productions.extend(productions) return list(set(extended_productions))
def BuildTrgTreePatterns(self, src_treep): src_leaves = GetLeaves(src_treep) uri_candidates = self.GetURIs(src_leaves, k=self.kgen) path, subpaths = (), [] trg_treeps = [TreePattern( tree_or_string(u'(ID [] {0})'.format(uri)), path, subpaths) \ for uri in uri_candidates] return trg_treeps
def BuildTrgTreePatterns(self, src_treep): src_leaves = GetLeaves(src_treep) uri_candidates_direct = self.GetURIs( src_leaves, filterq=self.filterq, k=self.kgen) uri_candidates = [] for uri in uri_candidates_direct: uri_candidates.append(uri) uri_candidates.append('!' + uri) path, subpaths = (), [] src_has_variables = src_treep.HasVariables() if src_has_variables: trg_treeps = [TreePattern(tree_or_string(u'(ID {0} ?x0|)'.format(uri)), path, subpaths) for uri in uri_candidates] else: trg_treeps = [TreePattern(tree_or_string(uri), path, subpaths) \ for uri in uri_candidates] return trg_treeps
def BuildTrgTreePatterns(self, src_treep): src_leaves = GetLeaves(src_treep) # uri_candidate_docs = self.GetDocs(src_leaves, context=None, fields=['uri']) uri_candidates = self.GetURIs(src_leaves, k=self.kgen) path, subpaths = (), [] trg_treeps = [TreePattern(tree_or_string(uri), path, subpaths) \ for uri in uri_candidates] return trg_treeps
def test_PredRevVarEntPolicyAll(self): ldcsc = tree_or_string('(ID !?p1 ent)') sparql = self.prefix + '\nSELECT DISTINCT ?x0 as ?answer, ?p1 WHERE {\n' \ + '\tent\t?p1\t?x0 .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc, ['?p']) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def test_OpPredVarEnt(self): ldcsc = tree_or_string('(ID COUNT (ID ?p1 ent))') sparql = self.prefix + '\nSELECT DISTINCT COUNT(?x0) as ?answer WHERE {\n' \ + '\t?x0\t?p1\tent .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def test_PredRevEnt(self): ldcsc = tree_or_string('(ID !pred ent)') sparql = self.prefix + '\nSELECT DISTINCT ?x0 as ?answer WHERE {\n' \ + '\tent\tpred\t?x0 .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def test_NumberWrongFormat(self): ldcsc = tree_or_string('(ID pred1 (NUMBER (ID pred2 ent2)))') sparql = self.prefix + '\nSELECT DISTINCT ?x0 as ?answer WHERE {\n' \ + '\t?x0\tpred1\t?n0 .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def loadrules(fn, fmt='json', num_occur=0): """ Given a filename fn of a file containing transducer rules (XTRule), it parses the file in 'json' (default), 'yaml' or 'tiburon', and returns a list of XTRules. """ out = [] loaded = None with codecs.open(fn, 'r', 'utf-8') as infile: if fmt == 'json': loaded = LoadRulesJson(infile, num_occur) elif fmt == 'yaml': loaded = LoadRulesYaml(infile, num_occur) elif fmt == 'tiburon': loaded = LoadRulesTiburon(infile, num_occur) # if not loaded: # raise ValueError("No rules loaded from file: {0} with format {1}".format(fn, fmt)) for d in loaded: lhs = tree_or_string(d["lhs"].strip('"')) rhs = tree_or_string(d["rhs"].strip('"')) state = d["state"] weight = d.get("weight", 1.0) weight = float(weight) if "newstates" in d: if fmt == 'tiburon': newstates = d["newstates"] else: newstates = paths_as_dicts(d["newstates"]) else: newstates = {} # Parameter tying. tied_to = d.get("tied_to", None) # Features. features = d.get("features", None) newrule = XTRule(state, lhs, rhs, newstates, weight) newrule.tied_to = tied_to newrule.features = features out.append(newrule) return list(set(out))
def test_Nonterminal(self): input_tree = immutable(tree_or_string('(A (B D E) (C F G))')) output_tree = immutable(tree_or_string('(A (R (T V W) U) (S X))')) productions, non_terminals = \ self.transducer.Produce(input_tree, output_tree, 'q', (), ()) expected_non_terminals = [('q', (), (), ''), ('q', (0, ), (0, 1), ''), ('q', (1, ), (0, 0), ''), ('q', (1, 0), (0, 0, 0), ''), ('q', (1, 1), (0, 0, 1), ''), ('q', (1, 0), (0, 0, 1), ''), ('q', (1, 1), (0, 0, 0), '')] self.assertIn(expected_non_terminals[0], non_terminals) self.assertIn(expected_non_terminals[1], non_terminals) self.assertIn(expected_non_terminals[2], non_terminals) self.assertIn(expected_non_terminals[3], non_terminals) self.assertIn(expected_non_terminals[4], non_terminals) self.assertIn(expected_non_terminals[5], non_terminals) self.assertIn(expected_non_terminals[6], non_terminals)
def test_PreterminalIdentityUnseenTerminalSimilar(self): """ Using the Identity back-off, the state of the parent rule is applied to the path of the variable in the RHS. However, the states of the path of the variable in the RHS should be more specific: "hypernym". """ intree = tree_or_string('(NN dog)') rule1 = XTRule('hypernym', tree_or_string('italian'), tree_or_string('european'), {}, 1.0) rules = [rule1] rule_backoffs = [Identity(), LexicalSimilarity()] initial_state = 'q' transducer = xT(initial_state, rules, rule_backoffs) wrtg = transducer.Transduce(intree, None) outtrees = [tree for tree, _ in wrtg.GenerateNBestTrees()] expected_outtree = immutable(tree_or_string('(NN canine)')) self.assertIn(expected_outtree, outtrees)
def BuildTrgTreePatterns(bridges_and_relations): path, subpaths = (), [] tree_patterns_and_relations = [] for bridge_list, relation in bridges_and_relations: predicate_bridge, predicate_main = SetPredicateDirection(bridge_list, relation) tree = tree_or_string('(ID {0} {1})'.format(predicate_bridge, predicate_main)) tree_pattern = TreePattern(tree, path, subpaths) tree_patterns_and_relations.append((tree_pattern, relation)) return tree_patterns_and_relations
def test_Conjunction(self): ldcsc = tree_or_string('(ID (ID pred1 ent1) (ID pred2 ent2))') sparql = self.prefix + '\nSELECT DISTINCT ?x0 as ?answer WHERE {\n' \ + '\t?x0\tpred1\tent1 .\n' \ + '\t?x0\tpred2\tent2 .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def test_VarPredVarEnt(self): ldcsc = tree_or_string('(ID ?p1 (ID ?p2 ent))') sparql = self.prefix + '\nSELECT DISTINCT ?x0 as ?answer, ?p1, ?p2 WHERE {\n' \ + '\t?x0\t?p1\t?x1 .\n' \ + '\t?x1\t?p2\tent .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc, ['?p']) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def QueryLambdaDCSC(ldcsc_str, query_manager=None): assert IsString(ldcsc_str) if query_manager is None: query_manager = query_manager_global results = [] ldcsc = tree_or_string(ldcsc_str) query = Query.fromldcsc(ldcsc) if query is not None: results = [r[0] for r in query.get_results(query_manager)] return results
def test_PredPredEntPredEntPolicyAll(self): ldcsc = tree_or_string('(ID pred1 (ID !pred2 ent2) (ID pred3 ent3))') sparql = self.prefix + '\nSELECT DISTINCT ?x0 as ?answer, ?x0, ?x1 WHERE {\n' \ + '\t?x0\tpred1\t?x1 .\n' \ + '\tent2\tpred2\t?x1 .\n' \ + '\t?x1\tpred3\tent3 .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc, ['?x']) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def test_NonterminalIdentityNoBackoff(self): intree = tree_or_string('(NP (DT the) (NN dog))') rule0 = XTRule('q', tree_or_string('(DT ?x0|)'), tree_or_string('(DTT ?x0|)'), {(0, ): 'copy'}, 1.0) rule1 = XTRule('copy', tree_or_string('the'), tree_or_string('the'), {}, 1.0) rule2 = XTRule('q', tree_or_string('(NN ?x0|)'), tree_or_string('(NNN ?x0|)'), {(0, ): 'hypernym'}, 1.0) rule3 = XTRule('hypernym', tree_or_string('dog'), tree_or_string('canine'), {}, 1.0) rules = [rule0, rule1, rule2, rule3] rule_backoffs = [] initial_state = 'q' transducer = xT(initial_state, rules, rule_backoffs) wrtg = transducer.Transduce(intree, None) outtrees = [tree for tree, _ in wrtg.GenerateNBestTrees()] expected_outtree = immutable( tree_or_string('(NP (DTT the) (NNN canine))')) self.assertNotIn(expected_outtree, outtrees)
def test_ComplexNumber(self): ldcsc = tree_or_string( '(ID fb:government.us_president.presidency_number ' \ + '(NUMBER 23.0 fb:en.unitless))') sparql = self.prefix + '\nSELECT DISTINCT ?x0 as ?answer WHERE {\n' \ + '\t?x0\tfb:government.us_president.presidency_number\t23.0 .} LIMIT 10 #' sparql_out = Query.fromldcsc(ldcsc) self.assertEqual(sparql, str(sparql_out), msg='\n{0}\n!=\n{1}'.format(sparql, sparql_out))
def test_GetLeavesInTreePatternRepeatedLeaves2(self): tree = tree_or_string('(A (B b) (C b))') path = () subpaths = [(1, )] tree_pattern = TreePattern(tree, path, subpaths) leaves = tree_pattern.GetLeaves() expected_leaves = ['b'] self.assertListEqual(expected_leaves, leaves) leaves_indices = tree_pattern.GetLeavesIndices() expected_indices = [0] self.assertListEqual(expected_indices, leaves_indices)
def test_NonterminalUnseenTerminalSimilar(self): intree = tree_or_string('(NP (DT the) (NN dog))') rule0 = XTRule('q', tree_or_string('(NP ?x0|DT ?x1|NN)'), tree_or_string('(NPP ?x0|DTT ?x1|NNN)'), { (0, ): 'q', (1, ): 'q' }, 1.0) rule1 = XTRule('q', tree_or_string('(DT ?x0|)'), tree_or_string('(DTT ?x0|)'), {(0, ): 'copy'}, 1.0) rule2 = XTRule('copy', tree_or_string('the'), tree_or_string('the'), {}, 1.0) rule3 = XTRule('q', tree_or_string('(NN ?x0|)'), tree_or_string('(NNN ?x0|)'), {(0, ): 'hypernym'}, 1.0) rules = [rule0, rule1, rule2, rule3] rule_backoffs = [LexicalSimilarity()] initial_state = 'q' transducer = xT(initial_state, rules, rule_backoffs) wrtg = transducer.Transduce(intree, None) outtrees = [tree for tree, _ in wrtg.GenerateNBestTrees()] expected_outtree = immutable( tree_or_string('(NPP (DTT the) (NNN canine))')) self.assertIn(expected_outtree, outtrees)
def test_NonConsumingLHSAvoidsInfiniteRTG(self): intree = tree_or_string('(NN dog)') rule0 = XTRule('q', tree_or_string('?x0|NN'), tree_or_string('(NN ?x0|)'), {(0, ): 'q'}, 0.9) rule1 = XTRule('q', tree_or_string('?x0|NN'), tree_or_string('(JJ ?x0|)'), {(0, ): 't'}, 0.9) rule2 = XTRule('t', tree_or_string('(NN dog)'), tree_or_string('canine'), {}, 1.0) rules = [rule0, rule1, rule2] initial_state = 'q' transducer = xT(initial_state, rules) wrtg = transducer.Transduce(intree, None) outtrees = [tree for tree, _ in wrtg.GenerateNBestTrees()] expected_outtree = immutable(tree_or_string('(JJ canine)')) self.assertIn(expected_outtree, outtrees)
def LoadAlignments(alignment_fname): """ Load a filename with the following structure: src_tree trg_tree alignment ... src_tree trg_tree alignment into a dictionary indexed by a tuple (src_tree_str, trg_tree_str), whose values are Alignment objects. """ alignments = {} with codecs.open(alignment_fname, 'r', 'utf-8') as fin: lines = fin.readlines() assert len( lines) % 3 == 0, 'Lines in {0} are not a multiple of 3.'.format( alignment_fname) for i, line in enumerate(lines): if i % 3 == 0: src_tree_str = line.strip() src_tree = tree_or_string(src_tree_str) src_leaves = src_tree.leaves() if not IsString(src_tree) else [ src_tree ] if i % 3 == 1: trg_tree_str = line.strip() trg_tree = tree_or_string(trg_tree_str) trg_leaves = trg_tree.leaves() if not IsString(trg_tree) else [ trg_tree ] if i % 3 == 2: alignment_str = line.strip() alignment = Alignment(alignment_str, src_leaves, trg_leaves) alignments[(src_tree_str, trg_tree_str)] = alignment return alignments
def SetContext(self, raw_context): """ Curates the raw_context dictionary into an extended and more structured context dictionary. At the moment, raw_context may contain the following entries: * src_tree : constituent representation of source tree (string). * trg_tree : constituent representation of target tree (string). """ self.ClearContext() src_tree_str = raw_context.get('src_tree', None) if src_tree_str: src_tree = tree_or_string(src_tree_str) self.context['src_words'] = src_tree.GetLeaves() self.context['answer_type'] = GetAnswerType( self.context['src_words'])
def main(args=None): parser = argparse.ArgumentParser() parser.add_argument( "--input", dest="input_fname", nargs='?', type=str, help="Json filename with questions, as provided by SEMPRE.", default="") parser.add_argument("--binarize", action="store_true", help="Convert source trees into Chomsky normal form.", default=False) parser.add_argument( "--rem_qmark", action="store_true", help="Remove question mark from the end of the question.", default=False) parser.add_argument("--random", action="store_true", help="Randomize the order of the examples.", default=False) args = parser.parse_args() if not os.path.exists(args.input_fname): print('File does not exist: {0}'.format(args.input_fname)) sys.exit(1) # json data as provided by SEMPRE. fin_json_fname = args.input_fname with open(fin_json_fname) as fin: data = json.load(fin) if args.random: # random.seed(23) random.shuffle(data) for d in data: if 'targetFormula' in d: dcs_str = d['targetFormula'] trg_tree = ConvertDCS2Constituent(dcs_str) else: trg_tree = '(ID no target formula)' src_tree = tree_or_string(d['src_tree']) if args.rem_qmark: src_tree = remove_qmark(src_tree) if args.binarize: src_tree.chomsky_normal_form() print(src_tree) print(trg_tree)