def test_inner_node_child_categoryWithFeats(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'cat1', r'\P.P'), SemanticRule(r'NP/NP', r'\P.P'), SemanticRule(r'NP', r'\P Q.(Q -> P)', {'child1_category': 'NP/NP'}) ] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="NP/NP[mod=xx]" end="3" begin="2" id="sp1-2"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_base2 -> _base1') self.assertEqual(expected_semantics, lexpr(semantics))
def test_match_any2(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'cat1', r'\P.P'), SemanticRule(r'cat2', r'\P.P'), SemanticRule(r'cat3', r'\P.P'), SemanticRule(r'NP', r'\P Q.(Q & P)', {'rule': 'lex'}), SemanticRule(r'NP', r'\P Q.(Q | P)', {'child_any_pos': 'pos1'}), SemanticRule(r'NP', r'\P Q.(Q -> P)', {'child_any_category': 'cat3'}) ] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> <token base="base3" pos="pos3" surf="surf3" id="t1_3"/> </tokens> <ccg root="sp1-5"> <span terminal="t1_1" category="cat1" pos="pos1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="cat2" pos="pos2" end="3" begin="2" id="sp1-2"/> <span terminal="t1_3" category="cat3" pos="pos3" end="4" begin="3" id="sp1-3"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-4"/> <span child="sp1-4 sp1-3" rule="lex" category="NP" end="4" begin="1" id="sp1-5"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_base3 -> (_base2 | _base1)') self.assertEqual(expected_semantics, lexpr(semantics))
def test_lexical_binary_one_type(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'cat1', r'\P.P'), SemanticRule(r'cat2', r'\Q x.Q(x)', {'coq_type': 'Entity -> Prop'}), SemanticRule(r'NP', r'\P Q x.(P -> Q(x))', {'rule': 'lex'}) ] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="cat2" end="3" begin="2" id="sp1-2"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) coq_types = get_coq_types(ccg_tree) expected_coq_types = ["Parameter _base2 : Entity -> Prop."] self.assertEqual(expected_coq_types, coq_types)
def test_lexical_binary_two_types(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'cat1', r'\P.P', {'coq_type' : 'Entity -> Prop'}), SemanticRule(r'cat2', r'\P.P', {'coq_type' : 'Entity -> Prop -> Prop'}), SemanticRule(r'NP', r'\P Q.(Q -> P)', {'rule' : 'lex'})] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="cat2" end="3" begin="2" id="sp1-2"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) coq_types = get_coq_types(ccg_tree) expected_coq_types = ["Parameter _base1 : Entity -> Prop.", "Parameter _base2 : Entity -> Prop -> Prop."] self.assertEqual(expected_coq_types, coq_types)
def test_match_any2(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'cat1', r'\P.P'), SemanticRule(r'cat2', r'\P.P'), SemanticRule(r'cat3', r'\P.P'), SemanticRule(r'NP', r'\P Q.(Q & P)', {'rule' : 'lex'}), SemanticRule(r'NP', r'\P Q.(Q | P)', {'child_any_pos' : 'pos1'}), SemanticRule(r'NP', r'\P Q.(Q -> P)', {'child_any_category' : 'cat3'})] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> <token base="base3" pos="pos3" surf="surf3" id="t1_3"/> </tokens> <ccg root="sp1-5"> <span terminal="t1_1" category="cat1" pos="pos1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="cat2" pos="pos2" end="3" begin="2" id="sp1-2"/> <span terminal="t1_3" category="cat3" pos="pos3" end="4" begin="3" id="sp1-3"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-4"/> <span child="sp1-4 sp1-3" rule="lex" category="NP" end="4" begin="1" id="sp1-5"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_base3 -> (_base2 | _base1)') self.assertEqual(expected_semantics, lexpr(semantics))
def main(args = None): DESCRIPTION=textwrap.dedent("""\ categories_template.yaml should contain the semantic templates in YAML format. parsed_sentence.xml contains the CCG-parsed sentences. If --arbi-types is specified, then the arbitrary specification of types is enabled, thus using the argument as the field of the semantic template that is used. E.g, by specifying "--arbi-types coq_type" and a semantic template: - semantics: \P x.P(x) category: NP coq_type: Animal The type "Animal" will be used for this expression. Otherwise, types of the sem/logic module of NLTK are used. """) parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=DESCRIPTION) parser.add_argument("ccg") parser.add_argument("templates") parser.add_argument("sem") parser.add_argument("--arbi-types", action="store_true", default=False) parser.add_argument("--gold_trees", action="store_true", default=True) args = parser.parse_args() if not os.path.exists(args.templates): print('File does not exist: {0}'.format(args.templates)) sys.exit(1) if not os.path.exists(args.ccg): print('File does not exist: {0}'.format(args.ccg)) sys.exit(1) logging.basicConfig(level=logging.WARNING) semantic_index = SemanticIndex(args.templates) parser = etree.XMLParser(remove_blank_text=True) root = etree.parse(args.ccg, parser) for sentence in root.findall('.//sentence'): sem_node = etree.Element('semantics') try: sem_node.set('status', 'success') tree_index = 1 if args.gold_trees: tree_index = int(sentence.get('gold_tree', '0')) + 1 sem_tree = assign_semantics_to_ccg( sentence, semantic_index, tree_index) sem_node.set('root', sentence.xpath('./ccg[{0}]/@root'.format(tree_index))[0]) filter_attributes(sem_tree) sem_node.extend(sem_tree.xpath('.//descendant-or-self::span')) except LogicalExpressionException: sem_node.set('status', 'failed') sentence.append(sem_node) root_xml_str = serialize_tree(root) with codecs.open(args.sem, 'wb') as fout: fout.write(root_xml_str)
def test_vertical_bar(self): sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="N" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="N" end="3" begin="2" id="sp1-2"/> <span child="sp1-1 sp1-2" category="NP\NP" rule=">" end="3" begin="1" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP|NP', r'\F1 F2.(F1 -> F2)', {'rule': '>'}), SemanticRule(r'NP/NP', r'\F1 F2.(F1 & F2)', {'rule': '>'}) ] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = lexpr(ccg_tree.get('sem', None)) expected_semantics = lexpr(r'(_base1 -> _base2)') self.assertEqual(expected_semantics, semantics)
def test_inner_node_child_categoryWithFeats(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'cat1', r'\P.P'), SemanticRule(r'NP/NP', r'\P.P'), SemanticRule(r'NP', r'\P Q.(Q -> P)', {'child1_category' : 'NP/NP'})] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="NP/NP[mod=xx]" end="3" begin="2" id="sp1-2"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_base2 -> _base1') self.assertEqual(expected_semantics, lexpr(semantics))
def test_func_combination_backwardComplexTwoArgs(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'S\NP\NP', r'\P y x e. P(e, x, y)'), SemanticRule(r'S\S', r'\P Q e. AND(past(e), Q(e))') ] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token id="s1_4" surf="ほめ" pos="動詞" pos1="自立" pos2="*" pos3="*" inflectionType="一段" inflectionForm="連用形" base="ほめる" reading="ホメ"/> <token id="s1_5" surf="た" pos="助動詞" pos1="*" pos2="*" pos3="*" inflectionType="特殊・タ" inflectionForm="基本形" base="た" reading="タ"/> </tokens> <ccg root="s1_sp9"> <span id="s1_sp9" begin="4" end="6" category="(S[mod=nm,form=base]\NP[mod=nm,case=ga])\NP[mod=nm,case=o]" rule="<B2" child="s1_sp10 s1_sp11"/> <span id="s1_sp10" begin="4" end="5" category="(S[mod=nm,form=cont]\NP[mod=nm,case=ga])\NP[mod=nm,case=o]" terminal="s1_4"/> <span id="s1_sp11" begin="5" end="6" category="S[mod=nm,form=base]\S[mod=nm,form=cont]" terminal="s1_5"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\y x e.AND(past(e), _ほめる(x, y, e))') self.assertEqual(expected_semantics, lexpr(semantics))
def test_lexical_binary_two_coq_complex_type(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'cat1', r'\P x R.P(x, R)', {'coq_type' : 'Entity -> Prop -> Prop'}), SemanticRule(r'cat2', r'\Q S T.Q(S, T)', {'coq_type' : 'Prop -> Entity -> Prop'}), SemanticRule(r'NP', r'\P Q x R S T.(Q(x, R) -> P(S, T))', {'rule' : 'lex'})] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="cat2" end="3" begin="2" id="sp1-2"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) coq_lib = get_coq_types(ccg_tree) expected_coq_lib = ['Parameter _base1 : Entity -> Prop -> Prop.', 'Parameter _base2 : Prop -> Entity -> Prop.'] self.assertEqual(expected_coq_lib, coq_lib) expression = [ccg_tree.get('sem')] coq_sig = convert_coq_signatures_to_nltk(coq_lib) nltk_lib = build_dynamic_library(expression, coq_sig) lib = merge_dynamic_libraries(coq_lib, nltk_lib, './coqlib.v', sentence) expected_lib = ["Parameter _base2 : Prop -> Entity -> Prop.", "Parameter _base1 : Entity -> Prop -> Prop."] self.assertCountEqual(expected_lib, lib)
def test_RTG3Paths2Vars(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule' : '>'}), SemanticRule(r'NPNP', r'\F1 F2.(F1 -> F2)', {'var_paths' : [[0,0], [0,1], [1,0]], 'rule' : '>'})] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) with self.assertRaises(nltk.sem.logic.LogicalExpressionException): semantics = lexpr(ccg_tree.get('sem', None))
def test_CFG(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule' : '>'}), SemanticRule(r'NPNP', r'\F1 F2.(F1 -> F2)', {'rule' : '>'})] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) semantics = lexpr(ccg_tree.get('sem', None)) expected_semantics = lexpr(r'(_base1 & _base2) -> (_base3 & _base4)') self.assertEqual(expected_semantics, semantics)
def test_RTG3Paths3Vars(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule' : '>'}), SemanticRule(r'NPNP', r'\F1 F2 F3.((F3 & F2) -> F1)', {'var_paths' : [[0,0], [0,1], [1,0]], 'rule' : '>'})] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) semantics = lexpr(ccg_tree.get('sem', None)) expected_semantics = lexpr(r'((_base3 & _base2) -> _base1)') self.assertEqual(expected_semantics, semantics)
def test_RTG1Path(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule' : '>'}), SemanticRule(r'NPNP', r'\F1 F2.(F1 -> F2)', {'var_paths' : [[0,1]], 'rule' : '>'})] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) semantics = lexpr(ccg_tree.get('sem', None)) expected_semantics = lexpr(r'\F2.(_base2 -> F2)') self.assertEqual(expected_semantics, semantics)
def test_CFG(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule': '>'}), SemanticRule(r'NPNP', r'\F1 F2.(F1 -> F2)', {'rule': '>'}) ] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) semantics = lexpr(ccg_tree.get('sem', None)) expected_semantics = lexpr(r'(_base1 & _base2) -> (_base3 & _base4)') self.assertEqual(expected_semantics, semantics)
def test_RTG3Paths2Vars(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule': '>'}), SemanticRule(r'NPNP', r'\F1 F2.(F1 -> F2)', { 'var_paths': [[0, 0], [0, 1], [1, 0]], 'rule': '>' }) ] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) with self.assertRaises(nltk.sem.logic.LogicalExpressionException): semantics = lexpr(ccg_tree.get('sem', None))
def semantic_parse_sentence(sentence_ind): """ `sentence` is an lxml tree with tokens and ccg nodes. It returns an lxml semantics node. """ global lock sentence = SENTENCES[sentence_ind] sem_nodes = [] # TODO: try to prevent semantic parsing for fragmented CCG trees. # Otherwise, produce fragmented semantics. if ARGS.gold_trees: # In xpath, elements are 1-indexed. # However, gold_tree annotations assumed zero-index. # This line fixes it. tree_indices = [int(sentence.get('gold_tree', '0')) + 1] if ARGS.nbest != 1: tree_indices = get_tree_indices(sentence, ARGS.nbest) for tree_index in tree_indices: sem_node = etree.Element('semantics') try: sem_tree = assign_semantics_to_ccg(sentence, SEMANTIC_INDEX, tree_index) filter_attributes(sem_tree) sem_node.extend(sem_tree.xpath('.//descendant-or-self::span')) sem_node.set('status', 'success') sem_node.set( 'ccg_id', sentence.xpath('./ccg[{0}]/@id'.format(tree_index))[0]) sem_node.set( 'root', sentence.xpath('./ccg[{0}]/@root'.format(tree_index))[0]) # print('.', end='', file=sys.stdout) sys.stdout.flush() except Exception as e: sem_node.set('status', 'failed') # from pudb import set_trace; set_trace() sentence_surf = ' '.join(sentence.xpath('tokens/token/@surf')) lock.acquire() logging.error( 'An error occurred: {0}\nSentence: {1}\nTree XML:\n{2}'.format( e, sentence_surf, etree.tostring(sentence, encoding='utf-8', pretty_print=True).decode('utf-8'))) lock.release() # print('x', end='', file=sys.stdout) sys.stdout.flush() sem_nodes.append(sem_node) return [etree.tostring(sem_node) for sem_node in sem_nodes]
def test_RTG1Path(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule': '>'}), SemanticRule(r'NPNP', r'\F1 F2.(F1 -> F2)', { 'var_paths': [[0, 1]], 'rule': '>' }) ] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) semantics = lexpr(ccg_tree.get('sem', None)) expected_semantics = lexpr(r'\F2.(_base2 -> F2)') self.assertEqual(expected_semantics, semantics)
def test_RTG3Paths3Vars(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'N', r'\P.P', {}), SemanticRule(r'NP', r'\F1 F2.(F1 & F2)', {'rule': '>'}), SemanticRule(r'NPNP', r'\F1 F2 F3.((F3 & F2) -> F1)', { 'var_paths': [[0, 0], [0, 1], [1, 0]], 'rule': '>' }) ] semantic_index.rules = semantic_rules ccg_tree = assign_semantics_to_ccg(self.sentence, semantic_index) semantics = lexpr(ccg_tree.get('sem', None)) expected_semantics = lexpr(r'((_base3 & _base2) -> _base1)') self.assertEqual(expected_semantics, semantics)
def test_token_to_const_latin(self): sentence_str = r""" <sentence id="s0"> <tokens> <token base="*" pos="名詞-固有名詞-組織" surf="Scala" id="t0_0"/> </tokens> <ccg root="sp0-3"> <span terminal="t0_0" category="NP[mod=nm,case=nc]" end="1" begin="0" id="sp0-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_Scala') self.assertEqual(expected_semantics, lexpr(semantics))
def test_token_to_const_japanese(self): sentence_str = r""" <sentence id="s0"> <tokens> <token base="言語" pos="名詞-一般" surf="言語" id="t0_3"/> </tokens> <ccg root="sp0-9"> <span terminal="t0_3" category="NP[mod=nm,case=nc]" end="4" begin="3" id="sp0-9"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_言語') self.assertEqual(expected_semantics, lexpr(semantics))
def test_token_to_function_2args(self): sentence_str = r""" <sentence id="s0"> <tokens> <token base="は" pos="助詞-係助詞" surf="は" id="t0_1"/> </tokens> <ccg root="sp0-4"> <span terminal="t0_1" category="(S/S)\NP[mod=nm,case=nc]" end="2" begin="1" id="sp0-4"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\x y._は(y, x)') self.assertEqual(expected_semantics, lexpr(semantics))
def test_token_to_function_1arg(self): sentence_str = r""" <sentence id="s0"> <tokens> <token base="です" katsuyou="基本形" pos="助動詞" surf="です" id="t0_4"/> </tokens> <ccg root="sp0-10"> <span terminal="t0_4" category="S[mod=nm,form=base]\NP[mod=nm,case=nc]" end="5" begin="4" id="sp0-10"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\x._です(x)') self.assertEqual(expected_semantics, lexpr(semantics))
def test_typeraising_for_unary_pred(self): sentence_str = r""" <sentence id="s0"> <tokens> <token base="良い" katsuyou="基本形" pos="形容詞-自立" surf="良い" id="t0_2"/> </tokens> <ccg root="sp0-7"> <span child="sp0-8" rule="ADN" category="NP[case=nc]/NP[case=nc]" end="3" begin="2" id="sp0-7"/> <span terminal="t0_2" category="S[mod=adn,form=base]" end="3" begin="2" id="sp0-8"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\P x.(P(x) & _良い(x))') self.assertEqual(expected_semantics, lexpr(semantics))
def test_func_combination_backwardSimpleTwoArgs(self): sentence_str = r""" <sentence id="s1"> <tokens> <token base="F" pos="pos1" surf="F" id="t1_3"/> <token base="G" katsuyou="katsuyou2" pos="pos2" surf="G" id="t1_4"/> </tokens> <ccg root="sp1-7"> <span child="sp1-8 sp1-9" rule="<B2" category="S[mod=nm,form=base]\NP[mod=nm,case=ga]\NP" end="5" begin="3" id="sp1-7"/> <span terminal="t1_3" category="S[mod=nm,form=da]\NP[mod=nm,case=ga]\NP" end="4" begin="3" id="sp1-8"/> <span terminal="t1_4" category="S[mod=nm,form=base]\S[mod=nm,form=da]" end="5" begin="4" id="sp1-9"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\y x._G(_F(x, y))') self.assertEqual(expected_semantics, lexpr(semantics))
def test_func_combination_backward(self): sentence_str = r""" <sentence id="s1"> <tokens> <token base="簡潔" pos="名詞-形容動詞語幹" surf="簡潔" id="t1_3"/> <token base="です" katsuyou="基本形" pos="助動詞" surf="です" id="t1_4"/> </tokens> <ccg root="sp1-7"> <span child="sp1-8 sp1-9" rule="<B" category="S[mod=nm,form=base]\NP[mod=nm,case=ga]" end="5" begin="3" id="sp1-7"/> <span terminal="t1_3" category="S[mod=nm,form=da]\NP[mod=nm,case=ga]" end="4" begin="3" id="sp1-8"/> <span terminal="t1_4" category="S[mod=nm,form=base]\S[mod=nm,form=da]" end="5" begin="4" id="sp1-9"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\x._です(_簡潔(x))') self.assertEqual(expected_semantics, lexpr(semantics))
def test_func_application_backward(self): sentence_str = r""" <sentence id="s0"> <tokens> <token base="*" pos="名詞-固有名詞-組織" surf="Scala" id="t0_0"/> <token base="は" pos="助詞-係助詞" surf="は" id="t0_1"/> </tokens> <ccg root="sp0-2"> <span child="sp0-3 sp0-4" rule="<" category="S/S" end="2" begin="0" id="sp0-2"/> <span terminal="t0_0" category="NP[mod=nm,case=nc]" end="1" begin="0" id="sp0-3"/> <span terminal="t0_1" category="(S/S)\NP[mod=nm,case=nc]" end="2" begin="1" id="sp0-4"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\y._は(y, _Scala)') self.assertEqual(expected_semantics, lexpr(semantics))
def test_np_feature_no(self): semantic_index = SemanticIndex(None) semantic_index.rules = [SemanticRule(r'NP', r'\P.P')] sentence_str = r""" <sentence id="s0"> <tokens> <token base="basepred" pos="pos1" surf="surfpred" id="t0_0"/> </tokens> <ccg root="sp0-3"> <span terminal="t0_0" category="NP" end="1" begin="0" id="sp0-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_basepred') self.assertEqual(expected_semantics, lexpr(semantics))
def test_lexical_binary_two_coq_complex_type(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'cat1', r'\P x R.P(x, R)', {'coq_type': 'Entity -> Prop -> Prop'}), SemanticRule(r'cat2', r'\Q S T.Q(S, T)', {'coq_type': 'Prop -> Entity -> Prop'}), SemanticRule(r'NP', r'\P Q x R S T.(Q(x, R) -> P(S, T))', {'rule': 'lex'}) ] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> <token base="base2" pos="pos2" surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> <span terminal="t1_2" category="cat2" end="3" begin="2" id="sp1-2"/> <span child="sp1-1 sp1-2" rule="lex" category="NP" end="3" begin="1" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) coq_lib = get_coq_types(ccg_tree) expected_coq_lib = [ 'Parameter _base1 : Entity -> Prop -> Prop.', 'Parameter _base2 : Prop -> Entity -> Prop.' ] self.assertEqual(expected_coq_lib, coq_lib) expression = [ccg_tree.get('sem')] coq_sig = convert_coq_signatures_to_nltk(coq_lib) nltk_lib, _ = build_dynamic_library(expression, coq_sig) lib = merge_dynamic_libraries(coq_sig, nltk_lib, './coqlib.v', sentence) expected_lib = [ "Parameter _base2 : Prop -> (Entity -> Prop).", "Parameter _base1 : Entity -> (Prop -> Prop)." ] self.assertCountEqual(expected_lib, lib)
def test_func_application_forward(self): sentence_str = r""" <sentence id="s0"> <tokens> <token base="良い" katsuyou="基本形" pos="形容詞-自立" surf="良い" id="t0_2"/> <token base="言語" pos="名詞-一般" surf="言語" id="t0_3"/> </tokens> <ccg root="sp0-6"> <span child="sp0-7 sp0-9" rule=">" category="NP[mod=nm,case=nc]" end="4" begin="2" id="sp0-6"/> <span child="sp0-8" rule="ADN" category="NP[case=nc]/NP[case=nc]" end="3" begin="2" id="sp0-7"/> <span terminal="t0_2" category="S[mod=adn,form=base]" end="3" begin="2" id="sp0-8"/> <span terminal="t0_3" category="NP[mod=nm,case=nc]" end="4" begin="3" id="sp0-9"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\x.(_言語(x) & _良い(x))') self.assertEqual(expected_semantics, lexpr(semantics))
def test_lexical_unary(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'N', r'\P.P'), SemanticRule(r'NP', r'\P.(P -> P)', {'rule' : 'lex'})] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> </tokens> <ccg root="sp1-2"> <span terminal="t1_1" category="N" end="2" begin="1" id="sp1-1"/> <span child="sp1-1" rule="lex" category="NP" end="2" begin="1" id="sp1-2"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_base1 -> _base1') self.assertEqual(expected_semantics, lexpr(semantics))
def test_lexical_unary(self): semantic_index = SemanticIndex(None) semantic_rules = [ SemanticRule(r'N', r'\P.P'), SemanticRule(r'NP', r'\P.(P -> P)', {'rule': 'lex'}) ] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> </tokens> <ccg root="sp1-2"> <span terminal="t1_1" category="N" end="2" begin="1" id="sp1-1"/> <span child="sp1-1" rule="lex" category="NP" end="2" begin="1" id="sp1-2"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_base1 -> _base1') self.assertEqual(expected_semantics, lexpr(semantics))
def test_func_application_backward(self): # 'は' has category (S/S)\NP[mod=nm,case=nc] which is not in the # unittest semantic templates. Thus, it is assigned the default # \E O.O and 'Scala' becomes the final meaning representation. sentence_str = r""" <sentence id="s0"> <tokens> <token base="*" pos="名詞-固有名詞-組織" surf="Scala" id="t0_0"/> <token base="は" pos="助詞-係助詞" surf="は" id="t0_1"/> </tokens> <ccg root="sp0-2"> <span child="sp0-3 sp0-4" rule="<" category="S/S" end="2" begin="0" id="sp0-2"/> <span terminal="t0_0" category="NP[mod=nm,case=nc]" end="1" begin="0" id="sp0-3"/> <span terminal="t0_1" category="(S/S)\NP[mod=nm,case=nc]" end="2" begin="1" id="sp0-4"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, self.semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'_Scala') self.assertEqual(expected_semantics, lexpr(semantics))
def test_func_combination_backwardComplexTwoArgs(self): semantic_index = SemanticIndex(None) semantic_rules = [SemanticRule(r'S\NP\NP', r'\P y x e. P(e, x, y)'), SemanticRule(r'S\S', r'\P Q e. AND(past(e), Q(e))')] semantic_index.rules = semantic_rules sentence_str = r""" <sentence id="s1"> <tokens> <token id="s1_4" surf="ほめ" pos="動詞" pos1="自立" pos2="*" pos3="*" inflectionType="一段" inflectionForm="連用形" base="ほめる" reading="ホメ"/> <token id="s1_5" surf="た" pos="助動詞" pos1="*" pos2="*" pos3="*" inflectionType="特殊・タ" inflectionForm="基本形" base="た" reading="タ"/> </tokens> <ccg root="s1_sp9"> <span id="s1_sp9" begin="4" end="6" category="(S[mod=nm,form=base]\NP[mod=nm,case=ga])\NP[mod=nm,case=o]" rule="<B2" child="s1_sp10 s1_sp11"/> <span id="s1_sp10" begin="4" end="5" category="(S[mod=nm,form=cont]\NP[mod=nm,case=ga])\NP[mod=nm,case=o]" terminal="s1_4"/> <span id="s1_sp11" begin="5" end="6" category="S[mod=nm,form=base]\S[mod=nm,form=cont]" terminal="s1_5"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = assign_semantics_to_ccg(sentence, semantic_index) semantics = ccg_tree.get('sem', None) expected_semantics = lexpr(r'\y x e.AND(past(e), _ほめる(x, y, e))') self.assertEqual(expected_semantics, lexpr(semantics))
def main(args = None): DESCRIPTION=textwrap.dedent("""\ categories_template.yaml should contain the semantic templates in YAML format. parsed_sentence.xml contains the parsed sentences. All CCG trees correspond to the premises, except the last one, which is the hypothesis. If --arbi-types flag is specified, then the arbitrary specification of coq_types is enabled. Thus, semantic rule assignments should contain a a field such as: - semantics: \P x.P(x) category: NP coq_type: Animal If --auto-types is enabled, or no flag is specified, then the automatic inference of types is enabled. This automatic inference relies on the naive implementation in the sem/logic module of NLTK. """) parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=DESCRIPTION) parser.add_argument("expression_templates_filename") parser.add_argument("parsed_sentences_filename") parser.add_argument("--sem_out_fname", dest="sem_out_fname", nargs='?', type=str, default="") parser.add_argument("--theorem_template", dest="theorem_template", nargs='?', type=str, default="") parser.add_argument("--arbi-types", action="store_true", default=False) parser.add_argument("--abduction", action="store_true", default=False) parser.add_argument("--gold_trees", action="store_true", default=False) args = parser.parse_args() if not os.path.exists(args.expression_templates_filename): print('File does not exist: {0}'.format(args.expression_templates_filename)) if not os.path.exists(args.parsed_sentences_filename): print('File does not exist: {0}'.format(args.parsed_sentences_filename)) logging.basicConfig(level=logging.WARNING) semantic_index = SemanticIndex(args.expression_templates_filename) parser = etree.XMLParser(remove_blank_text=True) ccg_xml_trees = etree.parse( args.parsed_sentences_filename, parser).findall('.//sentence') logical_interpretations = [] ccg_tree_list = [] ccg_tokens_list = [] for ccg_xml in ccg_xml_trees: ccg_tree = assign_semantics_to_ccg(ccg_xml, semantic_index) ccg_tree_list.append(ccg_tree) assert 'sem' in ccg_tree.attrib, \ 'The assignment of semantics to CCG tree may have failed. Tree: {0}'\ .format(etree.tostring(ccg_tree, pretty_print=True, encoding = 'utf-8')\ .decode('utf-8')) lambda_expression = ccg_tree.get('sem') logical_interpretations.append(lambda_expression) ccg_tokens = ccg_xml.find("tokens") ccg_tokens_list.append(ccg_tokens) if arbi_types_requested: inference_result, coq_scripts = \ prove_from_ccg(logical_interpretations, ccg_trees=ccg_tree_list, ccg_xml_trees=ccg_xml_trees) else: inference_result, coq_scripts = \ prove_from_ccg(logical_interpretations, ccg_xml_trees=ccg_xml_trees) print(inference_result, file=sys.stdout) html_str = convert_trees_to_mathml(ccg_tree_list, ccg_tokens_list, coq_scripts) print(html_str, file=sys.stderr)
def main(args=None): DESCRIPTION = textwrap.dedent("""\ categories_template.yaml should contain the semantic templates in YAML format. parsed_sentence.xml contains the CCG-parsed sentences. If --arbi-types is specified, then the arbitrary specification of types is enabled, thus using the argument as the field of the semantic template that is used. E.g, by specifying "--arbi-types coq_type" and a semantic template: - semantics: \P x.P(x) category: NP coq_type: Animal The type "Animal" will be used for this expression. Otherwise, types of the sem/logic module of NLTK are used. """) parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=DESCRIPTION) parser.add_argument("ccg") parser.add_argument("templates") parser.add_argument("sem") parser.add_argument("--arbi-types", action="store_true", default=False) parser.add_argument("--gold_trees", action="store_true", default=True) parser.add_argument("--nbest", nargs='?', type=int, default="0") args = parser.parse_args() if not os.path.exists(args.templates): print('File does not exist: {0}'.format(args.templates)) sys.exit(1) if not os.path.exists(args.ccg): print('File does not exist: {0}'.format(args.ccg)) sys.exit(1) logging.basicConfig(level=logging.WARNING) semantic_index = SemanticIndex(args.templates) parser = etree.XMLParser(remove_blank_text=True) root = etree.parse(args.ccg, parser) for sentence in root.findall('.//sentence'): if args.gold_trees: tree_indices = [int(sentence.get('gold_tree', '0')) + 1] if args.nbest != 1: tree_indices = get_tree_indices(sentence, args.nbest) for tree_index in tree_indices: sem_node = etree.Element('semantics') sem_node.set( 'ccg_id', sentence.xpath('./ccg[{0}]/@id'.format(tree_index))[0]) try: sem_node.set('status', 'success') sem_tree = assign_semantics_to_ccg(sentence, semantic_index, tree_index) sem_node.set( 'root', sentence.xpath('./ccg[{0}]/@root'.format(tree_index))[0]) filter_attributes(sem_tree) sem_node.extend(sem_tree.xpath('.//descendant-or-self::span')) except LogicalExpressionException as e: sem_node.set('status', 'failed') logging.error('An error occurred: {0}'.format(e)) sentence.append(sem_node) root_xml_str = serialize_tree(root) with codecs.open(args.sem, 'wb') as fout: fout.write(root_xml_str)