Ejemplo n.º 1
0
    def test_induction_from_corpus_tree(self):
        dsg = sentence_names_to_deep_syntax_graphs(["s26954"], "res/tiger/tiger_s26954.xml", hold=False)[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label
        labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]
        rec_part = rec_part_strategy(dsg)
        dcmp = compute_decomposition(dsg, rec_part)

        grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=labeling, terminal_labeling=term_labeling)

        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input(term_labeling_token.prepare_parser_input(dsg.sentence))
        parser.parse()
        self.assertTrue(parser.recognized())

        derivation = parser.best_derivation_tree()
        self.assertNotEqual(derivation, None)
    def test_lcfrs_sdcp_parsing(self):
        def tree1():
            tree = ConstituentTree("1")
            for i, t in enumerate(["a", "b", "c", "d"]):
                tree.add_leaf(str(i), "P" + t, t)
            tree.set_label('r0', 'C')
            tree.set_label('r1', 'A')
            tree.set_label('r2', 'B')
            tree.add_to_root('r0')
            tree.add_child('r0', 'r1')
            tree.add_child('r0', 'r2')
            tree.add_child('r1', '0')
            tree.add_child('r1', '2')
            tree.add_child('r2', '1')
            tree.add_child('r2', '3')
            print(tree, tree.word_yield())
            return tree

        def tree2():
            tree = ConstituentTree("1")
            for i, t in enumerate(["a", "b", "d", "c"]):
                tree.add_leaf(str(i), "P" + t, t)
            tree.set_label('r0', 'C')
            tree.set_label('r1', 'A')
            tree.set_label('r2', 'B')
            tree.add_to_root('r0')
            tree.add_child('r0', 'r1')
            tree.add_child('r0', 'r2')
            tree.add_child('r1', '0')
            tree.add_child('r1', '3')
            tree.add_child('r2', '1')
            tree.add_child('r2', '2')
            print(tree, tree.word_yield())
            return tree

        t1 = tree1()
        t2 = tree2()

        grammar = direct_extract_lcfrs(t1)
        grammar.add_gram(direct_extract_lcfrs(t2))

        print(grammar)
        # LCFRS_sDCP_Parser.preprocess_grammar(grammar, PosTerminals(), debug=True)

        parser = LCFRS_sDCP_Parser(grammar,
                                   terminal_labelling=PosTerminals(),
                                   debug=True)
        for t in [t1, t2]:
            # parser = LCFRS_sDCP_Parser(grammar, t)
            parser.set_input(t)
            parser.parse()
            self.assertTrue(parser.recognized())
            derivs = list(parser.all_derivation_trees())
            for der in derivs:
                print(der)
            self.assertEqual(1, len(derivs))
            parser.clear()
Ejemplo n.º 3
0
def fringe_extract_lcfrs(tree, fringes, naming='strict', term_labeling=PosTerminals(), isolate_pos=False, feature_logging=None):
    """
    :type tree: ConstituentTree
    :param fringes: recursive partitioning
    :param naming: 'strict' or 'child'
    :type naming: str
    :type term_labeling: ConstituentTerminalLabeling
    :rtype: LCFRS
    Get LCFRS for tree.
    """
    gram = LCFRS(start=START)
    first = None
    if len(tree.id_yield()) == 1 and isolate_pos:
        idx = tree.id_yield()[0]
        if tree.root[0] != idx:
            c_nont, c_spans, c_id_seq, c_nont_feat \
                = fringe_extract_lcfrs_recur(tree, fringes, gram, naming, term_labeling, isolate_pos, feature_logging,
                                             yield_one_check=False)

            fringe = fringes[0]
            spans = join_spans(fringe)
            args = []
            term_to_pos = {}  # maps input position to position in LCFRS rule
            for span in spans:
                args += [span_to_arg(span, [c_spans], tree, term_to_pos, term_labeling)]

            id_seq = make_id_seq(tree, tree.root[0], fringe)

            dcp_rules = []
            for (i, seq) in enumerate(id_seq):
                dcp_rhs = make_fringe_terms(tree, seq, [c_id_seq], term_to_pos, term_labeling)
                dcp_lhs = DCP_var(-1, i)
                dcp_rule = DCP_rule(dcp_lhs, dcp_rhs)
                dcp_rules += [dcp_rule]

            nont = id_nont(id_seq, tree, naming) + '/' + str(len(spans))
            nont_feat = feats(id_seq, tree)
            lhs = LCFRS_lhs(nont)
            for arg in args:
                lhs.add_arg(arg)
            rule = gram.add_rule(lhs, [c_nont], dcp=dcp_rules)
            if feature_logging is not None:
                feature_logging[(nont, nont_feat)] += 1
                feature_logging[(rule.get_idx(), nont_feat, tuple([c_nont_feat]))] += 1

            first = nont

    if first is None:
        (first, _, _, _) = fringe_extract_lcfrs_recur(tree, fringes, gram, naming, term_labeling, isolate_pos, feature_logging)
    lhs = LCFRS_lhs(START)
    lhs.add_arg([LCFRS_var(0, 0)])
    dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)])
    gram.add_rule(lhs, [first], dcp=[dcp_rule])
    return gram
Ejemplo n.º 4
0
    def test_fanout_marking(self):
        label = 's1813'
        path = "res/tiger/tiger_8000.xml"
        dsgs = sentence_names_to_deep_syntax_graphs([label], path, hold=False, reorder_children=True)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        def rec_part_strategy(direction, subgrouping, fanout):
            if direction == "right-to-left":
                return lambda dsg: fanout_limited_partitioning(dsg.recursive_partitioning(subgrouping), fanout)
            else:
                return lambda dsg: fanout_limited_partitioning_left_to_right(dsg.recursive_partitioning(subgrouping),
                                                                             fanout)

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label

        def stupid_edge(edge):
            return "X"

        def label_child(edge, j):
            return edge.get_function(j)

        def simple_nonterminal_labeling(nodes, dsg):
            return simple_labeling(nodes, dsg, label_edge)

        def bot_stupid_nonterminal_labeling(nodes, dsg):
            return top_bot_labeling(nodes, dsg, label_edge, stupid_edge)

        def missing_child_nonterminal_labeling(nodes, dsg):
            return missing_child_labeling(nodes, dsg, label_edge, label_child)

        rec_part = rec_part_strategy("left-to-right", True, 2)(dsgs[0])
        print(rec_part)

        dcmp = compute_decomposition(dsgs[0], rec_part)
        print(dcmp)

        grammar = induce_grammar_from(dsgs[0], rec_part, dcmp, labeling=missing_child_nonterminal_labeling,
                                      terminal_labeling=term_labeling)

        print(grammar)

        for rule in grammar.rules():
            print(rule)
Ejemplo n.º 5
0
 def __init__(self):
     self.normalize = False
     self.disconnect_punctuation = True
     self.terminal_labeling = PosTerminals()
     # self.nont_labeling = NonterminalsWithFunctions()
     self.nont_labeling = BasicNonterminalLabeling()
     self.binarize = True
     self.isolate_pos = True
     self.hmarkov = 0
     self.use_discodop_binarization = False
     self.discodop_binarization_params = [
         "--headrules=util/negra.headrules", "--binarize", "-h 1", "-v 1"
     ]
Ejemplo n.º 6
0
def direct_extract_lcfrs_from_prebinarized_corpus(tree,
                                                  term_labeling=PosTerminals(),
                                                  nont_labeling=BasicNonterminalLabeling(),
                                                  isolate_pos=True):
    gram = LCFRS(start=START)
    root = tree.root[0]
    if tree.is_leaf(root):
        lhs = LCFRS_lhs(START)
        label = term_labeling.token_label(tree.node_token(root))
        lhs.add_arg([label])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge(), pos=tree.node_token(root).pos()), [])])
        gram.add_rule(lhs, [], dcp=[dcp_rule])
    else:
        first = direct_extract_lcfrs_prebinarized_recur(tree, root, gram, term_labeling, nont_labeling, isolate_pos)
        lhs = LCFRS_lhs(START)
        lhs.add_arg([LCFRS_var(0, 0)])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)])
        gram.add_rule(lhs, [first], dcp=[dcp_rule])
    return gram
Ejemplo n.º 7
0
                            filter=test_input_filter,
                            type=corpus_type_test)

    return train, dev, test, test_input


def my_feature_filter(elem):
    base_feats = ["number", "person", "tense", "mood", "case", "degree", "category", "pos", "gender"]
    feat_set = {feat: value for feat, value in elem[0]}
    if "pos" in feat_set and feat_set["pos"] in {"APPR", "APPRART"}:
        return extract_feat(elem[0], features=base_feats + ["lemma"])
    return extract_feat(elem[0], features=base_feats)


FINE_TERMINAL_LABELING = FeatureTerminals(token_to_features, feature_filter=my_feature_filter)
FINE_TERMINAL_LABELING = CompositionalTerminalLabeling(FormTerminals(), PosTerminals())
FALLBACK_TERMINAL_LABELING = PosTerminals()

DEFAULT_RARE_WORD_THRESHOLD = 10


def terminal_labeling(corpus, threshold=DEFAULT_RARE_WORD_THRESHOLD):
    return FrequencyBiasedTerminalLabeling(FINE_TERMINAL_LABELING, FALLBACK_TERMINAL_LABELING, corpus, threshold)


MULTI_OBJECTIVES = "multi-objectives"
MULTI_OBJECTIVES_INDEPENDENT = "multi-objectives-independent"
BASE_GRAMMAR = "base-grammar" # use base grammar for parsing (no annotations LA)
MAX_RULE_PRODUCT_ONLY = "max-rule-product-only"
TEST_SECOND_HALF = False
Ejemplo n.º 8
0
def direct_extract_lcfrs(tree,
                         term_labeling=PosTerminals(),
                         nont_labeling=BasicNonterminalLabeling(),
                         binarize=False,
                         isolate_pos=False,
                         hmarkov=0):
    """
    :type tree: ConstituentTree
    :type term_labeling: ConstituentTerminalLabeling
    :type binarize: bool
    :type isolate_pos: bool
    :type hmarkov: int
    :rtype: LCFRS
    Extract LCFRS directly from hybrid tree.
    """
    assert not binarize or isolate_pos

    # Binarization without POS isolation requires a more sophisticated sDCP handling
    # with more than one variable per nonterminal. This is not implemented.
    # see, e.g., TIGER sentence 328:
    #
    #                                             ROOT
    # ┌──────────────────────┬─────────────────────┼────────────────────────────────────────────────────────────────┐
    # │                      │                     S                                                                │
    # │                      │    ┌────┬───┬───────┴───────────┬──────┬──────────────────────────────────┐          │
    # │                      │    │    NP  │                   │      │                                  │          │
    # │     ┌──────┬────┬─── │ ── │ ───┴── │ ───────────────── │ ──── │ ────────────┐                    │          │
    # │     │      │    │    │    │        NP                  │      │             PP                   PP         │
    # │     │      │    │    │    │    ┌───┴───────┐           │      │      ┌──────┼──────┐       ┌─────┴────┐     │
    # $[   ADV    ADV  CARD  $[ VAFIN ART          NN         ADV    ADV    APPR   CARD    NN   APPRART       NN    $.
    # │     │      │    │    │    │    │           │           │      │      │      │      │       │          │     │
    # `` Deutlich über 5000  ''  hat  die   SPD-Stadtregieru jetzt jeweils binnen zwölf Monaten    im       Visier  .
    #                                              ng
    #
    # [a] S/1(LCFRS_var(mem=0, arg=0) LCFRS_var(mem=1, arg=0) ADV ADV LCFRS_var(mem=0, arg=1) LCFRS_var(mem=1, arg=1))
    #     -> NP/2 BAR/S/2		<0>=S:{--}(<0,0> <1,0> [0:{MO}]() [1:{MO}]())
    # [b] BAR/S/2(VAFIN LCFRS_var(mem=0, arg=0); LCFRS_var(mem=1, arg=0))
    #     -> NP/1 PP/1		<0>=[0:{HD}]() <0,0> <1,0>
    # here the <1,0> variable in rule [a] needs to be split into <1,0> and <1,1>, because the NP occurs before "ADV ADV"
    # in the canonically ordered tree, but PP occurs afterwards
    #
    # TODO: Claim: This only affects BAR/.. nonterminals. Not more than fanout(BAR/...) nonterminals are needed.
    # TODO: Thus, each BAR/.. nonterminal gets a uniform number of sDCP arguments, some may be empty.
    # TODO: The term.args() string can be analyzed to construct appropriate sDCP rules for the binarization artifacts.

    gram = LCFRS(start=START)
    root = tree.root[0]
    if tree.is_leaf(root):
        lhs = LCFRS_lhs(START)
        label = term_labeling.token_label(tree.node_token(root))
        lhs.add_arg([label])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge(), pos=tree.node_token(root).pos()), [])])
        gram.add_rule(lhs, [], dcp=[dcp_rule])
    else:
        first = direct_extract_lcfrs_from(tree, root, gram, term_labeling, nont_labeling, binarize, isolate_pos,
                                          hmarkov=hmarkov)
        lhs = LCFRS_lhs(START)
        lhs.add_arg([LCFRS_var(0, 0)])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)])
        gram.add_rule(lhs, [first], dcp=[dcp_rule])
    return gram
Ejemplo n.º 9
0
import plac
from constituent.induction import direct_extract_lcfrs, BasicNonterminalLabeling, \
    direct_extract_lcfrs_from_prebinarized_corpus
from grammar.induction.terminal_labeling import PosTerminals, FrequencyBiasedTerminalLabeling, \
    FormTerminals, CompositionalTerminalLabeling, UNK4, Suffix
from experiment.resources import TRAINING, VALIDATION, TESTING, TESTING_INPUT, RESULT, CorpusFile
from experiment.split_merge_experiment import SplitMergeExperiment
from experiment.hg_constituent_experiment import ConstituentExperiment, ScoringExperiment, ScorerAndWriter, \
    setup_corpus_resources, MULTI_OBJECTIVES, MULTI_OBJECTIVES_INDEPENDENT, BASE_GRAMMAR, MAX_RULE_PRODUCT_ONLY

TEST_SECOND_HALF = False  # parse second half of test set

# FINE_TERMINAL_LABELING = FeatureTerminals(token_to_features, feature_filter=my_feature_filter)
# FINE_TERMINAL_LABELING = FormTerminals()
FINE_TERMINAL_LABELING = CompositionalTerminalLabeling(FormTerminals(),
                                                       PosTerminals())
FALLBACK_TERMINAL_LABELING = PosTerminals()
DEFAULT_RARE_WORD_THRESHOLD = 10


def terminal_labeling(corpus, threshold=DEFAULT_RARE_WORD_THRESHOLD):
    #   return FrequencyBiasedTerminalLabeling(FINE_TERMINAL_LABELING, FALLBACK_TERMINAL_LABELING, corpus, threshold)
    #   return UNK4(trees=corpus, threshold=threshold, use_pos=True)
    return Suffix(trees=corpus, threshold=threshold, suffix_length=2)


class InductionSettings:
    def __init__(self):
        self.normalize = False
        self.disconnect_punctuation = True
        self.terminal_labeling = PosTerminals()
import unittest
from corpora.negra_parse import sentence_names_to_hybridtrees
from constituent.induction import direct_extract_lcfrs_from_prebinarized_corpus
from grammar.induction.terminal_labeling import FormTerminals, PosTerminals, CompositionalTerminalLabeling, \
    FrequencyBiasedTerminalLabeling
from parser.naive.parsing import LCFRS_parser
from hybridtree.constituent_tree import HybridTree
from hybridtree.monadic_tokens import construct_constituent_token
from copy import deepcopy
from parser.sDCP_parser.sdcp_parser_wrapper import PysDCPParser, LCFRS_sDCP_Parser
from parser.sDCPevaluation.evaluator import DCP_evaluator, dcp_to_hybridtree
from parser.discodop_parser.parser import DiscodopKbestParser

fine_terminal_labeling = CompositionalTerminalLabeling(FormTerminals(),
                                                       PosTerminals())
fallback_terminal_labeling = PosTerminals()

terminal_threshold = 10


def terminal_labeling(corpus, threshold=terminal_threshold):
    return FrequencyBiasedTerminalLabeling(fine_terminal_labeling,
                                           fallback_terminal_labeling, corpus,
                                           threshold)


class TestDiscodopLCFRSInduction(unittest.TestCase):
    def test_something(self):
        normal_corpus = 'res/tiger/tiger_8000.export'
        binarized_corpus = 'res/tiger/tiger_8000_bin.export'
        limit = 55000
Ejemplo n.º 11
0
def construct_terminal_labeling(labeling, corpus, threshold=DEFAULT_RARE_WORD_THRESHOLD):
    if labeling == "form+pos":
        return FrequencyBiasedTerminalLabeling(FINE_TERMINAL_LABELING, FALLBACK_TERMINAL_LABELING, corpus, threshold)
    elif labeling == "suffixes":
        return FrequentSuffixTerminalLabeling(corpus, threshold)
    elif labeling == 'suffixes+pos':
        return CompositionalTerminalLabeling(FrequentSuffixTerminalLabeling(corpus, threshold), PosTerminals())
    elif labeling == 'stanford4' or 'stanford4de':
        unk_model = 'unknownword4%s' % ("de" if labeling == 'stanford4de' else "")
        return StanfordUNKing(corpus, unknown_threshold=threshold, openclass_threshold=50, unk_model=unk_model)
    else:
        raise Exception("Unknown terminal labeling \"%s\"" % labeling)
Ejemplo n.º 12
0
    def test_primary_tree_violation_workaround(self):
        label = 's150'
        label2 = 's6516'
        path = "res/tiger/tiger_8000.xml"
        train_dsgs = sentence_names_to_deep_syntax_graphs([label, label2], path, hold=False, reorder_children=True)
        binarize = True

        # Grammar induction
        term_labeling_token = PosTerminals()

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        if binarize:
            def modify_token(token):
                if isinstance(token, ConstituentCategory):
                    token_new = deepcopy(token)
                    token_new.set_category(token.category() + '-BAR')
                    return token_new
                elif isinstance(token, str):
                    return token + '-BAR'
                else:
                    assert False

            train_dsgs = [dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs]

            def is_bin(token):
                if isinstance(token, ConstituentCategory):
                    if token.category().endswith('-BAR'):
                        return True
                elif isinstance(token, str):
                    if token.endswith('-BAR'):
                        return True
                return False

            def debinarize(dsg):
                return dsg.debinarize(is_bin=is_bin)

        else:
            debinarize = id

        def rec_part_strategy(direction, subgrouping, fanout):
            if direction == "right-to-left":
                return lambda dsg: fanout_limited_partitioning(dsg.recursive_partitioning(subgrouping), fanout)
            else:
                return lambda dsg: fanout_limited_partitioning_left_to_right(
                    dsg.recursive_partitioning(subgrouping, weak=True),
                    fanout)
        the_rec_part_strategy = rec_part_strategy("left-to-right", True, 1)

        def simple_nonterminal_labeling(nodes, dsg):
            return simple_labeling(nodes, dsg, label_edge)
        # render_and_view_dog(train_dsgs[0].dog, 'train_dsg_tmp')
        grammar = induction_on_a_corpus(train_dsgs, the_rec_part_strategy, simple_nonterminal_labeling, term_labeling)
Ejemplo n.º 13
0
    def test_json_corpus_grammar_export(self):
        start = 1
        stop = 50
        # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
        path = "res/tiger/tiger_8000.xml"
        exclude = []
        dsgs = sentence_names_to_deep_syntax_graphs(
            ['s' + str(i) for i in range(start, stop + 1) if i not in exclude]
            , path
            , hold=False)

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label

        nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling)
        grammar.make_proper()

        terminals = Enumerator()

        data = export_dog_grammar_to_json(grammar, terminals)
        grammar_path = '/tmp/json_grammar.json'
        with open(grammar_path, 'w') as file:
            json.dump(data, file)

        corpus_path = '/tmp/json_corpus.json'
        with open(corpus_path, 'w') as file:
            json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file)

        with open('/tmp/enumerator.enum', 'w') as file:
            terminals.print_index(file)

        reduct_dir = '/tmp/reduct_grammars'
        if os.path.isdir(reduct_dir):
            shutil.rmtree(reduct_dir)
        os.makedirs(reduct_dir)
        p = subprocess.Popen([' '.join(
            ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t',
             corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        print("stdout", p.stdout.name)

        while True:
            nextline = p.stdout.readline()
            if nextline == b'' and p.poll() is not None:
                break
            print(nextline.decode('unicode_escape'), end='')
            # sys.stdout.write(nextline)
            # sys.stdout.flush()

        p.wait()
        p.stdout.close()
        self.assertEqual(0, p.returncode)

        rtgs = []
        for i in range(1, len(dsgs) + 1):
            rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra'))

        derivation_manager = PyDerivationManager(grammar)
        derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
        derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8'))

        f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token

        for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)):
            derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)]
            self.assertGreaterEqual(len(derivations), 1)
            if len(derivations) > 1:
                print("Sentence", i)
                for der in derivations:
                    print(der)

            for der in derivations:
                dog, sync = dog_evaluation(der)
                dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync)
                dsg.dog.project_labels(f)
                dsg.sentence = list(map(f, dsg.sentence))
                self.assertEqual(dsg.sentence, dsg2.sentence)
                morphs = dsg.dog.compute_isomorphism(dsg2.dog)
                self.assertFalse(morphs is None)
                self.assertListEqual([[morphs[0].get(node, node) for node in syncs]
                                      for syncs in dsg.synchronization], dsg2.synchronization)
        pass
Ejemplo n.º 14
0
    def test_induction_on_a_corpus(self):
        interactive = False
        start = 1
        stop = 50
        path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
        # path = "res/tiger/tiger_8000.xml"
        exclude = []
        dsgs = sentence_names_to_deep_syntax_graphs(
            ['s' + str(i) for i in range(start, stop + 1) if i not in exclude]
            , path
            , hold=False)

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label
        nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()
        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling, normalize=True)
        grammar.make_proper()

        parser = CFGParser(grammar)

        scorer = PredicateArgumentScoring()

        for dsg in dsgs:
            parser.set_input(term_labeling_token.prepare_parser_input(dsg.sentence))
            parser.parse()
            self.assertTrue(parser.recognized())
            derivation = parser.best_derivation_tree()
            dog, sync = dog_evaluation(derivation)
            dsg2 = DeepSyntaxGraph(dsg.sentence, dog, sync)

            f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token
            dsg.dog.project_labels(f)
            parser.clear()

            scorer.add_accuracy_frames(
                dsg.labeled_frames(guard=lambda x: len(x[1]) > 0),
                dsg2.labeled_frames(guard=lambda x: len(x[1]) > 0)
            )

            # print('dsg: ', dsg.dog, '\n', [dsg.get_graph_position(i) for i in range(len(dsg.sentence))],
            # '\n\n parsed: ', dsg2.dog, '\n', [dsg2.get_graph_position(i+1) for i in range(len(dsg2.sentence))])
            # print()
            if interactive:
                if dsg.label == 's50':
                    pass
                if dsg.dog != dog:
                    z1 = render_and_view_dog(dsg.dog, "corpus_" + dsg.label)
                    z2 = render_and_view_dog(dog, "parsed_" + dsg.label)
                    z1.communicate()
                    z2.communicate()

        print("Labeled frames:")
        print("P", scorer.labeled_frame_scorer.precision(), "R", scorer.labeled_frame_scorer.recall(),
              "F1", scorer.labeled_frame_scorer.fmeasure())
        print("Labeled dependencies:")
        print("P", scorer.labeled_dependency_scorer.precision(), "R", scorer.labeled_dependency_scorer.recall(),
              "F1", scorer.labeled_dependency_scorer.fmeasure())
Ejemplo n.º 15
0
def main2():
    induction_settings = InductionSettings()

    # terminal labeling
    induction_settings.terminal_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return induction_settings.terminal_labeling_token.token_label(
                token)
        else:
            return token

    induction_settings.terminal_labeling = term_labeling

    # recursive partitioning
    def rec_part_strategy(direction, subgrouping, fanout, binarize):
        if direction == "right-to-left":
            return lambda dsg: fanout_limited_partitioning(
                dsg.recursive_partitioning(subgrouping, weak=binarize), fanout)
        else:
            return lambda dsg: fanout_limited_partitioning_left_to_right(
                dsg.recursive_partitioning(subgrouping, weak=binarize), fanout)

    induction_settings.binarize = True
    induction_settings.direction = "left-to-right"
    induction_settings.subgrouping = False
    induction_settings.fanout = 1
    induction_settings.rec_part_strategy = rec_part_strategy(
        induction_settings.direction, induction_settings.subgrouping,
        induction_settings.fanout, induction_settings.binarize)

    # Nonterminal Labeling
    induction_settings.start = "START"

    def label_edge(edge):
        if isinstance(edge.label, ConstituentTerminal):
            return edge.label.pos()
        else:
            return edge.label

    def stupid_edge(edge):
        return "X"

    def label_child(edge, j):
        return edge.get_function(j)

    def simple_nonterminal_labeling(nodes, dsg):
        return simple_labeling(nodes, dsg, label_edge)

    def bot_stupid_nonterminal_labeling(nodes, dsg):
        return top_bot_labeling(nodes, dsg, label_edge, stupid_edge)

    def missing_child_nonterminal_labeling(nodes, dsg):
        return missing_child_labeling(nodes, dsg, label_edge, label_child)

    induction_settings.nonterminal_labeling = simple_nonterminal_labeling
    induction_settings.normalize = True

    experiment = DOGExperiment(induction_settings)

    # Corpora
    start = 1
    stop = 2000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []

    experiment.resources[TRAINING] = CorpusFile(corpus_path, start, stop)
    experiment.resources[TESTING] = CorpusFile(corpus_path, test_start,
                                               test_stop)
    experiment.oracle_parsing = True
    experiment.purge_rule_freq = None  # 1.0
    experiment.k_best = 100
    experiment.run_experiment()
Ejemplo n.º 16
0
def run_experiment(rec_part_strategy,
                   nonterminal_labeling,
                   exp,
                   reorder_children,
                   binarize=True):
    start = 1
    stop = 7000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []
    train_dsgs = sentence_names_to_deep_syntax_graphs(
        ['s' + str(i) for i in range(start, stop + 1) if i not in exclude],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)
    test_dsgs = sentence_names_to_deep_syntax_graphs(
        [
            's' + str(i)
            for i in range(test_start, test_stop + 1) if i not in exclude
        ],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)

    # Grammar induction
    term_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return term_labeling_token.token_label(token)
        else:
            return token

    if binarize:

        def modify_token(token):
            if isinstance(token, ConstituentCategory):
                token_new = deepcopy(token)
                token_new.set_category(token.category() + '-BAR')
                return token_new
            elif isinstance(token, str):
                return token + '-BAR'
            else:
                assert False

        train_dsgs = [
            dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs
        ]

        def is_bin(token):
            if isinstance(token, ConstituentCategory):
                if token.category().endswith('-BAR'):
                    return True
            elif isinstance(token, str):
                if token.endswith('-BAR'):
                    return True
            return False

        def debinarize(dsg):
            return dsg.debinarize(is_bin=is_bin)

    else:
        debinarize = id

    grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy,
                                    nonterminal_labeling, term_labeling)
    grammar.make_proper()

    print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules()))

    parser = GFParser_k_best(grammar, k=500)
    return do_parsing(parser,
                      test_dsgs,
                      term_labeling_token,
                      oracle=True,
                      debinarize=debinarize)

    # Compute reducts, i.e., intersect grammar with each training dsg
    basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp))
    reduct_dir = path.join(basedir, 'reduct_grammars')

    terminal_map = Enumerator()
    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    data = export_dog_grammar_to_json(grammar, terminal_map)
    grammar_path = path.join(basedir, 'grammar.json')
    with open(grammar_path, 'w') as file:
        json.dump(data, file)

    corpus_path = path.join(basedir, 'corpus.json')
    with open(corpus_path, 'w') as file:
        json.dump(
            export_corpus_to_json(train_dsgs,
                                  terminal_map,
                                  terminal_labeling=term_labeling), file)

    with open(path.join(basedir, 'enumerator.enum'), 'w') as file:
        terminal_map.print_index(file)

    if os.path.isdir(reduct_dir):
        shutil.rmtree(reduct_dir)
    os.makedirs(reduct_dir)
    p = subprocess.Popen([
        ' '.join([
            "java", "-jar",
            os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g',
            grammar_path, '-t', corpus_path, "-o", reduct_dir
        ])
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)

    while True:
        nextline = p.stdout.readline()
        if nextline == '' and p.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()

    p.wait()
    p.stdout.close()

    rtgs = []
    for i in range(1, len(train_dsgs) + 1):
        rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra')))

    derivation_manager = PyDerivationManager(grammar)
    derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
    derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace'))

    # Training
    ## prepare EM training
    em_epochs = 20
    seed = 0
    smoothing_factor = 0.01
    split_randomization = 0.01
    sm_cycles = 2
    merge_percentage = 50.0
    grammarInfo = PyGrammarInfo(grammar,
                                derivation_manager.get_nonterminal_map())
    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=THREADS)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token)
    return
    ## prepare SM training
    builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=THREADS)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    # builder.set_scc_merger(-0.2)
    builder.set_percent_merger(merge_percentage)
    splitMergeTrainer = builder.build()

    # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotation = [la_no_splits]

    # carry out split/merge training and do parsing
    parsing_method = "filter-ctf"
    # parsing_method = "single-best-annotation"
    k_best = 50
    for i in range(1, sm_cycles + 1):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        print("Cycle: ", i)
        if parsing_method == "single-best-annotation":
            smGrammar = latentAnnotation[i].build_sm_grammar(
                grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1)
            print("Rules in smoothed grammar: ", len(smGrammar.rules()))
            parser = GFParser(smGrammar)
        elif parsing_method == "filter-ctf":
            latentAnnotation[-1].project_weights(grammar, grammarInfo)
            parser = Coarse_to_fine_parser(
                grammar,
                latentAnnotation[-1],
                grammarInfo,
                derivation_manager.get_nonterminal_map(),
                base_parser_type=GFParser_k_best,
                k=k_best)
        else:
            raise (Exception())
        do_parsing(parser, test_dsgs, term_labeling_token)
        del parser