Beispiel #1
0
 def test_from_stream(self):
     s = '(ADVP (ADV widely) (CONJ and) (ADV friendly))'
     source = StringIO(s.replace(' ', '\n\n\n') + s)
     (one, two) = Tree.from_stream(source)
     actual = str(one)
     expected = str(two)
     eq_(actual, expected)
Beispiel #2
0
    def print_before_and_after_collapse_WSJ_trees(self):
        # filename = "wsj-normalized.psd"
        filename = "wsj-test.psd"
        f = open(filename, "r", encoding="utf-8")
        trees_before = Tree.from_stream(f)
        f.close

        for t in trees_before:
            before = Tree.pretty(t)
            print(
                '////////////////////////////////////////////////////////////////////'
            )
            print('BEFORE *************************')
            print(before)

            tree_collapsed = t.kidnap_daughter()
            after = Tree.pretty(tree_collapsed)
            print('AFTER *************************')
            print(after)
Beispiel #3
0
    def test_collapse_unary_WSJ(self):
        filename = "wsj-normalized.psd"
        #filename = "wsj-test.psd"
        f = open(filename, "r", encoding="utf-8")
        trees_expect = ExpectedTree.from_stream(f)
        f.close

        f = open(filename, "r", encoding="utf-8")
        trees_actual = Tree.from_stream(f)
        f.close

        counter = 0
        passed = 0
        failed = 0
        for t_expect, t_actual in zip(trees_expect, trees_actual):
            counter += 1
            print('Collapse Unary - TEST ' + str(counter) + ': ', end='')

            # Get a copy of the tree before unary collapse
            t_before = Tree.pretty(t_actual)

            # Get expected tree after collapse from solution
            col_tree_expect = ExpectedTree.collapse_unary(t_expect)
            expect = ExpectedTree.pretty(col_tree_expect)

            # Get actual tree after collapse from tree.py
            col_tree_actual = Tree.kidnap_daughter(t_actual)
            actual = Tree.pretty(col_tree_actual)

            # Compare results and print full report of any failed trees
            try:
                eq_(expect, actual)
                passed += 1
                print(" PASS")
                if VERBOSE:
                    print(
                        '////////////////////////////////////////////////////////////////////'
                    )
                    print('Collapse Unary - TEST: ' + str(counter) +
                          ': PASS *********************************')
                    print(t_before)
                    print('EXPECTED: *************************')
                    print(expect)
                    print('ACTUAL *************************')
                    print(actual)

            except AssertionError:
                failed += 1
                print(" FAIL")
                if VERBOSE:
                    print(
                        '////////////////////////////////////////////////////////////////////'
                    )
                    print('Collapse Unary - TEST: ' + str(counter) +
                          ': FAIL ************************************')
                    print(t_before)
                    print('EXPECTED: *************************')
                    print(expect)
                    print('ACTUAL *************************')
                    print(actual)

        print("Passed: " + str(passed))
        print("Failed: " + str(failed))
        if failed > 0:
            raise AssertionError
Beispiel #4
0
    def test_kidnap_daughter_collapse_small_WSJ_trees(self):
        # filename = "wsj-normalized.psd"
        filename = "wsj-test.psd"  # small test file - only 2 trees
        f = open(filename, "r", encoding="utf-8")
        trees_before = Tree.from_stream(f)
        f.close

        index = 0
        for t in trees_before:
            index += 1
            before = Tree.pretty(t)
            print(
                '////////////////////////////////////////////////////////////////////'
            )
            print('TEST: ' + str(index) +
                  '  ************************************')
            print(before)

            tree_collapsed = Tree.kidnap_daughter(t)
            actual = Tree.pretty(tree_collapsed)
            print('AFTER *************************')
            print(actual)

            expect = None
            if index == 1:
                expect = inspect.cleandoc("""
                    (TOP
                        (NP-SBJ
                            (NP
                                (NNP <NNP>)
                                (NNP <NNP>)
                            )
                            (, ,)
                            (ADJP
                                (NP
                                    (CD <CD>)
                                    (NNS years)
                                )
                                (JJ old)
                            )
                            (, ,)
                        )
                        (VP
                            (MD will)
                            (VP
                                (VB join)
                                (NP
                                    (DT the)
                                    (NN board)
                                )
                                (PP-CLR
                                    (IN as)
                                    (NP
                                        (DT a)
                                        (JJ nonexecutive)
                                        (NN director)
                                    )
                                )
                                (NP-TMP
                                    (NNP <NNP>)
                                    (CD <CD>)
                                )
                            )
                        )
                        (. .)
                    )""")
            elif index == 2:
                expect = inspect.cleandoc("""
                    (TOP
                        (NP-SBJ
                            (NNP <NNP>)
                            (NNP <NNP>)
                        )
                        (VP
                            (VBZ is)
                            (NP-PRD
                                (NP
                                    (NN chairman)
                                )
                                (PP
                                    (IN of)
                                    (NP
                                        (NP
                                            (NNP <NNP>)
                                            (NNP <NNP>)
                                        )
                                        (, ,)
                                        (NP
                                            (DT the)
                                            (NNP <NNP>)
                                            (VBG publishing)
                                            (NN group)
                                        )
                                    )
                                )
                            )
                        )
                        (. .)
                    )""")

            print('EXPECTED: *************************')
            print(expect)
            print('ACTUAL *************************')
            print(actual)

            eq_(expect, actual)
Beispiel #5
0
    def test_set_productions_WSJ(self):
        #filename = "wsj-normalized.psd"
        filename = "wsj-test.psd"
        f = open(filename, "r", encoding="utf-8")
        trees_expect = ExpectedTree.from_stream(f)
        f.close

        f = open(filename, "r", encoding="utf-8")
        trees_actual = Tree.from_stream(f)
        f.close

        counter = 0
        passed = 0
        failed = 0

        t_expect_pro = []
        t_actual_pro = []

        for t_expect, t_actual in zip(trees_expect, trees_actual):
            counter += 1
            #print('Productions SET - TEST ' + str(counter) + ': ', end='')

            # Get a copy of the tree before unary collapse
            t_before = Tree.pretty(t_actual)

            # Get expected tree after collapse from solution
            t_expect_col = ExpectedTree.collapse_unary(t_expect)
            t_expect_cnf = ExpectedTree.chomsky_normal_form(t_expect_col)
            t_expect_pro += ExpectedTree.productions(t_expect_cnf)

            #expect = ExpectedTree.pretty(t_expect_cnf)

            # Get actual tree after collapse from tree.py
            t_actual_col = Tree.collapse_unary(t_actual)
            t_actual_cnf = Tree.chomsky_normal_form(t_actual_col)
            t_actual_pro += Tree.productions(t_actual_cnf)

        expect = ExpectedTree.pretty_productions(t_expect_pro)
        actual = Tree.pretty_productions(t_actual_pro)

        # Compare results and print full report of any failed trees
        try:
            eq_(expect, actual)
            passed += 1
            print(" PASS")
            if VERBOSE:
                print(
                    '////////////////////////////////////////////////////////////////////'
                )
                print('Productions SET - TEST: ' + str(counter) +
                      '  ************************************')
                #print(t_before)
                print('EXPECTED: *************************')
                print(expect)
                print('ACTUAL *************************')
                print(actual)

        except AssertionError:
            failed += 1
            print(" FAIL")
            if VERBOSE:
                print(
                    '////////////////////////////////////////////////////////////////////'
                )
                print('Productions SET - TEST: ' + str(counter) +
                      '  ************************************')
                #print(t_before)
                print('EXPECTED: *************************')
                print(expect)
                print('ACTUAL *************************')
                print(actual)
                print('----')

        print("Passed: " + str(passed))
        print("Failed: " + str(failed))
        if failed > 0:
            raise AssertionError
Beispiel #6
0
    def get_before_and_expected_values_from_WSF(self, tree_index, actual):
        filename = "wsj-normalized.psd"
        #filename = "wsj-test.psd"
        f = open(filename, "r", encoding="utf-8")
        trees_expect = ExpectedTree.from_stream(f)
        f.close

        f = open(filename, "r", encoding="utf-8")
        trees_actual = Tree.from_stream(f)
        f.close

        counter = 0
        for t_expect, t_actual in zip(trees_expect, trees_actual):
            counter += 1

            if counter == tree_index:
                print('Convert CNF - TEST ' + str(counter) + ': ', end='')

                before = Tree.pretty(t_actual)

                # Get expected and tree after collapse from solution
                t_expect_unary = ExpectedTree.collapse_unary(t_expect)
                t_expect_cnf = ExpectedTree.chomsky_normal_form(t_expect_unary)
                t_expect_prod = ExpectedTree.productions(t_expect_cnf)

                expect_unary = ExpectedTree.pretty(t_expect_unary)
                expect_cnf = ExpectedTree.pretty(t_expect_cnf)
                expect_prod = ExpectedTree.pretty_productions(t_expect_prod)

                print(
                    '// BEFORE VALUE ////////////////////////////////////////////////////////////////////////'
                )
                print('WSJ TREE (BEFORE) #' + str(counter) +
                      '  ************************************')
                print(before)

                print(
                    '// EXPECTED VALUES ////////////////////////////////////////////////////////////////////////'
                )
                print('EXPECTED UNARY COLLAPSE #' + str(counter) +
                      '  ************************************')
                print(expect_unary)
                print('EXPECTED Convert CNF - TEST: ' + str(counter) +
                      '  ************************************')
                print(expect_cnf)
                print('EXPECTED Generate Productions: ' + str(counter) +
                      '  ************************************')
                print(expect_prod)

                if actual:
                    # The order f***s this up
                    t_actual_unary = Tree.collapse_unary(t_actual)
                    actual_unary = Tree.pretty(t_actual_unary)

                    t_actual_cnf = Tree.chomsky_normal_form(t_actual_unary)
                    actual_cnf = Tree.pretty(t_actual_cnf)

                    t_actual_prod = Tree.productions(t_actual_unary)
                    actual_prod = Tree.pretty_productions(t_actual_prod)

                    print(
                        '// ACTUAL /////////////////////////////////////////////////////////////////////////////////'
                    )
                    print('ACTUAL UNARY COLLAPSE #' + str(counter) +
                          '  ************************************')
                    print(actual_unary)
                    print('ACTUAL Convert CNF - TEST: ' + str(counter) +
                          '  ************************************')
                    print(actual_cnf)
                    print('ACTUAL Generate Productions: ' + str(counter) +
                          '  ************************************')
                    print(actual_prod)

                    eq_(expect_unary, actual_unary)
                    eq_(expect_cnf, actual_cnf)
                    eq_(expect_prod, actual_prod)

                break

class Timeout(object):
    def __init__(self, seconds=1, error_message="Timeout"):
        self.seconds = seconds
        self.error_message = error_message

    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)

    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)

    def __exit__(self, type, value, traceback):
        signal.alarm(0)


if __name__ == "__main__":
    pcfg = PCFG.load(PCFG_SOURCE)
    print "PCFG loaded."
    with open(TREE_SOURCE, "r") as source:
        for tree in Tree.from_stream(source):
            tokens = [leaf.decode("ASCII") for leaf in tree.leaves()]
            try:
                with Timeout(TIMEOUT):
                    (_, bw_prob) = CYK_chart(pcfg, tokens)
                    print bw_prob
            except TimeoutError:
                pass
        )
        (VP
            (VB gave)
            (NP
                (DT the)
                (NN lecture)
            )
        )
    )"""
    
    # uncomment to use the above simple trees for debugging:
    #    trees = [TRANSFORM(Tree.from_string(t)) for t in (t0, t1)]
    #    grammar = PCFG.from_trees(trees)

    # let's get some input to build a grammar:
    grammar = PCFG.from_trees(list(TRANSFORM(t) for t in Tree.from_stream(GzipFile('bigger_treebank_2.txt.gz'))))
    print "Read {} rules in grammar.".format(len(grammar))
    trees = list(TRANSFORM(t) for t in Tree.from_stream(open('end_of_wsj.txt')))
    print "Read {} trees.".format(len(trees))
    
    # now try and parse our trees:
    results = []
    
    for idx, tree in enumerate(trees):
        tokens = [(t,) for t in tree.terminals()]        
        # print 'Sentence {}\tTokens: "{}"'.format(idx, ' '.join(tree.terminals()))
        chart = Chart(grammar, tokens)
        chart.pretty_print()
        has_parse = chart.extract_parse()
        if not has_parse:
            print 'Sentence {}\tTokens: "{}" has no parse!'.format(idx, ' '.join(tree.terminals()))            
        Given a `daughter` (a non-terminal) return a corresponding 
        dictionary of (start, BitWeight) pairs.
        """
        return self.starts.get(daughter)

    def nonterminal_rules(self, daughters):
        """
        Given two `daughters`, return a corresponding dictionary of 
        (nonterminal, BitWeight) pairs.
        """
        return self.nonterms.get(daughters)

    def preterminal_rules(self, terminal):
        """
        Given a `terminal`, return a corresponding dictionary of 
        (preterminal, BitWeight) pairs.
        """
        return self.preterms.get(terminal)

    def __len__(self):
        return len(self.starts) + \
               sum(len(inner) for inner in self.nonterms) + \
               sum(len(inner) for inner in self.preterms)


if __name__ == "__main__":
    with open(SOURCE, "r") as source:
        pcfg = PCFG.from_trees(Tree.from_stream(source))
    print "|G| = {}".format(len(pcfg))
    pcfg.dump(SINK)