Exemple #1
0
def parse_sentences_from_file(grammar,
                              parser_type,
                              experiment,
                              connection,
                              path,
                              tree_yield,
                              max_length=sys.maxsize,
                              limit=sys.maxsize,
                              quiet=False,
                              ignore_punctuation=True,
                              root_default_deprel=None,
                              disconnected_default_deprel=None):
    """
    :rtype: None
    :type grammar: LCFRS
    :param path: file path for test corpus (dependency grammar in CoNLL format)
    :type path: str
    :param tree_yield: parse on words or POS or ..
    :type tree_yield: GeneralHybridTree -> list[str]
    :param max_length: don't parse sentences with yield > max_length
    :type max_length: int
    :param limit:      only parse the limit first sentences of the corpus
    :type limit: int
    :param quiet:      output status information
    :type quiet: bool
    :param ignore_punctuation: exclude punctuation from parsing
    :type ignore_punctuation: bool
    
    Parse sentences from corpus and compare derived dependency structure with gold standard information.
    """
    if not quiet:
        print("Building lookahead tables for grammar")
        parser_type.preprocess_grammar(grammar)

    experiment_database.set_experiment_test_corpus(connection, experiment,
                                                   path)

    if not quiet:
        if max_length != sys.maxsize:
            s = ', ignoring sentences with length > ' + str(max_length)
        else:
            s = ''
        print('Start parsing sentences' + s)

    trees = parse_conll_corpus(path, False, limit)
    trees = add_trees_to_db(path, connection, trees)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    (UAS, LAS, UEM, LEM) = (0, 0, 0, 0)
    parse = 0
    no_parse = 0
    n_gaps_gold = 0
    n_gaps_test = 0
    skipped = 0
    start_at = time.clock()
    for tree in trees:
        if len(tree.id_yield()) > max_length:
            skipped += 1
            continue
        time_stamp = time.clock()

        parser = parser_type(grammar, tree_yield(tree.token_yield()))
        time_stamp = time.clock() - time_stamp

        cleaned_tokens = copy.deepcopy(tree.full_token_yield())
        for token in cleaned_tokens:
            token.set_edge_label('_')
        h_tree = HybridTree(tree.sent_label())
        h_tree = parser.dcp_hybrid_tree_best_derivation(
            h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token)

        if h_tree:
            experiment_database.add_result_tree(connection, h_tree,
                                                path, experiment, 1,
                                                parser.best(), time_stamp,
                                                'parse', root_default_deprel,
                                                disconnected_default_deprel)
            n_gaps_gold += tree.n_gaps()
            n_gaps_test += h_tree.n_gaps()
            parse += 1
            (dUAS, dLAS, dUEM, dLEM) = score_cmp_dep_trees(tree, h_tree)
            UAS += dUAS
            LAS += dLAS
            UEM += dUEM
            LEM += dLEM
        else:
            experiment_database.no_parse_result(connection, tree.sent_label(),
                                                path, experiment, time_stamp,
                                                "no_parse")
            no_parse += 1

    end_at = time.clock()
    total = parse + no_parse
    if not quiet:
        print('Parsed ' + str(parse) + ' out of ' + str(total) + ' (skipped ' +
              str(skipped) + ')')
        print('fail: ', no_parse)
        if parse > 0:
            print('UAS: ', UAS / parse)
            print('LAS: ', LAS / parse)
            print('UEM: ', UEM / parse)
            print('LEM: ', LEM / parse)
            print('n gaps (gold): ', n_gaps_gold * 1.0 / parse)
            print('n gaps (test): ', n_gaps_test * 1.0 / parse)
        print('parse time: ', end_at - start_at, 's')
        print()
class GeneralHybridTreeTestCase(unittest.TestCase):
    tree = None

    def setUp(self):
        self.tree = HybridTree()
        self.tree.add_node("v1", construct_conll_token("Piet", "NP"), True)
        self.tree.add_node("v21", construct_conll_token("Marie", "N"), True)
        self.tree.add_node("v", construct_conll_token("helpen", "VP"), True)
        self.tree.add_node("v2", construct_conll_token("lezen", "V"), True)
        self.tree.add_child("v", "v2")
        self.tree.add_child("v", "v1")
        self.tree.add_child("v2", "v21")
        self.tree.add_node("v3", construct_conll_token(".", "Punc"), True,
                           False)
        self.tree.add_to_root("v")

    def test_children(self):
        self.assertListEqual(self.tree.children('v'), ['v2', 'v1'])
        self.tree.reorder()
        self.assertListEqual(self.tree.children('v'), ['v1', 'v2'])

    def test_fringe(self):
        self.tree.reorder()
        self.assertListEqual(self.tree.fringe('v'), [2, 0, 3, 1])
        self.assertListEqual(self.tree.fringe('v2'), [3, 1])

    def test_n_spans(self):
        self.tree.reorder()
        self.assertEqual(self.tree.n_spans('v'), 1)
        self.assertEqual(self.tree.n_spans('v2'), 2)

    def test_n_gaps(self):
        self.tree.reorder()
        self.assertEqual(self.tree.n_gaps(), 1)

    def test_node_ids(self):
        self.tree.reorder()
        self.assertListEqual(sorted(self.tree.nodes()),
                             sorted(['v', 'v1', 'v2', 'v21', 'v3']))

    def test_complete(self):
        self.tree.reorder()
        self.assertEqual(self.tree.complete(), True)

    def test_unlabelled_structure(self):
        self.tree.reorder()
        self.assertTupleEqual(self.tree.unlabelled_structure(),
                              ({0, 1, 2, 3}, [({0}, []),
                                              ({1, 3}, [({1}, [])])]))

    def test_max_n_spans(self):
        self.tree.reorder()
        self.assertEqual(self.tree.max_n_spans(), 2)

    def test_labelled_yield(self):
        self.tree.reorder()
        self.assertListEqual(
            [token.form() for token in self.tree.token_yield()],
            "Piet Marie helpen lezen".split(' '))

    def test_full_labelled_yield(self):
        self.tree.reorder()
        self.assertListEqual(
            [token.form() for token in self.tree.full_token_yield()],
            "Piet Marie helpen lezen .".split(' '))

    def test_full_yield(self):
        self.tree.reorder()
        self.assertListEqual(self.tree.full_yield(),
                             'v1 v21 v v2 v3'.split(' '))

    # def test_labelled_spans(self):
    # self.tree.reorder()
    # self.assertListEqual(self.tree.labelled_spans(), [])

    def test_pos_yield(self):
        self.tree.reorder()
        self.assertListEqual(
            [token.pos() for token in self.tree.token_yield()],
            "NP N VP V".split(' '))

    def test_recursive_partitioning(self):
        self.tree.reorder()
        self.assertEqual(self.tree.recursive_partitioning(),
                         ({0, 1, 2, 3}, [({0}, []),
                                         ({1, 3}, [({1}, []), ({3}, [])]),
                                         ({2}, [])]))