Esempio n. 1
0
    def parse(self, doc):
        """Parse
        """
        corenlp_out_name = _guess_corenlp_name(doc.key)
        if corenlp_out_name is None:
            return doc

        fname = os.path.join(self.corenlp_out_dir,
                             corenlp_out_name)
        if not os.path.exists(fname):
            raise ValueError('CoreNLP XML: no file {}'.format(fname))
        # CoreNLP XML output reader
        # FIXME the same reading is done in tokenize(), should find
        # a way to cache or share call
        reader = PreprocessingSource()
        reader.read(fname, suffix='')
        corenlp_out = read_corenlp_result(doc, reader)

        # ctrees and lexical heads on their nodes
        ctrees = corenlp_out.trees
        # strip function tags
        # TODO maybe this should be an internal preprocessing step in
        # find_lexical_heads(), so as to keep the function tags
        # that are kept by default by CoreNLP parser because they were found
        # to be useful e.g. `-retainTMPSubcategories`
        ctrees_no_gf = [transform_tree(ctree, strip_subcategory)
                        for ctree in ctrees]
        lex_heads = [find_lexical_heads(ctree_no_gf)
                     for ctree_no_gf in ctrees_no_gf]

        # store trees in doc
        doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads)

        return doc
Esempio n. 2
0
    def tokenize(self, doc):
        """Tokenize the document text.

        Parameters
        ----------
        doc: educe.rst_dt.DocumentPlus
            Document

        Returns
        -------
        doc: educe.rst_dt.DocumentPlus
            Tokenized document
        """
        corenlp_out_name = _guess_corenlp_name(doc.key)
        if corenlp_out_name is None:
            return doc

        fname = os.path.join(self.corenlp_out_dir,
                             corenlp_out_name)
        if not os.path.exists(fname):
            raise ValueError('CoreNLP XML: no file {}'.format(fname))
        # CoreNLP XML output reader
        reader = PreprocessingSource()
        reader.read(fname, suffix='')
        corenlp_out = read_corenlp_result(doc, reader)

        # modify DocumentPlus doc to add tokens
        doc.set_tokens(corenlp_out.tokens)

        return doc
Esempio n. 3
0
    def tokenize(self, doc):
        """Tokenize the document text.

        Parameters
        ----------
        doc: educe.rst_dt.DocumentPlus
            Document

        Returns
        -------
        doc: educe.rst_dt.DocumentPlus
            Tokenized document
        """
        corenlp_out_name = _guess_corenlp_name(doc.key)
        if corenlp_out_name is None:
            return doc

        fname = os.path.join(self.corenlp_out_dir, corenlp_out_name)
        if not os.path.exists(fname):
            raise ValueError('CoreNLP XML: no file {}'.format(fname))
        # CoreNLP XML output reader
        reader = PreprocessingSource()
        reader.read(fname, suffix='')
        corenlp_out = read_corenlp_result(doc, reader)

        # modify DocumentPlus doc to add tokens
        doc.tkd_tokens.extend(corenlp_out.tokens)

        return doc
Esempio n. 4
0
    def parse(self, doc):
        """Parse
        """
        corenlp_out_name = _guess_corenlp_name(doc.key)
        if corenlp_out_name is None:
            return doc

        fname = os.path.join(self.corenlp_out_dir, corenlp_out_name)
        if not os.path.exists(fname):
            raise ValueError('CoreNLP XML: no file {}'.format(fname))
        # CoreNLP XML output reader
        # FIXME the same reading is done in tokenize(), should find
        # a way to cache or share call
        reader = PreprocessingSource()
        reader.read(fname, suffix='')
        corenlp_out = read_corenlp_result(doc, reader)

        # ctrees and lexical heads on their nodes
        ctrees = corenlp_out.trees
        # strip function tags
        # TODO maybe this should be an internal preprocessing step in
        # find_lexical_heads(), so as to keep the function tags
        # that are kept by default by CoreNLP parser because they were found
        # to be useful e.g. `-retainTMPSubcategories`
        ctrees_no_gf = [
            transform_tree(ctree, strip_subcategory) for ctree in ctrees
        ]
        lex_heads = [
            find_lexical_heads(ctree_no_gf) for ctree_no_gf in ctrees_no_gf
        ]

        # store trees in doc
        doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads)

        return doc
Esempio n. 5
0
def read_results(corpus, dir_name):
    """
    Read stored parser output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.
    """
    results = {}
    for k in corpus:
        reader = PreprocessingSource()
        reader.read(parsed_file_name(k, dir_name), suffix='')
        doc = corpus[k]
        results[k] = read_corenlp_result(doc, reader)
    return results
Esempio n. 6
0
def read_results(corpus, dir_name):
    """
    Read stored parser output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.
    """
    results = {}
    for k in corpus:
        reader = PreprocessingSource()
        reader.read(parsed_file_name(k, dir_name), suffix='')
        doc = corpus[k]
        results[k] = read_corenlp_result(doc, reader)
    return results
Esempio n. 7
0
                                          encoding='ascii')
    # read the RST corpus
    rst_reader = Reader(corpus_dir)
    rst_corpus = rst_reader.slurp()
    # for each file, compare tokenizations between PTB and CoreNLP
    for key, rst_tree in sorted(rst_corpus.items()):
        doc_name = key.doc.split('.', 1)[0]
        if doc_name.startswith('wsj_'):
            print(doc_name)
            doc_wsj_num = doc_name.split('_')[1]
            section = doc_wsj_num[:2]

            # corenlp stuff
            core_fname = os.path.join(CORENLP_OUT_DIR, corpus,
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)
            core_toks = corenlp_doc.tokens
            core_toks_beg = [x.span.char_start for x in core_toks]
            core_toks_end = [x.span.char_end for x in core_toks]

            # PTB stuff
            # * create DocumentPlus (adapted from educe.rst_dt.corpus)
            rst_context = rst_tree.label().context
            ptb_docp = DocumentPlus(key, doc_name, rst_context)
            # * attach EDUs (yerk)
            # FIXME we currently get them via an RstDepTree created from
            # the original RSTTree, so as to get the left padding EDU
            rst_dtree = RstDepTree.from_rst_tree(rst_tree)
            ptb_docp.edus = rst_dtree.edus
Esempio n. 8
0
                                          encoding='ascii')
    # read the RST corpus
    rst_reader = Reader(corpus_dir)
    rst_corpus = rst_reader.slurp()
    # for each file, compare tokenizations between PTB and CoreNLP
    for key, rst_tree in sorted(rst_corpus.items()):
        doc_name = key.doc.split('.', 1)[0]
        if doc_name.startswith('wsj_'):
            print(doc_name)
            doc_wsj_num = doc_name.split('_')[1]
            section = doc_wsj_num[:2]

            # corenlp stuff
            core_fname = os.path.join(CORENLP_OUT_DIR, corpus,
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)
            core_toks = corenlp_doc.tokens
            core_toks_beg = [x.span.char_start for x in core_toks]
            core_toks_end = [x.span.char_end for x in core_toks]

            # PTB stuff
            # * create DocumentPlus (adapted from educe.rst_dt.corpus)
            rst_context = rst_tree.label().context
            ptb_docp = DocumentPlus(key, doc_name, rst_context)
            # * attach EDUs (yerk)
            # FIXME we currently get them via an RstDepTree created from
            # the original RSTTree, so as to get the left padding EDU
            rst_dtree = RstDepTree.from_rst_tree(rst_tree)
            ptb_docp.edus = rst_dtree.edus