Ejemplo n.º 1
0
def parse_doc_ptb(doc_id, doc_tkd_toks):
    """Dirty PTB parser"""
    # get PTB trees
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # use tweaked tokens
    doc_tokens = doc_tkd_toks
    tokens_iter = iter(doc_tokens)

    trees = []
    lex_heads = []
    for tree in PTB_READER.parsed_sents(ptb_name):
        # apply standard cleaning to tree
        # strip function tags, remove empty nodes
        tree_no_empty = prune_tree(tree, is_non_empty)
        tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                             strip_subcategory)
        #
        leaves = tree_no_empty_no_gf.leaves()
        tslice = itertools.islice(tokens_iter, len(leaves))
        clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                            tslice)
        trees.append(clean_tree)

        # lexicalize the PTB tree: find the head word of each constituent
        # constituents and their heads are designated by their Gorn address
        # ("tree position" in NLTK) in the tree
        lheads = find_lexical_heads(clean_tree)
        lex_heads.append(lheads)
    return trees  # , lex_heads
Ejemplo n.º 2
0
def parse_doc_ptb(doc_id, doc_tkd_toks):
    """Dirty PTB parser"""
    # get PTB trees
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # use tweaked tokens
    doc_tokens = doc_tkd_toks
    tokens_iter = iter(doc_tokens)

    trees = []
    lex_heads = []
    for tree in PTB_READER.parsed_sents(ptb_name):
        # apply standard cleaning to tree
        # strip function tags, remove empty nodes
        tree_no_empty = prune_tree(tree, is_non_empty)
        tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory)
        #
        leaves = tree_no_empty_no_gf.leaves()
        tslice = itertools.islice(tokens_iter, len(leaves))
        clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
        trees.append(clean_tree)

        # lexicalize the PTB tree: find the head word of each constituent
        # constituents and their heads are designated by their Gorn address
        # ("tree position" in NLTK) in the tree
        lheads = find_lexical_heads(clean_tree)
        lex_heads.append(lheads)
    return trees  # , lex_heads
Ejemplo n.º 3
0
    def parse(self, doc):
        """Parse a document, using the gold PTB annotation.

        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with syntactic
            constituency trees.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.set_syn_ctrees(trees, lex_heads=lex_heads)

        return doc
Ejemplo n.º 4
0
Archivo: ptb.py Proyecto: fbuijs/educe
    def parse(self, doc):
        """
        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                                tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.tkd_trees.extend(trees)
        # store lexical heads in doc
        # TODO move to DocumentPlus
        doc.lex_heads = []
        doc.lex_heads.append(None)
        # end TODO
        doc.lex_heads.extend(lex_heads)

        return doc
Ejemplo n.º 5
0
Archivo: ptb.py Proyecto: chloebt/educe
def parse_trees(corpus, k, ptb):
    """
    Given an RST DT tree and an NLTK PTB reader, return a list of
    educified PTB parse trees (one per sentence). These are
    almost the same as the trees that would be returned by the
    `parsed_sents` method, except that each leaf/node is
    associated with a span within the RST DT text.

    Note: returns None if there is no associated PTB corpus entry.
    """
    ptb_name = _guess_ptb_name(k)
    if ptb_name is None:
        return None
    tokens_iter = align(corpus, k, ptb)

    results = []
    for tree in ptb.parsed_sents(ptb_name):
        leaves = tree.leaves()
        tslice = itertools.islice(tokens_iter, len(leaves))
        results.append(ConstituencyTree.build(tree, tslice))
    return results
Ejemplo n.º 6
0
def read_corenlp_result(doc, corenlp_doc, tid=None):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe Document (?)
        The original document (?)

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    tid: turn id
        Turn id (?)

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information.
    """
    def is_matching_turn(x):
        """Check whether x corresponds to the current turn"""
        if tid is None:
            return stac.is_turn(x)
        else:
            x_tid = stac.turn_id(x)
            return stac.is_turn(x) & tid == x_tid

    turns = sorted((x for x in doc.units if is_matching_turn(x)),
                   key=lambda k: k.span)
    sentences = corenlp_doc.get_ordered_sentence_list()

    if len(turns) != len(sentences):
        msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\
              'and parsed sentences (%d) %s'\
              % (len(turns), len(sentences), doc.origin)
        raise Exception(msg)

    sentence_toks = defaultdict(list)
    for t in corenlp_doc.get_ordered_token_list():
        sid = t['s_id']
        sentence_toks[sid].append(t)

    # build dict from sid to (dict from tid to fancy token)
    educe_tokens = defaultdict(dict)
    for turn, sent in zip(turns, sentences):
        sid = sent['id']

        # the token offsets are global, ie. for all sentences/turns
        # in the file; so we have to shift them to left to zero them
        # and then shift them back to the right
        sentence_begin = min(t['extent'][0] for t in sentence_toks[sid])

        ttext = doc.text(turn.text_span())
        offset = (turn.span.char_start
                  + len(stac.split_turn_text(ttext)[0])
                  - sentence_begin)

        for t in sentence_toks[sid]:
            tid = t['id']
            educe_tokens[sid][tid] = CoreNlpToken(t, offset)

    all_tokens = []
    all_trees = []
    all_dtrees = []
    for turn, sent in zip(turns, sentences):
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # FIXME tokens are probably not properly ordered because token ids
        # are global ids, i.e. strings like "1-18" (sentence 1, token 18)
        # which means basic sorting ranks "1-10" before "1-2"
        # cf. educe.rst_dt.corenlp
        sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())]
        # end FIXME
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_tree = ConstituencyTree.build(tree, sorted_tokens)

        deps = defaultdict(list)
        for ty, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((ty, dep_id))

        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')

        all_tokens.extend(sorted_tokens)
        all_trees.append(educe_tree)
        all_dtrees.append(educe_dtree)

    all_chains = []
    for ctr, chain in enumerate(corenlp_doc.get_coref_chains()):
        mentions = []
        for m in chain:
            sid = m['sentence']

            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)

            start = local_id(m['start'])
            end = local_id(m['end'])
            token_range = [global_id(x) for x in range(start, end)]
            tokens = [educe_tokens[sid][t] for t in token_range]
            head = educe_tokens[sid][m['head']]
            mentions.append(Mention(tokens, head, m['most_representative']))
        all_chains.append(Chain(mentions))

    return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
Ejemplo n.º 7
0
def read_corenlp_result(doc, corenlp_doc, tid=None):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe Document (?)
        The original document (?)

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    tid: turn id
        Turn id (?)

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information.
    """
    def is_matching_turn(x):
        """Check whether x corresponds to the current turn"""
        if tid is None:
            return stac.is_turn(x)
        else:
            x_tid = stac.turn_id(x)
            return stac.is_turn(x) & tid == x_tid

    turns = sorted((x for x in doc.units if is_matching_turn(x)),
                   key=lambda k: k.span)
    sentences = corenlp_doc.get_ordered_sentence_list()

    if len(turns) != len(sentences):
        msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\
              'and parsed sentences (%d) %s'\
              % (len(turns), len(sentences), doc.origin)
        raise Exception(msg)

    sentence_toks = defaultdict(list)
    for t in corenlp_doc.get_ordered_token_list():
        sid = t['s_id']
        sentence_toks[sid].append(t)

    # build dict from sid to (dict from tid to fancy token)
    educe_tokens = defaultdict(dict)
    for turn, sent in zip(turns, sentences):
        sid = sent['id']

        # the token offsets are global, ie. for all sentences/turns
        # in the file; so we have to shift them to left to zero them
        # and then shift them back to the right
        sentence_begin = min(t['extent'][0] for t in sentence_toks[sid])

        ttext = doc.text(turn.text_span())
        offset = (turn.span.char_start + len(stac.split_turn_text(ttext)[0]) -
                  sentence_begin)

        for t in sentence_toks[sid]:
            tid = t['id']
            educe_tokens[sid][tid] = CoreNlpToken(t, offset)

    all_tokens = []
    all_trees = []
    all_dtrees = []
    for turn, sent in zip(turns, sentences):
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # FIXME tokens are probably not properly ordered because token ids
        # are global ids, i.e. strings like "1-18" (sentence 1, token 18)
        # which means basic sorting ranks "1-10" before "1-2"
        # cf. educe.rst_dt.corenlp
        sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())]
        # end FIXME
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_tree = ConstituencyTree.build(tree, sorted_tokens)

        deps = defaultdict(list)
        for ty, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((ty, dep_id))

        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')

        all_tokens.extend(sorted_tokens)
        all_trees.append(educe_tree)
        all_dtrees.append(educe_dtree)

    all_chains = []
    for ctr, chain in enumerate(corenlp_doc.get_coref_chains()):
        mentions = []
        for m in chain:
            sid = m['sentence']

            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)

            start = local_id(m['start'])
            end = local_id(m['end'])
            token_range = [global_id(x) for x in range(start, end)]
            tokens = [educe_tokens[sid][t] for t in token_range]
            head = educe_tokens[sid][m['head']]
            mentions.append(Mention(tokens, head, m['most_representative']))
        all_chains.append(Chain(mentions))

    return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
Ejemplo n.º 8
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # NEW extract local id to properly sort tokens
        tok_local_id = lambda x: int(x[len(sid) + 1:])
        sorted_tokens = [tokens_dict[x]
                         for x in sorted(tokens_dict, key=tok_local_id)]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        # FIXME 2016-06-13 skip the ROOT node, as in PTB
        # maybe we'd better add ROOT to the empty parentheses in the
        # PTB version, but just getting rid of ROOT here seems simpler:
        # the type of the root node of a tree is informative: usually
        # S, but more interestingly SINV, NP...
        if tree.label() != 'ROOT' or len(tree) > 1:
            print(tree)
            raise ValueError('Atypical root of CoreNLP tree')
        tree = tree[0]  # go down from ROOT to the real root
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to extract local ids and generate global ids
            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)
            # retrieve tokens for this mention
            start = local_id(mntn['start'])
            end = local_id(mntn['end'])
            tokens = [educe_tokens[sid][global_id(tok_idx)]
                      for tok_idx in range(start, end)]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head,
                                    mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc
Ejemplo n.º 9
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # NEW extract local id to properly sort tokens
        tok_local_id = lambda x: int(x[len(sid) + 1:])
        sorted_tokens = [
            tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)
        ]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to extract local ids and generate global ids
            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)
            # retrieve tokens for this mention
            start = local_id(mntn['start'])
            end = local_id(mntn['end'])
            tokens = [
                educe_tokens[sid][global_id(tok_idx)]
                for tok_idx in range(start, end)
            ]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head, mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc
Ejemplo n.º 10
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # sort tokens by their (integer) local id
        tok_local_id = tok_lid(sid)
        sorted_tokens = [
            tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)
        ]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        # FIXME 2016-06-13 skip the ROOT node, as in PTB
        # maybe we'd better add ROOT to the empty parentheses in the
        # PTB version, but just getting rid of ROOT here seems simpler:
        # the type of the root node of a tree is informative: usually
        # S, but more interestingly SINV, NP...
        if tree.label() != 'ROOT' or len(tree) > 1:
            print(tree)
            raise ValueError('Atypical root of CoreNLP tree')
        tree = tree[0]  # go down from ROOT to the real root
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to map from/to local and global ids
            tok_local_id = tok_lid(sid)
            tok_global_id = tok_gid(sid)
            # retrieve tokens for this mention
            start = tok_local_id(mntn['start'])
            end = tok_local_id(mntn['end'])
            tokens = [
                educe_tokens[sid][tok_global_id(tok_idx)]
                for tok_idx in range(start, end)
            ]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head, mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc