Beispiel #1
0
def read_corenlp_result(doc, corenlp_doc, tid=None):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe Document (?)
        The original document (?)

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    tid: turn id
        Turn id (?)

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information.
    """
    def is_matching_turn(x):
        """Check whether x corresponds to the current turn"""
        if tid is None:
            return stac.is_turn(x)
        else:
            x_tid = stac.turn_id(x)
            return stac.is_turn(x) & tid == x_tid

    turns = sorted((x for x in doc.units if is_matching_turn(x)),
                   key=lambda k: k.span)
    sentences = corenlp_doc.get_ordered_sentence_list()

    if len(turns) != len(sentences):
        msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\
              'and parsed sentences (%d) %s'\
              % (len(turns), len(sentences), doc.origin)
        raise Exception(msg)

    sentence_toks = defaultdict(list)
    for t in corenlp_doc.get_ordered_token_list():
        sid = t['s_id']
        sentence_toks[sid].append(t)

    # build dict from sid to (dict from tid to fancy token)
    educe_tokens = defaultdict(dict)
    for turn, sent in zip(turns, sentences):
        sid = sent['id']

        # the token offsets are global, ie. for all sentences/turns
        # in the file; so we have to shift them to left to zero them
        # and then shift them back to the right
        sentence_begin = min(t['extent'][0] for t in sentence_toks[sid])

        ttext = doc.text(turn.text_span())
        offset = (turn.span.char_start
                  + len(stac.split_turn_text(ttext)[0])
                  - sentence_begin)

        for t in sentence_toks[sid]:
            tid = t['id']
            educe_tokens[sid][tid] = CoreNlpToken(t, offset)

    all_tokens = []
    all_trees = []
    all_dtrees = []
    for turn, sent in zip(turns, sentences):
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # FIXME tokens are probably not properly ordered because token ids
        # are global ids, i.e. strings like "1-18" (sentence 1, token 18)
        # which means basic sorting ranks "1-10" before "1-2"
        # cf. educe.rst_dt.corenlp
        sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())]
        # end FIXME
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_tree = ConstituencyTree.build(tree, sorted_tokens)

        deps = defaultdict(list)
        for ty, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((ty, dep_id))

        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')

        all_tokens.extend(sorted_tokens)
        all_trees.append(educe_tree)
        all_dtrees.append(educe_dtree)

    all_chains = []
    for ctr, chain in enumerate(corenlp_doc.get_coref_chains()):
        mentions = []
        for m in chain:
            sid = m['sentence']

            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)

            start = local_id(m['start'])
            end = local_id(m['end'])
            token_range = [global_id(x) for x in range(start, end)]
            tokens = [educe_tokens[sid][t] for t in token_range]
            head = educe_tokens[sid][m['head']]
            mentions.append(Mention(tokens, head, m['most_representative']))
        all_chains.append(Chain(mentions))

    return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
Beispiel #2
0
def read_corenlp_result(doc, corenlp_doc, tid=None):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe Document (?)
        The original document (?)

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    tid: turn id
        Turn id (?)

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information.
    """
    def is_matching_turn(x):
        """Check whether x corresponds to the current turn"""
        if tid is None:
            return stac.is_turn(x)
        else:
            x_tid = stac.turn_id(x)
            return stac.is_turn(x) & tid == x_tid

    turns = sorted((x for x in doc.units if is_matching_turn(x)),
                   key=lambda k: k.span)
    sentences = corenlp_doc.get_ordered_sentence_list()

    if len(turns) != len(sentences):
        msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\
              'and parsed sentences (%d) %s'\
              % (len(turns), len(sentences), doc.origin)
        raise Exception(msg)

    sentence_toks = defaultdict(list)
    for t in corenlp_doc.get_ordered_token_list():
        sid = t['s_id']
        sentence_toks[sid].append(t)

    # build dict from sid to (dict from tid to fancy token)
    educe_tokens = defaultdict(dict)
    for turn, sent in zip(turns, sentences):
        sid = sent['id']

        # the token offsets are global, ie. for all sentences/turns
        # in the file; so we have to shift them to left to zero them
        # and then shift them back to the right
        sentence_begin = min(t['extent'][0] for t in sentence_toks[sid])

        ttext = doc.text(turn.text_span())
        offset = (turn.span.char_start + len(stac.split_turn_text(ttext)[0]) -
                  sentence_begin)

        for t in sentence_toks[sid]:
            tid = t['id']
            educe_tokens[sid][tid] = CoreNlpToken(t, offset)

    all_tokens = []
    all_trees = []
    all_dtrees = []
    for turn, sent in zip(turns, sentences):
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # FIXME tokens are probably not properly ordered because token ids
        # are global ids, i.e. strings like "1-18" (sentence 1, token 18)
        # which means basic sorting ranks "1-10" before "1-2"
        # cf. educe.rst_dt.corenlp
        sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())]
        # end FIXME
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_tree = ConstituencyTree.build(tree, sorted_tokens)

        deps = defaultdict(list)
        for ty, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((ty, dep_id))

        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')

        all_tokens.extend(sorted_tokens)
        all_trees.append(educe_tree)
        all_dtrees.append(educe_dtree)

    all_chains = []
    for ctr, chain in enumerate(corenlp_doc.get_coref_chains()):
        mentions = []
        for m in chain:
            sid = m['sentence']

            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)

            start = local_id(m['start'])
            end = local_id(m['end'])
            token_range = [global_id(x) for x in range(start, end)]
            tokens = [educe_tokens[sid][t] for t in token_range]
            head = educe_tokens[sid][m['head']]
            mentions.append(Mention(tokens, head, m['most_representative']))
        all_chains.append(Chain(mentions))

    return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
Beispiel #3
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # NEW extract local id to properly sort tokens
        tok_local_id = lambda x: int(x[len(sid) + 1:])
        sorted_tokens = [
            tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)
        ]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to extract local ids and generate global ids
            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)
            # retrieve tokens for this mention
            start = local_id(mntn['start'])
            end = local_id(mntn['end'])
            tokens = [
                educe_tokens[sid][global_id(tok_idx)]
                for tok_idx in range(start, end)
            ]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head, mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc
Beispiel #4
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # NEW extract local id to properly sort tokens
        tok_local_id = lambda x: int(x[len(sid) + 1:])
        sorted_tokens = [tokens_dict[x]
                         for x in sorted(tokens_dict, key=tok_local_id)]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        # FIXME 2016-06-13 skip the ROOT node, as in PTB
        # maybe we'd better add ROOT to the empty parentheses in the
        # PTB version, but just getting rid of ROOT here seems simpler:
        # the type of the root node of a tree is informative: usually
        # S, but more interestingly SINV, NP...
        if tree.label() != 'ROOT' or len(tree) > 1:
            print(tree)
            raise ValueError('Atypical root of CoreNLP tree')
        tree = tree[0]  # go down from ROOT to the real root
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to extract local ids and generate global ids
            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)
            # retrieve tokens for this mention
            start = local_id(mntn['start'])
            end = local_id(mntn['end'])
            tokens = [educe_tokens[sid][global_id(tok_idx)]
                      for tok_idx in range(start, end)]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head,
                                    mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc
Beispiel #5
0
def read_corenlp_result(doc, corenlp_doc):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe.rst_dt.document_plus.DocumentPlus
        The original document (currently unused, could be necessary to
        determine e.g. token offset for specific file formats ; if it
        never gets used, this function should probably to the generic
        default and moved to `educe.external.corenlp`).

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information
    """
    # sentences
    sentences = corenlp_doc.get_ordered_sentence_list()

    # tokens
    sentence_toks = defaultdict(list)
    for tok in corenlp_doc.get_ordered_token_list():
        sid = tok['s_id']
        sentence_toks[sid].append(tok)

    # educe tokens
    educe_tokens = defaultdict(dict)
    for sent in sentences:
        sid = sent['id']
        sent_toks = sentence_toks[sid]
        offset = 0  # was: sent_begin
        for tok in sent_toks:
            tid = tok['id']
            educe_tokens[sid][tid] = CoreNlpToken(tok, offset)

    # educe tokens, ctree and dtree
    all_tokens = []
    all_ctrees = []
    all_dtrees = []
    for sent in sentences:
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # sort tokens by their (integer) local id
        tok_local_id = tok_lid(sid)
        sorted_tokens = [
            tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)
        ]
        # ctree
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        # FIXME 2016-06-13 skip the ROOT node, as in PTB
        # maybe we'd better add ROOT to the empty parentheses in the
        # PTB version, but just getting rid of ROOT here seems simpler:
        # the type of the root node of a tree is informative: usually
        # S, but more interestingly SINV, NP...
        if tree.label() != 'ROOT' or len(tree) > 1:
            print(tree)
            raise ValueError('Atypical root of CoreNLP tree')
        tree = tree[0]  # go down from ROOT to the real root
        educe_ctree = ConstituencyTree.build(tree, sorted_tokens)
        # dtree
        deps = defaultdict(list)
        for lbl, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((lbl, dep_id))
        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')
        # store educe tokens, ctrees and dtrees
        all_tokens.extend(sorted_tokens)
        all_ctrees.append(educe_ctree)
        all_dtrees.append(educe_dtree)

    # coreference chains
    all_chains = []
    for chain in corenlp_doc.get_coref_chains():
        mentions = []
        for mntn in chain:
            sid = mntn['sentence']
            # helper functions to map from/to local and global ids
            tok_local_id = tok_lid(sid)
            tok_global_id = tok_gid(sid)
            # retrieve tokens for this mention
            start = tok_local_id(mntn['start'])
            end = tok_local_id(mntn['end'])
            tokens = [
                educe_tokens[sid][tok_global_id(tok_idx)]
                for tok_idx in range(start, end)
            ]
            head = educe_tokens[sid][mntn['head']]
            mentions.append(Mention(tokens, head, mntn['most_representative']))
        all_chains.append(Chain(mentions))

    corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees,
                                  all_chains)
    return corenlp_doc