Beispiel #1
0
def remove_unseen_edus(corpus, predictions):
    """
    The notion of unseen EDUs is a fairly visual one. The problem is
    that folds are allocated randomly over dialogues, so any EDUs
    may be confused for EDUs which are marked as unrelated. We need
    to set these aside somehow (for example hiding them outright) so
    they don't confuse analysis

    Note that this mutates the corpus
    """
    unseen = {}
    # build dictionary from FileId to relations in that document
    for id_parent, id_child, _ in predictions:
        doc_subdoc2, local_id_child = split_id(id_child)
        if id_parent != 'ROOT':
            doc_subdoc1, local_id_parent = split_id(id_parent)
            assert doc_subdoc1 == doc_subdoc2
        key, doc = guess_doc(corpus, doc_subdoc2)
        if key not in unseen:
            unseen[key] = set(x.local_id() for x in doc.units
                              if educe.stac.is_edu(x))
        if id_parent != 'ROOT':
            unseen[key].discard(local_id_parent)
        unseen[key].discard(local_id_child)

    for key, doc in corpus.items():
        if key not in unseen:
            continue
        edus = [x for x in doc.units if educe.stac.is_edu(x)]
        for edu in edus:
            if edu.local_id() in unseen[key]:
                doc.units.remove(edu)
Beispiel #2
0
def remove_unseen_edus(corpus, predictions):
    """
    The notion of unseen EDUs is a fairly visual one. The problem is
    that folds are allocated randomly over dialogues, so any EDUs
    may be confused for EDUs which are marked as unrelated. We need
    to set these aside somehow (for example hiding them outright) so
    they don't confuse analysis

    Note that this mutates the corpus
    """
    unseen = {}
    # build dictionary from FileId to relations in that document
    for id_parent, id_child, _ in predictions:
        doc_subdoc2, local_id_child = split_id(id_child)
        if id_parent != 'ROOT':
            doc_subdoc1, local_id_parent = split_id(id_parent)
            assert doc_subdoc1 == doc_subdoc2
        key, doc = guess_doc(corpus, doc_subdoc2)
        if key not in unseen:
            unseen[key] = set(x.local_id() for x in doc.units
                              if educe.stac.is_edu(x))
        if id_parent != 'ROOT':
            unseen[key].discard(local_id_parent)
        unseen[key].discard(local_id_child)

    for key, doc in corpus.items():
        if key not in unseen:
            continue
        edus = [x for x in doc.units if educe.stac.is_edu(x)]
        for edu in edus:
            if edu.local_id() in unseen[key]:
                doc.units.remove(edu)
Beispiel #3
0
def guess_doc(corpus, doc_subdoc):
    """
    Return the file id and document associated with the given
    global annotation ID
    """
    matches = [(k, v) for k, v in corpus.items() if
               (k.doc, k.subdoc) == doc_subdoc]  # live input; no subdoc
    if not matches:
        raise Exception(('Found no documents with key {}'
                         '').format(doc_subdoc))
    else:
        return matches[0]
Beispiel #4
0
def guess_doc(corpus, doc_subdoc):
    """
    Return the file id and document associated with the given
    global annotation ID
    """
    matches = [(k, v) for k, v in corpus.items()
               if (k.doc, k.subdoc) == doc_subdoc]  # live input; no subdoc
    if not matches:
        raise Exception(('Found no documents with key {}'
                         '').format(doc_subdoc))
    else:
        return matches[0]
Beispiel #5
0
def copy_discourse_corpus(corpus, annotator):
    """
    Return a fairly shallow copy of a presumably unannotated corpus,
    where every key is converted to a discourse stage key and every
    document is a shallow copy of the original.

    It should be safe to add things to the corpus, but modifying
    pre-existing EDUs (for example) would be destructive on the
    other side
    """
    corpus2 = {}
    for key, doc in corpus.items():
        key2 = copy.copy(key)
        key2.stage = 'discourse'
        key2.annotator = annotator
        corpus2[key2] = copy.copy(doc)
    return corpus2
Beispiel #6
0
def copy_discourse_corpus(corpus, annotator):
    """
    Return a fairly shallow copy of a presumably unannotated corpus,
    where every key is converted to a discourse stage key and every
    document is a shallow copy of the original.

    It should be safe to add things to the corpus, but modifying
    pre-existing EDUs (for example) would be destructive on the
    other side
    """
    corpus2 = {}
    for key, doc in corpus.items():
        key2 = copy.copy(key)
        key2.stage = 'discourse'
        key2.annotator = annotator
        corpus2[key2] = copy.copy(doc)
    return corpus2