def remove_unseen_edus(corpus, predictions): """ The notion of unseen EDUs is a fairly visual one. The problem is that folds are allocated randomly over dialogues, so any EDUs may be confused for EDUs which are marked as unrelated. We need to set these aside somehow (for example hiding them outright) so they don't confuse analysis Note that this mutates the corpus """ unseen = {} # build dictionary from FileId to relations in that document for id_parent, id_child, _ in predictions: doc_subdoc2, local_id_child = split_id(id_child) if id_parent != 'ROOT': doc_subdoc1, local_id_parent = split_id(id_parent) assert doc_subdoc1 == doc_subdoc2 key, doc = guess_doc(corpus, doc_subdoc2) if key not in unseen: unseen[key] = set(x.local_id() for x in doc.units if educe.stac.is_edu(x)) if id_parent != 'ROOT': unseen[key].discard(local_id_parent) unseen[key].discard(local_id_child) for key, doc in corpus.items(): if key not in unseen: continue edus = [x for x in doc.units if educe.stac.is_edu(x)] for edu in edus: if edu.local_id() in unseen[key]: doc.units.remove(edu)
def guess_doc(corpus, doc_subdoc): """ Return the file id and document associated with the given global annotation ID """ matches = [(k, v) for k, v in corpus.items() if (k.doc, k.subdoc) == doc_subdoc] # live input; no subdoc if not matches: raise Exception(('Found no documents with key {}' '').format(doc_subdoc)) else: return matches[0]
def copy_discourse_corpus(corpus, annotator): """ Return a fairly shallow copy of a presumably unannotated corpus, where every key is converted to a discourse stage key and every document is a shallow copy of the original. It should be safe to add things to the corpus, but modifying pre-existing EDUs (for example) would be destructive on the other side """ corpus2 = {} for key, doc in corpus.items(): key2 = copy.copy(key) key2.stage = 'discourse' key2.annotator = annotator corpus2[key2] = copy.copy(doc) return corpus2