def turns_with_final_emoticons(doc, tags): """ Return a tuple of lists. Both lists contain the turns in a document that end with the pattern EDU emoticon-only-EDU. The first (main) list contains those that are not pointed to by any relations or schema. The second (warnings only) list contains those that have relations or schema pointing to them. The reason we distinguish between the two lists is that we don't want to touch those in the latter (out of conservatism, the idea of removing these from their relations, CDUs seems scary), but we want to know about them. """ egraph = EnclosureGraph(doc, tags) affected_free_turns = [] affected_linked_turns = [] for turn in sorted_turns(doc): edus = sorted_first_widest(egraph.inside(turn)) last_edu = edus[-1] if len(edus) > 1 and is_just_emoticon(egraph.inside(last_edu)): if has_links(doc, last_edu): affected_linked_turns.append(turn) else: affected_free_turns.append(turn) return affected_free_turns, affected_linked_turns
def annotate(txt, annotations, inserts=None): """ Decorate a text with arbitrary bracket symbols, as a visual guide to the annotations on that text. For example, in a chat corpus, you might use newlines to indicate turn boundaries and square brackets for segments. Parameters ---------- inserts inserts a dictionary from annotation type to pair of its opening/closing bracket FIXME: this needs to become a standard educe utility, maybe as part of the educe.annotation layer? """ inserts = inserts or DEFAULT_INSERTS if not annotations: return txt def is_visible(anno): """ Is this annotation one we intend to display? """ return rough_type(anno) in inserts def add_endpoints(endpoints, buf, pos): """ Insert any pending closing annotations (eg. right bracket) into the text """ endpoints2 = [] buf2 = buf for pos2, rparen in endpoints: if pos == pos2: buf2 = buf2 + rparen else: endpoints2.append((pos2, rparen)) return endpoints2, buf2 s_annos = sorted_first_widest(filter(is_visible, annotations)) endpoints = [] buf = "" for i in range(0, len(txt)): char = txt[i] endpoints, buf = add_endpoints(endpoints, buf, i) while s_annos: nxt = s_annos[0] span = nxt.text_span() if span.char_start == i: lparen, rparen = inserts[rough_type(nxt)] buf = buf + lparen endpoints.insert(0, (span.char_end, rparen)) # lifo del s_annos[0] else: break buf = buf + char _, buf = add_endpoints(endpoints, buf, len(txt)) return buf
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args, verbose=True) for k in sorted(corpus, key=educe.stac.id_to_path): doc = corpus[k] print("========== %s ============" % k) print() if args.edges: dialogues = sorted_first_widest(filter(is_dialogue, doc.units)) if dialogues: d_first = dialogues[0] print(annotate_doc(doc, span=d_first.text_span())) if len(dialogues) > 1: d_last = dialogues[-1] txt = annotate_doc(doc, span=d_last.text_span()) print("...\n") print(txt.encode('utf-8')) else: print(annotate_doc(doc).encode('utf-8')) print()
def _diff_friendly(annos): """ Return a copy of annotations, tweaked and given some arbitrary canonical order. The background here is that you might want to do a visual diff between the human authored annotations, and automated modifications you requested, but in order for that to work we have to to eliminate spurious diffs that would obscure the interesting bits. """ return sorted_first_widest(_sans_modified_by(x) for x in annos)
def search_graph_edus(inputs, k, gra, pred): """ Return a ReportItem for any EDU within the graph for which some predicate is true """ doc = inputs.corpus[k] contexts = inputs.contexts[k] edu_names = {gra.annotation(name):name for name in gra.edus()} sorted_edus = sorted_first_widest(edu_names.keys()) return [UnitItem(doc, contexts, x) for x in sorted_edus if pred(gra, contexts, edu_names[x])]
def schema_text(doc, anno): """ (recursive) text preview of a schema and its contents. Members are enclosed in square brackets. """ if anno is None: return "" elif isinstance(anno, Schema): snippets = [u"[{}]".format(schema_text(doc, x)) for x in sorted_first_widest(anno.members)] return "...".join(snippets) else: return doc.text(anno.text_span())
def search_graph_edus(inputs, k, gra, pred): """ Return a ReportItem for any EDU within the graph for which some predicate is true """ doc = inputs.corpus[k] contexts = inputs.contexts[k] edu_names = {gra.annotation(name): name for name in gra.edus()} sorted_edus = sorted_first_widest(edu_names.keys()) return [ UnitItem(doc, contexts, x) for x in sorted_edus if pred(gra, contexts, edu_names[x]) ]
def schema_text(doc, anno): """ (recursive) text preview of a schema and its contents. Members are enclosed in square brackets. """ if anno is None: return "" elif isinstance(anno, Schema): snippets = [ u"[{}]".format(schema_text(doc, x)) for x in sorted_first_widest(anno.members) ] return "...".join(snippets) else: return doc.text(anno.text_span())
def cross_check_against(inputs, key1, stage='unannotated'): """ Compare annotations with their equivalents on a twin document in the corpus """ key2 = twin_key(key1, stage) try: missing = cross_check_units(inputs, key2, key1, MissingItem.missing_status) excess = cross_check_units(inputs, key1, key2, MissingItem.excess_status) mismatches = check_unit_ids(inputs, key1, key2) missing_excess = [] for vals in missing.values(): missing_excess.extend(vals) for vals in excess.values(): missing_excess.extend(vals) return sorted_first_widest(missing_excess), mismatches except MissingDocumentException as oops: print("ARGH! Can't cross-check ", oops.k, sys.stderr) return ({}, {})
def merge_final_emoticons(tcache, turn_spans, doc, tags): """ Given a timestamp cache and some text spans identifying turns with final emoticons in them, and a document: 1. find the specified turns in the document 2. absorb their emoticon EDUs into the one before it This modifies the document and does not return anything """ egraph = EnclosureGraph(doc, tags) for turn in sorted_turns(doc): if turn.text_span() not in turn_spans: continue edus = sorted_first_widest(egraph.inside(turn)) assert len(edus) > 1 stamp = tcache.get(educe.stac.turn_id(turn)) last_edu = edus[-1] penult_edu = edus[-2] absorb_emoticon(doc, stamp, penult_edu, last_edu) doc.units.remove(last_edu)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = defaultdict(list) # 2017-03-15 merge emoticon EDUs if *all* (selected?) 'discourse' # annotators agree that they are not part of any schema or relation discourse_subcorpus = defaultdict(list) for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam].append(k) for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_ks = discourse_subcorpus[fam] turns = set() warn_turns = set() for disc_k in disc_ks: doc = corpus[disc_k] turns_k, warn_turns_k = turns_with_final_emoticons( doc, postags[disc_k]) turns &= set(turns_k) warn_turns |= set(warn_turns_k) turns = sorted_first_widest(turns) warn_turns = sorted_first_widest(warn_turns) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append("Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def sorted_turns(doc): """ Turn annotations in a document, sorted by text span """ return sorted_first_widest(x for x in doc.units if educe.stac.is_turn(x))