def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if args.stage: if args.stage != 'unannotated' and not args.annotator: sys.exit("--annotator is required unless --stage is unannotated") elif args.stage == 'unannotated' and args.annotator: sys.exit("--annotator is forbidden if --stage is unannotated") output_dir = get_output_dir(args, default_overwrite=True) corpus = read_corpus(args, verbose=True) source = args.source target = _get_target(args, source, corpus) for k in corpus: print(k) doc = corpus[k] _rename_in_doc(source, target, doc) save_document(output_dir, k, doc) pretty_source = anno_id_from_tuple(source) pretty_target = anno_id_from_tuple(target) print("Renamed from %s to %s" % (pretty_source, pretty_target), file=sys.stderr) announce_output_dir(output_dir)
def _rename_in_doc(source, target, doc): """ Rename all annotations with the given source id in the given document NB: modifies doc """ matches = [x for x in doc.annotations() if anno_id_to_tuple(x.local_id()) == source] pretty_source = anno_id_from_tuple(source) pretty_target = anno_id_from_tuple(target) target_author, target_date = target def replace_pointer(pointers): "Given annotation id, return copy with s/src/tgt/" return [pretty_target if ptr == pretty_source else ptr for ptr in pointers] if not matches: sys.exit("No annotations found with id %s" % pretty_source) elif len(matches) > 1: sys.exit("Huh?! More than one annotation with id %s" % pretty_source) evil_set_id(matches[0], target_author, target_date) for anno in doc.relations: if anno.span.t1 == pretty_source: anno.span.t1 = pretty_target if anno.span.t2 == pretty_source: anno.span.t2 = pretty_target for anno in doc.schemas: anno.units = replace_pointer(anno.units) anno.relations = replace_pointer(anno.relations) anno.schemas = replace_pointer(anno.schemas)
def _tweak_presplit(tcache, doc, spans): """ What to do in case the split was already done manually (in the discourse section) """ renames = {} for span in sorted(spans): matches = [x for x in doc.units if x.text_span() == span and educe.stac.is_edu(x)] if not matches: raise Exception("No matches found for %s in %s" % (span, doc.origin)) edu = matches[0] old_id = edu.local_id() new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span))) set_anno_date(edu, tcache.get(span)) set_anno_author(edu, _AUTHOR) renames[old_id] = new_id for rel in doc.relations: if rel.span.t1 in renames: rel.span.t1 = renames[rel.span.t1] if rel.span.t2 in renames: rel.span.t2 = renames[rel.span.t2] for schema in doc.schemas: units2 = set(schema.units) for unit in schema.units: if unit in renames: units2.remove(unit) units2.add(renames[unit]) schema.units = units2
def _tweak_presplit(tcache, doc, spans): """ What to do in case the split was already done manually (in the discourse section) """ renames = {} for span in sorted(spans): matches = [ x for x in doc.units if x.text_span() == span and educe.stac.is_edu(x) ] if not matches: raise Exception("No matches found for %s in %s" % (span, doc.origin)) edu = matches[0] old_id = edu.local_id() new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span))) set_anno_date(edu, tcache.get(span)) set_anno_author(edu, _AUTHOR) renames[old_id] = new_id for rel in doc.relations: if rel.span.t1 in renames: rel.span.t1 = renames[rel.span.t1] if rel.span.t2 in renames: rel.span.t2 = renames[rel.span.t2] for schema in doc.schemas: units2 = set(schema.units) for unit in schema.units: if unit in renames: units2.remove(unit) units2.add(renames[unit]) schema.units = units2
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={ 'author': _AUTHOR, 'creation-date': str(cdu_stamp) }) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={'author': _AUTHOR, 'creation-date': str(cdu_stamp)}) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)
def _get_annotation_with_id(sought_tuple, annotations): """ Given a tuple (author,creation_date), pick out the one annotation whose id matches. There must be exactly one. """ sought = anno_id_from_tuple(sought_tuple) candidates = [x for x in annotations if x.local_id() == sought] if len(candidates) == 1: return candidates[0] elif len(candidates) > 1: raise Exception('More than one annotation found with id %s' % sought) else: raise Exception('No annotations found with id %s' % sought)
def mk_relation(tstamp, local_id_parent, local_id_child, label): """ Given a document and edu ids, create a relation instance betweenthem """ span = RelSpan(local_id_parent, local_id_child) label = label annotator = 'stacparser' date = tstamp.next() rel_id = stac_glozz.anno_id_from_tuple((annotator, date)) features = {} metadata = {} metadata['author'] = annotator metadata['creation-date'] = str(date) return Relation(rel_id=rel_id, span=span, rtype=label, features=features, metadata=metadata)
def commit_msg(args, corpus, k, sought): """ Generate a commit message describing the dialogue merging operation we are about to do (has to be run before merging happens) """ doc = corpus[k] dstr = ", ".join(anno_id_from_tuple(x) for x in sought) dialogues = [_get_annotation_with_id(d, doc.units) for d in sought] if dialogues: title_fmt = u"{doc}_{subdoc}: merge dialogues{hint}" title_hint = " (turns %d-%d)" % tuple(args.turns) if args.turns else "" dspan = _merge_spans(dialogues) lines = [ title_fmt.format(doc=k.doc, subdoc=k.subdoc, hint=title_hint), "", "Dialogues ({}), was:".format(dstr), "", annotate_doc(doc, span=dspan) ] return "\n".join(lines) else: return "(no commit message; nothing to merge)"
def commit_msg(args, corpus, k, sought): """ Generate a commit message describing the dialogue merging operation we are about to do (has to be run before merging happens) """ doc = corpus[k] dstr = ", ".join(anno_id_from_tuple(x) for x in sought) dialogues = [_get_annotation_with_id(d, doc.units) for d in sought] if dialogues: title_fmt = u"{doc}_{subdoc}: merge dialogues{hint}" title_hint = " (turns %d-%d)" % tuple(args.turns) if args.turns else "" dspan = _merge_spans(dialogues) lines = [title_fmt.format(doc=k.doc, subdoc=k.subdoc, hint=title_hint), "", "Dialogues ({}), was:".format(dstr), "", annotate_doc(doc, span=dspan)] return "\n".join(lines) else: return "(no commit message; nothing to merge)"
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if args.stage: if args.stage != 'unannotated' and not args.annotator: sys.exit("--annotator is required unless --stage is unannotated") elif args.stage == 'unannotated' and args.annotator: sys.exit("--annotator is forbidden if --stage is unannotated") output_dir = get_output_dir(args, default_overwrite=True) corpus = read_corpus(args, verbose=True) for key in corpus: print(key) doc = corpus[key] _delete_in_doc(args.anno_id, doc) save_document(output_dir, key, doc) pretty_id = anno_id_from_tuple(args.anno_id) print("Deleted %s" % pretty_id, file=sys.stderr) announce_output_dir(output_dir)
def _delete_in_doc(del_id, doc): """Delete the annotations with the given id in the given document NB: modifies doc """ pretty_id = anno_id_from_tuple(del_id) is_ok = lambda x: anno_id_to_tuple(x.local_id()) != del_id matches = [x for x in doc.annotations() if not is_ok(x)] if not matches: print("Skipping... no annotations found with id %s" % pretty_id, file=sys.stderr) return elif len(matches) > 1: sys.exit("Huh?! More than one annotation with id %s" % pretty_id) doc.units = [x for x in doc.units if is_ok(x)] doc.relations = [x for x in doc.relations if is_ok(x)] doc.schemas = [x for x in doc.schemas if is_ok(x)] def oops(reason): "quit because of illegal delete" sys.exit("Can't delete %s because %s " % pretty_id, reason) for anno in doc.relations: if anno.span.t1 == pretty_id: oops("it is the source for a relation: %s" % anno) if anno.span.t2 == pretty_id: oops("it is the target for a relation: %s" % anno) for anno in doc.schemas: if pretty_id in anno.units: oops("it is a unit member of %s" % anno) if pretty_id in anno.relations: oops("it is a relation member of %s" % anno) if pretty_id in anno.schemas: oops("it is a schema member of %s" % anno)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if not args.turns and len(args.dialogues) < 2: sys.exit("Must specify at least two dialogues") output_dir = get_output_dir(args, default_overwrite=True) corpus = read_corpus(args, verbose=True) if args.turns: try: sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1]) if len(sought) < 2: sys.exit("Must specify at least two dialogues") print("Merging dialogues: " + ", ".join(anno_id_from_tuple(x) for x in sought), file=sys.stderr) except GlozzException as oops: sys.exit(str(oops)) else: sought = args.dialogues if corpus and not args.no_commit_msg: key0 = list(corpus)[0] # compute this before we change things cmsg = commit_msg(args, corpus, key0, sought) for k in corpus: doc = corpus[k] _merge_dialogues_in_document(sought, doc) save_document(output_dir, k, doc) announce_output_dir(output_dir) if corpus and not args.no_commit_msg: print("-----8<------") print(cmsg)