Exemple #1
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    source = args.source
    target = _get_target(args, source, corpus)

    for k in corpus:
        print(k)
        doc = corpus[k]
        _rename_in_doc(source, target, doc)
        save_document(output_dir, k, doc)
    pretty_source = anno_id_from_tuple(source)
    pretty_target = anno_id_from_tuple(target)
    print("Renamed from %s to %s" % (pretty_source, pretty_target),
          file=sys.stderr)
    announce_output_dir(output_dir)
Exemple #2
0
def _rename_in_doc(source, target, doc):
    """
    Rename all annotations with the given source id in the given document

    NB: modifies doc
    """
    matches = [x for x in doc.annotations() if
               anno_id_to_tuple(x.local_id()) == source]
    pretty_source = anno_id_from_tuple(source)
    pretty_target = anno_id_from_tuple(target)
    target_author, target_date = target

    def replace_pointer(pointers):
        "Given annotation id, return copy with s/src/tgt/"
        return [pretty_target if ptr == pretty_source else ptr
                for ptr in pointers]

    if not matches:
        sys.exit("No annotations found with id %s" % pretty_source)
    elif len(matches) > 1:
        sys.exit("Huh?! More than one annotation with id %s" % pretty_source)
    evil_set_id(matches[0], target_author, target_date)
    for anno in doc.relations:
        if anno.span.t1 == pretty_source:
            anno.span.t1 = pretty_target
        if anno.span.t2 == pretty_source:
            anno.span.t2 = pretty_target
    for anno in doc.schemas:
        anno.units = replace_pointer(anno.units)
        anno.relations = replace_pointer(anno.relations)
        anno.schemas = replace_pointer(anno.schemas)
Exemple #3
0
def _tweak_presplit(tcache, doc, spans):
    """
    What to do in case the split was already done manually
    (in the discourse section)
    """
    renames = {}
    for span in sorted(spans):
        matches = [x for x in doc.units
                   if x.text_span() == span and educe.stac.is_edu(x)]
        if not matches:
            raise Exception("No matches found for %s in %s" %
                            (span, doc.origin))
        edu = matches[0]
        old_id = edu.local_id()
        new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span)))
        set_anno_date(edu, tcache.get(span))
        set_anno_author(edu, _AUTHOR)
        renames[old_id] = new_id

    for rel in doc.relations:
        if rel.span.t1 in renames:
            rel.span.t1 = renames[rel.span.t1]
        if rel.span.t2 in renames:
            rel.span.t2 = renames[rel.span.t2]
    for schema in doc.schemas:
        units2 = set(schema.units)
        for unit in schema.units:
            if unit in renames:
                units2.remove(unit)
                units2.add(renames[unit])
        schema.units = units2
Exemple #4
0
def _tweak_presplit(tcache, doc, spans):
    """
    What to do in case the split was already done manually
    (in the discourse section)
    """
    renames = {}
    for span in sorted(spans):
        matches = [
            x for x in doc.units
            if x.text_span() == span and educe.stac.is_edu(x)
        ]
        if not matches:
            raise Exception("No matches found for %s in %s" %
                            (span, doc.origin))
        edu = matches[0]
        old_id = edu.local_id()
        new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span)))
        set_anno_date(edu, tcache.get(span))
        set_anno_author(edu, _AUTHOR)
        renames[old_id] = new_id

    for rel in doc.relations:
        if rel.span.t1 in renames:
            rel.span.t1 = renames[rel.span.t1]
        if rel.span.t2 in renames:
            rel.span.t2 = renames[rel.span.t2]
    for schema in doc.schemas:
        units2 = set(schema.units)
        for unit in schema.units:
            if unit in renames:
                units2.remove(unit)
                units2.add(renames[unit])
        schema.units = units2
Exemple #5
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit', {},
                                  metadata={
                                      'author': _AUTHOR,
                                      'creation-date': str(cdu_stamp)
                                  })
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)
Exemple #6
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit',
                                  {},
                                  metadata={'author': _AUTHOR,
                                            'creation-date': str(cdu_stamp)})
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)
Exemple #7
0
def _get_annotation_with_id(sought_tuple, annotations):
    """
    Given a tuple (author,creation_date), pick out the one annotation
    whose id matches.  There must be exactly one.
    """
    sought = anno_id_from_tuple(sought_tuple)
    candidates = [x for x in annotations if x.local_id() == sought]
    if len(candidates) == 1:
        return candidates[0]
    elif len(candidates) > 1:
        raise Exception('More than one annotation found with id %s' % sought)
    else:
        raise Exception('No annotations found with id %s' % sought)
Exemple #8
0
def _get_annotation_with_id(sought_tuple, annotations):
    """
    Given a tuple (author,creation_date), pick out the one annotation
    whose id matches.  There must be exactly one.
    """
    sought = anno_id_from_tuple(sought_tuple)
    candidates = [x for x in annotations if x.local_id() == sought]
    if len(candidates) == 1:
        return candidates[0]
    elif len(candidates) > 1:
        raise Exception('More than one annotation found with id %s' % sought)
    else:
        raise Exception('No annotations found with id %s' % sought)
Exemple #9
0
def mk_relation(tstamp, local_id_parent, local_id_child, label):
    """
    Given a document and edu ids, create a relation
    instance betweenthem

    """
    span = RelSpan(local_id_parent, local_id_child)
    label = label
    annotator = 'stacparser'
    date = tstamp.next()
    rel_id = stac_glozz.anno_id_from_tuple((annotator, date))
    features = {}
    metadata = {}
    metadata['author'] = annotator
    metadata['creation-date'] = str(date)
    return Relation(rel_id=rel_id,
                    span=span,
                    rtype=label,
                    features=features,
                    metadata=metadata)
Exemple #10
0
def commit_msg(args, corpus, k, sought):
    """
    Generate a commit message describing the dialogue merging operation
    we are about to do (has to be run before merging happens)
    """
    doc = corpus[k]
    dstr = ", ".join(anno_id_from_tuple(x) for x in sought)
    dialogues = [_get_annotation_with_id(d, doc.units) for d in sought]
    if dialogues:
        title_fmt = u"{doc}_{subdoc}: merge dialogues{hint}"
        title_hint = " (turns %d-%d)" % tuple(args.turns) if args.turns else ""
        dspan = _merge_spans(dialogues)
        lines = [
            title_fmt.format(doc=k.doc, subdoc=k.subdoc, hint=title_hint), "",
            "Dialogues ({}), was:".format(dstr), "",
            annotate_doc(doc, span=dspan)
        ]
        return "\n".join(lines)
    else:
        return "(no commit message; nothing to merge)"
Exemple #11
0
def mk_relation(tstamp, local_id_parent, local_id_child, label):
    """
    Given a document and edu ids, create a relation
    instance betweenthem

    """
    span = RelSpan(local_id_parent,
                   local_id_child)
    label = label
    annotator = 'stacparser'
    date = tstamp.next()
    rel_id = stac_glozz.anno_id_from_tuple((annotator, date))
    features = {}
    metadata = {}
    metadata['author'] = annotator
    metadata['creation-date'] = str(date)
    return Relation(rel_id=rel_id,
                    span=span,
                    rtype=label,
                    features=features,
                    metadata=metadata)
Exemple #12
0
def commit_msg(args, corpus, k, sought):
    """
    Generate a commit message describing the dialogue merging operation
    we are about to do (has to be run before merging happens)
    """
    doc = corpus[k]
    dstr = ", ".join(anno_id_from_tuple(x) for x in sought)
    dialogues = [_get_annotation_with_id(d, doc.units) for d in sought]
    if dialogues:
        title_fmt = u"{doc}_{subdoc}: merge dialogues{hint}"
        title_hint = " (turns %d-%d)" % tuple(args.turns) if args.turns else ""
        dspan = _merge_spans(dialogues)
        lines = [title_fmt.format(doc=k.doc,
                                  subdoc=k.subdoc,
                                  hint=title_hint),
                 "",
                 "Dialogues ({}), was:".format(dstr),
                 "",
                 annotate_doc(doc, span=dspan)]
        return "\n".join(lines)
    else:
        return "(no commit message; nothing to merge)"
Exemple #13
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    for key in corpus:
        print(key)
        doc = corpus[key]
        _delete_in_doc(args.anno_id, doc)
        save_document(output_dir, key, doc)
    pretty_id = anno_id_from_tuple(args.anno_id)
    print("Deleted %s" % pretty_id, file=sys.stderr)
    announce_output_dir(output_dir)
Exemple #14
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    for key in corpus:
        print(key)
        doc = corpus[key]
        _delete_in_doc(args.anno_id, doc)
        save_document(output_dir, key, doc)
    pretty_id = anno_id_from_tuple(args.anno_id)
    print("Deleted %s" % pretty_id, file=sys.stderr)
    announce_output_dir(output_dir)
Exemple #15
0
def _delete_in_doc(del_id, doc):
    """Delete the annotations with the given id in the given document

    NB: modifies doc
    """
    pretty_id = anno_id_from_tuple(del_id)
    is_ok = lambda x: anno_id_to_tuple(x.local_id()) != del_id
    matches = [x for x in doc.annotations() if not is_ok(x)]

    if not matches:
        print("Skipping... no annotations found with id %s" % pretty_id,
              file=sys.stderr)
        return
    elif len(matches) > 1:
        sys.exit("Huh?! More than one annotation with id %s" % pretty_id)

    doc.units = [x for x in doc.units if is_ok(x)]
    doc.relations = [x for x in doc.relations if is_ok(x)]
    doc.schemas = [x for x in doc.schemas if is_ok(x)]

    def oops(reason):
        "quit because of illegal delete"
        sys.exit("Can't delete %s because %s " % pretty_id, reason)

    for anno in doc.relations:
        if anno.span.t1 == pretty_id:
            oops("it is the source for a relation: %s" % anno)
        if anno.span.t2 == pretty_id:
            oops("it is the target for a relation: %s" % anno)
    for anno in doc.schemas:
        if pretty_id in anno.units:
            oops("it is a unit member of %s" % anno)
        if pretty_id in anno.relations:
            oops("it is a relation member of %s" % anno)
        if pretty_id in anno.schemas:
            oops("it is a schema member of %s" % anno)
Exemple #16
0
def _delete_in_doc(del_id, doc):
    """Delete the annotations with the given id in the given document

    NB: modifies doc
    """
    pretty_id = anno_id_from_tuple(del_id)
    is_ok = lambda x: anno_id_to_tuple(x.local_id()) != del_id
    matches = [x for x in doc.annotations() if not is_ok(x)]

    if not matches:
        print("Skipping... no annotations found with id %s" % pretty_id,
              file=sys.stderr)
        return
    elif len(matches) > 1:
        sys.exit("Huh?! More than one annotation with id %s" % pretty_id)

    doc.units = [x for x in doc.units if is_ok(x)]
    doc.relations = [x for x in doc.relations if is_ok(x)]
    doc.schemas = [x for x in doc.schemas if is_ok(x)]

    def oops(reason):
        "quit because of illegal delete"
        sys.exit("Can't delete %s because %s " % pretty_id, reason)

    for anno in doc.relations:
        if anno.span.t1 == pretty_id:
            oops("it is the source for a relation: %s" % anno)
        if anno.span.t2 == pretty_id:
            oops("it is the target for a relation: %s" % anno)
    for anno in doc.schemas:
        if pretty_id in anno.units:
            oops("it is a unit member of %s" % anno)
        if pretty_id in anno.relations:
            oops("it is a relation member of %s" % anno)
        if pretty_id in anno.schemas:
            oops("it is a schema member of %s" % anno)
Exemple #17
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    if not args.turns and len(args.dialogues) < 2:
        sys.exit("Must specify at least two dialogues")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)
    if args.turns:
        try:
            sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1])
            if len(sought) < 2:
                sys.exit("Must specify at least two dialogues")
            print("Merging dialogues: " +
                  ", ".join(anno_id_from_tuple(x) for x in sought),
                  file=sys.stderr)
        except GlozzException as oops:
            sys.exit(str(oops))
    else:
        sought = args.dialogues
    if corpus and not args.no_commit_msg:
        key0 = list(corpus)[0]
        # compute this before we change things
        cmsg = commit_msg(args, corpus, key0, sought)
    for k in corpus:
        doc = corpus[k]
        _merge_dialogues_in_document(sought, doc)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
    if corpus and not args.no_commit_msg:
        print("-----8<------")
        print(cmsg)
Exemple #18
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    if not args.turns and len(args.dialogues) < 2:
        sys.exit("Must specify at least two dialogues")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)
    if args.turns:
        try:
            sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1])
            if len(sought) < 2:
                sys.exit("Must specify at least two dialogues")
            print("Merging dialogues: " +
                  ", ".join(anno_id_from_tuple(x) for x in sought),
                  file=sys.stderr)
        except GlozzException as oops:
            sys.exit(str(oops))
    else:
        sought = args.dialogues
    if corpus and not args.no_commit_msg:
        key0 = list(corpus)[0]
        # compute this before we change things
        cmsg = commit_msg(args, corpus, key0, sought)
    for k in corpus:
        doc = corpus[k]
        _merge_dialogues_in_document(sought, doc)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
    if corpus and not args.no_commit_msg:
        print("-----8<------")
        print(cmsg)