Ejemplo n.º 1
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)
    commit_info = None
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        span = Span.merge_all(args.spans)
        _split_edu(tcache, k, new_doc, args.spans)
        diffs = _mini_diff(k, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        commit_info = CommitInfo(key=k,
                                 annotator=args.annotator,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
Ejemplo n.º 2
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)
    commit_info = None
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        span = Span.merge_all(args.spans)
        _split_edu(tcache, k, new_doc, args.spans)
        diffs = _mini_diff(k, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        commit_info = CommitInfo(key=k,
                                 annotator=args.annotator,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
Ejemplo n.º 3
0
def merge_turn_stars(doc):
    """Return a copy of the document in which consecutive turns
    by the same speaker have been merged.

    Merging is done by taking the first turn in grouping of
    consecutive speaker turns, and stretching its span over all
    the subsequent turns.

    Additionally turn prefix text (containing turn numbers and
    speakers) from the removed turns are stripped out.
    """
    def prefix_span(turn):
        "given a turn annotation, return the span of its prefix"
        prefix, _ = split_turn_text(doc.text(turn.text_span()))
        start = turn.text_span().char_start
        return start, start + len(prefix)

    doc = copy.deepcopy(doc)
    dialogues = sorted([x for x in doc.units if is_dialogue(x)],
                       key=lambda x: x.text_span())
    rejects = []  # spans for the "deleted" turns' prefixes
    for dia in dialogues:
        dia_turns = sorted(turns_in_span(doc, dia.text_span()),
                           key=lambda x: x.text_span())
        for _, turns in itr.groupby(dia_turns, anno_speaker):
            turns = list(turns)
            tstar = turns[0]
            tstar.span = Span.merge_all(x.text_span() for x in turns)
            rejects.extend(turns[1:])
            for anno in turns[1:]:
                doc.units.remove(anno)
    # pylint: disable=protected-access
    doc._text = _blank_out(doc._text, [prefix_span(x) for x in rejects])
    # pylint: enable=protected-access
    return doc
Ejemplo n.º 4
0
def merge_turn_stars(doc):
    """Return a copy of the document in which consecutive turns
    by the same speaker have been merged.

    Merging is done by taking the first turn in grouping of
    consecutive speaker turns, and stretching its span over all
    the subsequent turns.

    Additionally turn prefix text (containing turn numbers and
    speakers) from the removed turns are stripped out.
    """
    def prefix_span(turn):
        "given a turn annotation, return the span of its prefix"
        prefix, _ = split_turn_text(doc.text(turn.text_span()))
        start = turn.text_span().char_start
        return start, start + len(prefix)

    doc = copy.deepcopy(doc)
    dialogues = sorted([x for x in doc.units if is_dialogue(x)],
                       key=lambda x: x.text_span())
    rejects = []  # spans for the "deleted" turns' prefixes
    for dia in dialogues:
        dia_turns = sorted(turns_in_span(doc, dia.text_span()),
                           key=lambda x: x.text_span())
        for _, turns in itr.groupby(dia_turns, anno_speaker):
            turns = list(turns)
            tstar = turns[0]
            tstar.span = Span.merge_all(x.text_span() for x in turns)
            rejects.extend(turns[1:])
            for anno in turns[1:]:
                doc.units.remove(anno)
    # pylint: disable=protected-access
    doc._text = _blank_out(doc._text, [prefix_span(x) for x in rejects])
    # pylint: enable=protected-access
    return doc
Ejemplo n.º 5
0
def _enclosing_turn_span(doc, span):
    """
    Return the span for any turn annotations that enclose this span.
    If none are found, return the span itself
    """
    def is_match(anno):
        "enclosing turn"
        return educe.stac.is_turn(anno) and anno.text_span().encloses(span)
    spans = [span] + [u.text_span() for u in doc.units if is_match(u)]
    return Span.merge_all(spans)
Ejemplo n.º 6
0
def _enclosing_turn_span(doc, span):
    """
    Return the span for any turn annotations that enclose this span.
    If none are found, return the span itself
    """
    def is_match(anno):
        "enclosing turn"
        return educe.stac.is_turn(anno) and anno.text_span().encloses(span)

    spans = [span] + [u.text_span() for u in doc.units if is_match(u)]
    return Span.merge_all(spans)
Ejemplo n.º 7
0
def _recompute_spans(tree, context):
    """
    Recalculate tree node spans from the bottom up
    (helper for _align_with_context)
    """
    if isinstance(tree, Tree):
        spans = []
        for child in tree:
            _recompute_spans(child, context)
            spans.append(_tree_span(child))
        treenode(tree).span = Span.merge_all(spans)
        treenode(tree).context = context
Ejemplo n.º 8
0
def _recompute_spans(tree, context):
    """
    Recalculate tree node spans from the bottom up
    (helper for _align_with_context)
    """
    if isinstance(tree, Tree):
        spans = []
        for child in tree:
            _recompute_spans(child, context)
            spans.append(_tree_span(child))
        treenode(tree).span = Span.merge_all(spans)
        treenode(tree).context = context
Ejemplo n.º 9
0
def _split_edu(tcache, k, doc, spans):
    """
    Find the edu covered by these spans and do the split
    """
    # seek edu
    big_span = Span.merge_all(spans)
    matches = [x for x in doc.units
               if x.text_span() == big_span and educe.stac.is_edu(x)]
    if not matches and k.stage != 'discourse':
        print("No matches found in %s" % k, file=sys.stderr)
    elif not matches:
        _tweak_presplit(tcache, doc, spans)
    else:
        _actually_split(tcache, doc, spans, matches[0])
Ejemplo n.º 10
0
def _split_edu(tcache, k, doc, spans):
    """
    Find the edu covered by these spans and do the split
    """
    # seek edu
    big_span = Span.merge_all(spans)
    matches = [
        x for x in doc.units
        if x.text_span() == big_span and educe.stac.is_edu(x)
    ]
    if not matches and k.stage != 'discourse':
        print("No matches found in %s" % k, file=sys.stderr)
    elif not matches:
        _tweak_presplit(tcache, doc, spans)
    else:
        _actually_split(tcache, doc, spans, matches[0])
Ejemplo n.º 11
0
def _actually_merge(tcache, edus, doc):
    """
    Given a timestamp cache, a document and a collection of edus,
    replace the edus with a single merged edu in the document

    Anything that points to one of the EDUs should point
    instead to the new edu.

    Anything which points exclusively to EDUs in the span
    should be deleted (or signaled?)

    Annotations and features should be merged
    """

    def one_or_join(strs):
        "Return element if singleton, otherwise moosh together"
        strs = [x for x in strs if x is not None]
        return list(strs)[0] if len(strs) == 1\
            else _MERGE_PREFIX + "/".join(strs)

    if not edus:
        return
    new_edu = copy.deepcopy(edus[0])
    new_edu.span = Span.merge_all(x.text_span() for x in edus)
    stamp = tcache.get(new_edu.span)
    set_anno_date(new_edu, stamp)
    set_anno_author(new_edu, _AUTHOR)

    if doc.origin.stage == 'units':
        new_edu.type = one_or_join(frozenset(x.type for x in edus))
        # feature keys for all edus
        all_keys = frozenset(x for edu in edus for x in edu.features.keys())
        for key in all_keys:
            old_values = frozenset(x.features.get(key) for x in edus)
            new_edu.features[key] = one_or_join(old_values)

    # in-place replacement
    for i, _ in enumerate(doc.units):
        if doc.units[i] in edus:
            doc.units[i] = new_edu
            break

    for edu in edus:
        if edu in doc.units:
            doc.units.remove(edu)
        retarget(doc, edu.local_id(), new_edu)
Ejemplo n.º 12
0
def _actually_merge(tcache, edus, doc):
    """
    Given a timestamp cache, a document and a collection of edus,
    replace the edus with a single merged edu in the document

    Anything that points to one of the EDUs should point
    instead to the new edu.

    Anything which points exclusively to EDUs in the span
    should be deleted (or signaled?)

    Annotations and features should be merged
    """
    def one_or_join(strs):
        "Return element if singleton, otherwise moosh together"
        strs = [x for x in strs if x is not None]
        return list(strs)[0] if len(strs) == 1\
            else _MERGE_PREFIX + "/".join(strs)

    if not edus:
        return
    new_edu = copy.deepcopy(edus[0])
    new_edu.span = Span.merge_all(x.text_span() for x in edus)
    stamp = tcache.get(new_edu.span)
    set_anno_date(new_edu, stamp)
    set_anno_author(new_edu, _AUTHOR)

    if doc.origin.stage == 'units':
        new_edu.type = one_or_join(frozenset(x.type for x in edus))
        # feature keys for all edus
        all_keys = frozenset(x for edu in edus for x in edu.features.keys())
        for key in all_keys:
            old_values = frozenset(x.features.get(key) for x in edus)
            new_edu.features[key] = one_or_join(old_values)

    # in-place replacement
    for i, _ in enumerate(doc.units):
        if doc.units[i] in edus:
            doc.units[i] = new_edu
            break

    for edu in edus:
        if edu in doc.units:
            doc.units.remove(edu)
        retarget(doc, edu.local_id(), new_edu)
Ejemplo n.º 13
0
def _merge_edus(tcache, span, doc):
    """
    Find any EDUs within the given span in the document
    and merge them into a single one.

    The EDUs should stretch from the beginning to the end of
    the span (gaps OK).

    The output EDU should have the same ID in all documents
    """
    edus = edus_in_span(doc, span)
    if not edus:
        sys.exit("No EDUs in span %s" % span)

    espan = Span.merge_all(x.text_span() for x in edus)
    if espan != span:
        sys.exit("EDUs in do not cover full span %s [only %s]" % (span, espan))
    _actually_merge(tcache, edus, doc)
Ejemplo n.º 14
0
def _merge_edus(tcache, span, doc):
    """
    Find any EDUs within the given span in the document
    and merge them into a single one.

    The EDUs should stretch from the beginning to the end of
    the span (gaps OK).

    The output EDU should have the same ID in all documents
    """
    edus = edus_in_span(doc, span)
    if not edus:
        sys.exit("No EDUs in span %s" % span)

    espan = Span.merge_all(x.text_span() for x in edus)
    if espan != span:
        sys.exit("EDUs in do not cover full span %s [only %s]" %
                 (span, espan))
    _actually_merge(tcache, edus, doc)
Ejemplo n.º 15
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit', {},
                                  metadata={
                                      'author': _AUTHOR,
                                      'creation-date': str(cdu_stamp)
                                  })
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)
Ejemplo n.º 16
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit',
                                  {},
                                  metadata={'author': _AUTHOR,
                                            'creation-date': str(cdu_stamp)})
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)
Ejemplo n.º 17
0
def _nudge_down(turn, dialogue, prev_turn, next_dialogue):
    """
    Move last turn to next dialogue. (ie. shorten the right
    boundary of this dialogue and extend the left boundary of
    this dialogue)

    Return encompassing span to show what we've changed
    """
    if not prev_turn:
        sys.exit("Can't move very first turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not next_dialogue:
        sys.exit("Can't move from last dialogue."
                 "Try `stac-util move` instead")
    elif turn.span.char_end != dialogue.span.char_end:
        sys.exit("Turn %d %s is not at the end of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = prev_turn.span.char_end - turn.span.char_end
    # take both dialogue boundaries down a bit (to next turn end)
    next_dialogue.span.char_start += offset
    dialogue.span.char_end += offset
    return Span.merge_all([dialogue.span, next_dialogue.span])
Ejemplo n.º 18
0
def _nudge_down(turn, dialogue, prev_turn, next_dialogue):
    """
    Move last turn to next dialogue. (ie. shorten the right
    boundary of this dialogue and extend the left boundary of
    this dialogue)

    Return encompassing span to show what we've changed
    """
    if not prev_turn:
        sys.exit("Can't move very first turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not next_dialogue:
        sys.exit("Can't move from last dialogue."
                 "Try `stac-util move` instead")
    elif turn.span.char_end != dialogue.span.char_end:
        sys.exit("Turn %d %s is not at the end of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = prev_turn.span.char_end - turn.span.char_end
    # take both dialogue boundaries down a bit (to next turn end)
    next_dialogue.span.char_start += offset
    dialogue.span.char_end += offset
    return Span.merge_all([dialogue.span, next_dialogue.span])
Ejemplo n.º 19
0
def _nudge_up(turn, dialogue, next_turn, prev_dialogue):
    """
    Move first turn to previous dialogue (ie. extend the
    previous dialogue to incorporate this turn, and push
    this dialogue to exclude it)

    Return encompassing span to show what we've changed
    """
    if not next_turn:
        sys.exit("Can't move very last turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not prev_dialogue:
        sys.exit("Can't move from first dialogue. "
                 "Try `stac-util move` instead")
    elif turn.span.char_start - 1 != dialogue.span.char_start:
        sys.exit("Turn %d %s is not at the start of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = next_turn.span.char_start - turn.span.char_start
    # take both dialogue boundaries up a bit (to prev turn end)
    prev_dialogue.span.char_end += offset
    dialogue.span.char_start += offset
    return Span.merge_all([prev_dialogue.span, dialogue.span])
Ejemplo n.º 20
0
def _nudge_up(turn, dialogue, next_turn, prev_dialogue):
    """
    Move first turn to previous dialogue (ie. extend the
    previous dialogue to incorporate this turn, and push
    this dialogue to exclude it)

    Return encompassing span to show what we've changed
    """
    if not next_turn:
        sys.exit("Can't move very last turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not prev_dialogue:
        sys.exit("Can't move from first dialogue."
                 "Try `stac-util move` instead")
    elif turn.span.char_start - 1 != dialogue.span.char_start:
        sys.exit("Turn %d %s is not at the start of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = next_turn.span.char_start - turn.span.char_start
    # take both dialogue boundaries up a bit (to prev turn end)
    prev_dialogue.span.char_end += offset
    dialogue.span.char_start += offset
    return Span.merge_all([prev_dialogue.span, dialogue.span])