Beispiel #1
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames, src_doc, tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames, src_doc2, tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = ["======= TO %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc),
                 "^------ FROM %s" % src_k,
                 show_diff(src_doc, new_src_doc),
                 ""]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Beispiel #2
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)
    for src_k in src_corpus:
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            sys.exit("Uh-oh! we don't have %s in the corpus" % tgt_k)
        else:
            src_doc = src_corpus[src_k]
            tgt_doc = tgt_corpus[tgt_k]
            if start == 0:
                new_src_doc, new_tgt_doc =\
                    move_portion(renames, src_doc, tgt_doc,
                                 src_split=end,
                                 tgt_split=-1)
            elif end == src_doc.text_span().char_end:
                new_src_doc, src_doc2 = split_doc(src_doc, start)
                _, new_tgt_doc =\
                    move_portion(renames, src_doc2, tgt_doc,
                                 src_split=-1,
                                 tgt_split=0)
            else:
                sys.exit("Sorry, can only move to the start or to the "
                         "end of a document at the moment")
            diffs = ["======= TO %s   ========" % tgt_k,
                     show_diff(tgt_doc, new_tgt_doc),
                     "^------ FROM %s" % src_k,
                     show_diff(src_doc, new_src_doc),
                     ""]
            print("\n".join(diffs), file=sys.stderr)
            save_document(output_dir, src_k, new_src_doc)
            save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Beispiel #3
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append("Skipping %s (and related); no offending emoticons"
                            % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
Beispiel #4
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append(
                "Skipping %s (and related); no offending emoticons" % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
Beispiel #5
0
def _mini_diff(k, args, old_doc, new_doc, span):
    """
    Return lines of text to be printed out, showing how the nudge
    affected the text
    """
    mini_old_doc = narrow_to_span(old_doc, span)
    mini_new_doc = narrow_to_span(new_doc, span)
    return [
        "======= SPLIT AT TURN {} in {} ========".format(args.turn, k), "...",
        show_diff(mini_old_doc, mini_new_doc), "...", ""
    ]
Beispiel #6
0
def _mini_diff(k, old_doc, new_doc, span):
    """
    Return lines of text to be printed out, showing how the EDU
    split affected the text
    """
    mini_old_doc = narrow_to_span(old_doc, span)
    mini_new_doc = narrow_to_span(new_doc, span)
    return [
        "======= MERGE EDUS %s ========" % (k), "...",
        show_diff(mini_old_doc, mini_new_doc), "...", ""
    ]
Beispiel #7
0
def _mini_diff(k, old_doc, new_doc, span):
    """
    Return lines of text to be printed out, showing how the EDU
    split affected the text
    """
    mini_old_doc = narrow_to_span(old_doc, span)
    mini_new_doc = narrow_to_span(new_doc, span)
    return ["======= SPLIT EDU %s ========" % (k),
            "...",
            show_diff(mini_old_doc, mini_new_doc),
            "...",
            ""]
Beispiel #8
0
def _mini_diff(k, args, old_doc, new_doc, span):
    """
    Return lines of text to be printed out, showing how the nudge
    affected the text
    """
    mini_old_doc = narrow_to_span(old_doc, span)
    mini_new_doc = narrow_to_span(new_doc, span)
    return ["======= SPLIT AT TURN {} in {} ========".format(args.turn, k),
            "...",
            show_diff(mini_old_doc, mini_new_doc),
            "...",
            ""]
Beispiel #9
0
def _mini_diff(k, args, old_doc, new_doc, span):
    """
    Return lines of text to be printed out, showing how the nudge
    affected the text
    """
    mini_old_doc = narrow_to_span(old_doc, span)
    mini_new_doc = narrow_to_span(new_doc, span)
    return ["======= NUDGE TURN %d %s in %s ========" %
            (args.turn, args.direction, k),
            "...",
            show_diff(mini_old_doc, mini_new_doc),
            "...",
            ""]
Beispiel #10
0
def _mini_diff(k, old_doc_span, new_doc_span):
    """
    Return lines of text to be printed out, showing how the nudge
    affected the text
    """
    old_doc, old_span = old_doc_span
    new_doc, new_span = new_doc_span
    interesting_span = _enclosing_turn_span(old_doc, old_span)
    mini_old_doc = narrow_to_span(old_doc, interesting_span)
    mini_new_doc = narrow_to_span(new_doc, interesting_span)
    return [
        "======= NUDGE %s to %s in %s ========" % (old_span, new_span, k),
        "...",
        show_diff(mini_old_doc, mini_new_doc), "...", ""
    ]
Beispiel #11
0
def _mini_diff(k, old_doc_span, new_doc_span):
    """
    Return lines of text to be printed out, showing how the nudge
    affected the text
    """
    old_doc, old_span = old_doc_span
    new_doc, new_span = new_doc_span
    interesting_span = _enclosing_turn_span(old_doc, old_span)
    mini_old_doc = narrow_to_span(old_doc, interesting_span)
    mini_new_doc = narrow_to_span(new_doc, interesting_span)
    return ["======= NUDGE %s to %s in %s ========" % (old_span, new_span, k),
            "...",
            show_diff(mini_old_doc, mini_new_doc),
            "...",
            ""]
Beispiel #12
0
def main(args):
    """Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`.
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    # locate insertion site: target document
    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    # TODO mark units with FIXME, optionally delete in/out relations
    span = args.span
    sub_text = args.sub_text
    minor = args.minor
    # store before/after
    annos_before = []
    annos_after = []
    for tgt_k, tgt_doc in tgt_corpus.items():
        annos_before.append(annotate_doc(tgt_doc, span=span))
        # process
        new_tgt_doc = replace_text_at_span(tgt_doc,
                                           span,
                                           sub_text,
                                           minor=minor)
        # WIP new_span, depends on the offset
        offset = len(sub_text) - (span.char_end - span.char_start)
        new_span = Span(span.char_start, span.char_end + offset)
        # end WIP
        annos_after.append(annotate_doc(new_tgt_doc, span=new_span))
        # show diff and save doc
        diffs = [
            "======= REPLACE TEXT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)
    announce_output_dir(output_dir)
    # commit message
    tgt_k, tgt_doc = list(tgt_corpus.items())[0]
    anno_str_before = annos_before[0]
    anno_str_after = annos_after[0]
    if tgt_k and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(tgt_k, anno_str_before, anno_str_after))
Beispiel #13
0
def main(args):
    """Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`.
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    # locate insertion site: target document
    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    # TODO mark units with FIXME, optionally delete in/out relations
    span = args.span
    sub_text = args.sub_text
    minor = args.minor
    # store before/after
    annos_before = []
    annos_after = []
    for tgt_k, tgt_doc in tgt_corpus.items():
        annos_before.append(annotate_doc(tgt_doc, span=span))
        # process
        new_tgt_doc = replace_text_at_span(
            tgt_doc, span, sub_text, minor=minor)
        # WIP new_span, depends on the offset
        offset = len(sub_text) - (span.char_end - span.char_start)
        new_span = Span(span.char_start, span.char_end + offset)
        # end WIP
        annos_after.append(annotate_doc(new_tgt_doc, span=new_span))
        # show diff and save doc
        diffs = ["======= REPLACE TEXT IN %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc)]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)
    announce_output_dir(output_dir)
    # commit message
    tgt_k, tgt_doc = list(tgt_corpus.items())[0]
    anno_str_before = annos_before[0]
    anno_str_after = annos_after[0]
    if tgt_k and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(tgt_k, anno_str_before, anno_str_after))
Beispiel #14
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = [
            "======= INSERT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Beispiel #15
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = ["======= INSERT IN %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc)]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Beispiel #16
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames,
                src_doc,
                tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames,
                src_doc2,
                tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = [
            "======= TO %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc),
            "^------ FROM %s" % src_k,
            show_diff(src_doc, new_src_doc), ""
        ]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)