Example #1
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    # iterate on annotated versions
    for key, tgt_doc in sorted(corpus.items()):
        print('<== weaving {} ==>'.format(key), file=sys.stderr)  # DEBUG
        # locate augmented version
        ukey = unannotated_key(key)
        try:
            src_doc = augmented[ukey]
        except KeyError:
            print('Cannot find augmented version of {}'.format(str(ukey)))
            raise
        # weave
        new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc)
        save_document(output_dir, key, new_tgt_doc)
        print('<== done ==>', file=sys.stderr)  # DEBUG
    announce_output_dir(output_dir)
Example #2
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames, src_doc, tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames, src_doc2, tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = ["======= TO %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc),
                 "^------ FROM %s" % src_k,
                 show_diff(src_doc, new_src_doc),
                 ""]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Example #3
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    # iterate on annotated versions
    for key, tgt_doc in sorted(corpus.items()):
        print('<== weaving {} ==>'.format(key), file=sys.stderr)  # DEBUG
        # locate augmented version
        ukey = unannotated_key(key)
        try:
            src_doc = augmented[ukey]
        except KeyError:
            print('Cannot find augmented version of {}'.format(str(ukey)))
            raise
        # weave
        new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc, args.gen)
        save_document(output_dir, key, new_tgt_doc)
        print('<== done ==>', file=sys.stderr)  # DEBUG
    announce_output_dir(output_dir)
Example #4
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)
    for src_k in src_corpus:
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            sys.exit("Uh-oh! we don't have %s in the corpus" % tgt_k)
        else:
            src_doc = src_corpus[src_k]
            tgt_doc = tgt_corpus[tgt_k]
            if start == 0:
                new_src_doc, new_tgt_doc =\
                    move_portion(renames, src_doc, tgt_doc,
                                 src_split=end,
                                 tgt_split=-1)
            elif end == src_doc.text_span().char_end:
                new_src_doc, src_doc2 = split_doc(src_doc, start)
                _, new_tgt_doc =\
                    move_portion(renames, src_doc2, tgt_doc,
                                 src_split=-1,
                                 tgt_split=0)
            else:
                sys.exit("Sorry, can only move to the start or to the "
                         "end of a document at the moment")
            diffs = ["======= TO %s   ========" % tgt_k,
                     show_diff(tgt_doc, new_tgt_doc),
                     "^------ FROM %s" % src_k,
                     show_diff(src_doc, new_src_doc),
                     ""]
            print("\n".join(diffs), file=sys.stderr)
            save_document(output_dir, src_k, new_src_doc)
            save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Example #5
0
File: weave.py Project: kowey/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    for key in corpus:
        ukey = unannotated_key(key)
        new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key])
        save_document(output_dir, key, new_tgt_doc)
    announce_output_dir(output_dir)
Example #6
0
File: weave.py Project: tjane/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    for key in corpus:
        ukey = unannotated_key(key)
        new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key])
        save_document(output_dir, key, new_tgt_doc)
    announce_output_dir(output_dir)
Example #7
0
def _get_target(args, source, corpus):
    """
    Return either the explicitly specified target from the command line
    or a new one that we computed by looking at the current corpus.
    Check for collisions while we're at it
    """
    if args.target:
        has_collision = any(_has_named_annotation(args.target, doc)
                            for doc in corpus.values())
        if has_collision:
            sys.exit("Can't rename! " +
                     "We already have annotation(s) with ID %s" % args.target)
        else:
            return args.target
    else:
        # generate a new name
        renames = compute_renames(corpus, corpus)
        source_author, source_date = source
        target_author = source_author
        target_date = renames[source_author][source_date]
        return (target_author, target_date)
Example #8
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = [
            "======= INSERT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Example #9
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = ["======= INSERT IN %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc)]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Example #10
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames,
                src_doc,
                tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames,
                src_doc2,
                tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = [
            "======= TO %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc),
            "^------ FROM %s" % src_k,
            show_diff(src_doc, new_src_doc), ""
        ]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)