Ejemplo n.º 1
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    for key in corpus:
        print(key)
        new_doc = corpus[key]
        old_doc = copy.deepcopy(new_doc)
        span = _split_dialogue(tcache, new_doc, args.turn)
        diffs = _mini_diff(key, args, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, key, new_doc)
        commit_info = CommitInfo(key=key,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span,
                                 tid=args.turn)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
Ejemplo n.º 2
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    source = args.source
    target = _get_target(args, source, corpus)

    for k in corpus:
        print(k)
        doc = corpus[k]
        _rename_in_doc(source, target, doc)
        save_document(output_dir, k, doc)
    pretty_source = anno_id_from_tuple(source)
    pretty_target = anno_id_from_tuple(target)
    print("Renamed from %s to %s" % (pretty_source, pretty_target),
          file=sys.stderr)
    announce_output_dir(output_dir)
Ejemplo n.º 3
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    # iterate on annotated versions
    for key, tgt_doc in sorted(corpus.items()):
        print('<== weaving {} ==>'.format(key), file=sys.stderr)  # DEBUG
        # locate augmented version
        ukey = unannotated_key(key)
        try:
            src_doc = augmented[ukey]
        except KeyError:
            print('Cannot find augmented version of {}'.format(str(ukey)))
            raise
        # weave
        new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc, args.gen)
        save_document(output_dir, key, new_tgt_doc)
        print('<== done ==>', file=sys.stderr)  # DEBUG
    announce_output_dir(output_dir)
Ejemplo n.º 4
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames, src_doc, tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames, src_doc2, tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = ["======= TO %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc),
                 "^------ FROM %s" % src_k,
                 show_diff(src_doc, new_src_doc),
                 ""]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Ejemplo n.º 5
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    # iterate on annotated versions
    for key, tgt_doc in sorted(corpus.items()):
        print('<== weaving {} ==>'.format(key), file=sys.stderr)  # DEBUG
        # locate augmented version
        ukey = unannotated_key(key)
        try:
            src_doc = augmented[ukey]
        except KeyError:
            print('Cannot find augmented version of {}'.format(str(ukey)))
            raise
        # weave
        new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc)
        save_document(output_dir, key, new_tgt_doc)
        print('<== done ==>', file=sys.stderr)  # DEBUG
    announce_output_dir(output_dir)
Ejemplo n.º 6
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)
    commit_info = None
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        span = Span.merge_all(args.spans)
        _split_edu(tcache, k, new_doc, args.spans)
        diffs = _mini_diff(k, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        commit_info = CommitInfo(key=k,
                                 annotator=args.annotator,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
Ejemplo n.º 7
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    for key in corpus:
        print(key)
        new_doc = corpus[key]
        old_doc = copy.deepcopy(new_doc)
        span = _split_dialogue(tcache, new_doc, args.turn)
        diffs = _mini_diff(key, args, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, key, new_doc)
        commit_info = CommitInfo(key=key,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span,
                                 tid=args.turn)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
Ejemplo n.º 8
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)
    commit_info = None
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        _merge_edus(tcache, args.span, new_doc)
        diffs = _mini_diff(k, old_doc, new_doc, args.span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        commit_info = CommitInfo(key=k,
                                 annotator=args.annotator,
                                 before=old_doc,
                                 after=new_doc,
                                 span=args.span)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
Ejemplo n.º 9
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append(
                "Skipping %s (and related); no offending emoticons" % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
Ejemplo n.º 10
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append("Skipping %s (and related); no offending emoticons"
                            % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
Ejemplo n.º 11
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpora = [TRAINING_CORPUS]
    odir = get_output_dir(args)
    for corpus in corpora:
        ofilename = fp.join(odir, fp.basename(corpus) + ".txt")
        with open(ofilename, 'w') as ofile:
            call(["stac-util", "count", corpus, "--annotator", ANNOTATORS],
                 stdout=ofile)
    announce_output_dir(odir)
Ejemplo n.º 12
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)
    for src_k in src_corpus:
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            sys.exit("Uh-oh! we don't have %s in the corpus" % tgt_k)
        else:
            src_doc = src_corpus[src_k]
            tgt_doc = tgt_corpus[tgt_k]
            if start == 0:
                new_src_doc, new_tgt_doc =\
                    move_portion(renames, src_doc, tgt_doc,
                                 src_split=end,
                                 tgt_split=-1)
            elif end == src_doc.text_span().char_end:
                new_src_doc, src_doc2 = split_doc(src_doc, start)
                _, new_tgt_doc =\
                    move_portion(renames, src_doc2, tgt_doc,
                                 src_split=-1,
                                 tgt_split=0)
            else:
                sys.exit("Sorry, can only move to the start or to the "
                         "end of a document at the moment")
            diffs = ["======= TO %s   ========" % tgt_k,
                     show_diff(tgt_doc, new_tgt_doc),
                     "^------ FROM %s" % src_k,
                     show_diff(src_doc, new_src_doc),
                     ""]
            print("\n".join(diffs), file=sys.stderr)
            save_document(output_dir, src_k, new_src_doc)
            save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Ejemplo n.º 13
0
Archivo: weave.py Proyecto: kowey/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    for key in corpus:
        ukey = unannotated_key(key)
        new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key])
        save_document(output_dir, key, new_tgt_doc)
    announce_output_dir(output_dir)
Ejemplo n.º 14
0
Archivo: weave.py Proyecto: tjane/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    for key in corpus:
        ukey = unannotated_key(key)
        new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key])
        save_document(output_dir, key, new_tgt_doc)
    announce_output_dir(output_dir)
Ejemplo n.º 15
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpora = [TRAINING_CORPUS]
    odir = get_output_dir(args)
    for corpus in corpora:
        ofilename = fp.join(odir, fp.basename(corpus) + ".txt")
        with open(ofilename, 'w') as ofile:
            call(["stac-util", "count", corpus,
                  "--annotator", ANNOTATORS],
                 stdout=ofile)
    announce_output_dir(odir)
Ejemplo n.º 16
0
def main(args):
    """Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`.
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    # locate insertion site: target document
    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    # TODO mark units with FIXME, optionally delete in/out relations
    span = args.span
    sub_text = args.sub_text
    minor = args.minor
    # store before/after
    annos_before = []
    annos_after = []
    for tgt_k, tgt_doc in tgt_corpus.items():
        annos_before.append(annotate_doc(tgt_doc, span=span))
        # process
        new_tgt_doc = replace_text_at_span(tgt_doc,
                                           span,
                                           sub_text,
                                           minor=minor)
        # WIP new_span, depends on the offset
        offset = len(sub_text) - (span.char_end - span.char_start)
        new_span = Span(span.char_start, span.char_end + offset)
        # end WIP
        annos_after.append(annotate_doc(new_tgt_doc, span=new_span))
        # show diff and save doc
        diffs = [
            "======= REPLACE TEXT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)
    announce_output_dir(output_dir)
    # commit message
    tgt_k, tgt_doc = list(tgt_corpus.items())[0]
    anno_str_before = annos_before[0]
    anno_str_after = annos_after[0]
    if tgt_k and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(tgt_k, anno_str_before, anno_str_after))
Ejemplo n.º 17
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        if args.diff_friendly:
            doc.units = _diff_friendly(doc.units)
            doc.relations = _diff_friendly(doc.relations)
            doc.schemas = _diff_friendly(doc.schemas)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
Ejemplo n.º 18
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        if args.diff_friendly:
            doc.units = _diff_friendly(doc.units)
            doc.relations = _diff_friendly(doc.relations)
            doc.schemas = _diff_friendly(doc.schemas)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
Ejemplo n.º 19
0
def main():
    "create a .seg file for every file in the corpus"
    args = mk_argparser().parse_args()
    corpus = read_corpus(args)
    output_dir = get_output_dir(args)
    if args.pipeline:
        args.resources = True
        args.resource_status = False
        args.dialogue_acts = False
        args.dialogue_boundaries = False
        args.fake_turn_ids = True
    config = Config(emit_resources=args.resources,
                    emit_resource_status=args.resource_status,
                    emit_dialogue_acts=args.dialogue_acts,
                    emit_dialogue_boundaries=args.dialogue_boundaries,
                    fake_turn_ids=args.fake_turn_ids)
    for key in corpus:
        process_document(config, corpus, key, output_dir)
    announce_output_dir(output_dir)
Ejemplo n.º 20
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    corpus = read_corpus(args, preselected={"stage": ["units"]})
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        for edu in [x for x in doc.units if educe.stac.is_edu(x)]:
            etypes = frozenset(educe.stac.split_type(edu))
            etypes2 = frozenset(RENAMES.get(t, t) for t in etypes)
            if etypes != etypes2:
                edu.type = "/".join(sorted(etypes2))
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
Ejemplo n.º 21
0
def main():
    "create a .seg file for every file in the corpus"
    args = mk_argparser().parse_args()
    corpus = read_corpus(args)
    output_dir = get_output_dir(args)
    if args.pipeline:
        args.resources = True
        args.resource_status = False
        args.dialogue_acts = False
        args.dialogue_boundaries = False
        args.fake_turn_ids = True
    config = Config(emit_resources=args.resources,
                    emit_resource_status=args.resource_status,
                    emit_dialogue_acts=args.dialogue_acts,
                    emit_dialogue_boundaries=args.dialogue_boundaries,
                    fake_turn_ids=args.fake_turn_ids)
    for key in corpus:
        process_document(config, corpus, key, output_dir)
    announce_output_dir(output_dir)
Ejemplo n.º 22
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, preselected={'stage': ['discourse', 'units']})
    output_dir = get_output_dir(args, default_overwrite=True)
    for key in corpus:
        doc = corpus[key]
        to_delete = []
        for sch in doc.schemas:
            if not sch.members:
                to_delete.append(sch)
        for sch in to_delete:
            doc.schemas.remove(sch)
        save_document(output_dir, key, doc)
    announce_output_dir(output_dir)
Ejemplo n.º 23
0
def main(args):
    """Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`.
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    # locate insertion site: target document
    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    # TODO mark units with FIXME, optionally delete in/out relations
    span = args.span
    sub_text = args.sub_text
    minor = args.minor
    # store before/after
    annos_before = []
    annos_after = []
    for tgt_k, tgt_doc in tgt_corpus.items():
        annos_before.append(annotate_doc(tgt_doc, span=span))
        # process
        new_tgt_doc = replace_text_at_span(
            tgt_doc, span, sub_text, minor=minor)
        # WIP new_span, depends on the offset
        offset = len(sub_text) - (span.char_end - span.char_start)
        new_span = Span(span.char_start, span.char_end + offset)
        # end WIP
        annos_after.append(annotate_doc(new_tgt_doc, span=new_span))
        # show diff and save doc
        diffs = ["======= REPLACE TEXT IN %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc)]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)
    announce_output_dir(output_dir)
    # commit message
    tgt_k, tgt_doc = list(tgt_corpus.items())[0]
    anno_str_before = annos_before[0]
    anno_str_after = annos_after[0]
    if tgt_k and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(tgt_k, anno_str_before, anno_str_after))
Ejemplo n.º 24
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    corpus = read_corpus(args,
                         preselected={"stage": ["units"]})
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        for edu in [x for x in doc.units if educe.stac.is_edu(x)]:
            etypes = frozenset(educe.stac.split_type(edu))
            etypes2 = frozenset(RENAMES.get(t, t) for t in etypes)
            if etypes != etypes2:
                edu.type = "/".join(sorted(etypes2))
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
Ejemplo n.º 25
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args,
                         preselected={'stage': ['discourse', 'units']})
    output_dir = get_output_dir(args, default_overwrite=True)
    for key in corpus:
        doc = corpus[key]
        to_delete = []
        for sch in doc.schemas:
            if not sch.members:
                to_delete.append(sch)
        for sch in to_delete:
            doc.schemas.remove(sch)
        save_document(output_dir, key, doc)
    announce_output_dir(output_dir)
Ejemplo n.º 26
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = [
            "======= INSERT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Ejemplo n.º 27
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    _screen_args(args)
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)

    old_span = args.span
    new_span = Span(old_span.char_start + args.nudge_start,
                    old_span.char_end + args.nudge_end)
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        found = False
        for anno in new_doc.units:
            if anno.span == old_span:
                anno.span = copy.deepcopy(new_span)
                found = True
        if found:
            diffs = _mini_diff(k, (old_doc, old_span), (new_doc, new_span))
            print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        else:
            print("WARNING: No annotations found for %s in %s" % (old_span, k),
                  file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        span = old_span.merge(new_span)
        commit_info = CommitInfo(key=k,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
Ejemplo n.º 28
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    _screen_args(args)
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)

    old_span = args.span
    new_span = Span(old_span.char_start + args.nudge_start,
                    old_span.char_end + args.nudge_end)
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        found = False
        for anno in new_doc.units:
            if anno.span == old_span:
                anno.span = copy.deepcopy(new_span)
                found = True
        if found:
            diffs = _mini_diff(k, (old_doc, old_span), (new_doc, new_span))
            print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        else:
            print("WARNING: No annotations found for %s in %s" % (old_span, k),
                  file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        span = old_span.merge(new_span)
        commit_info = CommitInfo(key=k,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
Ejemplo n.º 29
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = ["======= INSERT IN %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc)]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
Ejemplo n.º 30
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    for key in corpus:
        print(key)
        doc = corpus[key]
        _delete_in_doc(args.anno_id, doc)
        save_document(output_dir, key, doc)
    pretty_id = anno_id_from_tuple(args.anno_id)
    print("Deleted %s" % pretty_id, file=sys.stderr)
    announce_output_dir(output_dir)
Ejemplo n.º 31
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    for key in corpus:
        print(key)
        doc = corpus[key]
        _delete_in_doc(args.anno_id, doc)
        save_document(output_dir, key, doc)
    pretty_id = anno_id_from_tuple(args.anno_id)
    print("Deleted %s" % pretty_id, file=sys.stderr)
    announce_output_dir(output_dir)
Ejemplo n.º 32
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    if not args.turns and len(args.dialogues) < 2:
        sys.exit("Must specify at least two dialogues")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)
    if args.turns:
        try:
            sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1])
            if len(sought) < 2:
                sys.exit("Must specify at least two dialogues")
            print("Merging dialogues: " +
                  ", ".join(anno_id_from_tuple(x) for x in sought),
                  file=sys.stderr)
        except GlozzException as oops:
            sys.exit(str(oops))
    else:
        sought = args.dialogues
    if corpus and not args.no_commit_msg:
        key0 = list(corpus)[0]
        # compute this before we change things
        cmsg = commit_msg(args, corpus, key0, sought)
    for k in corpus:
        doc = corpus[k]
        _merge_dialogues_in_document(sought, doc)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
    if corpus and not args.no_commit_msg:
        print("-----8<------")
        print(cmsg)
Ejemplo n.º 33
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    if not args.turns and len(args.dialogues) < 2:
        sys.exit("Must specify at least two dialogues")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)
    if args.turns:
        try:
            sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1])
            if len(sought) < 2:
                sys.exit("Must specify at least two dialogues")
            print("Merging dialogues: " +
                  ", ".join(anno_id_from_tuple(x) for x in sought),
                  file=sys.stderr)
        except GlozzException as oops:
            sys.exit(str(oops))
    else:
        sought = args.dialogues
    if corpus and not args.no_commit_msg:
        key0 = list(corpus)[0]
        # compute this before we change things
        cmsg = commit_msg(args, corpus, key0, sought)
    for k in corpus:
        doc = corpus[k]
        _merge_dialogues_in_document(sought, doc)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
    if corpus and not args.no_commit_msg:
        print("-----8<------")
        print(cmsg)
Ejemplo n.º 34
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames,
                src_doc,
                tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames,
                src_doc2,
                tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = [
            "======= TO %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc),
            "^------ FROM %s" % src_k,
            show_diff(src_doc, new_src_doc), ""
        ]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)