def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args, verbose=True) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) for key in corpus: print(key) new_doc = corpus[key] old_doc = copy.deepcopy(new_doc) span = _split_dialogue(tcache, new_doc, args.turn) diffs = _mini_diff(key, args, old_doc, new_doc, span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, key, new_doc) commit_info = CommitInfo(key=key, before=old_doc, after=new_doc, span=span, tid=args.turn) announce_output_dir(output_dir) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if args.stage: if args.stage != 'unannotated' and not args.annotator: sys.exit("--annotator is required unless --stage is unannotated") elif args.stage == 'unannotated' and args.annotator: sys.exit("--annotator is forbidden if --stage is unannotated") output_dir = get_output_dir(args, default_overwrite=True) corpus = read_corpus(args, verbose=True) source = args.source target = _get_target(args, source, corpus) for k in corpus: print(k) doc = corpus[k] _rename_in_doc(source, target, doc) save_document(output_dir, k, doc) pretty_source = anno_id_from_tuple(source) pretty_target = anno_id_from_tuple(target) print("Renamed from %s to %s" % (pretty_source, pretty_target), file=sys.stderr) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args) augmented = read_augmented_corpus(args) corpus = read_corpus_with_unannotated(args) renames = compute_renames(corpus, augmented) # iterate on annotated versions for key, tgt_doc in sorted(corpus.items()): print('<== weaving {} ==>'.format(key), file=sys.stderr) # DEBUG # locate augmented version ukey = unannotated_key(key) try: src_doc = augmented[ukey] except KeyError: print('Cannot find augmented version of {}'.format(str(ukey))) raise # weave new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc, args.gen) save_document(output_dir, key, new_tgt_doc) print('<== done ==>', file=sys.stderr) # DEBUG announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) start = args.span.char_start end = args.span.char_end src_corpus = read_source_corpus(args) tgt_corpus = read_target_corpus(args) renames = compute_renames(tgt_corpus, src_corpus) for src_k, src_doc in src_corpus.items(): # retrieve target subdoc tgt_k = copy.copy(src_k) tgt_k.subdoc = args.target print(src_k, tgt_k, file=sys.stderr) if tgt_k not in tgt_corpus: raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k) tgt_doc = tgt_corpus[tgt_k] # move portion from source to target subdoc if start == 0: # move up new_src_doc, new_tgt_doc = move_portion( renames, src_doc, tgt_doc, end, # src_split tgt_split=-1) elif end == len(src_doc.text()): # src_doc.text_span().char_end: # move down # move_portion inserts src_doc[0:src_split] between # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:], # so we detach src_doc[start:] into a temporary doc, # then call move_portion on this temporary doc new_src_doc, src_doc2 = split_doc(src_doc, start) _, new_tgt_doc = move_portion( renames, src_doc2, tgt_doc, -1, # src_split tgt_split=0) # the whitespace between new_src_doc and src_doc2 went to # src_doc2, so we need to append a new whitespace to new_src_doc evil_set_text(new_src_doc, new_src_doc.text() + ' ') else: raise ValueError("Sorry, can only move to the start or to the " "end of a document at the moment") # print diff for suggested commit message diffs = ["======= TO %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc), "^------ FROM %s" % src_k, show_diff(src_doc, new_src_doc), ""] print("\n".join(diffs), file=sys.stderr) # dump the modified documents save_document(output_dir, src_k, new_src_doc) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args) augmented = read_augmented_corpus(args) corpus = read_corpus_with_unannotated(args) renames = compute_renames(corpus, augmented) # iterate on annotated versions for key, tgt_doc in sorted(corpus.items()): print('<== weaving {} ==>'.format(key), file=sys.stderr) # DEBUG # locate augmented version ukey = unannotated_key(key) try: src_doc = augmented[ukey] except KeyError: print('Cannot find augmented version of {}'.format(str(ukey))) raise # weave new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc) save_document(output_dir, key, new_tgt_doc) print('<== done ==>', file=sys.stderr) # DEBUG announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) commit_info = None for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) span = Span.merge_all(args.spans) _split_edu(tcache, k, new_doc, args.spans) diffs = _mini_diff(k, old_doc, new_doc, span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation commit_info = CommitInfo(key=k, annotator=args.annotator, before=old_doc, after=new_doc, span=span) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info)) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) commit_info = None for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) _merge_edus(tcache, args.span, new_doc) diffs = _mini_diff(k, old_doc, new_doc, args.span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation commit_info = CommitInfo(key=k, annotator=args.annotator, before=old_doc, after=new_doc, span=args.span) announce_output_dir(output_dir) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append( "Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append("Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpora = [TRAINING_CORPUS] odir = get_output_dir(args) for corpus in corpora: ofilename = fp.join(odir, fp.basename(corpus) + ".txt") with open(ofilename, 'w') as ofile: call(["stac-util", "count", corpus, "--annotator", ANNOTATORS], stdout=ofile) announce_output_dir(odir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) start = args.span.char_start end = args.span.char_end src_corpus = read_source_corpus(args) tgt_corpus = read_target_corpus(args) renames = compute_renames(tgt_corpus, src_corpus) for src_k in src_corpus: tgt_k = copy.copy(src_k) tgt_k.subdoc = args.target print(src_k, tgt_k, file=sys.stderr) if tgt_k not in tgt_corpus: sys.exit("Uh-oh! we don't have %s in the corpus" % tgt_k) else: src_doc = src_corpus[src_k] tgt_doc = tgt_corpus[tgt_k] if start == 0: new_src_doc, new_tgt_doc =\ move_portion(renames, src_doc, tgt_doc, src_split=end, tgt_split=-1) elif end == src_doc.text_span().char_end: new_src_doc, src_doc2 = split_doc(src_doc, start) _, new_tgt_doc =\ move_portion(renames, src_doc2, tgt_doc, src_split=-1, tgt_split=0) else: sys.exit("Sorry, can only move to the start or to the " "end of a document at the moment") diffs = ["======= TO %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc), "^------ FROM %s" % src_k, show_diff(src_doc, new_src_doc), ""] print("\n".join(diffs), file=sys.stderr) save_document(output_dir, src_k, new_src_doc) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args) augmented = read_augmented_corpus(args) corpus = read_corpus_with_unannotated(args) renames = compute_renames(corpus, augmented) for key in corpus: ukey = unannotated_key(key) new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key]) save_document(output_dir, key, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser`. """ output_dir = get_output_dir(args, default_overwrite=True) # locate insertion site: target document reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) # TODO mark units with FIXME, optionally delete in/out relations span = args.span sub_text = args.sub_text minor = args.minor # store before/after annos_before = [] annos_after = [] for tgt_k, tgt_doc in tgt_corpus.items(): annos_before.append(annotate_doc(tgt_doc, span=span)) # process new_tgt_doc = replace_text_at_span(tgt_doc, span, sub_text, minor=minor) # WIP new_span, depends on the offset offset = len(sub_text) - (span.char_end - span.char_start) new_span = Span(span.char_start, span.char_end + offset) # end WIP annos_after.append(annotate_doc(new_tgt_doc, span=new_span)) # show diff and save doc diffs = [ "======= REPLACE TEXT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc) ] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir) # commit message tgt_k, tgt_doc = list(tgt_corpus.items())[0] anno_str_before = annos_before[0] anno_str_after = annos_after[0] if tgt_k and not args.no_commit_msg: print("-----8<------") print(commit_msg(tgt_k, anno_str_before, anno_str_after))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args, verbose=True) output_dir = get_output_dir(args, default_overwrite=True) for k in corpus: doc = corpus[k] if args.diff_friendly: doc.units = _diff_friendly(doc.units) doc.relations = _diff_friendly(doc.relations) doc.schemas = _diff_friendly(doc.schemas) save_document(output_dir, k, doc) announce_output_dir(output_dir)
def main(): "create a .seg file for every file in the corpus" args = mk_argparser().parse_args() corpus = read_corpus(args) output_dir = get_output_dir(args) if args.pipeline: args.resources = True args.resource_status = False args.dialogue_acts = False args.dialogue_boundaries = False args.fake_turn_ids = True config = Config(emit_resources=args.resources, emit_resource_status=args.resource_status, emit_dialogue_acts=args.dialogue_acts, emit_dialogue_boundaries=args.dialogue_boundaries, fake_turn_ids=args.fake_turn_ids) for key in corpus: process_document(config, corpus, key, output_dir) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args, preselected={"stage": ["units"]}) output_dir = get_output_dir(args, default_overwrite=True) for k in corpus: doc = corpus[k] for edu in [x for x in doc.units if educe.stac.is_edu(x)]: etypes = frozenset(educe.stac.split_type(edu)) etypes2 = frozenset(RENAMES.get(t, t) for t in etypes) if etypes != etypes2: edu.type = "/".join(sorted(etypes2)) save_document(output_dir, k, doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args, preselected={'stage': ['discourse', 'units']}) output_dir = get_output_dir(args, default_overwrite=True) for key in corpus: doc = corpus[key] to_delete = [] for sch in doc.schemas: if not sch.members: to_delete.append(sch) for sch in to_delete: doc.schemas.remove(sch) save_document(output_dir, key, doc) announce_output_dir(output_dir)
def main(args): """Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser`. """ output_dir = get_output_dir(args, default_overwrite=True) # locate insertion site: target document reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) # TODO mark units with FIXME, optionally delete in/out relations span = args.span sub_text = args.sub_text minor = args.minor # store before/after annos_before = [] annos_after = [] for tgt_k, tgt_doc in tgt_corpus.items(): annos_before.append(annotate_doc(tgt_doc, span=span)) # process new_tgt_doc = replace_text_at_span( tgt_doc, span, sub_text, minor=minor) # WIP new_span, depends on the offset offset = len(sub_text) - (span.char_end - span.char_start) new_span = Span(span.char_start, span.char_end + offset) # end WIP annos_after.append(annotate_doc(new_tgt_doc, span=new_span)) # show diff and save doc diffs = ["======= REPLACE TEXT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc)] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir) # commit message tgt_k, tgt_doc = list(tgt_corpus.items())[0] anno_str_before = annos_before[0] anno_str_after = annos_after[0] if tgt_k and not args.no_commit_msg: print("-----8<------") print(commit_msg(tgt_k, anno_str_before, anno_str_after))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) src_reader = educe.stac.LiveInputReader(args.insert) src_corpus = src_reader.slurp(src_reader.files()) if not src_corpus: sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)") elif len(src_corpus) > 1: sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" % len(src_corpus)) src_doc = src_corpus.values()[0] reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) renames = compute_renames(tgt_corpus, src_corpus) for tgt_k in tgt_corpus: tgt_doc = tgt_corpus[tgt_k] _, new_tgt_doc = move_portion(renames, src_doc, tgt_doc, -1, tgt_split=args.start) diffs = [ "======= INSERT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc) ] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ _screen_args(args) corpus = read_corpus(args, verbose=True) output_dir = get_output_dir(args, default_overwrite=True) old_span = args.span new_span = Span(old_span.char_start + args.nudge_start, old_span.char_end + args.nudge_end) for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) found = False for anno in new_doc.units: if anno.span == old_span: anno.span = copy.deepcopy(new_span) found = True if found: diffs = _mini_diff(k, (old_doc, old_span), (new_doc, new_span)) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) else: print("WARNING: No annotations found for %s in %s" % (old_span, k), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation span = old_span.merge(new_span) commit_info = CommitInfo(key=k, before=old_doc, after=new_doc, span=span) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info)) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) src_reader = educe.stac.LiveInputReader(args.insert) src_corpus = src_reader.slurp(src_reader.files()) if not src_corpus: sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)") elif len(src_corpus) > 1: sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" % len(src_corpus)) src_doc = src_corpus.values()[0] reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) renames = compute_renames(tgt_corpus, src_corpus) for tgt_k in tgt_corpus: tgt_doc = tgt_corpus[tgt_k] _, new_tgt_doc = move_portion(renames, src_doc, tgt_doc, -1, tgt_split=args.start) diffs = ["======= INSERT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc)] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if args.stage: if args.stage != 'unannotated' and not args.annotator: sys.exit("--annotator is required unless --stage is unannotated") elif args.stage == 'unannotated' and args.annotator: sys.exit("--annotator is forbidden if --stage is unannotated") output_dir = get_output_dir(args, default_overwrite=True) corpus = read_corpus(args, verbose=True) for key in corpus: print(key) doc = corpus[key] _delete_in_doc(args.anno_id, doc) save_document(output_dir, key, doc) pretty_id = anno_id_from_tuple(args.anno_id) print("Deleted %s" % pretty_id, file=sys.stderr) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if not args.turns and len(args.dialogues) < 2: sys.exit("Must specify at least two dialogues") output_dir = get_output_dir(args, default_overwrite=True) corpus = read_corpus(args, verbose=True) if args.turns: try: sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1]) if len(sought) < 2: sys.exit("Must specify at least two dialogues") print("Merging dialogues: " + ", ".join(anno_id_from_tuple(x) for x in sought), file=sys.stderr) except GlozzException as oops: sys.exit(str(oops)) else: sought = args.dialogues if corpus and not args.no_commit_msg: key0 = list(corpus)[0] # compute this before we change things cmsg = commit_msg(args, corpus, key0, sought) for k in corpus: doc = corpus[k] _merge_dialogues_in_document(sought, doc) save_document(output_dir, k, doc) announce_output_dir(output_dir) if corpus and not args.no_commit_msg: print("-----8<------") print(cmsg)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) start = args.span.char_start end = args.span.char_end src_corpus = read_source_corpus(args) tgt_corpus = read_target_corpus(args) renames = compute_renames(tgt_corpus, src_corpus) for src_k, src_doc in src_corpus.items(): # retrieve target subdoc tgt_k = copy.copy(src_k) tgt_k.subdoc = args.target print(src_k, tgt_k, file=sys.stderr) if tgt_k not in tgt_corpus: raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k) tgt_doc = tgt_corpus[tgt_k] # move portion from source to target subdoc if start == 0: # move up new_src_doc, new_tgt_doc = move_portion( renames, src_doc, tgt_doc, end, # src_split tgt_split=-1) elif end == len(src_doc.text()): # src_doc.text_span().char_end: # move down # move_portion inserts src_doc[0:src_split] between # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:], # so we detach src_doc[start:] into a temporary doc, # then call move_portion on this temporary doc new_src_doc, src_doc2 = split_doc(src_doc, start) _, new_tgt_doc = move_portion( renames, src_doc2, tgt_doc, -1, # src_split tgt_split=0) # the whitespace between new_src_doc and src_doc2 went to # src_doc2, so we need to append a new whitespace to new_src_doc evil_set_text(new_src_doc, new_src_doc.text() + ' ') else: raise ValueError("Sorry, can only move to the start or to the " "end of a document at the moment") # print diff for suggested commit message diffs = [ "======= TO %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc), "^------ FROM %s" % src_k, show_diff(src_doc, new_src_doc), "" ] print("\n".join(diffs), file=sys.stderr) # dump the modified documents save_document(output_dir, src_k, new_src_doc) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)