def get_all_book_paths(testament_dir: Path) -> Iterable[Tuple[str, Path]]: for book in ALL_BOOK_IDS: book_num = book_id_to_number(book) if not is_ot_nt(book_num): continue book_exp_dir = testament_dir / (str(book_num).zfill(3) + "-" + book) yield book, book_exp_dir
def main() -> None: parser = argparse.ArgumentParser( description="Translates text using Google Cloud") parser.add_argument("experiment", help="Experiment name") parser.add_argument("--src-project", type=str, help="The source project to translate") parser.add_argument("--book", type=str, help="The book to translate") parser.add_argument("--trg-lang", default=None, type=str, help="ISO-639-1 code for target language (e.g., 'en')") args = parser.parse_args() get_git_revision_hash() root_dir = get_mt_exp_dir(args.experiment) src_project: str = args.src_project book: str = args.book trg_iso: Optional[str] = args.trg_lang default_output_dir = root_dir / src_project book_num = book_id_to_number(book) output_path = default_output_dir / f"{book_file_name_digits(book_num)}{book}.SFM" output_dir = output_path.parent output_dir.mkdir(exist_ok=True) translator = GoogleTranslator() translator.translate_book(src_project, book, output_path, trg_iso=trg_iso)
def filter_corpus(text: Text) -> bool: book_num = book_id_to_number(text.id) if exclude_books_set is not None and book_num in exclude_books_set: return False if include_books_set is not None and book_num in include_books_set: return True return include_books_set is None
def aggregate_testament_results() -> None: for testament in TESTAMENTS: data: Dict[str, pd.DataFrame] = {} available_books: Set[str] = set() available_aligners: Set[str] = set() for translation in TRANSLATIONS: translation_dir = EXP_DIR / translation exp_dir = translation_dir if testament == "nt+ot" else translation_dir / testament scores_path = exp_dir / "scores.csv" if scores_path.is_file(): df = pd.read_csv(scores_path, index_col=[0, 1]) data[translation] = df available_books.update(df.index.get_level_values("Book")) available_aligners.update(df.index.get_level_values("Model")) available_books.remove("ALL") for metric in METRICS: output_path = EXP_DIR / f"{testament}.all.{metric}.csv" with output_path.open("w") as output_file: output_file.write( "Model," + ",".join(filter(lambda t: t in data, TRANSLATIONS)) + "\n") for aligner in ALIGNERS: output_file.write(aligner.replace("Giza-", "")) for translation in TRANSLATIONS: df = data.get(translation) if df is None: continue output_file.write(",") if ("ALL", aligner) in df.index: output_file.write( str(df.at[("ALL", aligner), metric])) output_file.write("\n") if len(available_books) > 0: for aligner in available_aligners: output_path = EXP_DIR / f"{testament}.all.{aligner}.{metric}.csv" with output_path.open("w") as output_file: output_file.write("Book," + ",".join( filter(lambda t: t in data, TRANSLATIONS)) + "\n") for book_id in sorted( available_books, key=lambda b: book_id_to_number(b)): output_file.write(book_id) for translation in TRANSLATIONS: df = data.get(translation) if df is None: continue output_file.write(",") if (book_id, aligner) in df.index: output_file.write( str(df.at[(book_id, aligner), metric])) output_file.write("\n")
def get_book_path(project: str, book: str) -> Path: project_dir = get_project_dir(project) settings_tree = parse_project_settings(project_dir) naming_elem = settings_tree.find("Naming") assert naming_elem is not None pre_part = naming_elem.get("PrePart", "") post_part = naming_elem.get("PostPart", "") book_name_form = naming_elem.get("BookNameForm") assert book_name_form is not None book_num = book_id_to_number(book) if book_name_form == "MAT": book_name = book elif book_name_form == "40" or book_name_form == "41": book_name = book_file_name_digits(book_num) else: book_name = f"{book_file_name_digits(book_num)}{book}" book_file_name = f"{pre_part}{book_name}{post_part}" return SIL_NLP_ENV.pt_projects_dir / project / book_file_name
def test(exp_dirs: List[Path], by_book: bool, books: Set[int], test_size: Optional[int], output_dir: Path) -> None: vrefs: List[VerseRef] = [] all_alignments: Dict[str, List[Alignment]] = {} all_lexicons: Dict[str, Lexicon] = {} for exp_dir in exp_dirs: vref_file_path = exp_dir / "refs.txt" if not vref_file_path.is_file(): continue vrefs += load_vrefs(vref_file_path) add_alignments(all_alignments, load_all_alignments(exp_dir)) add_lexicons(all_lexicons, load_all_lexicons(exp_dir)) df = compute_alignment_metrics(vrefs, all_alignments, "ALL", books, test_size) df = df.join(compute_lexicon_metrics(all_lexicons)) if by_book: for book_id in ALL_BOOK_IDS: book_num = book_id_to_number(book_id) if not is_ot_nt(book_num) or (len(books) > 0 and book_num not in books): continue book_df = compute_alignment_metrics(vrefs, all_alignments, book_id, {book_num}) df = pd.concat([df, book_df]) for book in sorted(set(df.index.get_level_values("Book")), key=lambda b: 0 if b == "ALL" else book_id_to_number(b)): if by_book: print(f"--- {book} ---") for index, row in df.loc[[book]].iterrows(): aer: float = row["AER"] f_score: float = row["F-Score"] precision: float = row["Precision"] recall: float = row["Recall"] print(f"--- {index[1]} ---") print("Alignments") print(f"- AER: {aer:.4f}") print(f"- F-Score: {f_score:.4f}") print(f"- Precision: {precision:.4f}") print(f"- Recall: {recall:.4f}") if book == "ALL": f_score_at_1: float = row["F-Score@1"] precision_at_1: float = row["Precision@1"] recall_at_1: float = row["Recall@1"] f_score_at_3: float = row["F-Score@3"] precision_at_3: float = row["Precision@3"] recall_at_3: float = row["Recall@3"] mean_avg_precision: float = row["MAP"] ao_at_1: float = row["AO@1"] rbo: float = row["RBO"] print("Lexicon") print(f"- F-Score@1: {f_score_at_1:.4f}") print(f"- Precision@1: {precision_at_1:.4f}") print(f"- Recall@1: {recall_at_1:.4f}") print(f"- F-Score@3: {f_score_at_3:.4f}") print(f"- Precision@3: {precision_at_3:.4f}") print(f"- Recall@3: {recall_at_3:.4f}") print(f"- MAP: {mean_avg_precision:.4f}") print(f"- AO@1: {ao_at_1:.4f}") print(f"- RBO: {rbo:.4f}") scores_file_name = "scores.csv" if test_size is not None: scores_file_name = f"scores-{test_size}.csv" scores_file_path = output_dir / scores_file_name df.to_csv(scores_file_path, float_format="%.4f")