Exemple #1
0
def get_all_book_paths(testament_dir: Path) -> Iterable[Tuple[str, Path]]:
    for book in ALL_BOOK_IDS:
        book_num = book_id_to_number(book)
        if not is_ot_nt(book_num):
            continue
        book_exp_dir = testament_dir / (str(book_num).zfill(3) + "-" + book)
        yield book, book_exp_dir
Exemple #2
0
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Translates text using Google Cloud")
    parser.add_argument("experiment", help="Experiment name")
    parser.add_argument("--src-project",
                        type=str,
                        help="The source project to translate")
    parser.add_argument("--book", type=str, help="The book to translate")
    parser.add_argument("--trg-lang",
                        default=None,
                        type=str,
                        help="ISO-639-1 code for target language (e.g., 'en')")
    args = parser.parse_args()

    get_git_revision_hash()

    root_dir = get_mt_exp_dir(args.experiment)
    src_project: str = args.src_project
    book: str = args.book
    trg_iso: Optional[str] = args.trg_lang

    default_output_dir = root_dir / src_project
    book_num = book_id_to_number(book)
    output_path = default_output_dir / f"{book_file_name_digits(book_num)}{book}.SFM"
    output_dir = output_path.parent
    output_dir.mkdir(exist_ok=True)

    translator = GoogleTranslator()
    translator.translate_book(src_project, book, output_path, trg_iso=trg_iso)
Exemple #3
0
        def filter_corpus(text: Text) -> bool:
            book_num = book_id_to_number(text.id)
            if exclude_books_set is not None and book_num in exclude_books_set:
                return False

            if include_books_set is not None and book_num in include_books_set:
                return True

            return include_books_set is None
Exemple #4
0
def aggregate_testament_results() -> None:
    for testament in TESTAMENTS:
        data: Dict[str, pd.DataFrame] = {}
        available_books: Set[str] = set()
        available_aligners: Set[str] = set()
        for translation in TRANSLATIONS:
            translation_dir = EXP_DIR / translation
            exp_dir = translation_dir if testament == "nt+ot" else translation_dir / testament
            scores_path = exp_dir / "scores.csv"
            if scores_path.is_file():
                df = pd.read_csv(scores_path, index_col=[0, 1])
                data[translation] = df
                available_books.update(df.index.get_level_values("Book"))
                available_aligners.update(df.index.get_level_values("Model"))
        available_books.remove("ALL")

        for metric in METRICS:
            output_path = EXP_DIR / f"{testament}.all.{metric}.csv"
            with output_path.open("w") as output_file:
                output_file.write(
                    "Model," +
                    ",".join(filter(lambda t: t in data, TRANSLATIONS)) + "\n")
                for aligner in ALIGNERS:
                    output_file.write(aligner.replace("Giza-", ""))
                    for translation in TRANSLATIONS:
                        df = data.get(translation)
                        if df is None:
                            continue
                        output_file.write(",")
                        if ("ALL", aligner) in df.index:
                            output_file.write(
                                str(df.at[("ALL", aligner), metric]))
                    output_file.write("\n")

            if len(available_books) > 0:
                for aligner in available_aligners:
                    output_path = EXP_DIR / f"{testament}.all.{aligner}.{metric}.csv"
                    with output_path.open("w") as output_file:
                        output_file.write("Book," + ",".join(
                            filter(lambda t: t in data, TRANSLATIONS)) + "\n")
                        for book_id in sorted(
                                available_books,
                                key=lambda b: book_id_to_number(b)):
                            output_file.write(book_id)
                            for translation in TRANSLATIONS:
                                df = data.get(translation)
                                if df is None:
                                    continue
                                output_file.write(",")
                                if (book_id, aligner) in df.index:
                                    output_file.write(
                                        str(df.at[(book_id, aligner), metric]))
                            output_file.write("\n")
Exemple #5
0
def get_book_path(project: str, book: str) -> Path:
    project_dir = get_project_dir(project)
    settings_tree = parse_project_settings(project_dir)
    naming_elem = settings_tree.find("Naming")
    assert naming_elem is not None

    pre_part = naming_elem.get("PrePart", "")
    post_part = naming_elem.get("PostPart", "")
    book_name_form = naming_elem.get("BookNameForm")
    assert book_name_form is not None

    book_num = book_id_to_number(book)
    if book_name_form == "MAT":
        book_name = book
    elif book_name_form == "40" or book_name_form == "41":
        book_name = book_file_name_digits(book_num)
    else:
        book_name = f"{book_file_name_digits(book_num)}{book}"

    book_file_name = f"{pre_part}{book_name}{post_part}"

    return SIL_NLP_ENV.pt_projects_dir / project / book_file_name
Exemple #6
0
def test(exp_dirs: List[Path], by_book: bool, books: Set[int], test_size: Optional[int], output_dir: Path) -> None:
    vrefs: List[VerseRef] = []
    all_alignments: Dict[str, List[Alignment]] = {}
    all_lexicons: Dict[str, Lexicon] = {}
    for exp_dir in exp_dirs:
        vref_file_path = exp_dir / "refs.txt"
        if not vref_file_path.is_file():
            continue
        vrefs += load_vrefs(vref_file_path)
        add_alignments(all_alignments, load_all_alignments(exp_dir))
        add_lexicons(all_lexicons, load_all_lexicons(exp_dir))

    df = compute_alignment_metrics(vrefs, all_alignments, "ALL", books, test_size)
    df = df.join(compute_lexicon_metrics(all_lexicons))

    if by_book:
        for book_id in ALL_BOOK_IDS:
            book_num = book_id_to_number(book_id)
            if not is_ot_nt(book_num) or (len(books) > 0 and book_num not in books):
                continue

            book_df = compute_alignment_metrics(vrefs, all_alignments, book_id, {book_num})
            df = pd.concat([df, book_df])

    for book in sorted(set(df.index.get_level_values("Book")), key=lambda b: 0 if b == "ALL" else book_id_to_number(b)):
        if by_book:
            print(f"--- {book} ---")
        for index, row in df.loc[[book]].iterrows():
            aer: float = row["AER"]
            f_score: float = row["F-Score"]
            precision: float = row["Precision"]
            recall: float = row["Recall"]
            print(f"--- {index[1]} ---")
            print("Alignments")
            print(f"- AER: {aer:.4f}")
            print(f"- F-Score: {f_score:.4f}")
            print(f"- Precision: {precision:.4f}")
            print(f"- Recall: {recall:.4f}")

            if book == "ALL":
                f_score_at_1: float = row["F-Score@1"]
                precision_at_1: float = row["Precision@1"]
                recall_at_1: float = row["Recall@1"]
                f_score_at_3: float = row["F-Score@3"]
                precision_at_3: float = row["Precision@3"]
                recall_at_3: float = row["Recall@3"]
                mean_avg_precision: float = row["MAP"]
                ao_at_1: float = row["AO@1"]
                rbo: float = row["RBO"]
                print("Lexicon")
                print(f"- F-Score@1: {f_score_at_1:.4f}")
                print(f"- Precision@1: {precision_at_1:.4f}")
                print(f"- Recall@1: {recall_at_1:.4f}")
                print(f"- F-Score@3: {f_score_at_3:.4f}")
                print(f"- Precision@3: {precision_at_3:.4f}")
                print(f"- Recall@3: {recall_at_3:.4f}")
                print(f"- MAP: {mean_avg_precision:.4f}")
                print(f"- AO@1: {ao_at_1:.4f}")
                print(f"- RBO: {rbo:.4f}")

    scores_file_name = "scores.csv"
    if test_size is not None:
        scores_file_name = f"scores-{test_size}.csv"
    scores_file_path = output_dir / scores_file_name
    df.to_csv(scores_file_path, float_format="%.4f")