Exemple #1
0
def collect_segments(book: str, doc: List[sfm.Element]) -> List[Segment]:
    segments: List[Segment] = []
    for root in doc:
        cur_segment = Segment(VerseRef(book, 0, 0, ORIGINAL_VERSIFICATION))
        collect_segments_from_paragraph(segments, root, cur_segment)
        if not cur_segment.is_empty:
            segments.append(cur_segment)
    return segments
Exemple #2
0
def extract_terms_list(
        list_type: str,
        output_dir: Path,
        project_dir: Optional[Path] = None) -> Dict[str, List[VerseRef]]:
    list_file_name = _TERMS_LISTS.get(list_type)
    if list_file_name is None:
        return {}

    list_name = list_type
    if project_dir is not None:
        list_name = project_dir.name

    dir = SIL_NLP_ENV.pt_terms_dir if project_dir is None else project_dir
    terms_xml_path = dir / list_file_name

    terms_metadata_path = get_terms_metadata_path(list_name,
                                                  mt_terms_dir=output_dir)
    terms_glosses_path = get_terms_glosses_path(list_name,
                                                mt_terms_dir=output_dir)
    terms_vrefs_path = get_terms_vrefs_path(list_name, mt_terms_dir=output_dir)

    references: Dict[str, List[VerseRef]] = {}
    with terms_metadata_path.open(
            "w", encoding="utf-8",
            newline="\n") as terms_metadata_file, terms_glosses_path.open(
                "w", encoding="utf-8",
                newline="\n") as terms_glosses_file, terms_vrefs_path.open(
                    "w", encoding="utf-8", newline="\n") as terms_vrefs_file:
        if os.path.exists(terms_xml_path):
            with terms_xml_path.open("rb") as terms_file:
                terms_tree = etree.parse(terms_file)
            for term_elem in terms_tree.getroot().findall("Term"):
                id = term_elem.get("Id")
                if id is None:
                    continue
                id = escape_id(id)
                cat = term_elem.findtext("Category", "?")
                if cat == "":
                    cat = "?"
                domain = term_elem.findtext("Domain", "?")
                if domain == "":
                    domain = "?"
                gloss_str = term_elem.findtext("Gloss", "")
                refs_elem = term_elem.find("References")
                refs_list: List[VerseRef] = []
                if refs_elem is not None:
                    for verse_elem in refs_elem.findall("Verse"):
                        bbbcccvvv = int(verse_elem.text[:9])
                        vref = VerseRef.from_bbbcccvvv(bbbcccvvv)
                        vref.change_versification(ORIGINAL_VERSIFICATION)
                        refs_list.append(vref)
                    references[id] = refs_list
                glosses = _process_gloss_string(gloss_str)
                terms_metadata_file.write(f"{id}\t{cat}\t{domain}\n")
                terms_glosses_file.write("\t".join(glosses) + "\n")
                terms_vrefs_file.write("\t".join(
                    str(vref) for vref in refs_list) + "\n")
    return references
Exemple #3
0
    def filter_lines(verse_ref_str: str) -> bool:
        if include_books_set is None and exclude_books_set is None:
            return True

        vref = VerseRef.from_string(verse_ref_str.strip(),
                                    ORIGINAL_VERSIFICATION)
        if exclude_books_set is not None and vref.book_num in exclude_books_set:
            return False

        if include_books_set is not None and vref.book_num in include_books_set:
            return True

        return include_books_set is None
Exemple #4
0
def get_test_indices(config: dict) -> Optional[Set[int]]:
    exp_name = config.get("use_test_set_from")
    if exp_name is None:
        return None

    exp_dir = get_mt_exp_dir(exp_name)
    vref_path = exp_dir / "test.vref.txt"
    if not vref_path.is_file():
        return None

    vrefs: Dict[str, int] = {}
    for i, vref_str in enumerate(
            load_corpus(SIL_NLP_ENV.assets_dir / "vref.txt")):
        vrefs[vref_str] = i

    test_indices: Set[int] = set()
    for vref_str in load_corpus(vref_path):
        vref = VerseRef.from_string(vref_str, ORIGINAL_VERSIFICATION)
        if vref.has_multiple:
            vref.simplify()
        test_indices.add(vrefs[str(vref)])
    return test_indices
Exemple #5
0
def get_terms(terms_renderings_path: Path, iso: str = "en") -> Dict[str, Term]:
    list_name = get_terms_list(terms_renderings_path)
    terms_metadata_path = get_terms_metadata_path(list_name)
    terms_glosses_path = get_terms_glosses_path(list_name, iso=iso)
    terms_vrefs_path = get_terms_vrefs_path(list_name)
    terms: Dict[str, Term] = {}
    terms_metadata = load_corpus(terms_metadata_path)
    terms_glosses = load_corpus(
        terms_glosses_path) if terms_glosses_path.is_file() else iter([])
    terms_renderings = load_corpus(terms_renderings_path)
    terms_vrefs = load_corpus(
        terms_vrefs_path) if terms_vrefs_path.is_file() else iter([])
    for metadata_line, glosses_line, renderings_line, vrefs_line in itertools.zip_longest(
            terms_metadata, terms_glosses, terms_renderings, terms_vrefs):
        id, cat, domain = metadata_line.split("\t", maxsplit=3)
        glosses = [] if glosses_line is None or len(
            glosses_line) == 0 else glosses_line.split("\t")
        renderings = [] if len(
            renderings_line) == 0 else renderings_line.split("\t")
        vrefs = (set() if vrefs_line is None or len(vrefs_line) == 0 else set(
            VerseRef.from_string(vref, ORIGINAL_VERSIFICATION)
            for vref in vrefs_line.split("\t")))
        terms[id] = Term(id, cat, domain, glosses, renderings, vrefs)
    return terms
Exemple #6
0
def get_scripture_parallel_corpus(
        src_file_path: Path,
        trg_file_path: Path,
        remove_empty_sentences: bool = True) -> pd.DataFrame:
    vrefs: List[VerseRef] = []
    src_sentences: List[str] = []
    trg_sentences: List[str] = []
    indices: List[int] = []
    with (SIL_NLP_ENV.assets_dir / "vref.txt").open(
            "r", encoding="utf-8") as vref_file, src_file_path.open(
                "r", encoding="utf-8") as src_file, trg_file_path.open(
                    "r", encoding="utf-8") as trg_file:
        index = 0
        for vref_line, src_line, trg_line in zip(vref_file, src_file,
                                                 trg_file):
            vref_line = vref_line.strip()
            src_line = src_line.strip()
            trg_line = trg_line.strip()
            vref = VerseRef.from_string(vref_line, ORIGINAL_VERSIFICATION)
            if src_line == "<range>" and trg_line == "<range>":
                if vref.chapter_num == vrefs[-1].chapter_num:
                    vrefs[-1].simplify()
                    vrefs[-1] = VerseRef.from_range(vrefs[-1], vref)
            elif src_line == "<range>":
                if vref.chapter_num == vrefs[-1].chapter_num:
                    vrefs[-1].simplify()
                    vrefs[-1] = VerseRef.from_range(vrefs[-1], vref)
                if len(trg_line) > 0:
                    if len(trg_sentences[-1]) > 0:
                        trg_sentences[-1] += " "
                    trg_sentences[-1] += trg_line
            elif trg_line == "<range>":
                if vref.chapter_num == vrefs[-1].chapter_num:
                    vrefs[-1].simplify()
                    vrefs[-1] = VerseRef.from_range(vrefs[-1], vref)
                if len(src_line) > 0:
                    if len(src_sentences[-1]) > 0:
                        src_sentences[-1] += " "
                    src_sentences[-1] += src_line
            else:
                vrefs.append(vref)
                src_sentences.append(src_line)
                trg_sentences.append(trg_line)
                indices.append(index)
            index += 1

    if remove_empty_sentences:
        for i in range(len(vrefs) - 1, -1, -1):
            if len(src_sentences[i]) == 0 or len(trg_sentences[i]) == 0:
                vrefs.pop(i)
                src_sentences.pop(i)
                trg_sentences.pop(i)
                indices.pop(i)
    else:
        for i in range(len(vrefs) - 1, -1, -1):
            if len(src_sentences[i]) == 0 or len(trg_sentences[i]) == 0:
                src_sentences[i] = ""
                trg_sentences[i] = ""

    data = {"vref": vrefs, "source": src_sentences, "target": trg_sentences}
    return pd.DataFrame(data, index=indices)
Exemple #7
0
def extract_project(
    project_dir: Path,
    output_dir: Path,
    include_books: List[str] = [],
    exclude_books: List[str] = [],
    include_markers: bool = False,
    extract_lemmas: bool = False,
    output_project_vrefs: bool = False,
) -> Tuple[Path, int]:
    settings_tree = parse_project_settings(project_dir)
    iso = get_iso(settings_tree)

    ref_dir = SIL_NLP_ENV.assets_dir / "Ref"

    ref_corpus = ParatextTextCorpus(ref_dir)

    ltg_dir = project_dir / "LTG"
    if extract_lemmas and ltg_dir.is_dir():
        project_corpus = get_lemma_text_corpus(project_dir)
    else:
        project_corpus = ParatextTextCorpus(project_dir,
                                            include_markers=include_markers)

    output_basename = f"{iso}-{project_dir.name}"
    if len(include_books) > 0 or len(exclude_books) > 0:
        output_basename += "_"
        include_books_set: Optional[Set[int]] = None
        if len(include_books) > 0:
            include_books_set = get_books(include_books)
            for text in include_books:
                output_basename += f"+{text}"
        exclude_books_set: Optional[Set[int]] = None
        if len(exclude_books) > 0:
            exclude_books_set = get_books(exclude_books)
            for text in exclude_books:
                output_basename += f"-{text}"

        def filter_corpus(text: Text) -> bool:
            book_num = book_id_to_number(text.id)
            if exclude_books_set is not None and book_num in exclude_books_set:
                return False

            if include_books_set is not None and book_num in include_books_set:
                return True

            return include_books_set is None

        ref_corpus = ref_corpus.filter_texts(filter_corpus)
        project_corpus = project_corpus.filter_texts(filter_corpus)

    if include_markers:
        output_basename += "-m"
    elif extract_lemmas and ltg_dir.is_dir():
        output_basename += "-lemmas"
    output_filename = output_dir / f"{output_basename}.txt"
    output_vref_filename = output_dir / f"{output_basename}.vref.txt"

    try:
        parallel_corpus = ref_corpus.align_rows(project_corpus,
                                                all_source_rows=True)
        segment_count = 0
        with ExitStack() as stack:
            output_stream = stack.enter_context(
                output_filename.open("w", encoding="utf-8", newline="\n"))
            rows = stack.enter_context(parallel_corpus.get_rows())
            output_vref_stream: Optional[TextIO] = None
            if output_project_vrefs:
                output_vref_stream = stack.enter_context(
                    output_vref_filename.open("w",
                                              encoding="utf-8",
                                              newline="\n"))

            cur_ref: Optional[VerseRef] = None
            cur_trg_ref: Optional[VerseRef] = None
            cur_target_line = ""
            cur_target_line_range = True
            for row in rows:
                ref: VerseRef = row.ref
                if cur_ref is not None and ref.compare_to(
                        cur_ref, compare_segments=False) != 0:
                    output_stream.write(("<range>" if cur_target_line_range
                                         else cur_target_line) + "\n")
                    if output_vref_stream is not None:
                        output_vref_stream.write(
                            ("" if cur_trg_ref is None else str(cur_trg_ref)) +
                            "\n")
                    segment_count += 1
                    cur_target_line = ""
                    cur_target_line_range = True
                    cur_trg_ref = None

                cur_ref = ref
                if cur_trg_ref is None and len(row.target_refs) > 0:
                    cur_trg_ref = row.target_refs[0]
                elif cur_trg_ref is not None and len(
                        row.target_refs
                ) > 0 and cur_trg_ref != row.target_refs[0]:
                    cur_trg_ref.simplify()
                    if cur_trg_ref < row.target_refs[0]:
                        start_ref = cur_trg_ref
                        end_ref = row.target_refs[0]
                    else:
                        start_ref = row.target_refs[0]
                        end_ref = cur_trg_ref
                    cur_trg_ref = VerseRef.from_range(start_ref, end_ref)
                if not row.is_target_in_range or row.is_target_range_start or len(
                        row.target_text) > 0:
                    if len(row.target_text) > 0:
                        if len(cur_target_line) > 0:
                            cur_target_line += " "
                        cur_target_line += row.target_text
                    cur_target_line_range = False
            output_stream.write(
                ("<range>" if cur_target_line_range else cur_target_line) +
                "\n")
            if output_vref_stream is not None:
                output_vref_stream.write(
                    ("" if cur_trg_ref is None else str(cur_trg_ref)) + "\n")
            segment_count += 1
        return output_filename, segment_count
    except:
        if output_filename.is_file():
            output_filename.unlink()
        if output_vref_filename.is_file():
            output_vref_filename.unlink()
        raise
Exemple #8
0
def extract_term_renderings(project_dir: Path, corpus_filename: Path,
                            output_dir: Path) -> int:
    renderings_path = project_dir / "TermRenderings.xml"
    if not renderings_path.is_file():
        return 0

    try:
        with renderings_path.open("rb") as renderings_file:
            renderings_tree = etree.parse(renderings_file)
    except etree.XMLSyntaxError:
        # Try forcing the encoding to UTF-8 during parsing
        with renderings_path.open("rb") as renderings_file:
            renderings_tree = etree.parse(
                renderings_file, parser=etree.XMLParser(encoding="utf-8"))
    rendering_elems: Dict[str, etree.Element] = {}
    for elem in renderings_tree.getroot().findall("TermRendering"):
        id = elem.get("Id")
        if id is None:
            continue
        id = escape_id(id)
        rendering_elems[id] = elem

    settings_tree = parse_project_settings(project_dir)
    iso = get_iso(settings_tree)
    project_name = settings_tree.getroot().findtext("Name", project_dir.name)
    terms_setting = settings_tree.getroot().findtext(
        "BiblicalTermsListSetting", "Major::BiblicalTerms.xml")

    list_type, terms_project, _ = terms_setting.split(":", maxsplit=3)
    list_name = list_type
    references: Dict[str, List[VerseRef]] = {}
    if list_type == "Project":
        if terms_project == project_name:
            references = extract_terms_list(list_type, output_dir, project_dir)
        else:
            extract_terms_list_from_renderings(project_dir.name,
                                               renderings_tree, output_dir)
        list_name = project_dir.name

    corpus: Dict[VerseRef, str] = {}
    if len(references) > 0:
        prev_verse_str = ""
        for ref_str, verse_str in zip(
                load_corpus(SIL_NLP_ENV.assets_dir / "vref.txt"),
                load_corpus(corpus_filename)):
            if verse_str == "<range>":
                verse_str = prev_verse_str
            corpus[VerseRef.from_string(ref_str,
                                        ORIGINAL_VERSIFICATION)] = verse_str
            prev_verse_str = verse_str

    terms_metadata_path = get_terms_metadata_path(list_name,
                                                  mt_terms_dir=output_dir)
    terms_renderings_path = output_dir / f"{iso}-{project_dir.name}-{list_type}-renderings.txt"
    count = 0
    with terms_renderings_path.open("w", encoding="utf-8",
                                    newline="\n") as terms_renderings_file:
        for line in load_corpus(terms_metadata_path):
            id, _, _ = line.split("\t", maxsplit=3)
            rendering_elem = rendering_elems.get(id)
            refs_list = references.get(id, [])

            renderings: Set[str] = set()
            if rendering_elem is not None and rendering_elem.get(
                    "Guess", "false") == "false":
                renderings_str = rendering_elem.findtext("Renderings", "")
                if renderings_str != "":
                    for rendering in renderings_str.strip().split("||"):
                        rendering = clean_term(rendering).strip()
                        if len(refs_list) > 0 and "*" in rendering:
                            regex = (re.escape(rendering).replace(
                                "\\ \\*\\*\\ ",
                                "(?:\\ \\w+)*\\ ").replace("\\*", "\\w*"))
                            for ref in refs_list:
                                verse_str = corpus.get(ref, "")
                                for match in re.finditer(regex, verse_str):
                                    surface_form = match.group()
                                    renderings.add(surface_form)

                        else:
                            rendering = rendering.replace("*", "").strip()
                            if rendering != "":
                                renderings.add(rendering)
            terms_renderings_file.write("\t".join(renderings) + "\n")
            if len(renderings) > 0:
                count += 1
    if count == 0:
        terms_renderings_path.unlink()
        if list_type == "Project":
            terms_metadata_path.unlink()
            terms_glosses_path = get_terms_glosses_path(
                list_name, mt_terms_dir=output_dir)
            if terms_glosses_path.is_file():
                terms_glosses_path.unlink()
    return count
Exemple #9
0
def load_test_data(
    vref_file_name: str,
    src_file_name: str,
    pred_file_name: str,
    ref_pattern: str,
    output_file_name: str,
    ref_projects: Set[str],
    config: Config,
    books: Set[int],
    by_book: bool,
) -> Tuple[Dict[str, Tuple[List[str], List[List[str]]]], Dict[str, dict]]:
    dataset: Dict[str, Tuple[List[str], List[List[str]]]] = {}
    src_file_path = config.exp_dir / src_file_name
    pred_file_path = config.exp_dir / pred_file_name
    with src_file_path.open("r",
                            encoding="utf-8") as src_file, pred_file_path.open(
                                "r", encoding="utf-8") as pred_file, (
                                    config.exp_dir / output_file_name).open(
                                        "w", encoding="utf-8") as out_file:
        ref_file_paths = list(config.exp_dir.glob(ref_pattern))
        select_rand_ref_line = False
        if len(ref_file_paths) > 1:
            if len(ref_projects) == 0:
                # no refs specified, so randomly select verses from all available train refs to build one ref
                select_rand_ref_line = True
                ref_file_paths = [
                    p for p in ref_file_paths if config.is_train_project(p)
                ]
            else:
                # use specified refs only
                ref_file_paths = [
                    p for p in ref_file_paths
                    if config.is_ref_project(ref_projects, p)
                ]
        ref_files: List[IO] = []
        vref_file: Optional[IO] = None
        vref_file_path = config.exp_dir / vref_file_name
        if len(books) > 0 and vref_file_path.is_file():
            vref_file = vref_file_path.open("r", encoding="utf-8")
        try:
            for ref_file_path in ref_file_paths:
                ref_files.append(ref_file_path.open("r", encoding="utf-8"))
            default_trg_iso = config.default_trg_iso
            for lines in zip(src_file, pred_file, *ref_files):
                if vref_file is not None:
                    vref_line = vref_file.readline().strip()
                    if vref_line != "":
                        vref = VerseRef.from_string(vref_line,
                                                    ORIGINAL_VERSIFICATION)
                        if vref.book_num not in books:
                            continue
                src_line = lines[0].strip()
                pred_line = lines[1].strip()
                detok_pred_line = decode_sp(pred_line)
                iso = default_trg_iso
                if src_line.startswith("<2"):
                    index = src_line.index(">")
                    val = src_line[2:index]
                    if val != "qaa":
                        iso = val
                if iso not in dataset:
                    dataset[iso] = ([], [])
                sys, refs = dataset[iso]
                sys.append(detok_pred_line)
                if select_rand_ref_line:
                    ref_lines: List[str] = [
                        l for l in map(lambda l: l.strip(), lines[2:])
                        if len(l) > 0
                    ]
                    ref_index = random.randint(0, len(ref_lines) - 1)
                    ref_line = ref_lines[ref_index]
                    if len(refs) == 0:
                        refs.append([])
                    refs[0].append(ref_line)
                else:
                    for ref_index in range(len(ref_files)):
                        ref_line = lines[ref_index + 2].strip()
                        if len(refs) == ref_index:
                            refs.append([])
                        refs[ref_index].append(ref_line)
                out_file.write(detok_pred_line + "\n")
            book_dict: Dict[str, dict] = {}
            if by_book:
                book_dict = process_individual_books(
                    src_file_path,
                    pred_file_path,
                    ref_file_paths,
                    vref_file_path,
                    default_trg_iso,
                    select_rand_ref_line,
                    books,
                )
        finally:
            if vref_file is not None:
                vref_file.close()
            for ref_file in ref_files:
                ref_file.close()
    return dataset, book_dict
Exemple #10
0
def process_individual_books(
    src_file_path: Path,
    pred_file_path: Path,
    ref_file_paths: List[Path],
    vref_file_path: Path,
    default_trg_iso: str,
    select_rand_ref_line: bool,
    books: Set[int],
):
    # Output data structure
    book_dict: Dict[str, dict] = {}
    ref_files = []

    try:
        # Get all references
        for ref_file_path in ref_file_paths:
            file = ref_file_path.open("r", encoding="utf-8")
            ref_files.append(file)

        with vref_file_path.open(
                "r", encoding="utf-8") as vref_file, pred_file_path.open(
                    "r", encoding="utf-8") as pred_file, src_file_path.open(
                        "r", encoding="utf-8") as src_file:
            for lines in zip(pred_file, vref_file, src_file, *ref_files):
                # Get file lines
                pred_line = lines[0].strip()
                detok_pred = decode_sp(pred_line)
                vref = lines[1].strip()
                src_line = lines[2].strip()
                # Get book
                if vref != "":
                    vref = VerseRef.from_string(vref.strip(),
                                                ORIGINAL_VERSIFICATION)
                    # Check if book in books
                    if vref.book_num in books:
                        # Get iso
                        book_iso = default_trg_iso
                        if src_line.startswith("<2"):
                            index = src_line.index(">")
                            val = src_line[2:index]
                            if val != "qaa":
                                book_iso = val
                        # If book not in dictionary add the book
                        if vref.book not in book_dict:
                            book_dict[vref.book] = {}
                        if book_iso not in book_dict[vref.book]:
                            book_dict[vref.book][book_iso] = ([], [])
                        book_pred, book_refs = book_dict[vref.book][book_iso]

                        # Add detokenized prediction to nested dictionary
                        book_pred.append(detok_pred)

                        # Check if random ref line selected or not
                        if select_rand_ref_line:
                            ref_index = random.randint(0, len(ref_files) - 1)
                            ref_line = lines[ref_index + 3].strip()
                            if len(book_refs) == 0:
                                book_refs.append([])
                            book_refs[0].append(ref_line)
                        else:
                            # For each reference text, add to book_refs
                            for ref_index in range(len(ref_files)):
                                ref_line = lines[ref_index + 3].strip()
                                if len(book_refs) == ref_index:
                                    book_refs.append([])
                                book_refs[ref_index].append(ref_line)
    finally:
        if ref_files is not None:
            for ref_file in ref_files:
                ref_file.close()
    return book_dict
Exemple #11
0
def load_vrefs(vref_file_path: Path) -> List[VerseRef]:
    vrefs: List[VerseRef] = []
    for line in load_corpus(vref_file_path):
        vrefs.append(VerseRef.from_bbbcccvvv(int(line)))
    return vrefs