def collect_segments(book: str, doc: List[sfm.Element]) -> List[Segment]: segments: List[Segment] = [] for root in doc: cur_segment = Segment(VerseRef(book, 0, 0, ORIGINAL_VERSIFICATION)) collect_segments_from_paragraph(segments, root, cur_segment) if not cur_segment.is_empty: segments.append(cur_segment) return segments
def extract_terms_list( list_type: str, output_dir: Path, project_dir: Optional[Path] = None) -> Dict[str, List[VerseRef]]: list_file_name = _TERMS_LISTS.get(list_type) if list_file_name is None: return {} list_name = list_type if project_dir is not None: list_name = project_dir.name dir = SIL_NLP_ENV.pt_terms_dir if project_dir is None else project_dir terms_xml_path = dir / list_file_name terms_metadata_path = get_terms_metadata_path(list_name, mt_terms_dir=output_dir) terms_glosses_path = get_terms_glosses_path(list_name, mt_terms_dir=output_dir) terms_vrefs_path = get_terms_vrefs_path(list_name, mt_terms_dir=output_dir) references: Dict[str, List[VerseRef]] = {} with terms_metadata_path.open( "w", encoding="utf-8", newline="\n") as terms_metadata_file, terms_glosses_path.open( "w", encoding="utf-8", newline="\n") as terms_glosses_file, terms_vrefs_path.open( "w", encoding="utf-8", newline="\n") as terms_vrefs_file: if os.path.exists(terms_xml_path): with terms_xml_path.open("rb") as terms_file: terms_tree = etree.parse(terms_file) for term_elem in terms_tree.getroot().findall("Term"): id = term_elem.get("Id") if id is None: continue id = escape_id(id) cat = term_elem.findtext("Category", "?") if cat == "": cat = "?" domain = term_elem.findtext("Domain", "?") if domain == "": domain = "?" gloss_str = term_elem.findtext("Gloss", "") refs_elem = term_elem.find("References") refs_list: List[VerseRef] = [] if refs_elem is not None: for verse_elem in refs_elem.findall("Verse"): bbbcccvvv = int(verse_elem.text[:9]) vref = VerseRef.from_bbbcccvvv(bbbcccvvv) vref.change_versification(ORIGINAL_VERSIFICATION) refs_list.append(vref) references[id] = refs_list glosses = _process_gloss_string(gloss_str) terms_metadata_file.write(f"{id}\t{cat}\t{domain}\n") terms_glosses_file.write("\t".join(glosses) + "\n") terms_vrefs_file.write("\t".join( str(vref) for vref in refs_list) + "\n") return references
def filter_lines(verse_ref_str: str) -> bool: if include_books_set is None and exclude_books_set is None: return True vref = VerseRef.from_string(verse_ref_str.strip(), ORIGINAL_VERSIFICATION) if exclude_books_set is not None and vref.book_num in exclude_books_set: return False if include_books_set is not None and vref.book_num in include_books_set: return True return include_books_set is None
def get_test_indices(config: dict) -> Optional[Set[int]]: exp_name = config.get("use_test_set_from") if exp_name is None: return None exp_dir = get_mt_exp_dir(exp_name) vref_path = exp_dir / "test.vref.txt" if not vref_path.is_file(): return None vrefs: Dict[str, int] = {} for i, vref_str in enumerate( load_corpus(SIL_NLP_ENV.assets_dir / "vref.txt")): vrefs[vref_str] = i test_indices: Set[int] = set() for vref_str in load_corpus(vref_path): vref = VerseRef.from_string(vref_str, ORIGINAL_VERSIFICATION) if vref.has_multiple: vref.simplify() test_indices.add(vrefs[str(vref)]) return test_indices
def get_terms(terms_renderings_path: Path, iso: str = "en") -> Dict[str, Term]: list_name = get_terms_list(terms_renderings_path) terms_metadata_path = get_terms_metadata_path(list_name) terms_glosses_path = get_terms_glosses_path(list_name, iso=iso) terms_vrefs_path = get_terms_vrefs_path(list_name) terms: Dict[str, Term] = {} terms_metadata = load_corpus(terms_metadata_path) terms_glosses = load_corpus( terms_glosses_path) if terms_glosses_path.is_file() else iter([]) terms_renderings = load_corpus(terms_renderings_path) terms_vrefs = load_corpus( terms_vrefs_path) if terms_vrefs_path.is_file() else iter([]) for metadata_line, glosses_line, renderings_line, vrefs_line in itertools.zip_longest( terms_metadata, terms_glosses, terms_renderings, terms_vrefs): id, cat, domain = metadata_line.split("\t", maxsplit=3) glosses = [] if glosses_line is None or len( glosses_line) == 0 else glosses_line.split("\t") renderings = [] if len( renderings_line) == 0 else renderings_line.split("\t") vrefs = (set() if vrefs_line is None or len(vrefs_line) == 0 else set( VerseRef.from_string(vref, ORIGINAL_VERSIFICATION) for vref in vrefs_line.split("\t"))) terms[id] = Term(id, cat, domain, glosses, renderings, vrefs) return terms
def get_scripture_parallel_corpus( src_file_path: Path, trg_file_path: Path, remove_empty_sentences: bool = True) -> pd.DataFrame: vrefs: List[VerseRef] = [] src_sentences: List[str] = [] trg_sentences: List[str] = [] indices: List[int] = [] with (SIL_NLP_ENV.assets_dir / "vref.txt").open( "r", encoding="utf-8") as vref_file, src_file_path.open( "r", encoding="utf-8") as src_file, trg_file_path.open( "r", encoding="utf-8") as trg_file: index = 0 for vref_line, src_line, trg_line in zip(vref_file, src_file, trg_file): vref_line = vref_line.strip() src_line = src_line.strip() trg_line = trg_line.strip() vref = VerseRef.from_string(vref_line, ORIGINAL_VERSIFICATION) if src_line == "<range>" and trg_line == "<range>": if vref.chapter_num == vrefs[-1].chapter_num: vrefs[-1].simplify() vrefs[-1] = VerseRef.from_range(vrefs[-1], vref) elif src_line == "<range>": if vref.chapter_num == vrefs[-1].chapter_num: vrefs[-1].simplify() vrefs[-1] = VerseRef.from_range(vrefs[-1], vref) if len(trg_line) > 0: if len(trg_sentences[-1]) > 0: trg_sentences[-1] += " " trg_sentences[-1] += trg_line elif trg_line == "<range>": if vref.chapter_num == vrefs[-1].chapter_num: vrefs[-1].simplify() vrefs[-1] = VerseRef.from_range(vrefs[-1], vref) if len(src_line) > 0: if len(src_sentences[-1]) > 0: src_sentences[-1] += " " src_sentences[-1] += src_line else: vrefs.append(vref) src_sentences.append(src_line) trg_sentences.append(trg_line) indices.append(index) index += 1 if remove_empty_sentences: for i in range(len(vrefs) - 1, -1, -1): if len(src_sentences[i]) == 0 or len(trg_sentences[i]) == 0: vrefs.pop(i) src_sentences.pop(i) trg_sentences.pop(i) indices.pop(i) else: for i in range(len(vrefs) - 1, -1, -1): if len(src_sentences[i]) == 0 or len(trg_sentences[i]) == 0: src_sentences[i] = "" trg_sentences[i] = "" data = {"vref": vrefs, "source": src_sentences, "target": trg_sentences} return pd.DataFrame(data, index=indices)
def extract_project( project_dir: Path, output_dir: Path, include_books: List[str] = [], exclude_books: List[str] = [], include_markers: bool = False, extract_lemmas: bool = False, output_project_vrefs: bool = False, ) -> Tuple[Path, int]: settings_tree = parse_project_settings(project_dir) iso = get_iso(settings_tree) ref_dir = SIL_NLP_ENV.assets_dir / "Ref" ref_corpus = ParatextTextCorpus(ref_dir) ltg_dir = project_dir / "LTG" if extract_lemmas and ltg_dir.is_dir(): project_corpus = get_lemma_text_corpus(project_dir) else: project_corpus = ParatextTextCorpus(project_dir, include_markers=include_markers) output_basename = f"{iso}-{project_dir.name}" if len(include_books) > 0 or len(exclude_books) > 0: output_basename += "_" include_books_set: Optional[Set[int]] = None if len(include_books) > 0: include_books_set = get_books(include_books) for text in include_books: output_basename += f"+{text}" exclude_books_set: Optional[Set[int]] = None if len(exclude_books) > 0: exclude_books_set = get_books(exclude_books) for text in exclude_books: output_basename += f"-{text}" def filter_corpus(text: Text) -> bool: book_num = book_id_to_number(text.id) if exclude_books_set is not None and book_num in exclude_books_set: return False if include_books_set is not None and book_num in include_books_set: return True return include_books_set is None ref_corpus = ref_corpus.filter_texts(filter_corpus) project_corpus = project_corpus.filter_texts(filter_corpus) if include_markers: output_basename += "-m" elif extract_lemmas and ltg_dir.is_dir(): output_basename += "-lemmas" output_filename = output_dir / f"{output_basename}.txt" output_vref_filename = output_dir / f"{output_basename}.vref.txt" try: parallel_corpus = ref_corpus.align_rows(project_corpus, all_source_rows=True) segment_count = 0 with ExitStack() as stack: output_stream = stack.enter_context( output_filename.open("w", encoding="utf-8", newline="\n")) rows = stack.enter_context(parallel_corpus.get_rows()) output_vref_stream: Optional[TextIO] = None if output_project_vrefs: output_vref_stream = stack.enter_context( output_vref_filename.open("w", encoding="utf-8", newline="\n")) cur_ref: Optional[VerseRef] = None cur_trg_ref: Optional[VerseRef] = None cur_target_line = "" cur_target_line_range = True for row in rows: ref: VerseRef = row.ref if cur_ref is not None and ref.compare_to( cur_ref, compare_segments=False) != 0: output_stream.write(("<range>" if cur_target_line_range else cur_target_line) + "\n") if output_vref_stream is not None: output_vref_stream.write( ("" if cur_trg_ref is None else str(cur_trg_ref)) + "\n") segment_count += 1 cur_target_line = "" cur_target_line_range = True cur_trg_ref = None cur_ref = ref if cur_trg_ref is None and len(row.target_refs) > 0: cur_trg_ref = row.target_refs[0] elif cur_trg_ref is not None and len( row.target_refs ) > 0 and cur_trg_ref != row.target_refs[0]: cur_trg_ref.simplify() if cur_trg_ref < row.target_refs[0]: start_ref = cur_trg_ref end_ref = row.target_refs[0] else: start_ref = row.target_refs[0] end_ref = cur_trg_ref cur_trg_ref = VerseRef.from_range(start_ref, end_ref) if not row.is_target_in_range or row.is_target_range_start or len( row.target_text) > 0: if len(row.target_text) > 0: if len(cur_target_line) > 0: cur_target_line += " " cur_target_line += row.target_text cur_target_line_range = False output_stream.write( ("<range>" if cur_target_line_range else cur_target_line) + "\n") if output_vref_stream is not None: output_vref_stream.write( ("" if cur_trg_ref is None else str(cur_trg_ref)) + "\n") segment_count += 1 return output_filename, segment_count except: if output_filename.is_file(): output_filename.unlink() if output_vref_filename.is_file(): output_vref_filename.unlink() raise
def extract_term_renderings(project_dir: Path, corpus_filename: Path, output_dir: Path) -> int: renderings_path = project_dir / "TermRenderings.xml" if not renderings_path.is_file(): return 0 try: with renderings_path.open("rb") as renderings_file: renderings_tree = etree.parse(renderings_file) except etree.XMLSyntaxError: # Try forcing the encoding to UTF-8 during parsing with renderings_path.open("rb") as renderings_file: renderings_tree = etree.parse( renderings_file, parser=etree.XMLParser(encoding="utf-8")) rendering_elems: Dict[str, etree.Element] = {} for elem in renderings_tree.getroot().findall("TermRendering"): id = elem.get("Id") if id is None: continue id = escape_id(id) rendering_elems[id] = elem settings_tree = parse_project_settings(project_dir) iso = get_iso(settings_tree) project_name = settings_tree.getroot().findtext("Name", project_dir.name) terms_setting = settings_tree.getroot().findtext( "BiblicalTermsListSetting", "Major::BiblicalTerms.xml") list_type, terms_project, _ = terms_setting.split(":", maxsplit=3) list_name = list_type references: Dict[str, List[VerseRef]] = {} if list_type == "Project": if terms_project == project_name: references = extract_terms_list(list_type, output_dir, project_dir) else: extract_terms_list_from_renderings(project_dir.name, renderings_tree, output_dir) list_name = project_dir.name corpus: Dict[VerseRef, str] = {} if len(references) > 0: prev_verse_str = "" for ref_str, verse_str in zip( load_corpus(SIL_NLP_ENV.assets_dir / "vref.txt"), load_corpus(corpus_filename)): if verse_str == "<range>": verse_str = prev_verse_str corpus[VerseRef.from_string(ref_str, ORIGINAL_VERSIFICATION)] = verse_str prev_verse_str = verse_str terms_metadata_path = get_terms_metadata_path(list_name, mt_terms_dir=output_dir) terms_renderings_path = output_dir / f"{iso}-{project_dir.name}-{list_type}-renderings.txt" count = 0 with terms_renderings_path.open("w", encoding="utf-8", newline="\n") as terms_renderings_file: for line in load_corpus(terms_metadata_path): id, _, _ = line.split("\t", maxsplit=3) rendering_elem = rendering_elems.get(id) refs_list = references.get(id, []) renderings: Set[str] = set() if rendering_elem is not None and rendering_elem.get( "Guess", "false") == "false": renderings_str = rendering_elem.findtext("Renderings", "") if renderings_str != "": for rendering in renderings_str.strip().split("||"): rendering = clean_term(rendering).strip() if len(refs_list) > 0 and "*" in rendering: regex = (re.escape(rendering).replace( "\\ \\*\\*\\ ", "(?:\\ \\w+)*\\ ").replace("\\*", "\\w*")) for ref in refs_list: verse_str = corpus.get(ref, "") for match in re.finditer(regex, verse_str): surface_form = match.group() renderings.add(surface_form) else: rendering = rendering.replace("*", "").strip() if rendering != "": renderings.add(rendering) terms_renderings_file.write("\t".join(renderings) + "\n") if len(renderings) > 0: count += 1 if count == 0: terms_renderings_path.unlink() if list_type == "Project": terms_metadata_path.unlink() terms_glosses_path = get_terms_glosses_path( list_name, mt_terms_dir=output_dir) if terms_glosses_path.is_file(): terms_glosses_path.unlink() return count
def load_test_data( vref_file_name: str, src_file_name: str, pred_file_name: str, ref_pattern: str, output_file_name: str, ref_projects: Set[str], config: Config, books: Set[int], by_book: bool, ) -> Tuple[Dict[str, Tuple[List[str], List[List[str]]]], Dict[str, dict]]: dataset: Dict[str, Tuple[List[str], List[List[str]]]] = {} src_file_path = config.exp_dir / src_file_name pred_file_path = config.exp_dir / pred_file_name with src_file_path.open("r", encoding="utf-8") as src_file, pred_file_path.open( "r", encoding="utf-8") as pred_file, ( config.exp_dir / output_file_name).open( "w", encoding="utf-8") as out_file: ref_file_paths = list(config.exp_dir.glob(ref_pattern)) select_rand_ref_line = False if len(ref_file_paths) > 1: if len(ref_projects) == 0: # no refs specified, so randomly select verses from all available train refs to build one ref select_rand_ref_line = True ref_file_paths = [ p for p in ref_file_paths if config.is_train_project(p) ] else: # use specified refs only ref_file_paths = [ p for p in ref_file_paths if config.is_ref_project(ref_projects, p) ] ref_files: List[IO] = [] vref_file: Optional[IO] = None vref_file_path = config.exp_dir / vref_file_name if len(books) > 0 and vref_file_path.is_file(): vref_file = vref_file_path.open("r", encoding="utf-8") try: for ref_file_path in ref_file_paths: ref_files.append(ref_file_path.open("r", encoding="utf-8")) default_trg_iso = config.default_trg_iso for lines in zip(src_file, pred_file, *ref_files): if vref_file is not None: vref_line = vref_file.readline().strip() if vref_line != "": vref = VerseRef.from_string(vref_line, ORIGINAL_VERSIFICATION) if vref.book_num not in books: continue src_line = lines[0].strip() pred_line = lines[1].strip() detok_pred_line = decode_sp(pred_line) iso = default_trg_iso if src_line.startswith("<2"): index = src_line.index(">") val = src_line[2:index] if val != "qaa": iso = val if iso not in dataset: dataset[iso] = ([], []) sys, refs = dataset[iso] sys.append(detok_pred_line) if select_rand_ref_line: ref_lines: List[str] = [ l for l in map(lambda l: l.strip(), lines[2:]) if len(l) > 0 ] ref_index = random.randint(0, len(ref_lines) - 1) ref_line = ref_lines[ref_index] if len(refs) == 0: refs.append([]) refs[0].append(ref_line) else: for ref_index in range(len(ref_files)): ref_line = lines[ref_index + 2].strip() if len(refs) == ref_index: refs.append([]) refs[ref_index].append(ref_line) out_file.write(detok_pred_line + "\n") book_dict: Dict[str, dict] = {} if by_book: book_dict = process_individual_books( src_file_path, pred_file_path, ref_file_paths, vref_file_path, default_trg_iso, select_rand_ref_line, books, ) finally: if vref_file is not None: vref_file.close() for ref_file in ref_files: ref_file.close() return dataset, book_dict
def process_individual_books( src_file_path: Path, pred_file_path: Path, ref_file_paths: List[Path], vref_file_path: Path, default_trg_iso: str, select_rand_ref_line: bool, books: Set[int], ): # Output data structure book_dict: Dict[str, dict] = {} ref_files = [] try: # Get all references for ref_file_path in ref_file_paths: file = ref_file_path.open("r", encoding="utf-8") ref_files.append(file) with vref_file_path.open( "r", encoding="utf-8") as vref_file, pred_file_path.open( "r", encoding="utf-8") as pred_file, src_file_path.open( "r", encoding="utf-8") as src_file: for lines in zip(pred_file, vref_file, src_file, *ref_files): # Get file lines pred_line = lines[0].strip() detok_pred = decode_sp(pred_line) vref = lines[1].strip() src_line = lines[2].strip() # Get book if vref != "": vref = VerseRef.from_string(vref.strip(), ORIGINAL_VERSIFICATION) # Check if book in books if vref.book_num in books: # Get iso book_iso = default_trg_iso if src_line.startswith("<2"): index = src_line.index(">") val = src_line[2:index] if val != "qaa": book_iso = val # If book not in dictionary add the book if vref.book not in book_dict: book_dict[vref.book] = {} if book_iso not in book_dict[vref.book]: book_dict[vref.book][book_iso] = ([], []) book_pred, book_refs = book_dict[vref.book][book_iso] # Add detokenized prediction to nested dictionary book_pred.append(detok_pred) # Check if random ref line selected or not if select_rand_ref_line: ref_index = random.randint(0, len(ref_files) - 1) ref_line = lines[ref_index + 3].strip() if len(book_refs) == 0: book_refs.append([]) book_refs[0].append(ref_line) else: # For each reference text, add to book_refs for ref_index in range(len(ref_files)): ref_line = lines[ref_index + 3].strip() if len(book_refs) == ref_index: book_refs.append([]) book_refs[ref_index].append(ref_line) finally: if ref_files is not None: for ref_file in ref_files: ref_file.close() return book_dict
def load_vrefs(vref_file_path: Path) -> List[VerseRef]: vrefs: List[VerseRef] = [] for line in load_corpus(vref_file_path): vrefs.append(VerseRef.from_bbbcccvvv(int(line))) return vrefs