Ejemplo n.º 1
0
def get_preview_page(g_body_page, n_body_page, g_durchen_page, n_durchen_page):
    g_body_page_content = g_body_page.content
    n_body_page_content = n_body_page.content
    g_durchen_page_content = g_durchen_page.content
    n_durchen_page_content = n_durchen_page.content
    vol_num = g_body_page.vol
    n_body_page_content = transfer(g_body_page_content, [["pedurma", "(#)"]],
                                   n_body_page_content,
                                   output="txt")
    g_body_page_content = g_body_page_content.replace("#", "")
    body_result = reconstruct_body(n_body_page_content, g_body_page_content,
                                   vol_num)
    footnotes = reconstruct_footnote(n_durchen_page_content,
                                     g_durchen_page_content, vol_num)
    pg_num = get_page_num(body_result, vol_num)
    if pg_num not in footnotes:
        cur_pg_footnotes = []
        raise PageNumMissing
    else:
        cur_pg_footnotes = footnotes[pg_num]
    if cur_pg_footnotes:
        merge_marker, merge = merge_footnotes_per_page(body_result,
                                                       cur_pg_footnotes)
        return merge
    else:
        return ""
def reformat_text_with_note(original_text, text_with_note):
    annotations = [['line_break', '(\n)'],
                   ['pagination', '(\[[0-9]+[a-z]{1}\])']]
    text_with_note = rm_annotations(text_with_note,
                                    ['\n', '\[[0-9]+[a-z]{1}\]'])
    original_text_with_note = transfer(original_text,
                                       annotations,
                                       text_with_note,
                                       output='txt')
    return original_text_with_note
Ejemplo n.º 3
0
def test_transfer_hfml_tags():
    layer_1 = "<񉏠k1ཀཀཀཀ>\n ཁཁཁཁ"
    layer_2 = "ཀཀཀཀ\n <񉏠auཁཁཁཁ>"
    layer_3 = "ཀཀཀཀ\n ཁཁཁཁ\n <񉏠gགགགg>"
    base = "ཀཀཀ\n ཁཁཁ\n གགགག"
    expected = "<񉏠k1ཀཀཀ>\n <񉏠auཁཁཁ>\n <񉏠gགགགགg>"

    for layer in [layer_1, layer_2, layer_3]:
        base = transfer(layer, HFML_ANN_PATTERN, base, "txt")

    assert base == expected
def text_with_google_line_break(text, g_text):
    annotations = [['line_break', '(\n)'],
                   ['pagination', '(\[[𰵀-󴉱]?[0-9]+[a-z]{1}\])']]
    g_annotations = [
        '\n', '\[[𰵀-󴉱]?[0-9]+[a-z]{1}\]', '\[\w+\.\d+\]', '\{([𰵀-󴉱])?\w+\}'
    ]
    clean_text = rm_annotations(text, g_annotations)
    text_with_google_linebreak = transfer(g_text,
                                          annotations,
                                          clean_text,
                                          output='txt')
    return text_with_google_linebreak
Ejemplo n.º 5
0
    def _merge_layers_for_vol(self, base_vol_fn):
        """Merge all the layers of a volume."""
        base_layer = base_vol_fn.read_text()
        vol_fn = base_vol_fn.name
        for ann_layer_name in self.layers[1:]:
            ann_layer_vol_fn = self.layers_path / ann_layer_name / vol_fn
            if not ann_layer_vol_fn.is_file():
                continue
            ann_layer = ann_layer_vol_fn.read_text()
            base_layer = transfer(ann_layer, HFML_ANN_PATTERN, base_layer, "txt")

        merged_layers_fn = self.merged_layers_path / vol_fn
        merged_layers_fn.write_text(base_layer.replace(">>", ">"))
Ejemplo n.º 6
0
    def _update_pars(source, target):
        target = target.replace('\n\n', ' ')
        pattern = [["pars", "(\n\n)"]]
        updated = transfer(source, pattern, target, "txt")
        updated = re.sub(r'([!?”:;…,.»"]+?)([^ \f\v\u202f\u00a0\n!?”:;…,.»"])', r'\1 \2', updated)  # reinserting spaces where needed
        updated = re.sub(r'\n\n/ +', '/\n\n', updated)
        updated = re.sub(r'/ /\n\n([^\n])', r'/\n\n/\1', updated)
        # updated = updated.replace(' /', '/')
        updated = re.sub(r'\n\n” ', '”\n\n', updated)
        updated = updated.replace('\n ', '\n')
        updated = updated.replace(' \n', '\n')
        updated = re.sub(r'([!?”:;…,.»"]+?) —', r'\1\n—', updated)
        updated = updated.replace('\n\n\n', '\n\n')

        return updated
Ejemplo n.º 7
0
def put_derge_line_break(preview_text, derge_text):
    collation_text = ''
    for vol_id, text in preview_text.items():
        collation_text += re.sub('<p.+?>', '', text)
    full_derge_text = ''
    for vol_id, vol_text in derge_text.items():
        full_derge_text += vol_text
    anns = [
        r"\n",
    ]
    collation_text = rm_ann(collation_text, anns)
    collation_text_with_derge_linebr = transfer(
        full_derge_text,
        [["linebreak", r"(\n)"], ["pg_ann", r"(\[[𰵀-󴉱]?[0-9]+[a-z]{1}\])"]],
        collation_text,
        output="txt",
    )
    return collation_text_with_derge_linebr
Ejemplo n.º 8
0
def get_derge_google_text(derge_hfml, google_hfml):
    derge_google_text = ""
    anns = [r"\n", r"\[\w+\.\d+\]", r"\[[𰵀-󴉱]?[0-9]+[a-z]{1}\]"]
    derge_hfml = rm_ann(derge_hfml, anns)
    dg_body = transfer(
        google_hfml,
        [["linebreak", r"(\n)"], ["pg_ann", r"(\[[𰵀-󴉱]?[0-9]+[a-z]{1}\])"]],
        derge_hfml,
        output="txt",
    )
    dg_pages = get_pages(dg_body)
    g_pages = get_pages(google_hfml)
    for g_page, dg_page in zip(g_pages, dg_pages):
        if is_note_page(g_page, dg_page):
            derge_google_text += g_page
        else:
            derge_google_text += dg_page
    return derge_google_text
Ejemplo n.º 9
0
def reconstruct_footnote(namsel_footnote, google_footnote, vol_num):
    annotations = [
        ["marker", "(<u.+?>)"],
        ["marker", "([①-⑩])"],
        ["pg_ref", "(<r.+?>)"],
        ["pedurma-page", "(<p.+?>)"],
    ]
    print("Calculating diffs..")
    diffs = transfer(namsel_footnote,
                     annotations,
                     google_footnote,
                     output="diff")
    diffs_list = list(map(list, diffs))
    filtered_diffs = filter_footnotes_diffs(diffs_list, vol_num)
    new_text = format_diff(filtered_diffs, vol_num, type_="footnotes")
    reformatted_footnotes = reformat_footnotes(new_text)
    formatted_footnotes = postprocess_footnotes(reformatted_footnotes, vol_num)
    return formatted_footnotes
 def _update_pars(source, target):
     pattern = [["pars", "(\n\n\n)"]]
     updated = transfer(source, pattern, target, "txt")
     updated = updated.replace('\n\n\n\n',
                               '\n\n\n')  # hack for a strange behaviour
     return updated
Ejemplo n.º 11
0
def test_ann_transfer_optimized():
    transfer(get_source(), get_pattern(), get_target(), "txt")
Ejemplo n.º 12
0
def test_ann_transfer_optimized(source_text, target_text, annotation_patterns,
                                expected):
    annotated = transfer(source_text, annotation_patterns, target_text, "txt")
    assert annotated == expected
Ejemplo n.º 13
0
def flow(vol_path, source_path, target_path, text_type, image_info):
    """ - diff is computed between B and A text
        - footnotes and footnotes markers are filtered from diffs
        - they are applied to B text with markers
        - A image links are computed and added at the end of each page
    Args:
        B_path (path): path of text B (namsel)
        A_path (path): path of text A (clean)
        text_type (str): type of text can be either body or footnote
        image_info (list): Contains work_id, volume number and source image offset
    """
    volume_no = image_info[1]
    namsel_text = source_path.read_text(encoding="utf-8")
    google_text = target_path.read_text(encoding="utf-8")
    diffs_to_yaml = partial(
        to_yaml, type_="diffs")  # customising to_yaml function for diff list
    filtered_diffs_to_yaml = partial(
        to_yaml, type_="filtered_diffs"
    )  # customising to_yaml function for filtered diffs list
    footnotes_to_yaml = partial(to_yaml, type_="footnotes")

    dir_path = vol_path / text_type

    diffs_yaml_path = dir_path / "diffs.yaml"
    filtered_diffs_yaml_path = dir_path / "filtered_diffs.yaml"
    # Text_type can be either body of the text or footnote footnote.
    if text_type == "body":
        # patterns = [['google_marker','(#)'],["pages", "\[\d+[ab]\]"]]
        # transformed_namsel = transfer(google_text, patterns, namsel_text, output='txt')
        # namsel_text = transformed_namsel.replace('#་','་#')
        # google_text = google_text.replace('#','')
        print("Calculating diffs...")
        diffs = get_diffs(namsel_text, google_text)
        diffs_list = list(map(list, diffs))
        diffs_to_yaml(diffs_list, dir_path)
        print("Filtering diffs...")
        filtered_diffs = filter_diffs(diffs_yaml_path, "body", image_info)
        #filtered_diffs = rm_diff_tag(filtered_diffs)
        filtered_diffs_to_yaml(filtered_diffs, dir_path)
        new_text = format_diff(filtered_diffs_yaml_path,
                               image_info,
                               type_="body")
        new_text = reformatting_body(new_text)
        (dir_path / f"result.txt").write_text(new_text, encoding="utf-8")

    elif text_type == "footnotes":
        annotations = [
            ["marker", "(<m.+?>)"],
            ["marker", "([①-⑩])"],
            ["pg_ref", "(<r.+?>)"],
            ["pedurma_page", "(<p.+?>)"],
        ]
        google_text = rm_google_ocr_header(google_text)
        clean_google_text = preprocess_google_notes(google_text)
        clean_namsel_text = preprocess_namsel_notes(namsel_text)
        print("Calculating diffs..")
        diffs = transfer(clean_namsel_text, annotations, clean_google_text)
        diffs_list = list(map(list, diffs))
        diffs_to_yaml(diffs_list, dir_path)
        filtered_diffs = filter_footnotes_diffs(diffs_yaml_path, image_info[1])
        filtered_diffs_to_yaml(filtered_diffs, dir_path)
        new_text = format_diff(filtered_diffs_yaml_path,
                               image_info,
                               type_="footnotes")
        reformatted_footnotes = reformat_footnotes(new_text)
        formatted_yaml = postprocess_footnotes(reformatted_footnotes)
        footnotes_to_yaml(formatted_yaml, dir_path)
        (dir_path / "result.txt").write_text(reformatted_footnotes,
                                             encoding="utf-8")
    else:
        print("Type not found")
    print("Done")