def test_tofu_id(self):
     formatter = HFMLFormatter()
     formatter.dirs = {}
     formatter.dirs["layers_path"] = Path(
         "tests/data/formatter/hfml/tofu-id")
     layers = [
         layer.stem for layer in formatter.dirs["layers_path"].iterdir()
     ]
     old_layers = formatter.get_old_layers(layers)
     local_id2uuid = LocalIdManager(old_layers)
     local_id2uuid.add("tsawa", "v001", 1231232)
Esempio n. 2
0
def create_pecha():
    source_metadata = create_soruce_metadata(request)
    text_path = save_text(f"{source_metadata['id']}/text",
                          request.form.get("content-text"))
    layers = [
        "book-title",
        "chapter-title",
        "author",
        "cition",
        "sabche",
        "root-verse",
        "foot-note",
        "all",
    ]
    catalog = CatalogManager(
        formatter=HFMLFormatter(metadata={
            "source_metadata": source_metadata,
            "layers": layers
        }),
        layers=layers,
        token=current_app.config["GITHUB_TOKEN"],
    )
    catalog.add_hfml_item(text_path)
    catalog.update()
    pecha_id = f"P{catalog.last_id:06}"
    return redirect(url_for("main.editor", pecha_id=pecha_id))
Esempio n. 3
0
def format(**kwargs):
    '''
    Cammand to format pecha into opf
    '''
    if kwargs['name'] == 'ocr':
        formatter = GoogleOCRFormatter()
        formatter.new_poti(kwargs['input_path'])
    elif kwargs['name'] == 'hfml':
        formatter = HFMLFormatter()
        formatter.new_poti(kwargs['input_path'])
Esempio n. 4
0
    def test_build_layers(self):
        m_text1 = Path('tests/data/formatter/hfml/kangyur_01.txt').read_text()
        m_text2 = Path('tests/data/formatter/hfml/kangyur_02.txt').read_text()
        m_text3 = Path('tests/data/formatter/hfml/kangyur_03.txt').read_text()
        formatter = HFMLFormatter()

        text1 = formatter.text_preprocess(m_text1)
        text2 = formatter.text_preprocess(m_text2)
        text3 = formatter.text_preprocess(m_text3)
        texts = [text1, text2, text3]
        for text in texts:
            result = formatter.build_layers(text, len(texts))

        result = formatter.get_result()
        
        expected_result = {
            'page': [[(0, 24, 'kk', '1a'), (27, 676, 'kl', '1b'), (679, 2173, 'lm', '2a')], [(0, 0, 'kk', '1a'),(0,266,'','1b')], [(0, 266, 'ko', '1a')]],
            'topic': [[(0, 2046, 1,'T1')],[(2046,2173,1,'t2')],[(0, 266, 2,'T2'),(0,26,3,'T2')], [(26, 243, 3,'T3')],[(243,266,3,'t4')]],
            'sub_topic': [[[(0, 1352, 1,'T1-1')], [(1353, 1496, 1,'T1-2')], [(1497, 2046, 1,'T1-6')]],[[]], [[(0, 140, 2,'T1-8')],[(141,266,2,'T1-9'),(0,26,3,'T1-9')]],[[]],[[]]],
            'correction': [[(1838, 1843, 'མཆིའོ་')], [], []],
            'error_candidate': [[(2040, 2042), (2044, 2045)], [], []],
            'peydurma':[[1518, 1624, 1938], [], []]
        }

        for layer in result:
            assert result[layer] == expected_result[layer]
    def __init__(self, pecha_id, oauth_token, layers="publication", format_=".epub"):
        self.oauth_token = oauth_token
        self.headers = {"Authorization": f"token {self.oauth_token}"}

        self.pecha_id = pecha_id
        self.base_layer_name = "BaseText"
        default_layers = [
            self.base_layer_name,
            AnnType.book_title,
            AnnType.poti_title,
            AnnType.author,
            AnnType.chapter,
        ]
        self.layers = default_layers + layers
        self.format_ = format_

        self._prepare_paths()

        self.parser = HFMLFormatter(output_path=self.base_path)
        self.serializer = None

        self.content_url_template = (
            "https://api.github.com/repos/OpenPecha/{}/contents/{}?ref={}"
        )
Esempio n. 6
0
    def test_get_base_text(self):
        m_text = Path('tests/data/formatter/hfml/kangyur_01.txt').read_text()
        formatter = HFMLFormatter()

        text = formatter.text_preprocess(m_text)
        formatter.build_layers(text, len([text]))
        result = formatter.get_base_text()

        expected = Path('tests/data/formatter/hfml/kangyur_base.txt').read_text()

        assert result == expected
class PechaExporter:
    """This class exports pecha into specified format with selected layers."""

    def __init__(self, pecha_id, oauth_token, layers="publication", format_=".epub"):
        self.oauth_token = oauth_token
        self.headers = {"Authorization": f"token {self.oauth_token}"}

        self.pecha_id = pecha_id
        self.base_layer_name = "BaseText"
        default_layers = [
            self.base_layer_name,
            AnnType.book_title,
            AnnType.poti_title,
            AnnType.author,
            AnnType.chapter,
        ]
        self.layers = default_layers + layers
        self.format_ = format_

        self._prepare_paths()

        self.parser = HFMLFormatter(output_path=self.base_path)
        self.serializer = None

        self.content_url_template = (
            "https://api.github.com/repos/OpenPecha/{}/contents/{}?ref={}"
        )

    def _prepare_paths(self):
        self.base_path = Path("/tmp") / "openpecha"
        self.pecha_path = self.base_path / self.pecha_id
        if self.pecha_path.is_dir():
            self.clean()
        self.pecha_path.mkdir(exist_ok=True, parents=True)

        self.layers_path = self.pecha_path / "layers"
        self.layers_path.mkdir(exist_ok=True, parents=True)
        self.merged_layers_path = self.pecha_path / "merged_layers"
        self.merged_layers_path.mkdir(exist_ok=True, parents=True)
        self.exports_path = self.pecha_path / "exports"
        self.exports_path.mkdir(exist_ok=True, parents=True)

    @staticmethod
    def _get_serializer(format_, **kwargs):
        if format_ == ".epub":
            return EpubSerializer(**kwargs)
        else:
            return HFMLSerializer(**kwargs)

    def get_response_json(self, url, headers={}):
        r = requests.get(url, headers=self.headers)
        if r.status_code != 200:
            return []
        return r.json()

    def _get_layers_git_urls(self):
        for layer in self.layers:
            files = self.get_response_json(
                self.content_url_template.format(self.pecha_id, "", layer)
            )
            for file in files:
                yield layer, file["name"], file["git_url"]

    def _get_base64_content(self, git_url):
        data = self.get_response_json(git_url)
        return base64.b64decode(data["content"]).decode("utf-8")

    def download_layers(self):
        """Download layers."""
        for layer, fn, git_url in self._get_layers_git_urls():
            layer_path = self.layers_path / layer
            layer_path.mkdir(exist_ok=True)
            out_fn = layer_path / fn
            content = self._get_base64_content(git_url)
            out_fn.write_text(content)

    def _download_github_dir(self, items):
        for item in items:
            if item["type"] == "file":
                out_fn = self.pecha_path / item["path"]
                download_file(item["download_url"], out_fn)
            else:
                dir_url = item["url"]
                items = self.get_response_json(dir_url)
                self._download_github_dir(items)

    def download_assets(self):
        """Download all assets of pecha."""
        asset_path = f"{self.pecha_id}.opf/asset"
        asset_url = self.content_url_template.format(
            self.pecha_id, asset_path, "master"
        )
        items = self.get_response_json(asset_url)
        self._download_github_dir(items)

    def download_metadata(self):
        meta_path = f"{self.pecha_id}.opf/meta.yml"
        meta_url = self.content_url_template.format(self.pecha_id, meta_path, "master")
        meta = self.get_response_json(meta_url)
        out_fn = self.pecha_path / meta["path"]
        download_file(meta["download_url"], out_fn)

    def _merge_layers_for_vol(self, base_vol_fn):
        """Merge all the layers of a volume."""
        base_layer = base_vol_fn.read_text()
        vol_fn = base_vol_fn.name
        for ann_layer_name in self.layers[1:]:
            ann_layer_vol_fn = self.layers_path / ann_layer_name / vol_fn
            if not ann_layer_vol_fn.is_file():
                continue
            ann_layer = ann_layer_vol_fn.read_text()
            base_layer = transfer(ann_layer, HFML_ANN_PATTERN, base_layer, "txt")

        merged_layers_fn = self.merged_layers_path / vol_fn
        merged_layers_fn.write_text(base_layer.replace(">>", ">"))

    def merge_layers(self):
        for base_vol_fn in (self.layers_path / self.base_layer_name).iterdir():
            self._merge_layers_for_vol(base_vol_fn)

    def parse(self):
        """Parser layers into opf."""
        self.parser.create_opf(self.merged_layers_path, id_=self.pecha_id)

    def serialize(self):
        """Serialize the opf into given format."""
        serializer = self._get_serializer(
            self.format_, opf_path=self.parser.dirs["opf_path"]
        )
        serializer.apply_layers()
        exported_fn = serializer.serialize(output_path=self.exports_path)
        return exported_fn

    def create_pre_release(self):
        """Create pre-release and return the asset link."""
        download_url = create_release(
            self.pecha_id,
            prerelease=True,
            assets_path=list(self.exports_path.iterdir()),
            token=self.oauth_token,
        )
        return download_url

    def clean(self):
        """Remove downloaded layers, hfml file, opf and exported file."""
        shutil.rmtree(str(self.pecha_path))

    def export(self):
        self.download_layers()
        self.merge_layers()
        self.parse()
        self.download_assets()
        self.download_metadata()
        self.serialize()
        asset_download_url = self.create_pre_release()
        return asset_download_url
Esempio n. 8
0
import os
from pathlib import Path

from openpecha.formatters import HFMLFormatter
from openpecha.serializers import HFMLSerializer

if __name__ == "__main__":
    hfml_text = "./tests/data/serialize/hfml/P000002/"
    opfs_path = Path("./output/opfs")
    opf_path = "./tests/data/serialize/hfml/P000002.opf/"
    hfml_path = "./output/"
    pecha_id = "P000002"

    # Converts HFML to OPF
    formatter = HFMLFormatter(output_path=opfs_path)
    formatter.create_opf(hfml_text, pecha_id)

    # Converts OPF to HFML
    # text_list = Path("./output/tengyur/text_list.txt").read_text()
    # texts = text_list.splitlines()
    # for text in texts:
    serializer = HFMLSerializer(opf_path)
    serializer.serialize(output_path=hfml_path)
    def test_build_layers(self):
        m_text1 = Path("tests/data/formatter/hfml/kangyur_01.txt").read_text()
        m_text2 = Path("tests/data/formatter/hfml/kangyur_02.txt").read_text()
        m_text3 = Path("tests/data/formatter/hfml/kangyur_03.txt").read_text()
        formatter = HFMLFormatter()

        text1 = formatter.text_preprocess(m_text1)
        text2 = formatter.text_preprocess(m_text2)
        text3 = formatter.text_preprocess(m_text3)
        texts = [text1, text2, text3]
        for text in texts:
            result = formatter.build_layers(text, len(texts))

        result = formatter.get_result()
        expected_result = {
            AnnType.book_title: [[], [], []],
            AnnType.book_number: [[], [], []],
            AnnType.author: [[], [], []],
            AnnType.poti_title: [
                [(None, {
                    "span": {
                        "start": 0,
                        "end": 24
                    }
                })],
                [(None, {
                    "span": {
                        "start": 0,
                        "end": 24
                    }
                })],
                [(None, {
                    "span": {
                        "start": 0,
                        "end": 24
                    }
                })],
            ],
            AnnType.chapter: [[(None, {
                "span": {
                    "start": 98,
                    "end": 125
                }
            })], [], []],
            AnnType.citation: [
                [],
                [
                    (1000020, {
                        "span": {
                            "start": 164,
                            "end": 202
                        }
                    }),
                    (1000021, {
                        "span": {
                            "start": 204,
                            "end": 241
                        }
                    }),
                ],
                [(1000024, {
                    "span": {
                        "start": 97,
                        "end": 162
                    }
                })],
            ],
            AnnType.pagination: [
                [
                    (
                        1000000,
                        {
                            "page_index": "1a",
                            "page_info": "kk",
                            "reference": None,
                            "span": {
                                "start": 0,
                                "end": 24
                            },
                        },
                    ),
                    (
                        1000001,
                        {
                            "page_index": "1b",
                            "page_info": "kl",
                            "reference": None,
                            "span": {
                                "start": 27,
                                "end": 676
                            },
                        },
                    ),
                    (
                        1000027,
                        {
                            "page_index": "2a",
                            "page_info": "lm",
                            "reference": None,
                            "span": {
                                "start": 679,
                                "end": 2173
                            },
                        },
                    ),
                ],
                [
                    (
                        1000015,
                        {
                            "page_index": "1a",
                            "page_info": "kk",
                            "reference": None,
                            "span": {
                                "start": 0,
                                "end": 0
                            },
                        },
                    ),
                    (
                        1000016,
                        {
                            "page_index": "1b",
                            "page_info": "",
                            "reference": None,
                            "span": {
                                "start": 0,
                                "end": 266
                            },
                        },
                    ),
                ],
                [(
                    1000022,
                    {
                        "page_index": "1a",
                        "page_info": "ko",
                        "reference": None,
                        "span": {
                            "start": 0,
                            "end": 266
                        },
                    },
                )],
            ],
            AnnType.topic: [
                [(
                    1000002,
                    {
                        "work_id": "T1",
                        "span": {
                            "vol": 1,
                            "start": 27,
                            "end": 2046
                        }
                    },
                )],
                [(
                    1000014,
                    {
                        "work_id": "t2",
                        "span": {
                            "vol": 1,
                            "start": 2046,
                            "end": 2173
                        },
                    },
                )],
                [(
                    1000017,
                    {
                        "work_id": "T2",
                        "span": {
                            "vol": 2,
                            "start": 26,
                            "end": 266
                        }
                    },
                )],
                [(
                    1000023,
                    {
                        "work_id": "T3",
                        "span": {
                            "vol": 3,
                            "start": 26,
                            "end": 243
                        }
                    },
                )],
                [(
                    1000026,
                    {
                        "work_id": "t4",
                        "span": {
                            "vol": 3,
                            "start": 243,
                            "end": 266
                        }
                    },
                )],
            ],
            AnnType.sub_topic: [
                [
                    [(
                        1000003,
                        {
                            "work_id": "T1-1",
                            "span": {
                                "vol": 1,
                                "start": 27,
                                "end": 1352
                            },
                        },
                    )],
                    [(
                        1000005,
                        {
                            "work_id": "T1-2",
                            "span": {
                                "vol": 1,
                                "start": 1352,
                                "end": 1496
                            },
                        },
                    )],
                    [(
                        1000006,
                        {
                            "work_id": "T1-6",
                            "span": {
                                "vol": 1,
                                "start": 1496,
                                "end": 2046
                            },
                        },
                    )],
                ],
                [[]],
                [
                    [(
                        1000018,
                        {
                            "work_id": "T1-8",
                            "span": {
                                "vol": 2,
                                "start": 26,
                                "end": 140
                            },
                        },
                    )],
                    [(
                        1000019,
                        {
                            "work_id": "T1-9",
                            "span": {
                                "vol": 2,
                                "start": 140,
                                "end": 266
                            },
                        },
                    )],
                ],
                [[]],
                [[]],
            ],
            AnnType.sabche: [
                [(1000008, {
                    "span": {
                        "start": 1548,
                        "end": 1936
                    }
                })],
                [],
                [],
            ],
            AnnType.tsawa: [[(1000004, {
                "span": {
                    "start": 420,
                    "end": 739
                }
            })], [], []],
            AnnType.yigchung: [
                [],
                [],
                [(1000025, {
                    "span": {
                        "start": 164,
                        "end": 241
                    }
                })],
            ],
            AnnType.correction: [
                [(
                    1000010,
                    {
                        "correction": "མཆིའོ་",
                        "span": {
                            "start": 1838,
                            "end": 1843
                        }
                    },
                )],
                [],
                [],
            ],
            AnnType.error_candidate: [
                [
                    (1000012, {
                        "span": {
                            "start": 2040,
                            "end": 2042
                        }
                    }),
                    (1000013, {
                        "span": {
                            "start": 2044,
                            "end": 2045
                        }
                    }),
                ],
                [],
                [],
            ],
            AnnType.peydurma: [
                [
                    (1000007, {
                        "span": {
                            "start": 1518,
                            "end": 1518
                        }
                    }),
                    (1000009, {
                        "span": {
                            "start": 1624,
                            "end": 1624
                        }
                    }),
                    (1000011, {
                        "span": {
                            "start": 1938,
                            "end": 1938
                        }
                    }),
                ],
                [],
                [],
            ],
            AnnType.archaic: [[], [], []],
            AnnType.durchen: [[], [], []],
        }

        for layer in result:
            assert result[layer] == expected_result[layer]
from openpecha.formatters import HFMLFormatter

formatter_input_path = "../openpecha-user/publication/P000001-test"
formatter_output_path = "../openpecha-user/opfs"

formatter = HFMLFormatter(output_path=formatter_output_path)
formatter.create_opf(formatter_input_path)