def test_tofu_id(self): formatter = HFMLFormatter() formatter.dirs = {} formatter.dirs["layers_path"] = Path( "tests/data/formatter/hfml/tofu-id") layers = [ layer.stem for layer in formatter.dirs["layers_path"].iterdir() ] old_layers = formatter.get_old_layers(layers) local_id2uuid = LocalIdManager(old_layers) local_id2uuid.add("tsawa", "v001", 1231232)
def create_pecha(): source_metadata = create_soruce_metadata(request) text_path = save_text(f"{source_metadata['id']}/text", request.form.get("content-text")) layers = [ "book-title", "chapter-title", "author", "cition", "sabche", "root-verse", "foot-note", "all", ] catalog = CatalogManager( formatter=HFMLFormatter(metadata={ "source_metadata": source_metadata, "layers": layers }), layers=layers, token=current_app.config["GITHUB_TOKEN"], ) catalog.add_hfml_item(text_path) catalog.update() pecha_id = f"P{catalog.last_id:06}" return redirect(url_for("main.editor", pecha_id=pecha_id))
def format(**kwargs): ''' Cammand to format pecha into opf ''' if kwargs['name'] == 'ocr': formatter = GoogleOCRFormatter() formatter.new_poti(kwargs['input_path']) elif kwargs['name'] == 'hfml': formatter = HFMLFormatter() formatter.new_poti(kwargs['input_path'])
def test_build_layers(self): m_text1 = Path('tests/data/formatter/hfml/kangyur_01.txt').read_text() m_text2 = Path('tests/data/formatter/hfml/kangyur_02.txt').read_text() m_text3 = Path('tests/data/formatter/hfml/kangyur_03.txt').read_text() formatter = HFMLFormatter() text1 = formatter.text_preprocess(m_text1) text2 = formatter.text_preprocess(m_text2) text3 = formatter.text_preprocess(m_text3) texts = [text1, text2, text3] for text in texts: result = formatter.build_layers(text, len(texts)) result = formatter.get_result() expected_result = { 'page': [[(0, 24, 'kk', '1a'), (27, 676, 'kl', '1b'), (679, 2173, 'lm', '2a')], [(0, 0, 'kk', '1a'),(0,266,'','1b')], [(0, 266, 'ko', '1a')]], 'topic': [[(0, 2046, 1,'T1')],[(2046,2173,1,'t2')],[(0, 266, 2,'T2'),(0,26,3,'T2')], [(26, 243, 3,'T3')],[(243,266,3,'t4')]], 'sub_topic': [[[(0, 1352, 1,'T1-1')], [(1353, 1496, 1,'T1-2')], [(1497, 2046, 1,'T1-6')]],[[]], [[(0, 140, 2,'T1-8')],[(141,266,2,'T1-9'),(0,26,3,'T1-9')]],[[]],[[]]], 'correction': [[(1838, 1843, 'མཆིའོ་')], [], []], 'error_candidate': [[(2040, 2042), (2044, 2045)], [], []], 'peydurma':[[1518, 1624, 1938], [], []] } for layer in result: assert result[layer] == expected_result[layer]
def __init__(self, pecha_id, oauth_token, layers="publication", format_=".epub"): self.oauth_token = oauth_token self.headers = {"Authorization": f"token {self.oauth_token}"} self.pecha_id = pecha_id self.base_layer_name = "BaseText" default_layers = [ self.base_layer_name, AnnType.book_title, AnnType.poti_title, AnnType.author, AnnType.chapter, ] self.layers = default_layers + layers self.format_ = format_ self._prepare_paths() self.parser = HFMLFormatter(output_path=self.base_path) self.serializer = None self.content_url_template = ( "https://api.github.com/repos/OpenPecha/{}/contents/{}?ref={}" )
def test_get_base_text(self): m_text = Path('tests/data/formatter/hfml/kangyur_01.txt').read_text() formatter = HFMLFormatter() text = formatter.text_preprocess(m_text) formatter.build_layers(text, len([text])) result = formatter.get_base_text() expected = Path('tests/data/formatter/hfml/kangyur_base.txt').read_text() assert result == expected
class PechaExporter: """This class exports pecha into specified format with selected layers.""" def __init__(self, pecha_id, oauth_token, layers="publication", format_=".epub"): self.oauth_token = oauth_token self.headers = {"Authorization": f"token {self.oauth_token}"} self.pecha_id = pecha_id self.base_layer_name = "BaseText" default_layers = [ self.base_layer_name, AnnType.book_title, AnnType.poti_title, AnnType.author, AnnType.chapter, ] self.layers = default_layers + layers self.format_ = format_ self._prepare_paths() self.parser = HFMLFormatter(output_path=self.base_path) self.serializer = None self.content_url_template = ( "https://api.github.com/repos/OpenPecha/{}/contents/{}?ref={}" ) def _prepare_paths(self): self.base_path = Path("/tmp") / "openpecha" self.pecha_path = self.base_path / self.pecha_id if self.pecha_path.is_dir(): self.clean() self.pecha_path.mkdir(exist_ok=True, parents=True) self.layers_path = self.pecha_path / "layers" self.layers_path.mkdir(exist_ok=True, parents=True) self.merged_layers_path = self.pecha_path / "merged_layers" self.merged_layers_path.mkdir(exist_ok=True, parents=True) self.exports_path = self.pecha_path / "exports" self.exports_path.mkdir(exist_ok=True, parents=True) @staticmethod def _get_serializer(format_, **kwargs): if format_ == ".epub": return EpubSerializer(**kwargs) else: return HFMLSerializer(**kwargs) def get_response_json(self, url, headers={}): r = requests.get(url, headers=self.headers) if r.status_code != 200: return [] return r.json() def _get_layers_git_urls(self): for layer in self.layers: files = self.get_response_json( self.content_url_template.format(self.pecha_id, "", layer) ) for file in files: yield layer, file["name"], file["git_url"] def _get_base64_content(self, git_url): data = self.get_response_json(git_url) return base64.b64decode(data["content"]).decode("utf-8") def download_layers(self): """Download layers.""" for layer, fn, git_url in self._get_layers_git_urls(): layer_path = self.layers_path / layer layer_path.mkdir(exist_ok=True) out_fn = layer_path / fn content = self._get_base64_content(git_url) out_fn.write_text(content) def _download_github_dir(self, items): for item in items: if item["type"] == "file": out_fn = self.pecha_path / item["path"] download_file(item["download_url"], out_fn) else: dir_url = item["url"] items = self.get_response_json(dir_url) self._download_github_dir(items) def download_assets(self): """Download all assets of pecha.""" asset_path = f"{self.pecha_id}.opf/asset" asset_url = self.content_url_template.format( self.pecha_id, asset_path, "master" ) items = self.get_response_json(asset_url) self._download_github_dir(items) def download_metadata(self): meta_path = f"{self.pecha_id}.opf/meta.yml" meta_url = self.content_url_template.format(self.pecha_id, meta_path, "master") meta = self.get_response_json(meta_url) out_fn = self.pecha_path / meta["path"] download_file(meta["download_url"], out_fn) def _merge_layers_for_vol(self, base_vol_fn): """Merge all the layers of a volume.""" base_layer = base_vol_fn.read_text() vol_fn = base_vol_fn.name for ann_layer_name in self.layers[1:]: ann_layer_vol_fn = self.layers_path / ann_layer_name / vol_fn if not ann_layer_vol_fn.is_file(): continue ann_layer = ann_layer_vol_fn.read_text() base_layer = transfer(ann_layer, HFML_ANN_PATTERN, base_layer, "txt") merged_layers_fn = self.merged_layers_path / vol_fn merged_layers_fn.write_text(base_layer.replace(">>", ">")) def merge_layers(self): for base_vol_fn in (self.layers_path / self.base_layer_name).iterdir(): self._merge_layers_for_vol(base_vol_fn) def parse(self): """Parser layers into opf.""" self.parser.create_opf(self.merged_layers_path, id_=self.pecha_id) def serialize(self): """Serialize the opf into given format.""" serializer = self._get_serializer( self.format_, opf_path=self.parser.dirs["opf_path"] ) serializer.apply_layers() exported_fn = serializer.serialize(output_path=self.exports_path) return exported_fn def create_pre_release(self): """Create pre-release and return the asset link.""" download_url = create_release( self.pecha_id, prerelease=True, assets_path=list(self.exports_path.iterdir()), token=self.oauth_token, ) return download_url def clean(self): """Remove downloaded layers, hfml file, opf and exported file.""" shutil.rmtree(str(self.pecha_path)) def export(self): self.download_layers() self.merge_layers() self.parse() self.download_assets() self.download_metadata() self.serialize() asset_download_url = self.create_pre_release() return asset_download_url
import os from pathlib import Path from openpecha.formatters import HFMLFormatter from openpecha.serializers import HFMLSerializer if __name__ == "__main__": hfml_text = "./tests/data/serialize/hfml/P000002/" opfs_path = Path("./output/opfs") opf_path = "./tests/data/serialize/hfml/P000002.opf/" hfml_path = "./output/" pecha_id = "P000002" # Converts HFML to OPF formatter = HFMLFormatter(output_path=opfs_path) formatter.create_opf(hfml_text, pecha_id) # Converts OPF to HFML # text_list = Path("./output/tengyur/text_list.txt").read_text() # texts = text_list.splitlines() # for text in texts: serializer = HFMLSerializer(opf_path) serializer.serialize(output_path=hfml_path)
def test_build_layers(self): m_text1 = Path("tests/data/formatter/hfml/kangyur_01.txt").read_text() m_text2 = Path("tests/data/formatter/hfml/kangyur_02.txt").read_text() m_text3 = Path("tests/data/formatter/hfml/kangyur_03.txt").read_text() formatter = HFMLFormatter() text1 = formatter.text_preprocess(m_text1) text2 = formatter.text_preprocess(m_text2) text3 = formatter.text_preprocess(m_text3) texts = [text1, text2, text3] for text in texts: result = formatter.build_layers(text, len(texts)) result = formatter.get_result() expected_result = { AnnType.book_title: [[], [], []], AnnType.book_number: [[], [], []], AnnType.author: [[], [], []], AnnType.poti_title: [ [(None, { "span": { "start": 0, "end": 24 } })], [(None, { "span": { "start": 0, "end": 24 } })], [(None, { "span": { "start": 0, "end": 24 } })], ], AnnType.chapter: [[(None, { "span": { "start": 98, "end": 125 } })], [], []], AnnType.citation: [ [], [ (1000020, { "span": { "start": 164, "end": 202 } }), (1000021, { "span": { "start": 204, "end": 241 } }), ], [(1000024, { "span": { "start": 97, "end": 162 } })], ], AnnType.pagination: [ [ ( 1000000, { "page_index": "1a", "page_info": "kk", "reference": None, "span": { "start": 0, "end": 24 }, }, ), ( 1000001, { "page_index": "1b", "page_info": "kl", "reference": None, "span": { "start": 27, "end": 676 }, }, ), ( 1000027, { "page_index": "2a", "page_info": "lm", "reference": None, "span": { "start": 679, "end": 2173 }, }, ), ], [ ( 1000015, { "page_index": "1a", "page_info": "kk", "reference": None, "span": { "start": 0, "end": 0 }, }, ), ( 1000016, { "page_index": "1b", "page_info": "", "reference": None, "span": { "start": 0, "end": 266 }, }, ), ], [( 1000022, { "page_index": "1a", "page_info": "ko", "reference": None, "span": { "start": 0, "end": 266 }, }, )], ], AnnType.topic: [ [( 1000002, { "work_id": "T1", "span": { "vol": 1, "start": 27, "end": 2046 } }, )], [( 1000014, { "work_id": "t2", "span": { "vol": 1, "start": 2046, "end": 2173 }, }, )], [( 1000017, { "work_id": "T2", "span": { "vol": 2, "start": 26, "end": 266 } }, )], [( 1000023, { "work_id": "T3", "span": { "vol": 3, "start": 26, "end": 243 } }, )], [( 1000026, { "work_id": "t4", "span": { "vol": 3, "start": 243, "end": 266 } }, )], ], AnnType.sub_topic: [ [ [( 1000003, { "work_id": "T1-1", "span": { "vol": 1, "start": 27, "end": 1352 }, }, )], [( 1000005, { "work_id": "T1-2", "span": { "vol": 1, "start": 1352, "end": 1496 }, }, )], [( 1000006, { "work_id": "T1-6", "span": { "vol": 1, "start": 1496, "end": 2046 }, }, )], ], [[]], [ [( 1000018, { "work_id": "T1-8", "span": { "vol": 2, "start": 26, "end": 140 }, }, )], [( 1000019, { "work_id": "T1-9", "span": { "vol": 2, "start": 140, "end": 266 }, }, )], ], [[]], [[]], ], AnnType.sabche: [ [(1000008, { "span": { "start": 1548, "end": 1936 } })], [], [], ], AnnType.tsawa: [[(1000004, { "span": { "start": 420, "end": 739 } })], [], []], AnnType.yigchung: [ [], [], [(1000025, { "span": { "start": 164, "end": 241 } })], ], AnnType.correction: [ [( 1000010, { "correction": "མཆིའོ་", "span": { "start": 1838, "end": 1843 } }, )], [], [], ], AnnType.error_candidate: [ [ (1000012, { "span": { "start": 2040, "end": 2042 } }), (1000013, { "span": { "start": 2044, "end": 2045 } }), ], [], [], ], AnnType.peydurma: [ [ (1000007, { "span": { "start": 1518, "end": 1518 } }), (1000009, { "span": { "start": 1624, "end": 1624 } }), (1000011, { "span": { "start": 1938, "end": 1938 } }), ], [], [], ], AnnType.archaic: [[], [], []], AnnType.durchen: [[], [], []], } for layer in result: assert result[layer] == expected_result[layer]
from openpecha.formatters import HFMLFormatter formatter_input_path = "../openpecha-user/publication/P000001-test" formatter_output_path = "../openpecha-user/opfs" formatter = HFMLFormatter(output_path=formatter_output_path) formatter.create_opf(formatter_input_path)