def test_update_index_layer(): src_opf_path = data_path / "v1" / "v1.opf" dst_opf_path = data_path / "v1_base_edited" / "v1_base_edited.opf" expected_opf_path = data_path / "v2" / "v2.opf" # edit v1 base dst_opf_path.mkdir(exist_ok=True, parents=True) shutil.copytree(str(src_opf_path / "layers"), str(dst_opf_path / "layers"), dirs_exist_ok=True) shutil.copy(str(src_opf_path / "index.yml"), str(dst_opf_path / "index.yml")) shutil.copytree(str(expected_opf_path / "base"), str(dst_opf_path / "base"), dirs_exist_ok=True) pecha = PechaBaseUpdate(src_opf_path, dst_opf_path) pecha.update() # test annotations layers for layer in ["title", "yigchung", "quotes", "tsawa", "sapche"]: result_layer, expected_layer = get_layer(layer, "v1_base_edited", "v2") is_layer_same(result_layer, expected_layer) # test index layer result_index_layer = load_yaml(dst_opf_path / "index.yml") expected_index_layer = load_yaml(expected_opf_path / "index.yml") is_index_same(result_index_layer, expected_index_layer) shutil.rmtree(str(dst_opf_path.parent))
def update_layers(self, vol_id, updater): """ Update all the layer annotations """ for layer_fn in (self.layer_path / vol_id).iterdir(): layer = load_yaml(layer_fn) update_ann_layer(layer, updater) dump_yaml(layer, layer_fn)
def get_meta_data(self): opf_path = self.opf_path try: meta = load_yaml((opf_path / "meta.yml")) except Exception: print("Meta data not Found!!!") meta = {} return meta
def get_old_layers(self, new_layers): layers = defaultdict(dict) for layer in new_layers: for vol in self.dirs["layers_path"].iterdir(): vol_layer_fn = vol / f"{layer}.yml" if not vol_layer_fn.is_file(): continue layers[layer][vol.name] = load_yaml(vol_layer_fn) return layers
def update_index_layer(self): layer = load_yaml(self.index_path) for ann in layer["annotations"]: # update text span self.update_text_span(ann["span"]) # update sub-text span for sub_text in ann["parts"]: self.update_text_span(sub_text["span"]) dump_yaml(layer, self.index_path)
def get_text_spans(self, text_id, index_layer): """ get spans of text """ text_span = {} if not index_layer: index_layer = load_yaml(self.opf_path / "index.yml") for id, anno in index_layer["annotations"].items(): if anno["parts"]: for sub_topic in anno["parts"]: if sub_topic["work_id"] == text_id: text_span[f'v{sub_topic["span"]["vol"]:03}'] = sub_topic["span"] if anno["work_id"] == text_id: for span in anno["span"]: text_span[f'v{span["vol"]:03}'] = span return text_span
def serialize(self): """Opf is serialize to html format in order to present it in editor workspace Yields: str, str: base file name, serialized html of that base file """ self.apply_layers() self.layers = [layer for layer in self.layers if layer != "Pagination"] results = self.get_result() for base_name, result in results.items(): footnote_ref_tag = "" if "Footnote" in self.layers: footnote_fn = self.opf_path / "layers" / base_name / "Footnote.yml" footnote_layer = load_yaml(footnote_fn) footnote_ref_tag = self.get_footnote_references( footnote_layer["annotations"]) result = self.p_tag_adder(result) result = f"<html>\n<head>\n<title></title>\n</head>\n<body>\n{result}{footnote_ref_tag}</body>\n</html>" yield base_name, result
def apply_layers(self): """ This applies all the layers recorded in self.layers. If self.layers is none, it reads all the layers from the layer directory. """ if not self.index_layer: index_path = self.opf_path / "index.yml" if index_path.is_file(): self.index_layer = load_yaml(index_path) self.apply_index() else: self.apply_index() for vol_id in self.base_layers: if not self.layers: self.layers = self.get_all_layer(vol_id) if "Pagination" in self.layers: pagination_index = self.layers.index("Pagination") del self.layers[pagination_index] self.layers.append("Pagination") for layer_id in self.layers: self.apply_layer(vol_id, layer_id)
def apply_layer(self, vol_id, layer_id): """ This reads the file opfpath/layers/layer_id.yml and applies all the annotations it contains, in the order in which they appear. I think it can be implemented in this class by just calling self.apply_annotation on each annotation of the file. """ layer_fn = self.opf_path / "layers" / vol_id / f"{layer_id}.yml" if not layer_fn.is_file(): return layer = load_yaml(layer_fn) for ann_id, ann in layer["annotations"].items(): # text begins in middle of the page if ( ann["span"]["end"] >= self.text_spans[vol_id]["start"] and ann["span"]["start"] <= self.text_spans[vol_id]["end"] ): ann["type"] = layer["annotation_type"] ann["id"] = ann_id try: uuid2localid = layer["local_ids"] except Exception: uuid2localid = "" self.apply_annotation(vol_id, ann, uuid2localid)
def get_index_layer(self, text_id, index_layer): if not index_layer: index_layer = load_yaml(self.opf_path / "index.yml") text_index_layer = defaultdict(str) text_index_layer["id"] = index_layer["id"] text_index_layer["annotation_type"] = index_layer["annotation_type"] text_index_layer["revision"] = index_layer["revision"] annotations = defaultdict(str) for id, anno in index_layer["annotations"].items(): if anno["work_id"] == text_id: annotations[id] = anno elif anno["parts"]: annotation = {} annotation_span_list = [] for sub_topic in anno["parts"]: if sub_topic["work_id"] == text_id: annotation["work_id"] = sub_topic["work_id"] annotation_span_list.append(sub_topic["span"]) annotation["parts"] = [] if annotation_span_list: annotation["span"] = annotation_span_list annotations[id] = annotation text_index_layer["annotations"] = annotations return text_index_layer
def serialize(self, toc_levels={}, output_path="./output/epub_output"): """This module serialize .opf file to other format such as .epub etc. In case of epub, we are using calibre ebook-convert command to do the conversion by passing our custom css template and embedding our custom font. The converted output will be then saved in current directory as {pecha_id}.epub. Args: pecha_id (string): Pecha id that needs to be exported in other format """ output_path = Path(output_path) out_html_fn = f"{self.meta['id']}.html" pecha_title = self.meta["source_metadata"].get("title", "") cover_image = self.meta["source_metadata"].get("cover", "") self.apply_layers() self.layers = [layer for layer in self.layers if layer != "Pagination"] results = self.get_result() for vol_id, result in results.items(): result = f"{self.get_front_page()}{result}" footnote_ref_tag = "" if "Footnote" in self.layers: footnote_fn = self.opf_path / "layers" / vol_id / "Footnote.yml" footnote_layer = load_yaml(footnote_fn) footnote_ref_tag = self.get_footnote_references( footnote_layer["annotations"]) result = self.p_tag_adder(result) result = self.indentation_adjustment(result) serialized_html = ( f"<html>\n<head>\n\t<title>{pecha_title}</title>\n</head>\n<body>\n" ) serialized_html += f"{result}{footnote_ref_tag}</body>\n</html>" Path(out_html_fn).write_text(serialized_html) # Downloading css template file from ebook template repo and saving it template = requests.get( "https://raw.githubusercontent.com/OpenPecha/ebook-template/master/tsadra_template.css" ) Path("template.css").write_bytes(template.content) # Running ebook-convert command to convert html file to .epub (From calibre) # XPath expression to detect chapter titles. if not toc_levels: toc_levels = Tsadra_template.toc_levels toc_levels = self.set_toc_level(toc_levels, serialized_html) level1_toc_Xpath = toc_levels.get(1, "") level2_toc_Xpath = toc_levels.get(2, "") level3_toc_Xpath = toc_levels.get(3, "") cover_path = self.opf_path / f"assets/image/{cover_image}" out_epub_fn = output_path / f"{self.meta['id']}.epub" font_family = "Monlam Uni Ouchan2" if cover_path.is_file(): os.system( f'ebook-convert {out_html_fn} {out_epub_fn} --extra-css=./template.css --embed-font-family="{font_family}" --page-breaks-before="{Tsadra_template.book_title_Xpath}" --cover={cover_path} --flow-size=0 --level1-toc="{level1_toc_Xpath}" --level2-toc="{level2_toc_Xpath}" --level3-toc="{level3_toc_Xpath}" --use-auto-toc --disable-font-rescaling' ) else: os.system( f'ebook-convert {out_html_fn} {out_epub_fn} --extra-css=./template.css --embed-font-family="{font_family}" --page-breaks-before="{Tsadra_template.book_title_Xpath}" --flow-size=0 --level1-toc="{level1_toc_Xpath}" --level2-toc="{level2_toc_Xpath}" --level3-toc="{level3_toc_Xpath}" --use-auto-toc --disable-font-rescaling' ) # Removing html file and template file os.system(f"rm {out_html_fn}") os.system("rm template.css") if out_epub_fn.is_file(): self.embed_ibook_specific_font(out_epub_fn) return out_epub_fn
def read_index_file(self) -> Dict: if not self.index_fn.is_file(): raise FileNotFoundError return load_yaml(self.index_fn)
def read_meta_file(self) -> Dict: return load_yaml(self.meta_fn)
def read_layers_file(self, base_name: str, layer_name: LayerEnum) -> Union[Dict, None]: layer_fn = self.layers_path / base_name / f"{layer_name}.yml" if layer_fn.is_file(): return load_yaml(layer_fn)