def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) renditions, renditions_metadata = sps_package.get_renditions_metadata() logger.info("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets(asset_replacements + renditions, pkg_path, bad_pkg_path, sps_package.package_name) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
class Test_SPS_Package_No_Metadata(unittest.TestCase): def setUp(self): article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink"> <inline-graphic xlink:href="a01tab01.gif"/> <graphic xlink:href="a01f01.gif"/> <ext-link xlink:href="a01tab02.gif"/> <ext-link xlink:href="mailto:a01f02.gif"/> <inline-supplementary-material xlink:href="a01tab03.gif"/> <supplementary-material xlink:href="a01tab04.gif"/> <media xlink:href="a01tab04.gif"/> </root> """ self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01") def test_parse_article(self): self.assertEqual(self.sps_package.parse_article_meta, []) def test_package_name(self): self.assertEqual(self.sps_package.package_name, "a01") def test_asset_package_name_f01(self): self.assertEqual(self.sps_package.asset_name("a01f01.jpg"), "a01-gf01.jpg") def test_asset_package_name_any_img(self): self.assertEqual(self.sps_package.asset_name("img.jpg"), "a01-gimg.jpg") def test_journal_meta(self): self.assertEqual(self.sps_package.journal_meta, []) def test_parse_article_meta(self): self.assertEqual(self.sps_package.parse_article_meta, [])
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) pid_v2 = xml_sps.scielo_pid_v2 # VERIFICA A EXISTÊNCIA DO PID V3 NO XC ATRAVES DO PID V2 if not pid_manager.check_pid_v3_by_v2(pid_database_engine, pid_v2): # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # CRIA O PID V2 E V3 NA BASE DE DADOS DO XC pid_manager.create_pid(pid_database_engine, pid_v2, xml_sps.scielo_pid_v3) else: # SE CASO EXISTA O PID NO VERSÃO 3 NA BASE DO XC É PRECISO ADICIONAR NO XML pid_v3 = pid_manager.get_pid_v3_by_v2(pid_database_engine, pid_v2) xml_sps.scielo_pid_v3 = pid_v3 if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) files.make_empty_dir(pkg_path) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path, sps_package.package_name) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def convert_article_xml(file_xml_path): obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body() # CONVERTE PUB-DATE PARA SPS 1.9 xml_sps.transform_pubdate() # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def setUp(self): xml = """<article><article-meta> <pub-date pub-type="epub"> <year>2010</year><month>9</month><day>10</day></pub-date> <pub-date pub-type="epub-ppub"> <year>2011</year></pub-date> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None)
def setUp(self): self.xml = """<article><article-meta> <pub-date date-type="pub"> <year>2010</year><month>5</month><day>13</day></pub-date> <pub-date date-type="collection"> <year>2012</year><month>2</month><day>3</day></pub-date> </article-meta></article>""" xmltree = etree.fromstring(self.xml) self.sps_package = SPS_Package(xmltree, None)
def setUp(self): article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink"> <inline-graphic xlink:href="a01tab01.gif"/> <graphic xlink:href="a01f01.gif"/> <ext-link xlink:href="a01tab02.gif"/> <ext-link xlink:href="mailto:a01f02.gif"/> <inline-supplementary-material xlink:href="a01tab03.gif"/> <supplementary-material xlink:href="a01tab04.gif"/> <media xlink:href="a01tab04.gif"/> </root> """ self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01")
def article_xml_constructor(file_xml_path: str, dest_path: str) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def setUp(self): xml = """<article xmlns:xlink="http://www.w3.org/1999/xlink"><article-meta> <counts> <fig-count count="0"/> <table-count count="0"/> <equation-count count="0"/> </counts> <body> <fig id="i01"><graphic xlink:href="/img/fbpe/rm/v30n1/0002i01.gif"/></fig> <table-wrap id="tab01"><label>Tabela 1</label><table><tr><td>TEXTO</td></tr></table></table-wrap> </body> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None)
class Test_MatchPubDate4(unittest.TestCase): def setUp(self): xml = """<article><article-meta> <pub-date date-type="pub"> <year>2010</year><month>9</month><day>1</day></pub-date> <pub-date date-type="epub-ppub"> <year>2011</year></pub-date> <pub-date date-type="collection"> <year>2012</year><month>2</month></pub-date> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None) def test__match_pubdate(self): result = self.sps_package._match_pubdate( ('pub-date[@date-type="pub"]', 'pub-date[@date-type="collection"]')) self.assertEqual(result.findtext("year"), "2010") def test_document_pubdate(self): self.assertEqual(self.sps_package.document_pubdate, ("2010", "09", "01")) def test_documents_bundle_pubdate(self): self.assertEqual(self.sps_package.documents_bundle_pubdate, ("2012", "02", ""))
def get_document_manifest(document: etree.ElementTree, document_url: str, assets: list) -> dict: """Cria um manifesto no formato do Kernel a partir de um documento xml""" obj_sps = SPS_Package(document) _id = obj_sps.scielo_id date = obj_sps.document_pubdate if not _id: raise ValueError("Document requires an scielo-id") from None if not date: raise ValueError("A creation date is required") from None _creation_date = parse_date("-".join( [date_part for date_part in date if date_part])) _version = { "data": document_url, "assets": {}, "timestamp": _creation_date } _document = {"id": _id, "versions": [_version]} for asset in assets: _version["assets"][asset.get("asset_id")] = [[ _creation_date, asset.get("asset_url") ]] return _document
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) _renditions = list( filter(lambda file: ".pdf" in file or ".html" in file, list_files)) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) # TODO: é possível que alguns artigos não possuam o self.acron prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, list_files, folder) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) if obj_xml: renditions = get_document_renditions(folder, _renditions, prefix, storage) manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest( obj_xml, url_xml, registered_assets, renditions)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
class Test_MatchPubDate4(unittest.TestCase): def setUp(self): xml = """<article><article-meta> <pub-date date-type="pub"> <year>2010</year><month>9</month><day>1</day></pub-date> <pub-date date-type="epub-ppub"> <year>2011</year></pub-date> <pub-date date-type="collection"> <year>2012</year><month>2</month></pub-date> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None) def test__match_pubdate(self): result = self.sps_package._match_pubdate( ('pub-date[@date-type="pub"]', 'pub-date[@date-type="collection"]')) self.assertEqual(result.findtext("year"), "2010") def test_document_pubdate(self): self.assertEqual(self.sps_package.document_pubdate, ("2010", "09", "01")) def test_documents_bundle_pubdate(self): self.assertEqual(self.sps_package.documents_bundle_pubdate, ("2012", "02", "")) def test_transform_pubdate(self): self.sps_package.transform_pubdate() xpaths_results = ( ('pub-date[@date-type="pub"]', ("2010", "9", "1")), ('pub-date[@date-type="collection"]', ("2012", "2", None)), ) for xpath, result in xpaths_results: with self.subTest(xpath=xpath, result=result): pubdate = self.sps_package.article_meta.find(xpath) self.assertIsNotNone(pubdate) self.assertEqual(pubdate.get("publication-format"), "electronic") self.assertEqual(pubdate.findtext("year"), result[0]) self.assertEqual(pubdate.findtext("month"), result[1]) self.assertEqual(pubdate.findtext("day"), result[2])
def document_store_by_request(folder: str, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) _renditions = list( filter(lambda file: ".pdf" in file or ".html" in file, list_files)) medias_files = set(list_files) - set(xml_files) - set(_renditions) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) assets = [] for m_file in medias_files: assets.append({ "asset_id": m_file, "asset_url": storage.register(os.path.join(folder, m_file), prefix), }) renditions = [] if obj_xml: documentstore_data = { "data": url_xml, "assets": assets, "renditions": renditions, } scielo_id = xml_sps.scielo_id if scielo_id: result = request.put( request.join(settings.DOCUMENT_STORE_URL, "/documents/%s" % scielo_id), data=json.dumps(documentstore_data), ) logger.info("Retorno Documents-Store: %s", result.status_code)
class Test_MatchPubDate1_Season(unittest.TestCase): def setUp(self): xml = """<article><article-meta> <pub-date date-type="pub"> <year>2010</year><month>5</month><day>13</day></pub-date> <pub-date date-type="collection"> <year>2012</year><season>Jan-Feb</season></pub-date> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None) def test__match_pubdate(self): result = self.sps_package._match_pubdate( ('pub-date[@date-type="pub"]', 'pub-date[@date-type="collection"]')) self.assertEqual(result.findtext("year"), "2010") def test_document_pubdate(self): self.assertEqual(self.sps_package.document_pubdate, ("2010", "05", "13")) def test_documents_bundle_pubdate(self): self.assertEqual(self.sps_package.documents_bundle_pubdate, ("2012", "", "")) def test_transform_pubdate(self): self.sps_package.transform_pubdate() pubdate = self.sps_package.article_meta.find( 'pub-date[@date-type="pub"]') self.assertIsNotNone(pubdate) self.assertEqual(pubdate.get("publication-format"), "electronic") self.assertEqual(pubdate.findtext("year"), "2010") self.assertEqual(pubdate.findtext("month"), "5") self.assertEqual(pubdate.findtext("day"), "13") pubdate = self.sps_package.article_meta.find( 'pub-date[@date-type="collection"]') self.assertIsNotNone(pubdate) self.assertEqual(pubdate.get("publication-format"), "electronic") self.assertEqual(pubdate.findtext("year"), "2012") self.assertEqual(pubdate.findtext("season"), "Jan-Feb")
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) medias_files = set(list_files) - set(xml_files) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) assets = [] for m_file in medias_files: assets.append({ "asset_id": m_file, "asset_url": storage.register(os.path.join(folder, m_file), prefix), }) if obj_xml: manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest(obj_xml, url_xml, assets)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
def convert_article_xml(file_xml_path: str, spy=False, poison_pill=PoisonPill()): if poison_pill.poisoned: return logger.info(os.path.basename(file_xml_path)) obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body(spy) # Transforma XML em SPS 1.9 xml_sps.transform_content() # Completa datas presentes na base artigo e ausente no XML json_file_path = Path(config.get("SOURCE_PATH")).joinpath( Path(xml_sps.scielo_pid_v2 + ".json")) article = xylose_converter.json_file_to_xylose_article(json_file_path) document_pubdate, issue_pubdate = get_article_dates(article) xml_sps.complete_pub_date(document_pubdate, issue_pubdate) # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def update_xml_file(self, xml_target_path, row, pack_name): """ Lê e atualiza o XML do pacote informado com os dados de artigos do arquivo articles_data_reader. """ obj_xmltree = xml.loadToXML(xml_target_path) logger.debug('Updating XML "%s" with CSV info', xml_target_path) sps_package = self._update_sps_package_obj(SPS_Package(obj_xmltree), pack_name, row, xml_target_path) # Salva XML com alterações xml.objXML2file(xml_target_path, sps_package.xmltree, pretty=True) return sps_package
def get_document_manifest(document: etree.ElementTree, document_url: str, assets: list, renditions: List[dict]) -> dict: """Cria um manifesto no formato do Kernel a partir de um documento xml""" obj_sps = SPS_Package(document) _id = obj_sps.scielo_id date = obj_sps.document_pubdate if not _id: raise ValueError("Document requires an scielo-id") from None if not date: raise ValueError("A creation date is required") from None _creation_date = parse_date("-".join( [date_part for date_part in date if date_part])) _renditions = [] _version = { "data": document_url, "assets": {}, "timestamp": _creation_date, "renditions": _renditions, } _document = {"id": _id, "versions": [_version]} for asset in assets: _version["assets"][asset.get("asset_id")] = [[ _creation_date, asset.get("asset_url") ]] for rendition in renditions: _renditions.append({ "filename": rendition.get("filename"), "data": [{ "timestamp": _creation_date, "url": rendition.get("url"), "size_bytes": rendition.get("size_bytes"), }], "mimetype": rendition.get("mimetype"), "lang": rendition.get("lang", obj_sps.languages[0]), }) return _document
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) register_pid_v3(pid_database_engine, xml_sps) if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
class Test_SPS_Package(unittest.TestCase): def setUp(self): article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink"> <inline-graphic xlink:href="a01tab01.gif"/> <graphic xlink:href="a01f01.gif"/> <ext-link xlink:href="a01tab02.gif"/> <ext-link xlink:href="mailto:a01f02.gif"/> <inline-supplementary-material xlink:href="a01tab03.gif"/> <supplementary-material xlink:href="a01tab04.gif"/> <media xlink:href="a01tab04.gif"/> </root> """ self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01") def test_elements_which_has_xlink_href(self): items = list(self.sps_package.elements_which_has_xlink_href) self.assertEqual(len(items), 7) self.assertEqual( [node.tag for node in items], sorted([ "inline-graphic", "graphic", "ext-link", "ext-link", "inline-supplementary-material", "supplementary-material", "media", ]), ) def test_replace_assets(self): expected = [ ("a01tab02.gif", "a01-gtab02"), ("a01f01.gif", "a01-gf01"), ("a01tab01.gif", "a01-gtab01"), ("a01tab03.gif", "a01-gtab03"), ("a01tab04.gif", "a01-gtab04"), ("a01tab04.gif", "a01-gtab04"), ] items = self.sps_package.replace_assets_names() self.assertEqual(len(items), 6) for i, item in enumerate(items): with self.subTest(i): self.assertEqual(expected[i][0], item[0]) self.assertEqual(expected[i][1], item[1])
class Test_ArticleMetaCount(unittest.TestCase): def setUp(self): xml = """<article xmlns:xlink="http://www.w3.org/1999/xlink"><article-meta> <counts> <fig-count count="0"/> <table-count count="0"/> <equation-count count="0"/> </counts> <body> <fig id="i01"><graphic xlink:href="/img/fbpe/rm/v30n1/0002i01.gif"/></fig> <table-wrap id="tab01"><label>Tabela 1</label><table><tr><td>TEXTO</td></tr></table></table-wrap> </body> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None) def test__transform_article_meta_count(self): result = self.sps_package.transform_article_meta_count() self.assertIsNone(result.find(".//counts"))
def sps_package(article_meta_xml, doi="10.1590/S0074-02761962000200006"): xml = build_xml(article_meta_xml, doi) xmltree = etree.fromstring(xml) return SPS_Package(xmltree, "a01")
class Test_SPS_Package(unittest.TestCase): def setUp(self): article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink"> <inline-graphic xlink:href="a01tab01.gif"/> <graphic xlink:href="a01f01.gif"/> <ext-link xlink:href="a01tab02.gif"/> <ext-link xlink:href="mailto:a01f02.gif"/> <inline-supplementary-material xlink:href="a01tab03.gif"/> <supplementary-material xlink:href="a01tab04.gif"/> <media xlink:href="a01tab04.gif"/> </root> """ self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01") def test_elements_which_has_xlink_href(self): items = list(self.sps_package.elements_which_has_xlink_href) self.assertEqual(len(items), 7) self.assertEqual( [node.tag for node in items], sorted([ "inline-graphic", "graphic", "ext-link", "ext-link", "inline-supplementary-material", "supplementary-material", "media", ]), ) def test_replace_assets(self): expected = [ ("a01tab02.gif", "a01-gtab02"), ("a01f01.gif", "a01-gf01"), ("a01tab01.gif", "a01-gtab01"), ("a01tab03.gif", "a01-gtab03"), ("a01tab04.gif", "a01-gtab04"), ("a01tab04.gif", "a01-gtab04"), ] items = self.sps_package.replace_assets_names() self.assertEqual(len(items), 6) for i, item in enumerate(items): with self.subTest(i): self.assertEqual(expected[i][0], item[0]) self.assertEqual(expected[i][1], item[1]) @mock.patch("documentstore_migracao.export.sps_package.article.get_article" ) def test_get_renditions_metadata_no_renditions(self, mk_get_article): mk_article = mock.Mock() mk_article.fulltexts.return_value = {} mk_get_article.return_value = mk_article renditions, renditions_metadata = self.sps_package.get_renditions_metadata( ) self.assertEqual(renditions, []) self.assertEqual(renditions_metadata, {}) @mock.patch("documentstore_migracao.export.sps_package.article.get_article" ) def test_get_renditions_metadata(self, mk_get_article): fulltexts = { "pdf": { "en": "http://www.scielo.br/pdf/aa/v1n1/a01.pdf", "pt": "http://www.scielo.br/pdf/aa/v1n1/pt_a01.pdf", }, "html": { "en": "http://www.scielo.br/scielo.php?script=sci_arttext&tlng=en", "pt": "http://www.scielo.br/scielo.php?script=sci_arttext&tlng=pt", }, } mk_article = mock.Mock() mk_article.fulltexts.return_value = fulltexts mk_get_article.return_value = mk_article renditions, renditions_metadata = self.sps_package.get_renditions_metadata( ) for lang, link in fulltexts.get("pdf"): self.assertEqual(renditions, [ ('http://www.scielo.br/pdf/aa/v1n1/a01.pdf', 'a01'), ('http://www.scielo.br/pdf/aa/v1n1/pt_a01.pdf', 'pt_a01'), ]) self.assertEqual( renditions_metadata, { 'en': 'http://www.scielo.br/pdf/aa/v1n1/a01.pdf', 'pt': 'http://www.scielo.br/pdf/aa/v1n1/pt_a01.pdf', })
def register_document(folder: str, session, storage, pid_database_engine, poison_pill=PoisonPill()) -> None: """Registra registra pacotes SPS em uma instância do Kernel e seus ativos digitais em um object storage.""" if poison_pill.poisoned: return logger.debug("Starting the import step for '%s' package.", folder) package_files = files.list_files(folder) xmls = files.xml_files_list(folder) if xmls is None or len(xmls) == 0: raise exceptions.XMLError( "There is no XML file into package '%s'. Please verify and try later." % folder ) from None xml_path = os.path.join(folder, xmls[0]) constructor.article_xml_constructor(xml_path, folder, pid_database_engine, False) try: obj_xml = xml.loadToXML(xml_path) except lxml.etree.ParseError as exc: raise exceptions.XMLError( "Could not parse the '%s' file, please validate" " this file before then try to import again." % xml_path, ) from None xml_sps = SPS_Package(obj_xml) pid_v3 = xml_sps.scielo_pid_v3 try: session.documents.fetch(id=pid_v3) except DoesNotExist: pass else: logger.debug( "Document '%s' already exist in kernel. Returning article result information", pid_v3, ) return get_article_result_dict(xml_sps) prefix = xml_sps.media_prefix or "" url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, package_files, folder ) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) renditions = get_document_renditions(folder, prefix, storage) document = Document( manifest=manifest.get_document_manifest( xml_sps, url_xml, registered_assets, renditions ) ) try: add_document(session, document) if renditions: add_renditions(session, document) except AlreadyExists as exc: logger.error(exc) else: logger.debug("Document with id '%s' was imported.", document.id()) return get_article_result_dict(xml_sps)
def update_articles_mixed_citations( source: str, output_folder: str = None, override: bool = False, disable_bar: bool = False, ): """Atualiza os elementos de ``mixed-citations`` em um ou mais XMLs. O resultado da atualização pode ser salvo no próprio arquivo XML ou em outro arquivo XML em um diretório diferente utilizando o parâmetro ``output_folder``. Marque o `override` como `True` para sobrescrever todas as mixed citations das referências, caso contrário, apenas as referências sem mixed citations serão atualizadas (padrão).""" CACHE_DIR = config.get("PARAGRAPH_CACHE_PATH") if not os.path.exists(source): raise FileNotFoundError("Source path '%s' does not exist" % source) elif output_folder is not None and not os.path.exists(output_folder): raise FileNotFoundError("Output folder '%s' does not exist" % output_folder) def get_references_text_from_paragraphs(paragraphs: list, pid: str) -> dict: """Filtra as referências a partir dos paragráfos. As referências possuem a mesma estrutura dos parágrafos na base MST exceto pelo índice (v888). Considera-se uma referência os registros que possuem o índice/order (v888) e a chave de `PID` para o artigo (v880). Params: paragraphs (List[dict]): Lista de parágrafos extraídos da base MST pid (str): Identificador do documento no formato `scielo-v2` Returns: references (Dict[str, str]): Dicionário com referências filtradas, e.g: {"order": "text"} """ references = {} for paragraph in paragraphs: article_pid = get_nested(paragraph, "v880", 0, "_", default=None) index = get_nested(paragraph, "v888", 0, "_", default=-1) if index != -1 and article_pid == pid: references[index] = XMLUtils.cleanup_mixed_citation_text( get_nested(paragraph, "v704", 0, "_")) return references def get_output_file_path(original_file, output_folder=None): """Retorna o path completo para um arquivo de saída""" if output_folder is None: return original_file return os.path.join(output_folder, os.path.basename(original_file)) def get_paragraphs_from_cache(file) -> list: """Retorna uma lista de paragráfos a partir de um arquivo JSON""" paragraphs = [] with open(file, "r") as f: for line in f.readlines(): paragraphs.append(json.loads(line)) return paragraphs xmls = get_files_in_path(source, extension=".xml") with tqdm(total=len(xmls), disable=disable_bar) as pbar: for xml in xmls: try: package = SPS_Package(etree.parse(xml)) if package.scielo_pid_v2 is None: logger.error( "Could not update file '%s' because its PID is unknown.", xml) continue paragraph_file = f"{CACHE_DIR}/{package.scielo_pid_v2}.json" paragraphs = get_paragraphs_from_cache(paragraph_file) references = get_references_text_from_paragraphs( paragraphs, pid=package.scielo_pid_v2) updated = package.update_mixed_citations(references, override=override) output_file = get_output_file_path(xml, output_folder) XMLUtils.objXML2file(output_file, package.xmltree, pretty=True) if len(updated) > 0: logger.debug("Updated %0.3d references from '%s' file.", len(updated), xml) except etree.XMLSyntaxError as e: logger.error(e) except FileNotFoundError as e: logger.error( "Could not update file '%s' " "the exception '%s' occurred.", xml, e) pbar.update(1)
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()): """Empacoda um xml e seus ativos digitais. Args: file_xml_path: Caminho para o XML poison_pill: Injeta um PosionPill() Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: packing.pack_article_xml( os.path.join("S0044-59672003000300002.xml") ) Exceções: Não lança exceções. """ if poison_pill.poisoned: return original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) sps_package.fix("article_id_which_id_type_is_other", sps_package.scielo_pid_v2 and sps_package.scielo_pid_v2[-5:], silently=True) new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10]) if new_issns: sps_package.fix("issns", new_issns, silently=True) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.debug("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) source_json = get_source_json(sps_package.scielo_pid_v2) renditions, renditions_metadata = source_json.get_renditions_metadata() logger.debug("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets( asset_replacements + renditions, pkg_path, incomplete_pkg_path, sps_package.package_name, sps_package.scielo_pid_v2, ) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)