def convert_article_xml(file_xml_path): obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body() # CONVERTE PUB-DATE PARA SPS 1.9 xml_sps.transform_pubdate() # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def get_document_manifest(document: etree.ElementTree, document_url: str, assets: list) -> dict: """Cria um manifesto no formato do Kernel a partir de um documento xml""" obj_sps = SPS_Package(document) _id = obj_sps.scielo_id date = obj_sps.document_pubdate if not _id: raise ValueError("Document requires an scielo-id") from None if not date: raise ValueError("A creation date is required") from None _creation_date = parse_date("-".join( [date_part for date_part in date if date_part])) _version = { "data": document_url, "assets": {}, "timestamp": _creation_date } _document = {"id": _id, "versions": [_version]} for asset in assets: _version["assets"][asset.get("asset_id")] = [[ _creation_date, asset.get("asset_url") ]] return _document
def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) renditions, renditions_metadata = sps_package.get_renditions_metadata() logger.info("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets(asset_replacements + renditions, pkg_path, bad_pkg_path, sps_package.package_name) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) files.make_empty_dir(pkg_path) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path, sps_package.package_name) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) pid_v2 = xml_sps.scielo_pid_v2 # VERIFICA A EXISTÊNCIA DO PID V3 NO XC ATRAVES DO PID V2 if not pid_manager.check_pid_v3_by_v2(pid_database_engine, pid_v2): # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # CRIA O PID V2 E V3 NA BASE DE DADOS DO XC pid_manager.create_pid(pid_database_engine, pid_v2, xml_sps.scielo_pid_v3) else: # SE CASO EXISTA O PID NO VERSÃO 3 NA BASE DO XC É PRECISO ADICIONAR NO XML pid_v3 = pid_manager.get_pid_v3_by_v2(pid_database_engine, pid_v2) xml_sps.scielo_pid_v3 = pid_v3 if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def setUp(self): xml = """<article><article-meta> <pub-date pub-type="epub"> <year>2010</year><month>9</month><day>10</day></pub-date> <pub-date pub-type="epub-ppub"> <year>2011</year></pub-date> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None)
def setUp(self): self.xml = """<article><article-meta> <pub-date date-type="pub"> <year>2010</year><month>5</month><day>13</day></pub-date> <pub-date date-type="collection"> <year>2012</year><month>2</month><day>3</day></pub-date> </article-meta></article>""" xmltree = etree.fromstring(self.xml) self.sps_package = SPS_Package(xmltree, None)
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) _renditions = list( filter(lambda file: ".pdf" in file or ".html" in file, list_files)) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) # TODO: é possível que alguns artigos não possuam o self.acron prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, list_files, folder) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) if obj_xml: renditions = get_document_renditions(folder, _renditions, prefix, storage) manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest( obj_xml, url_xml, registered_assets, renditions)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
def document_store_by_request(folder: str, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) _renditions = list( filter(lambda file: ".pdf" in file or ".html" in file, list_files)) medias_files = set(list_files) - set(xml_files) - set(_renditions) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) assets = [] for m_file in medias_files: assets.append({ "asset_id": m_file, "asset_url": storage.register(os.path.join(folder, m_file), prefix), }) renditions = [] if obj_xml: documentstore_data = { "data": url_xml, "assets": assets, "renditions": renditions, } scielo_id = xml_sps.scielo_id if scielo_id: result = request.put( request.join(settings.DOCUMENT_STORE_URL, "/documents/%s" % scielo_id), data=json.dumps(documentstore_data), ) logger.info("Retorno Documents-Store: %s", result.status_code)
def setUp(self): article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink"> <inline-graphic xlink:href="a01tab01.gif"/> <graphic xlink:href="a01f01.gif"/> <ext-link xlink:href="a01tab02.gif"/> <ext-link xlink:href="mailto:a01f02.gif"/> <inline-supplementary-material xlink:href="a01tab03.gif"/> <supplementary-material xlink:href="a01tab04.gif"/> <media xlink:href="a01tab04.gif"/> </root> """ self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01")
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) medias_files = set(list_files) - set(xml_files) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) assets = [] for m_file in medias_files: assets.append({ "asset_id": m_file, "asset_url": storage.register(os.path.join(folder, m_file), prefix), }) if obj_xml: manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest(obj_xml, url_xml, assets)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
def article_xml_constructor(file_xml_path: str, dest_path: str) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def setUp(self): xml = """<article xmlns:xlink="http://www.w3.org/1999/xlink"><article-meta> <counts> <fig-count count="0"/> <table-count count="0"/> <equation-count count="0"/> </counts> <body> <fig id="i01"><graphic xlink:href="/img/fbpe/rm/v30n1/0002i01.gif"/></fig> <table-wrap id="tab01"><label>Tabela 1</label><table><tr><td>TEXTO</td></tr></table></table-wrap> </body> </article-meta></article>""" xmltree = etree.fromstring(xml) self.sps_package = SPS_Package(xmltree, None)
def update_xml_file(self, xml_target_path, row, pack_name): """ Lê e atualiza o XML do pacote informado com os dados de artigos do arquivo articles_data_reader. """ obj_xmltree = xml.loadToXML(xml_target_path) logger.debug('Updating XML "%s" with CSV info', xml_target_path) sps_package = self._update_sps_package_obj(SPS_Package(obj_xmltree), pack_name, row, xml_target_path) # Salva XML com alterações xml.objXML2file(xml_target_path, sps_package.xmltree, pretty=True) return sps_package
def get_document_manifest(document: etree.ElementTree, document_url: str, assets: list, renditions: List[dict]) -> dict: """Cria um manifesto no formato do Kernel a partir de um documento xml""" obj_sps = SPS_Package(document) _id = obj_sps.scielo_id date = obj_sps.document_pubdate if not _id: raise ValueError("Document requires an scielo-id") from None if not date: raise ValueError("A creation date is required") from None _creation_date = parse_date("-".join( [date_part for date_part in date if date_part])) _renditions = [] _version = { "data": document_url, "assets": {}, "timestamp": _creation_date, "renditions": _renditions, } _document = {"id": _id, "versions": [_version]} for asset in assets: _version["assets"][asset.get("asset_id")] = [[ _creation_date, asset.get("asset_url") ]] for rendition in renditions: _renditions.append({ "filename": rendition.get("filename"), "data": [{ "timestamp": _creation_date, "url": rendition.get("url"), "size_bytes": rendition.get("size_bytes"), }], "mimetype": rendition.get("mimetype"), "lang": rendition.get("lang", obj_sps.languages[0]), }) return _document
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) register_pid_v3(pid_database_engine, xml_sps) if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def convert_article_xml(file_xml_path: str, spy=False, poison_pill=PoisonPill()): if poison_pill.poisoned: return logger.info(os.path.basename(file_xml_path)) obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body(spy) # Transforma XML em SPS 1.9 xml_sps.transform_content() # Completa datas presentes na base artigo e ausente no XML json_file_path = Path(config.get("SOURCE_PATH")).joinpath( Path(xml_sps.scielo_pid_v2 + ".json")) article = xylose_converter.json_file_to_xylose_article(json_file_path) document_pubdate, issue_pubdate = get_article_dates(article) xml_sps.complete_pub_date(document_pubdate, issue_pubdate) # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def update_articles_mixed_citations( source: str, output_folder: str = None, override: bool = False, disable_bar: bool = False, ): """Atualiza os elementos de ``mixed-citations`` em um ou mais XMLs. O resultado da atualização pode ser salvo no próprio arquivo XML ou em outro arquivo XML em um diretório diferente utilizando o parâmetro ``output_folder``. Marque o `override` como `True` para sobrescrever todas as mixed citations das referências, caso contrário, apenas as referências sem mixed citations serão atualizadas (padrão).""" CACHE_DIR = config.get("PARAGRAPH_CACHE_PATH") if not os.path.exists(source): raise FileNotFoundError("Source path '%s' does not exist" % source) elif output_folder is not None and not os.path.exists(output_folder): raise FileNotFoundError("Output folder '%s' does not exist" % output_folder) def get_references_text_from_paragraphs(paragraphs: list, pid: str) -> dict: """Filtra as referências a partir dos paragráfos. As referências possuem a mesma estrutura dos parágrafos na base MST exceto pelo índice (v888). Considera-se uma referência os registros que possuem o índice/order (v888) e a chave de `PID` para o artigo (v880). Params: paragraphs (List[dict]): Lista de parágrafos extraídos da base MST pid (str): Identificador do documento no formato `scielo-v2` Returns: references (Dict[str, str]): Dicionário com referências filtradas, e.g: {"order": "text"} """ references = {} for paragraph in paragraphs: article_pid = get_nested(paragraph, "v880", 0, "_", default=None) index = get_nested(paragraph, "v888", 0, "_", default=-1) if index != -1 and article_pid == pid: references[index] = XMLUtils.cleanup_mixed_citation_text( get_nested(paragraph, "v704", 0, "_")) return references def get_output_file_path(original_file, output_folder=None): """Retorna o path completo para um arquivo de saída""" if output_folder is None: return original_file return os.path.join(output_folder, os.path.basename(original_file)) def get_paragraphs_from_cache(file) -> list: """Retorna uma lista de paragráfos a partir de um arquivo JSON""" paragraphs = [] with open(file, "r") as f: for line in f.readlines(): paragraphs.append(json.loads(line)) return paragraphs xmls = get_files_in_path(source, extension=".xml") with tqdm(total=len(xmls), disable=disable_bar) as pbar: for xml in xmls: try: package = SPS_Package(etree.parse(xml)) if package.scielo_pid_v2 is None: logger.error( "Could not update file '%s' because its PID is unknown.", xml) continue paragraph_file = f"{CACHE_DIR}/{package.scielo_pid_v2}.json" paragraphs = get_paragraphs_from_cache(paragraph_file) references = get_references_text_from_paragraphs( paragraphs, pid=package.scielo_pid_v2) updated = package.update_mixed_citations(references, override=override) output_file = get_output_file_path(xml, output_folder) XMLUtils.objXML2file(output_file, package.xmltree, pretty=True) if len(updated) > 0: logger.debug("Updated %0.3d references from '%s' file.", len(updated), xml) except etree.XMLSyntaxError as e: logger.error(e) except FileNotFoundError as e: logger.error( "Could not update file '%s' " "the exception '%s' occurred.", xml, e) pbar.update(1)
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()): """Empacoda um xml e seus ativos digitais. Args: file_xml_path: Caminho para o XML poison_pill: Injeta um PosionPill() Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: packing.pack_article_xml( os.path.join("S0044-59672003000300002.xml") ) Exceções: Não lança exceções. """ if poison_pill.poisoned: return original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) sps_package.fix("article_id_which_id_type_is_other", sps_package.scielo_pid_v2 and sps_package.scielo_pid_v2[-5:], silently=True) new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10]) if new_issns: sps_package.fix("issns", new_issns, silently=True) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.debug("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) source_json = get_source_json(sps_package.scielo_pid_v2) renditions, renditions_metadata = source_json.get_renditions_metadata() logger.debug("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets( asset_replacements + renditions, pkg_path, incomplete_pkg_path, sps_package.package_name, sps_package.scielo_pid_v2, ) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def register_document(folder: str, session, storage, pid_database_engine, poison_pill=PoisonPill()) -> None: """Registra registra pacotes SPS em uma instância do Kernel e seus ativos digitais em um object storage.""" if poison_pill.poisoned: return logger.debug("Starting the import step for '%s' package.", folder) package_files = files.list_files(folder) xmls = files.xml_files_list(folder) if xmls is None or len(xmls) == 0: raise exceptions.XMLError( "There is no XML file into package '%s'. Please verify and try later." % folder ) from None xml_path = os.path.join(folder, xmls[0]) constructor.article_xml_constructor(xml_path, folder, pid_database_engine, False) try: obj_xml = xml.loadToXML(xml_path) except lxml.etree.ParseError as exc: raise exceptions.XMLError( "Could not parse the '%s' file, please validate" " this file before then try to import again." % xml_path, ) from None xml_sps = SPS_Package(obj_xml) pid_v3 = xml_sps.scielo_pid_v3 try: session.documents.fetch(id=pid_v3) except DoesNotExist: pass else: logger.debug( "Document '%s' already exist in kernel. Returning article result information", pid_v3, ) return get_article_result_dict(xml_sps) prefix = xml_sps.media_prefix or "" url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, package_files, folder ) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) renditions = get_document_renditions(folder, prefix, storage) document = Document( manifest=manifest.get_document_manifest( xml_sps, url_xml, registered_assets, renditions ) ) try: add_document(session, document) if renditions: add_renditions(session, document) except AlreadyExists as exc: logger.error(exc) else: logger.debug("Document with id '%s' was imported.", document.id()) return get_article_result_dict(xml_sps)
def sps_package(article_meta_xml, doi="10.1590/S0074-02761962000200006"): xml = build_xml(article_meta_xml, doi) xmltree = etree.fromstring(xml) return SPS_Package(xmltree, "a01")