def list_converted_xml_view(request): list_files_xmls = files.xml_files_list(config.get("CONVERSION_PATH")) list_files_xmls += files.xml_files_list(config.get("VALID_XML_PATH")) xmls = Page( list_files_xmls, page=int(request.params.get("page", 1)), items_per_page=20, item_count=len(list_files_xmls), ) return {"xmls": xmls, "page_title": "Lista de XMLS Convertidos"}
def convert_article_ALLxml(spy=False): """Converte todos os arquivos HTML/XML que estão na pasta fonte.""" logger.debug("Starting XML conversion, it may take sometime.") logger.warning("If you are facing problems with Python crashing during " "conversion try to export this environment " "variable: `OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`") xmls = [ os.path.join(config.get("SOURCE_PATH"), xml) for xml in files.xml_files_list(config.get("SOURCE_PATH")) ] jobs = [{"file_xml_path": xml, "spy": spy} for xml in xmls] with tqdm(total=len(xmls)) as pbar: def update_bar(pbar=pbar): pbar.update(1) def log_exceptions(exception, job, logger=logger): logger.error( "Could not convert file '%s'. The exception '%s' was raised.", job["file_xml_path"], exception, ) DoJobsConcurrently( convert_article_xml, jobs=jobs, executor=concurrent.futures.ProcessPoolExecutor, max_workers=int(config.get("PROCESSPOOL_MAX_WORKERS")), exception_callback=log_exceptions, update_bar=update_bar, )
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) _renditions = list( filter(lambda file: ".pdf" in file or ".html" in file, list_files)) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) # TODO: é possível que alguns artigos não possuam o self.acron prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, list_files, folder) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) if obj_xml: renditions = get_document_renditions(folder, _renditions, prefix, storage) manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest( obj_xml, url_xml, registered_assets, renditions)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
def article_ALL_html_generator(source_path: str, dest_path: str) -> None: logger.info("Iniciando Geração dos HTMLs") list_files_xmls = files.xml_files_list(source_path) for file_xml in tqdm(list_files_xmls): try: article_html_generator(os.path.join(source_path, file_xml), dest_path) except Exception as ex: logger.info("não foi possível gerar o html do Arquivo %s: %s", file_xml, ex)
def convert_article_ALLxml(): logger.info("Iniciando Conversão do xmls") list_files_xmls = files.xml_files_list(config.get("SOURCE_PATH")) for file_xml in tqdm(list_files_xmls): try: convert_article_xml( os.path.join(config.get("SOURCE_PATH"), file_xml)) except Exception as ex: logger.error(file_xml) logger.exception(ex)
def pack_article_ALLxml(): logger.info("Empacotando os documentos XML") list_files_xmls = files.xml_files_list(config.get("VALID_XML_PATH")) for file_xml in tqdm(list_files_xmls): try: pack_article_xml(os.path.join(config.get("VALID_XML_PATH"), file_xml)) except (PermissionError, OSError, etree.Error) as ex: logger.error("Falha no empacotamento de %s" % file_xml) logger.exception(ex)
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) medias_files = set(list_files) - set(xml_files) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) assets = [] for m_file in medias_files: assets.append({ "asset_id": m_file, "asset_url": storage.register(os.path.join(folder, m_file), prefix), }) if obj_xml: manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest(obj_xml, url_xml, assets)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
def validate_article_ALLxml(move_to_processed_source=False, move_to_valid_xml=False): logger.info("Iniciando Validação dos xmls") list_files_xmls = files.xml_files_list(config.get("CONVERSION_PATH")) success_path = config.get("VALID_XML_PATH") errors_path = config.get("XML_ERRORS_PATH") func = shutil.move if move_to_valid_xml else shutil.copyfile result = {} for file_xml in tqdm(list_files_xmls): filename, _ = files.extract_filename_ext_by_path(file_xml) converted_file = os.path.join(config.get("CONVERSION_PATH"), file_xml) try: errors = validate_article_xml(converted_file, False) for k_error, v_error in errors.items(): dicts.merge(result, k_error, v_error) if errors_path: manage_error_file( errors, os.path.join(errors_path, "%s.err" % filename), converted_file, ) if not errors: if success_path: func(converted_file, os.path.join(success_path, file_xml)) if move_to_processed_source: files.move_xml_to( "%s.xml" % filename, config.get("SOURCE_PATH"), config.get("PROCESSED_SOURCE_PATH"), ) except Exception as ex: logger.exception(ex) raise analase = sorted(result.items(), key=lambda x: x[1]["count"], reverse=True) for k_result, v_result in analase: logger.error("%s - %s", k_result, v_result["count"])
def pack_article_ALLxml(): """Gera os pacotes SPS a partir de um lista de XML validos. Args: Não há argumentos Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: pack_article_ALLxml() Exceções: Não lança exceções. """ xmls = [ os.path.join(config.get("VALID_XML_PATH"), xml) for xml in files.xml_files_list(config.get("VALID_XML_PATH")) ] jobs = [{"file_xml_path": xml} for xml in xmls] with tqdm(total=len(xmls), initial=0) as pbar: def update_bar(pbar=pbar): pbar.update(1) def log_exceptions(exception, job, logger=logger): logger.error( "Could not pack file '%s'. The exception '%s' was raised.", job["file_xml_path"], exception, ) DoJobsConcurrently( pack_article_xml, jobs=jobs, max_workers=int(config.get("THREADPOOL_MAX_WORKERS")), exception_callback=log_exceptions, update_bar=update_bar, )
def pack_article_ALLxml(): """Gera os pacotes SPS a partir de um lista de XML validos. Args: Não há argumentos Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: pack_article_ALLxml() Exceções: Não lança exceções. """ xmls = [ os.path.join(config.get("VALID_XML_PATH"), xml) for xml in files.xml_files_list(config.get("VALID_XML_PATH")) ] jobs = [{"file_xml_path": xml} for xml in xmls] with tqdm(total=len(xmls), initial=0) as pbar: def update_bar(pbar=pbar): pbar.update(1) DoJobsConcurrently( pack_article_xml, jobs=jobs, max_workers=int(config.get("THREADPOOL_MAX_WORKERS")), update_bar=update_bar, )
def test_xml_files_list(self): self.assertEqual(len(files.xml_files_list(SAMPLES_PATH)), COUNT_SAMPLES_FILES)
def register_document(folder: str, session, storage, pid_database_engine, poison_pill=PoisonPill()) -> None: """Registra registra pacotes SPS em uma instância do Kernel e seus ativos digitais em um object storage.""" if poison_pill.poisoned: return logger.debug("Starting the import step for '%s' package.", folder) package_files = files.list_files(folder) xmls = files.xml_files_list(folder) if xmls is None or len(xmls) == 0: raise exceptions.XMLError( "There is no XML file into package '%s'. Please verify and try later." % folder ) from None xml_path = os.path.join(folder, xmls[0]) constructor.article_xml_constructor(xml_path, folder, pid_database_engine, False) try: obj_xml = xml.loadToXML(xml_path) except lxml.etree.ParseError as exc: raise exceptions.XMLError( "Could not parse the '%s' file, please validate" " this file before then try to import again." % xml_path, ) from None xml_sps = SPS_Package(obj_xml) pid_v3 = xml_sps.scielo_pid_v3 try: session.documents.fetch(id=pid_v3) except DoesNotExist: pass else: logger.debug( "Document '%s' already exist in kernel. Returning article result information", pid_v3, ) return get_article_result_dict(xml_sps) prefix = xml_sps.media_prefix or "" url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, package_files, folder ) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) renditions = get_document_renditions(folder, prefix, storage) document = Document( manifest=manifest.get_document_manifest( xml_sps, url_xml, registered_assets, renditions ) ) try: add_document(session, document) if renditions: add_renditions(session, document) except AlreadyExists as exc: logger.error(exc) else: logger.debug("Document with id '%s' was imported.", document.id()) return get_article_result_dict(xml_sps)