Esempio n. 1
0
def list_converted_xml_view(request):
    list_files_xmls = files.xml_files_list(config.get("CONVERSION_PATH"))
    list_files_xmls += files.xml_files_list(config.get("VALID_XML_PATH"))
    xmls = Page(
        list_files_xmls,
        page=int(request.params.get("page", 1)),
        items_per_page=20,
        item_count=len(list_files_xmls),
    )
    return {"xmls": xmls, "page_title": "Lista de XMLS Convertidos"}
Esempio n. 2
0
def convert_article_ALLxml(spy=False):
    """Converte todos os arquivos HTML/XML que estão na pasta fonte."""

    logger.debug("Starting XML conversion, it may take sometime.")
    logger.warning("If you are facing problems with Python crashing during "
                   "conversion try to export this environment "
                   "variable: `OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`")

    xmls = [
        os.path.join(config.get("SOURCE_PATH"), xml)
        for xml in files.xml_files_list(config.get("SOURCE_PATH"))
    ]

    jobs = [{"file_xml_path": xml, "spy": spy} for xml in xmls]

    with tqdm(total=len(xmls)) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        def log_exceptions(exception, job, logger=logger):
            logger.error(
                "Could not convert file '%s'. The exception '%s' was raised.",
                job["file_xml_path"],
                exception,
            )

        DoJobsConcurrently(
            convert_article_xml,
            jobs=jobs,
            executor=concurrent.futures.ProcessPoolExecutor,
            max_workers=int(config.get("PROCESSPOOL_MAX_WORKERS")),
            exception_callback=log_exceptions,
            update_bar=update_bar,
        )
def register_document(folder: str, session_db, storage) -> None:

    logger.info("Processando a Pasta %s", folder)
    list_files = files.list_files(folder)

    obj_xml = None
    prefix = ""
    xml_files = files.xml_files_list(folder)
    _renditions = list(
        filter(lambda file: ".pdf" in file or ".html" in file, list_files))

    if len(xml_files) > 1:
        raise exceptions.XMLError("Existe %s xmls no pacote SPS",
                                  len(xml_files))
    else:
        try:
            x_file = xml_files[0]
        except IndexError as ex:
            raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex)

    xml_path = os.path.join(folder, x_file)
    obj_xml = xml.loadToXML(xml_path)

    xml_sps = SPS_Package(obj_xml)

    # TODO: é possível que alguns artigos não possuam o self.acron
    prefix = xml_sps.media_prefix
    url_xml = storage.register(xml_path, prefix)

    static_assets, static_additionals = get_document_assets_path(
        obj_xml, list_files, folder)
    registered_assets = put_static_assets_into_storage(static_assets, prefix,
                                                       storage)

    for additional_path in static_additionals.values():
        storage.register(os.path.join(additional_path), prefix)

    if obj_xml:
        renditions = get_document_renditions(folder, _renditions, prefix,
                                             storage)
        manifest_data = ManifestDomainAdapter(
            manifest=manifest.get_document_manifest(
                obj_xml, url_xml, registered_assets, renditions))

        try:
            session_db.documents.add(data=manifest_data)
            session_db.changes.add({
                "timestamp": utcnow(),
                "entity": "Document",
                "id": manifest_data.id()
            })
            logger.info("Document-store save: %s", manifest_data.id())
        except AlreadyExists as exc:
            logger.exception(exc)

    return obj_xml, manifest_data.id()
Esempio n. 4
0
def article_ALL_html_generator(source_path: str, dest_path: str) -> None:

    logger.info("Iniciando Geração dos HTMLs")
    list_files_xmls = files.xml_files_list(source_path)
    for file_xml in tqdm(list_files_xmls):
        try:
            article_html_generator(os.path.join(source_path, file_xml),
                                   dest_path)
        except Exception as ex:
            logger.info("não foi possível gerar o html do Arquivo %s: %s",
                        file_xml, ex)
Esempio n. 5
0
def convert_article_ALLxml():

    logger.info("Iniciando Conversão do xmls")
    list_files_xmls = files.xml_files_list(config.get("SOURCE_PATH"))
    for file_xml in tqdm(list_files_xmls):

        try:
            convert_article_xml(
                os.path.join(config.get("SOURCE_PATH"), file_xml))
        except Exception as ex:
            logger.error(file_xml)
            logger.exception(ex)
def pack_article_ALLxml():

    logger.info("Empacotando os documentos XML")
    list_files_xmls = files.xml_files_list(config.get("VALID_XML_PATH"))
    for file_xml in tqdm(list_files_xmls):

        try:
            pack_article_xml(os.path.join(config.get("VALID_XML_PATH"), file_xml))

        except (PermissionError, OSError, etree.Error) as ex:
            logger.error("Falha no empacotamento de %s" % file_xml)
            logger.exception(ex)
def register_document(folder: str, session_db, storage) -> None:

    logger.info("Processando a Pasta %s", folder)
    list_files = files.list_files(folder)

    obj_xml = None
    prefix = ""
    xml_files = files.xml_files_list(folder)
    medias_files = set(list_files) - set(xml_files)

    if len(xml_files) > 1:
        raise exceptions.XMLError("Existe %s xmls no pacote SPS",
                                  len(xml_files))
    else:
        try:
            x_file = xml_files[0]
        except IndexError as ex:
            raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex)

    xml_path = os.path.join(folder, x_file)
    obj_xml = xml.loadToXML(xml_path)

    xml_sps = SPS_Package(obj_xml)

    prefix = xml_sps.media_prefix
    url_xml = storage.register(xml_path, prefix)

    assets = []
    for m_file in medias_files:
        assets.append({
            "asset_id":
            m_file,
            "asset_url":
            storage.register(os.path.join(folder, m_file), prefix),
        })

    if obj_xml:
        manifest_data = ManifestDomainAdapter(
            manifest=manifest.get_document_manifest(obj_xml, url_xml, assets))

        try:
            session_db.documents.add(data=manifest_data)
            session_db.changes.add({
                "timestamp": utcnow(),
                "entity": "Document",
                "id": manifest_data.id()
            })
            logger.info("Document-store save: %s", manifest_data.id())
        except AlreadyExists as exc:
            logger.exception(exc)

    return obj_xml, manifest_data.id()
def validate_article_ALLxml(move_to_processed_source=False,
                            move_to_valid_xml=False):
    logger.info("Iniciando Validação dos xmls")
    list_files_xmls = files.xml_files_list(config.get("CONVERSION_PATH"))

    success_path = config.get("VALID_XML_PATH")
    errors_path = config.get("XML_ERRORS_PATH")
    func = shutil.move if move_to_valid_xml else shutil.copyfile

    result = {}
    for file_xml in tqdm(list_files_xmls):

        filename, _ = files.extract_filename_ext_by_path(file_xml)
        converted_file = os.path.join(config.get("CONVERSION_PATH"), file_xml)

        try:
            errors = validate_article_xml(converted_file, False)

            for k_error, v_error in errors.items():
                dicts.merge(result, k_error, v_error)

            if errors_path:
                manage_error_file(
                    errors,
                    os.path.join(errors_path, "%s.err" % filename),
                    converted_file,
                )

            if not errors:
                if success_path:
                    func(converted_file, os.path.join(success_path, file_xml))

                if move_to_processed_source:
                    files.move_xml_to(
                        "%s.xml" % filename,
                        config.get("SOURCE_PATH"),
                        config.get("PROCESSED_SOURCE_PATH"),
                    )

        except Exception as ex:
            logger.exception(ex)
            raise

    analase = sorted(result.items(), key=lambda x: x[1]["count"], reverse=True)
    for k_result, v_result in analase:
        logger.error("%s - %s", k_result, v_result["count"])
Esempio n. 9
0
def pack_article_ALLxml():
    """Gera os pacotes SPS a partir de um lista de XML validos.

    Args:
       Não há argumentos

    Retornos:
        Sem retornos.

        Persiste o XML no ``package_path``

    Exemplo:
        pack_article_ALLxml()

    Exceções:
        Não lança exceções.
    """

    xmls = [
        os.path.join(config.get("VALID_XML_PATH"), xml)
        for xml in files.xml_files_list(config.get("VALID_XML_PATH"))
    ]

    jobs = [{"file_xml_path": xml} for xml in xmls]

    with tqdm(total=len(xmls), initial=0) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        def log_exceptions(exception, job, logger=logger):
            logger.error(
                "Could not pack file '%s'. The exception '%s' was raised.",
                job["file_xml_path"],
                exception,
            )

        DoJobsConcurrently(
            pack_article_xml,
            jobs=jobs,
            max_workers=int(config.get("THREADPOOL_MAX_WORKERS")),
            exception_callback=log_exceptions,
            update_bar=update_bar,
        )
Esempio n. 10
0
def pack_article_ALLxml():
    """Gera os pacotes SPS a partir de um lista de XML validos.

    Args:
       Não há argumentos

    Retornos:
        Sem retornos.

        Persiste o XML no ``package_path``

    Exemplo:
        pack_article_ALLxml()

    Exceções:
        Não lança exceções.
    """

    xmls = [
        os.path.join(config.get("VALID_XML_PATH"), xml)
        for xml in files.xml_files_list(config.get("VALID_XML_PATH"))
    ]

    jobs = [{"file_xml_path": xml} for xml in xmls]

    with tqdm(total=len(xmls), initial=0) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        DoJobsConcurrently(
            pack_article_xml,
            jobs=jobs,
            max_workers=int(config.get("THREADPOOL_MAX_WORKERS")),
            update_bar=update_bar,
        )
Esempio n. 11
0
 def test_xml_files_list(self):
     self.assertEqual(len(files.xml_files_list(SAMPLES_PATH)),
                      COUNT_SAMPLES_FILES)
Esempio n. 12
0
def register_document(folder: str, session, storage, pid_database_engine, poison_pill=PoisonPill()) -> None:
    """Registra registra pacotes SPS em uma instância do Kernel e seus
    ativos digitais em um object storage."""

    if poison_pill.poisoned:
        return

    logger.debug("Starting the import step for '%s' package.", folder)

    package_files = files.list_files(folder)
    xmls = files.xml_files_list(folder)

    if xmls is None or len(xmls) == 0:
        raise exceptions.XMLError(
            "There is no XML file into package '%s'. Please verify and try later."
            % folder
        ) from None

    xml_path = os.path.join(folder, xmls[0])
    constructor.article_xml_constructor(xml_path, folder, pid_database_engine, False)

    try:
        obj_xml = xml.loadToXML(xml_path)
    except lxml.etree.ParseError as exc:
        raise exceptions.XMLError(
            "Could not parse the '%s' file, please validate"
            " this file before then try to import again." % xml_path,
        ) from None

    xml_sps = SPS_Package(obj_xml)

    pid_v3 = xml_sps.scielo_pid_v3

    try:
        session.documents.fetch(id=pid_v3)
    except DoesNotExist:
        pass
    else:
        logger.debug(
            "Document '%s' already exist in kernel. Returning article result information",
            pid_v3,
        )
        return get_article_result_dict(xml_sps)

    prefix = xml_sps.media_prefix or ""
    url_xml = storage.register(xml_path, prefix)
    static_assets, static_additionals = get_document_assets_path(
        obj_xml, package_files, folder
    )
    registered_assets = put_static_assets_into_storage(static_assets, prefix, storage)

    for additional_path in static_additionals.values():
        storage.register(os.path.join(additional_path), prefix)

    renditions = get_document_renditions(folder, prefix, storage)
    document = Document(
        manifest=manifest.get_document_manifest(
            xml_sps, url_xml, registered_assets, renditions
        )
    )

    try:
        add_document(session, document)
        if renditions:
            add_renditions(session, document)
    except AlreadyExists as exc:
        logger.error(exc)
    else:
        logger.debug("Document with id '%s' was imported.", document.id())

    return get_article_result_dict(xml_sps)