Ejemplo n.º 1
0
def pack_article_xml(file_xml_path):
    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.info("%s possui %s ativos digitais", file_xml_path,
                len(asset_replacements))

    renditions, renditions_metadata = sps_package.get_renditions_metadata()
    logger.info("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(asset_replacements + renditions, pkg_path,
                                  bad_pkg_path, sps_package.package_name)

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))

    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Ejemplo n.º 2
0
def packing_assets(asset_replacements, pkg_path, bad_pkg_path, pkg_name):
    """
    Retorna o caminho do pacote (pkg_path ou bad_pkg_path)
    """
    errors = []
    if not os.path.isdir(pkg_path):
        files.make_empty_dir(pkg_path)

    for old_path, new_fname in asset_replacements:
        error = download_asset(old_path, new_fname, pkg_path)
        if error:
            errors.append((old_path, new_fname, error))

    if len(errors) > 0:
        # garante que existe pastas diferentes para
        # pacotes completos e incompletos
        if pkg_path == bad_pkg_path:
            bad_pkg_path += "_INCOMPLETE"
        # move pacote incompleto para a pasta de pacotes incompletos
        files.make_empty_dir(bad_pkg_path)
        for item in os.listdir(pkg_path):
            shutil.move(os.path.join(pkg_path, item), bad_pkg_path)
        shutil.rmtree(pkg_path)
        # gera relatorio de erros
        errors_filename = os.path.join(bad_pkg_path, "%s.err" % pkg_name)
        if len(errors) > 0:
            error_messages = "\n".join(["%s %s %s" % _err for _err in errors])
            files.write_file(errors_filename, error_messages)
        return bad_pkg_path
    return pkg_path
def register_documents(session_db, storage, documents_sorter, folder) -> None:
    """Realiza o processo de importação de pacotes SPS no diretório indicado. O
    processo de importação segue as fases: registro de assets/renditions no
    object storage informado, registro do manifesto na base de dados do Kernel
    informada e ordenação dos documentos em um `documents_sorter` para posterior
    associação aos seus respectivos fascículos"""

    err_filename = os.path.join(config.get("ERRORS_PATH"),
                                "insert_documents.err")

    for path, _, sps_files in os.walk(folder):
        if not sps_files:
            continue

        try:
            xml = list(filter(lambda f: f.endswith(".xml"), sps_files))[0]
            xml_path = os.path.join(path, xml)
            constructor.article_xml_constructor(xml_path, path, False)
            registration_result = register_document(path, session_db, storage)

            if registration_result:
                document_xml, document_id = registration_result
                documents_sorter.insert_document(document_id, document_xml)

        except (IndexError, ValueError, TypeError, exceptions.XMLError) as ex:
            msg = "Falha ao registrar documento %s: %s" % (path, ex)
            logger.error(msg)
            files.write_file(err_filename, msg, "a")
Ejemplo n.º 4
0
def article_html_generator(file_xml_path: str, dest_path: str) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = XML(file_xml_path, no_network=False)
    html_generator = HTMLGenerator.parse(
        parsed_xml,
        valid_only=False,
        css="https://new.scielo.br/static/css/scielo-article.css",
        print_css="https://new.scielo.br/static/css/scielo-bundle-print.css",
        js="https://new.scielo.br/static/js/scielo-article-min.js",
    )

    for lang, trans_result in html_generator:
        fpath, fname = os.path.split(file_xml_path)
        fname, fext = fname.rsplit(".", 1)
        out_fname = ".".join([fname, lang, "html"])

        new_file_html_path = os.path.join(dest_path, out_fname)

        files.write_file(
            new_file_html_path,
            etree.tostring(
                trans_result,
                doctype=u"<!DOCTYPE html>",
                pretty_print=True,
                encoding="utf-8",
                method="html",
            ).decode("utf-8"),
        )
Ejemplo n.º 5
0
def extract_all_data(list_documents_pids: List[str]):
    """Extrai documentos XML a partir de uma lista de PIDS
    de entrada"""

    pids_to_extract, pids_extracteds, stage_path = files.fetch_stages_info(
        list_documents_pids, __name__)

    logger.info("Iniciando extração dos Documentos")
    count = 0

    try:
        for documents_pid in tqdm(
                iterable=pids_to_extract,
                initial=len(pids_extracteds),
                total=len(list_documents_pids),
        ):
            documents_pid = documents_pid.strip()

            logger.debug("\t coletando dados do Documento '%s'", documents_pid)
            xml_article = article.ext_article_txt(documents_pid)
            if xml_article:
                count += 1

                file_path = os.path.join(config.get("SOURCE_PATH"),
                                         "%s.xml" % documents_pid)
                logger.debug("\t Salvando arquivo '%s'", file_path)
                files.write_file(file_path, xml_article)
                files.register_latest_stage(stage_path, documents_pid)
    except KeyboardInterrupt:
        ...

    logger.info("\t Total de %s artigos", count)
Ejemplo n.º 6
0
def packing_assets(asset_replacements, pkg_path, incomplete_pkg_path, pkg_name,
                   scielo_pid_v2):
    """Tem a responsabilidade de ``empacotar`` os ativos digitais e retorna o
    path do pacote.

    Args:
        asset_replacements: lista com os ativos
        pkg_path: caminho do pacote
        incomplete_pkg_path: caminho para os pacotes incompletos
        pkg_name: nome do pacote
        scielo_pid_v2: PID v2

    Retornos:
        retorna o caminho ``pkg_path`` ou incomplete_pkg_path

    Exceções:
        Não lança exceções.
    """
    errors = []
    if not os.path.isdir(pkg_path):
        files.make_empty_dir(pkg_path)

    for old_path, new_fname in asset_replacements:
        try:
            get_asset(old_path, new_fname, pkg_path)
        except AssetNotFoundError as e:
            logger.error(
                "%s", {
                    "pid": scielo_pid_v2,
                    "pkg_name": pkg_name,
                    "old_path": old_path,
                    "new_fname": new_fname,
                    "msg": str(e),
                })
            errors.append((old_path, new_fname, str(e)))

    if len(errors) > 0:
        # garante que existe pastas diferentes para
        # pacotes completos e incompletos
        if pkg_path == incomplete_pkg_path:
            incomplete_pkg_path += "_INCOMPLETE"
        # move pacote incompleto para a pasta de pacotes incompletos
        files.make_empty_dir(incomplete_pkg_path)
        for item in os.listdir(pkg_path):
            shutil.move(os.path.join(pkg_path, item), incomplete_pkg_path)
        shutil.rmtree(pkg_path)
        # gera relatorio de erros
        errors_filename = os.path.join(incomplete_pkg_path,
                                       "%s.err" % pkg_name)
        error_messages = "\n".join(["%s %s %s" % _err for _err in errors])
        files.write_file(errors_filename, error_messages)
        return incomplete_pkg_path
    return pkg_path
Ejemplo n.º 7
0
def extrated_journal_data(obj_journal):

    logger.info("\t coletando dados do periodico '%s'", obj_journal.title)
    list_articles = article.get_all_articles_notXML(obj_journal.scielo_issn)
    for name_article, xml_article in list_articles:

        logger.info("\t Salvando arquivo '%s'", name_article)
        files.write_file(
            os.path.join(config.get("SOURCE_PATH"), "%s.xml" % name_article),
            xml_article,
        )
    logger.info("\t Total de %s artigos", len(list_articles))
Ejemplo n.º 8
0
    def test_write_file(self):
        expected_text = "<a><b>bar</b></a>"
        filename = "foo_test.txt"

        try:
            files.write_file(filename, expected_text)

            with open(filename, "r") as f:
                text = f.read()
        finally:
            os.remove(filename)

        self.assertEqual(expected_text, text)
Ejemplo n.º 9
0
def get_and_write(pid, stage_path, poison_pill):
    if poison_pill.poisoned:
        return

    documents_pid = pid.strip()

    logger.debug("\t coletando dados do Documento '%s'", documents_pid)
    xml_article = article.ext_article_txt(documents_pid)

    if xml_article:
        file_path = os.path.join(config.get("SOURCE_PATH"), "%s.xml" % documents_pid)
        logger.debug("\t Salvando arquivo '%s'", file_path)
        files.write_file(file_path, xml_article)
        files.register_latest_stage(stage_path, documents_pid)
Ejemplo n.º 10
0
def conversion_article_xml(file_xml_path):
    article = files.read_file(file_xml_path)

    obj_xml = etree.fromstring(article)
    obj_html_body = xml.parser_body_xml(obj_xml)

    # sobrecreve o html escapado anterior pelo novo xml tratado
    remove = obj_xml.find("body/p")
    remove.getparent().replace(remove, obj_html_body)

    new_file_xml_path = os.path.join(
        config.get("CONVERSION_PATH"), os.path.split(file_xml_path)[1]
    )
    files.write_file(new_file_xml_path, etree.tostring(obj_xml).decode("utf-8"))
def extract_all_data(list_documents_pids: List[str]):

    logger.info("Iniciando extração dos Documentos")
    count = 0
    for documents_pid in tqdm(list_documents_pids):
        documents_pid = documents_pid.strip()

        logger.debug("\t coletando dados do Documento '%s'", documents_pid)
        xml_article = article.ext_article_txt(documents_pid)
        if xml_article:
            count += 1

            file_path = os.path.join(config.get("SOURCE_PATH"),
                                     "%s.xml" % documents_pid)
            logger.debug("\t Salvando arquivo '%s'", file_path)
            files.write_file(file_path, xml_article)

    logger.info("\t Total de %s artigos", count)
Ejemplo n.º 12
0
def register_documents_in_documents_bundle(
        session_db, documents_sorted_in_bundles) -> None:

    err_filename = os.path.join(config.get("ERRORS_PATH"),
                                "insert_documents_in_bundle.err")

    not_registered = []

    for key, documents_bundle in documents_sorted_in_bundles.items():
        data = documents_bundle["data"]
        items = documents_bundle["items"]
        try:
            documents_bundle = get_documents_bundle(session_db, data)
        except ValueError as error:
            files.write_file(err_filename, key + "\n", "a")
            not_registered.append(key)
        else:
            link_documents_bundles_with_documents(documents_bundle, items,
                                                  session_db)
Ejemplo n.º 13
0
def register_documents(session_db, storage, documents_sorter) -> None:
    logger.info("Iniciando Envio dos do xmls")
    list_folders = files.list_files(config.get("SPS_PKG_PATH"))

    err_filename = os.path.join(config.get("ERRORS_PATH"),
                                "insert_documents.err")

    for folder in list_folders:
        try:
            document_path = os.path.join(config.get("SPS_PKG_PATH"), folder)
            registration_result = register_document(document_path, session_db,
                                                    storage)
            if registration_result:
                document_xml, document_id = registration_result
                documents_sorter.insert_document(document_id, document_xml)

        except Exception as ex:
            msg = "Falha ao registrar documento %s: %s" % (document_path, ex)
            logger.error(msg)
            files.write_file(err_filename, msg, "a")
Ejemplo n.º 14
0
def manage_error_file(errors, err_file, converted_file):
    if os.path.isfile(err_file):
        try:
            os.unlink(err_file)
        except:
            pass

    if errors:
        msg = []
        for err, data in errors.items():
            msg.append(err)
            msg.extend([
                "{}:{}".format(ln, text)
                for ln, text in zip(data["lineno"], data["message"])
            ])

        files.write_file(
            err_file,
            "%s %s\n%s" %
            (files.read_file(converted_file), "=" * 80, "\n".join(msg)),
        )
Ejemplo n.º 15
0
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()):
    """Empacoda um xml e seus ativos digitais.

    Args:
        file_xml_path: Caminho para o XML
        poison_pill: Injeta um PosionPill()

    Retornos:
        Sem retornos.

        Persiste o XML no ``package_path``

    Exemplo:
        packing.pack_article_xml(
                os.path.join("S0044-59672003000300002.xml")
            )

    Exceções:
        Não lança exceções.
    """
    if poison_pill.poisoned:
        return

    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)
    sps_package.fix("article_id_which_id_type_is_other",
                    sps_package.scielo_pid_v2
                    and sps_package.scielo_pid_v2[-5:],
                    silently=True)
    new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10])
    if new_issns:
        sps_package.fix("issns", new_issns, silently=True)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH,
                                       original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.debug("%s possui %s ativos digitais", file_xml_path,
                 len(asset_replacements))

    source_json = get_source_json(sps_package.scielo_pid_v2)
    renditions, renditions_metadata = source_json.get_renditions_metadata()
    logger.debug("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(
        asset_replacements + renditions,
        pkg_path,
        incomplete_pkg_path,
        sps_package.package_name,
        sps_package.scielo_pid_v2,
    )

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))
    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
def register_documents_in_documents_bundle(session_db, file_documents: str,
                                           file_journals: str) -> None:

    err_filename = os.path.join(config.get("ERRORS_PATH"),
                                "insert_documents_in_bundle.err")

    not_registered = []
    journals = reading.read_json_file(file_journals)
    documents = reading.read_json_file(file_documents)

    data_journal = {}
    for journal in journals:
        o_journal = Journal(journal)
        if o_journal.print_issn:
            data_journal[o_journal.print_issn] = o_journal.scielo_issn
        if o_journal.electronic_issn:
            data_journal[o_journal.electronic_issn] = o_journal.scielo_issn
        if o_journal.scielo_issn:
            data_journal[o_journal.scielo_issn] = o_journal.scielo_issn

    documents_bundles = {}
    for scielo_id, document in documents.items():
        is_issue = bool(document.get("volume") or document.get("number"))

        issn = ""
        for issn_type in ("eissn", "pissn", "issn"):
            issn = document.get(issn_type)
            if issn: break

        if is_issue:
            bundle_id = scielo_ids_generator.issue_id(
                data_journal[issn],
                document.get("year"),
                document.get("volume"),
                document.get("number"),
                document.get("supplement"),
            )
        else:
            bundle_id = scielo_ids_generator.aops_bundle_id(data_journal[issn])

        documents_bundles.setdefault(bundle_id, {})
        documents_bundles[bundle_id].setdefault("items", [])

        documents_bundles[bundle_id]["items"].append({
            "id":
            scielo_id,
            "order":
            document.get("order", ""),
        })
        documents_bundles[bundle_id]["data"] = {
            "is_issue": is_issue,
            "bundle_id": bundle_id,
            "issn": data_journal[document.get("issn")],
        }

    for documents_bundle in documents_bundles.values():

        data = documents_bundle["data"]
        items = documents_bundle["items"]
        try:
            documents_bundle = get_documents_bundle(session_db,
                                                    data["bundle_id"],
                                                    data["is_issue"],
                                                    data["issn"])
        except ValueError as error:
            files.write_file(err_filename, data["bundle_id"] + "\n", "a")
            not_registered.append(data["bundle_id"])
        else:
            link_documents_bundles_with_documents(documents_bundle, items,
                                                  session_db)
Ejemplo n.º 17
0
def register_documents_in_documents_bundle(
    session_db, file_documents: str, file_journals: str
) -> None:
    journals = reading.read_json_file(file_journals)
    data_journal = {}
    for journal in journals:
        o_journal = Journal(journal)
        for _issn in (o_journal.print_issn, o_journal.electronic_issn,
                      o_journal.scielo_issn):
            if _issn:
                data_journal[_issn] = o_journal.scielo_issn

    def get_issn(document, data_journal=data_journal):
        """Recupera o ISSN ID do Periódico ao qual documento pertence"""
        for issn_type in ("eissn", "pissn", "issn"):
            if document.get(issn_type) is not None:
                issn_value = document[issn_type].strip()
                if data_journal.get(issn_value) is not None:
                    return data_journal[issn_value]

    def get_bundle_info(issn, document):
        """
        Obtém e retorna os dados do `bundle`: ID e se é um fascículo

        Args:
            issn (str): ISSN
            document (dict): Dados do documento

        Returns:
            tuple (bool, str):
                True para é fascículoID do `bundle` de fascículo ou aop
        """
        bundle_id = scielo_ids_generator.any_bundle_id(
                issn,
                document.get("year"),
                document.get("volume"),
                document.get("number"),
                document.get("supplement"),
            )
        aops_bundle_id = scielo_ids_generator.aops_bundle_id(issn)
        is_issue = bundle_id != aops_bundle_id
        return is_issue, bundle_id

    err_filename = os.path.join(
        config.get("ERRORS_PATH"), "insert_documents_in_bundle.err"
    )

    with open(file_documents) as f:
        documents = f.readlines()

    documents_bundles = {}
    for document in documents:
        document = json.loads(document)
        issn_id = get_issn(document)
        if issn_id is None:
            logger.error("No ISSN in document '%s'", document["pid_v3"])
            files.write_file(err_filename, document["pid_v3"] + "\n", "a")
            continue
        is_issue, bundle_id = get_bundle_info(issn_id, document)
        documents_bundles.setdefault(bundle_id, {})
        documents_bundles[bundle_id].setdefault("items", [])
        documents_bundles[bundle_id]["items"].append(
            {"id": document.pop("pid_v3"), "order": document.get("order", "")}
        )
        documents_bundles[bundle_id]["data"] = {
            "is_issue": is_issue,
            "bundle_id": bundle_id,
            "issn": issn_id,
        }

    for documents_bundle in documents_bundles.values():

        data = documents_bundle["data"]
        items = documents_bundle["items"]
        try:
            documents_bundle = get_documents_bundle(
                session_db, data["bundle_id"], data["is_issue"], data["issn"]
            )
        except ValueError as exc:
            logger.error(
                "The bundle '%s' was not updated. During executions "
                "this following exception was raised '%s'.",
                data["bundle_id"],
                exc,
            )
            content = json.dumps({"issue": data["bundle_id"], "items": items})
            files.write_file(err_filename, content + "\n", "a")
        else:
            link_documents_bundles_with_documents(documents_bundle, items, session_db)
Ejemplo n.º 18
0
 def save_file(stage_path, file_path, documents_pid, article_content):
     logger.debug("\t Salvando arquivo '%s'", file_path)
     files.write_file(file_path, article_content)
     files.register_latest_stage(stage_path, documents_pid)