Ejemplo n.º 1
0
def pack_article_xml(file_xml_path):
    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.info("%s possui %s ativos digitais", file_xml_path,
                len(asset_replacements))

    renditions, renditions_metadata = sps_package.get_renditions_metadata()
    logger.info("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(asset_replacements + renditions, pkg_path,
                                  bad_pkg_path, sps_package.package_name)

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))

    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Ejemplo n.º 2
0
def convert_article_xml(file_xml_path):

    obj_xmltree = xml.loadToXML(file_xml_path)
    obj_xml = obj_xmltree.getroot()

    obj_xml.set("specific-use", "sps-1.9")
    obj_xml.set("dtd-version", "1.1")

    xml_sps = SPS_Package(obj_xmltree)
    # CONVERTE O BODY DO AM PARA SPS
    xml_sps.transform_body()
    # CONVERTE PUB-DATE PARA SPS 1.9
    xml_sps.transform_pubdate()

    # CONSTROI O SCIELO-id NO XML CONVERTIDO
    xml_sps.create_scielo_id()

    # Remove a TAG <counts> do XML
    xml_sps.transform_article_meta_count()

    languages = "-".join(xml_sps.languages)
    _, fname = os.path.split(file_xml_path)
    fname, fext = fname.rsplit(".", 1)

    new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"),
                                     "%s.%s.%s" % (fname, languages, fext))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 3
0
    def update_xml_with_alternatives(self, assets_alternatives, sps_package,
                                     xml_target_path):
        def add_alternative_to_alternatives_tag(image_element, image_filename):
            image_parent = image_element.getparent()
            new_alternative = etree.Element(image_element.tag)
            new_alternative.set("{http://www.w3.org/1999/xlink}href",
                                image_filename)
            if image_parent.tag == "alternatives":
                image_parent.append(new_alternative)
            else:
                alternative_node = etree.Element("alternatives")
                alternative_node.tail = image_element.tail
                image_element.tail = None
                alternative_node.append(image_element)
                alternative_node.append(new_alternative)
                image_parent.append(alternative_node)

        _xmltree = deepcopy(sps_package.xmltree)
        for asset_filename, alternatives in assets_alternatives.items():
            for new_name in alternatives:
                logger.debug('New alternative name for asset "%s": "%s"',
                             asset_filename, new_name)
                asset_elems = _xmltree.findall(
                    f'.//*[@xlink:href="{asset_filename}"]',
                    namespaces={"xlink": "http://www.w3.org/1999/xlink"},
                )
                for elem in asset_elems:
                    add_alternative_to_alternatives_tag(elem, new_name)

        # Salva XML com alterações
        xml.objXML2file(xml_target_path, _xmltree, pretty=True)
def article_xml_constructor(file_xml_path: str, dest_path: str,
                            pid_database_engine, in_place: bool) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    pid_v2 = xml_sps.scielo_pid_v2

    # VERIFICA A EXISTÊNCIA DO PID V3 NO XC ATRAVES DO PID V2
    if not pid_manager.check_pid_v3_by_v2(pid_database_engine, pid_v2):

        # CONSTROI O SCIELO-id NO XML CONVERTIDO
        xml_sps.create_scielo_id()

        # CRIA O PID V2 E V3 NA BASE DE DADOS DO XC
        pid_manager.create_pid(pid_database_engine, pid_v2,
                               xml_sps.scielo_pid_v3)

    else:

        # SE CASO EXISTA O PID NO VERSÃO 3 NA BASE DO XC É PRECISO ADICIONAR NO XML
        pid_v3 = pid_manager.get_pid_v3_by_v2(pid_database_engine, pid_v2)

        xml_sps.scielo_pid_v3 = pid_v3

    if in_place:
        new_file_xml_path = file_xml_path
    else:
        new_file_xml_path = os.path.join(dest_path,
                                         os.path.basename(file_xml_path))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 5
0
def pack_article_xml(file_xml_path):
    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename)

    files.make_empty_dir(pkg_path)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.info("%s possui %s ativos digitais", file_xml_path,
                len(asset_replacements))

    package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path,
                                  sps_package.package_name)

    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Ejemplo n.º 6
0
 def test_objXML2file(self):
     xml_obj = etree.fromstring("""<root>
             <p>TEXTO é ç á à è</p>
         </root>""")
     test_dir = tempfile.mkdtemp()
     file_name = os.path.join(test_dir, "test.xml")
     xml.objXML2file(file_name, xml_obj)
     with open(file_name) as f:
         text = f.read()
         self.assertIn("<?xml version='1.0' encoding='utf-8'?>", text)
         self.assertIn("é ç á à è", text)
def article_xml_constructor(file_xml_path: str, dest_path: str) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    # CONSTROI O SCIELO-id NO XML CONVERTIDO
    xml_sps.create_scielo_id()

    new_file_xml_path = os.path.join(dest_path,
                                     os.path.basename(file_xml_path))
    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 8
0
    def update_xml_file(self, xml_target_path, row, pack_name):
        """
        Lê e atualiza o XML do pacote informado com os dados de artigos do arquivo
        articles_data_reader.
        """
        obj_xmltree = xml.loadToXML(xml_target_path)

        logger.debug('Updating XML "%s" with CSV info', xml_target_path)
        sps_package = self._update_sps_package_obj(SPS_Package(obj_xmltree),
                                                   pack_name, row,
                                                   xml_target_path)

        # Salva XML com alterações
        xml.objXML2file(xml_target_path, sps_package.xmltree, pretty=True)
        return sps_package
Ejemplo n.º 9
0
def article_xml_constructor(file_xml_path: str, dest_path: str,
                            pid_database_engine, in_place: bool) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    register_pid_v3(pid_database_engine, xml_sps)

    if in_place:
        new_file_xml_path = file_xml_path
    else:
        new_file_xml_path = os.path.join(dest_path,
                                         os.path.basename(file_xml_path))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 10
0
def convert_article_xml(file_xml_path: str,
                        spy=False,
                        poison_pill=PoisonPill()):

    if poison_pill.poisoned:
        return
    logger.info(os.path.basename(file_xml_path))

    obj_xmltree = xml.loadToXML(file_xml_path)
    obj_xml = obj_xmltree.getroot()

    obj_xml.set("specific-use", "sps-1.9")
    obj_xml.set("dtd-version", "1.1")

    xml_sps = SPS_Package(obj_xmltree)
    # CONVERTE O BODY DO AM PARA SPS
    xml_sps.transform_body(spy)
    # Transforma XML em SPS 1.9
    xml_sps.transform_content()
    # Completa datas presentes na base artigo e ausente no XML
    json_file_path = Path(config.get("SOURCE_PATH")).joinpath(
        Path(xml_sps.scielo_pid_v2 + ".json"))
    article = xylose_converter.json_file_to_xylose_article(json_file_path)
    document_pubdate, issue_pubdate = get_article_dates(article)
    xml_sps.complete_pub_date(document_pubdate, issue_pubdate)

    # Remove a TAG <counts> do XML
    xml_sps.transform_article_meta_count()

    languages = "-".join(xml_sps.languages)
    _, fname = os.path.split(file_xml_path)
    fname, fext = fname.rsplit(".", 1)

    new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"),
                                     "%s.%s.%s" % (fname, languages, fext))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 11
0
def update_articles_mixed_citations(
    source: str,
    output_folder: str = None,
    override: bool = False,
    disable_bar: bool = False,
):
    """Atualiza os elementos de ``mixed-citations`` em um ou mais XMLs.

    O resultado da atualização pode ser salvo no próprio arquivo XML ou em
    outro arquivo XML em um diretório diferente utilizando o parâmetro
    ``output_folder``.
    
    Marque o `override` como `True` para sobrescrever todas as mixed citations
    das referências, caso contrário, apenas as referências sem mixed citations
    serão atualizadas (padrão)."""

    CACHE_DIR = config.get("PARAGRAPH_CACHE_PATH")

    if not os.path.exists(source):
        raise FileNotFoundError("Source path '%s' does not exist" % source)
    elif output_folder is not None and not os.path.exists(output_folder):
        raise FileNotFoundError("Output folder '%s' does not exist" %
                                output_folder)

    def get_references_text_from_paragraphs(paragraphs: list,
                                            pid: str) -> dict:
        """Filtra as referências a partir dos paragráfos.

        As referências possuem a mesma estrutura dos parágrafos na base MST
        exceto pelo índice (v888). Considera-se uma referência os registros que
        possuem o índice/order (v888) e a chave de `PID` para o artigo (v880).

        Params:
            paragraphs (List[dict]): Lista de parágrafos extraídos da base MST
            pid (str): Identificador do documento no formato `scielo-v2`

        Returns:
            references (Dict[str, str]): Dicionário com referências filtradas,
            e.g: {"order": "text"}
        """
        references = {}

        for paragraph in paragraphs:
            article_pid = get_nested(paragraph, "v880", 0, "_", default=None)
            index = get_nested(paragraph, "v888", 0, "_", default=-1)

            if index != -1 and article_pid == pid:
                references[index] = XMLUtils.cleanup_mixed_citation_text(
                    get_nested(paragraph, "v704", 0, "_"))

        return references

    def get_output_file_path(original_file, output_folder=None):
        """Retorna o path completo para um arquivo de saída"""
        if output_folder is None:
            return original_file

        return os.path.join(output_folder, os.path.basename(original_file))

    def get_paragraphs_from_cache(file) -> list:
        """Retorna uma lista de paragráfos a partir de um arquivo JSON"""
        paragraphs = []

        with open(file, "r") as f:
            for line in f.readlines():
                paragraphs.append(json.loads(line))

        return paragraphs

    xmls = get_files_in_path(source, extension=".xml")

    with tqdm(total=len(xmls), disable=disable_bar) as pbar:
        for xml in xmls:
            try:
                package = SPS_Package(etree.parse(xml))

                if package.scielo_pid_v2 is None:
                    logger.error(
                        "Could not update file '%s' because its PID is unknown.",
                        xml)
                    continue

                paragraph_file = f"{CACHE_DIR}/{package.scielo_pid_v2}.json"
                paragraphs = get_paragraphs_from_cache(paragraph_file)
                references = get_references_text_from_paragraphs(
                    paragraphs, pid=package.scielo_pid_v2)
                updated = package.update_mixed_citations(references,
                                                         override=override)
                output_file = get_output_file_path(xml, output_folder)
                XMLUtils.objXML2file(output_file, package.xmltree, pretty=True)

                if len(updated) > 0:
                    logger.debug("Updated %0.3d references from '%s' file.",
                                 len(updated), xml)

            except etree.XMLSyntaxError as e:
                logger.error(e)
            except FileNotFoundError as e:
                logger.error(
                    "Could not update file '%s' "
                    "the exception '%s' occurred.", xml, e)
            pbar.update(1)
Ejemplo n.º 12
0
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()):
    """Empacoda um xml e seus ativos digitais.

    Args:
        file_xml_path: Caminho para o XML
        poison_pill: Injeta um PosionPill()

    Retornos:
        Sem retornos.

        Persiste o XML no ``package_path``

    Exemplo:
        packing.pack_article_xml(
                os.path.join("S0044-59672003000300002.xml")
            )

    Exceções:
        Não lança exceções.
    """
    if poison_pill.poisoned:
        return

    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)
    sps_package.fix("article_id_which_id_type_is_other",
                    sps_package.scielo_pid_v2
                    and sps_package.scielo_pid_v2[-5:],
                    silently=True)
    new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10])
    if new_issns:
        sps_package.fix("issns", new_issns, silently=True)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH,
                                       original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.debug("%s possui %s ativos digitais", file_xml_path,
                 len(asset_replacements))

    source_json = get_source_json(sps_package.scielo_pid_v2)
    renditions, renditions_metadata = source_json.get_renditions_metadata()
    logger.debug("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(
        asset_replacements + renditions,
        pkg_path,
        incomplete_pkg_path,
        sps_package.package_name,
        sps_package.scielo_pid_v2,
    )

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))
    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Ejemplo n.º 13
0
    def optimise_xml_to_web(self, target_path, xml_target_path, pid):
        xml_filename = os.path.basename(xml_target_path)

        def read_file(filename):
            file_source_path = os.path.join(target_path, filename)
            try:
                with open(file_source_path, "rb") as file_obj:
                    file_bytes = file_obj.read()
            except OSError as exc:
                raise packtools.exceptions.SPPackageError(
                    "[%s] -  Error reading file {} during {} optimization: {}".
                    format(pid, filename, xml_filename, str(exc)))
            else:
                logger.debug('File "%s" reading %s bytes', file_source_path,
                             len(file_bytes))
                return file_bytes

        logger.debug("Optimizing XML file %s", xml_filename)
        try:
            xml_web_optimiser = packtools.XMLWebOptimiser(
                xml_filename, os.listdir(target_path), read_file, target_path)
        except (etree.XMLSyntaxError, etree.SerialisationError) as exc:
            logger.error(
                '[%s] - Error creating XMLWebOptimiser for "%s": %s',
                pid,
                xml_target_path,
                str(exc),
            )
        else:
            optimised_xml = xml_web_optimiser.get_xml_file()
            logger.debug("Saving optimised XML file %s", xml_filename)
            xml.objXML2file(xml_target_path,
                            etree.fromstring(optimised_xml),
                            pretty=True)

            # Salva ativos digitais otimizados
            for asset_filename, asset_bytes in xml_web_optimiser.get_optimised_assets(
            ):
                if asset_bytes is None:
                    logger.error(
                        '[%s] - Error saving image file "%s" referenced in "%s": '
                        "no file bytes",
                        pid,
                        asset_filename,
                        xml_filename,
                    )
                else:
                    image_target_path = os.path.join(target_path,
                                                     asset_filename)
                    logger.debug('Saving image file "%s"', image_target_path)
                    files.write_file_binary(image_target_path, asset_bytes)
            for asset_filename, asset_bytes in xml_web_optimiser.get_assets_thumbnails(
            ):
                if asset_bytes is None:
                    logger.error(
                        '[%s] - Error saving image file "%s" referenced in "%s": '
                        "no file bytes",
                        pid,
                        asset_filename,
                        xml_filename,
                    )
                else:
                    image_target_path = os.path.join(target_path,
                                                     asset_filename)
                    logger.debug('Saving image file "%s"', image_target_path)
                    files.write_file_binary(image_target_path, asset_bytes)