Ejemplo n.º 1
0
    def setUp(self):
        self.package_path = os.path.join(SAMPLES_PATH,
                                         "0034-8910-rsp-47-02-0231")
        self.xml_path = os.path.join(self.package_path,
                                     "0034-8910-rsp-47-02-0231.xml")
        self.xml_etree = loadToXML(self.xml_path)
        self.package_files = [
            "0034-8910-rsp-47-02-0231-en.pdf",
            "0034-8910-rsp-47-02-0231-gf01-en.jpg",
            "0034-8910-rsp-47-02-0231-gf01-en.tif",
            "0034-8910-rsp-47-02-0231-gf01.jpg",
            "0034-8910-rsp-47-02-0231-gf01.tif",
            "0034-8910-rsp-47-02-0231.pdf",
            "0034-8910-rsp-47-02-0231.xml",
        ]

        self.second_package_path = os.path.join(SAMPLES_PATH,
                                                "0034-8910-rsp-47-02-0403")

        self.second_xml_path = os.path.join(self.second_package_path,
                                            "0034-8910-rsp-47-02-0403.xml")

        self.second_xml_etree = loadToXML(self.second_xml_path)

        self.second_package_files = [
            "0034-8910-rsp-47-02-0403-gf01.jpg",
            "0034-8910-rsp-47-02-0403-gf01.tif",
            "0034-8910-rsp-47-02-0403.pdf",
            "0034-8910-rsp-47-02-0403.xml",
        ]

        self.session = Session()
Ejemplo n.º 2
0
def convert_article_xml(file_xml_path):

    obj_xmltree = xml.loadToXML(file_xml_path)
    obj_xml = obj_xmltree.getroot()

    obj_xml.set("specific-use", "sps-1.9")
    obj_xml.set("dtd-version", "1.1")

    xml_sps = SPS_Package(obj_xmltree)
    # CONVERTE O BODY DO AM PARA SPS
    xml_sps.transform_body()
    # CONVERTE PUB-DATE PARA SPS 1.9
    xml_sps.transform_pubdate()

    # CONSTROI O SCIELO-id NO XML CONVERTIDO
    xml_sps.create_scielo_id()

    # Remove a TAG <counts> do XML
    xml_sps.transform_article_meta_count()

    languages = "-".join(xml_sps.languages)
    _, fname = os.path.split(file_xml_path)
    fname, fext = fname.rsplit(".", 1)

    new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"),
                                     "%s.%s.%s" % (fname, languages, fext))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def article_xml_constructor(file_xml_path: str, dest_path: str,
                            pid_database_engine, in_place: bool) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    pid_v2 = xml_sps.scielo_pid_v2

    # VERIFICA A EXISTÊNCIA DO PID V3 NO XC ATRAVES DO PID V2
    if not pid_manager.check_pid_v3_by_v2(pid_database_engine, pid_v2):

        # CONSTROI O SCIELO-id NO XML CONVERTIDO
        xml_sps.create_scielo_id()

        # CRIA O PID V2 E V3 NA BASE DE DADOS DO XC
        pid_manager.create_pid(pid_database_engine, pid_v2,
                               xml_sps.scielo_pid_v3)

    else:

        # SE CASO EXISTA O PID NO VERSÃO 3 NA BASE DO XC É PRECISO ADICIONAR NO XML
        pid_v3 = pid_manager.get_pid_v3_by_v2(pid_database_engine, pid_v2)

        xml_sps.scielo_pid_v3 = pid_v3

    if in_place:
        new_file_xml_path = file_xml_path
    else:
        new_file_xml_path = os.path.join(dest_path,
                                         os.path.basename(file_xml_path))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def register_document(folder: str, session_db, storage) -> None:

    logger.info("Processando a Pasta %s", folder)
    list_files = files.list_files(folder)

    obj_xml = None
    prefix = ""
    xml_files = files.xml_files_list(folder)
    _renditions = list(
        filter(lambda file: ".pdf" in file or ".html" in file, list_files))

    if len(xml_files) > 1:
        raise exceptions.XMLError("Existe %s xmls no pacote SPS",
                                  len(xml_files))
    else:
        try:
            x_file = xml_files[0]
        except IndexError as ex:
            raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex)

    xml_path = os.path.join(folder, x_file)
    obj_xml = xml.loadToXML(xml_path)

    xml_sps = SPS_Package(obj_xml)

    # TODO: é possível que alguns artigos não possuam o self.acron
    prefix = xml_sps.media_prefix
    url_xml = storage.register(xml_path, prefix)

    static_assets, static_additionals = get_document_assets_path(
        obj_xml, list_files, folder)
    registered_assets = put_static_assets_into_storage(static_assets, prefix,
                                                       storage)

    for additional_path in static_additionals.values():
        storage.register(os.path.join(additional_path), prefix)

    if obj_xml:
        renditions = get_document_renditions(folder, _renditions, prefix,
                                             storage)
        manifest_data = ManifestDomainAdapter(
            manifest=manifest.get_document_manifest(
                obj_xml, url_xml, registered_assets, renditions))

        try:
            session_db.documents.add(data=manifest_data)
            session_db.changes.add({
                "timestamp": utcnow(),
                "entity": "Document",
                "id": manifest_data.id()
            })
            logger.info("Document-store save: %s", manifest_data.id())
        except AlreadyExists as exc:
            logger.exception(exc)

    return obj_xml, manifest_data.id()
Ejemplo n.º 5
0
def register_document(folder: str, session_db, storage) -> None:

    logger.info("Processando a Pasta %s", folder)
    list_files = files.list_files(folder)

    obj_xml = None
    prefix = ""
    xml_files = files.xml_files_list(folder)
    medias_files = set(list_files) - set(xml_files)

    if len(xml_files) > 1:
        raise exceptions.XMLError("Existe %s xmls no pacote SPS",
                                  len(xml_files))
    else:
        try:
            x_file = xml_files[0]
        except IndexError as ex:
            raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex)

    xml_path = os.path.join(folder, x_file)
    obj_xml = xml.loadToXML(xml_path)

    xml_sps = SPS_Package(obj_xml)

    prefix = xml_sps.media_prefix
    url_xml = storage.register(xml_path, prefix)

    assets = []
    for m_file in medias_files:
        assets.append({
            "asset_id":
            m_file,
            "asset_url":
            storage.register(os.path.join(folder, m_file), prefix),
        })

    if obj_xml:
        manifest_data = ManifestDomainAdapter(
            manifest=manifest.get_document_manifest(obj_xml, url_xml, assets))

        try:
            session_db.documents.add(data=manifest_data)
            session_db.changes.add({
                "timestamp": utcnow(),
                "entity": "Document",
                "id": manifest_data.id()
            })
            logger.info("Document-store save: %s", manifest_data.id())
        except AlreadyExists as exc:
            logger.exception(exc)

    return obj_xml, manifest_data.id()
def article_xml_constructor(file_xml_path: str, dest_path: str) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    # CONSTROI O SCIELO-id NO XML CONVERTIDO
    xml_sps.create_scielo_id()

    new_file_xml_path = os.path.join(dest_path,
                                     os.path.basename(file_xml_path))
    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 7
0
    def update_xml_file(self, xml_target_path, row, pack_name):
        """
        Lê e atualiza o XML do pacote informado com os dados de artigos do arquivo
        articles_data_reader.
        """
        obj_xmltree = xml.loadToXML(xml_target_path)

        logger.debug('Updating XML "%s" with CSV info', xml_target_path)
        sps_package = self._update_sps_package_obj(SPS_Package(obj_xmltree),
                                                   pack_name, row,
                                                   xml_target_path)

        # Salva XML com alterações
        xml.objXML2file(xml_target_path, sps_package.xmltree, pretty=True)
        return sps_package
Ejemplo n.º 8
0
def article_xml_constructor(file_xml_path: str, dest_path: str,
                            pid_database_engine, in_place: bool) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    register_pid_v3(pid_database_engine, xml_sps)

    if in_place:
        new_file_xml_path = file_xml_path
    else:
        new_file_xml_path = os.path.join(dest_path,
                                         os.path.basename(file_xml_path))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 9
0
def convert_article_xml(file_xml_path: str,
                        spy=False,
                        poison_pill=PoisonPill()):

    if poison_pill.poisoned:
        return
    logger.info(os.path.basename(file_xml_path))

    obj_xmltree = xml.loadToXML(file_xml_path)
    obj_xml = obj_xmltree.getroot()

    obj_xml.set("specific-use", "sps-1.9")
    obj_xml.set("dtd-version", "1.1")

    xml_sps = SPS_Package(obj_xmltree)
    # CONVERTE O BODY DO AM PARA SPS
    xml_sps.transform_body(spy)
    # Transforma XML em SPS 1.9
    xml_sps.transform_content()
    # Completa datas presentes na base artigo e ausente no XML
    json_file_path = Path(config.get("SOURCE_PATH")).joinpath(
        Path(xml_sps.scielo_pid_v2 + ".json"))
    article = xylose_converter.json_file_to_xylose_article(json_file_path)
    document_pubdate, issue_pubdate = get_article_dates(article)
    xml_sps.complete_pub_date(document_pubdate, issue_pubdate)

    # Remove a TAG <counts> do XML
    xml_sps.transform_article_meta_count()

    languages = "-".join(xml_sps.languages)
    _, fname = os.path.split(file_xml_path)
    fname, fext = fname.rsplit(".", 1)

    new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"),
                                     "%s.%s.%s" % (fname, languages, fext))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Ejemplo n.º 10
0
def register_document(folder: str, session, storage, pid_database_engine, poison_pill=PoisonPill()) -> None:
    """Registra registra pacotes SPS em uma instância do Kernel e seus
    ativos digitais em um object storage."""

    if poison_pill.poisoned:
        return

    logger.debug("Starting the import step for '%s' package.", folder)

    package_files = files.list_files(folder)
    xmls = files.xml_files_list(folder)

    if xmls is None or len(xmls) == 0:
        raise exceptions.XMLError(
            "There is no XML file into package '%s'. Please verify and try later."
            % folder
        ) from None

    xml_path = os.path.join(folder, xmls[0])
    constructor.article_xml_constructor(xml_path, folder, pid_database_engine, False)

    try:
        obj_xml = xml.loadToXML(xml_path)
    except lxml.etree.ParseError as exc:
        raise exceptions.XMLError(
            "Could not parse the '%s' file, please validate"
            " this file before then try to import again." % xml_path,
        ) from None

    xml_sps = SPS_Package(obj_xml)

    pid_v3 = xml_sps.scielo_pid_v3

    try:
        session.documents.fetch(id=pid_v3)
    except DoesNotExist:
        pass
    else:
        logger.debug(
            "Document '%s' already exist in kernel. Returning article result information",
            pid_v3,
        )
        return get_article_result_dict(xml_sps)

    prefix = xml_sps.media_prefix or ""
    url_xml = storage.register(xml_path, prefix)
    static_assets, static_additionals = get_document_assets_path(
        obj_xml, package_files, folder
    )
    registered_assets = put_static_assets_into_storage(static_assets, prefix, storage)

    for additional_path in static_additionals.values():
        storage.register(os.path.join(additional_path), prefix)

    renditions = get_document_renditions(folder, prefix, storage)
    document = Document(
        manifest=manifest.get_document_manifest(
            xml_sps, url_xml, registered_assets, renditions
        )
    )

    try:
        add_document(session, document)
        if renditions:
            add_renditions(session, document)
    except AlreadyExists as exc:
        logger.error(exc)
    else:
        logger.debug("Document with id '%s' was imported.", document.id())

    return get_article_result_dict(xml_sps)