Beispiel #1
0
def pack_article_xml(file_xml_path):
    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.info("%s possui %s ativos digitais", file_xml_path,
                len(asset_replacements))

    renditions, renditions_metadata = sps_package.get_renditions_metadata()
    logger.info("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(asset_replacements + renditions, pkg_path,
                                  bad_pkg_path, sps_package.package_name)

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))

    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Beispiel #2
0
class Test_SPS_Package_No_Metadata(unittest.TestCase):
    def setUp(self):
        article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink">
                <inline-graphic xlink:href="a01tab01.gif"/>
                <graphic xlink:href="a01f01.gif"/>
                <ext-link xlink:href="a01tab02.gif"/>
                <ext-link xlink:href="mailto:a01f02.gif"/>
                <inline-supplementary-material xlink:href="a01tab03.gif"/>
                <supplementary-material xlink:href="a01tab04.gif"/>
                <media xlink:href="a01tab04.gif"/>
            </root>
            """
        self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01")

    def test_parse_article(self):
        self.assertEqual(self.sps_package.parse_article_meta, [])

    def test_package_name(self):
        self.assertEqual(self.sps_package.package_name, "a01")

    def test_asset_package_name_f01(self):
        self.assertEqual(self.sps_package.asset_name("a01f01.jpg"),
                         "a01-gf01.jpg")

    def test_asset_package_name_any_img(self):
        self.assertEqual(self.sps_package.asset_name("img.jpg"),
                         "a01-gimg.jpg")

    def test_journal_meta(self):
        self.assertEqual(self.sps_package.journal_meta, [])

    def test_parse_article_meta(self):
        self.assertEqual(self.sps_package.parse_article_meta, [])
def article_xml_constructor(file_xml_path: str, dest_path: str,
                            pid_database_engine, in_place: bool) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    pid_v2 = xml_sps.scielo_pid_v2

    # VERIFICA A EXISTÊNCIA DO PID V3 NO XC ATRAVES DO PID V2
    if not pid_manager.check_pid_v3_by_v2(pid_database_engine, pid_v2):

        # CONSTROI O SCIELO-id NO XML CONVERTIDO
        xml_sps.create_scielo_id()

        # CRIA O PID V2 E V3 NA BASE DE DADOS DO XC
        pid_manager.create_pid(pid_database_engine, pid_v2,
                               xml_sps.scielo_pid_v3)

    else:

        # SE CASO EXISTA O PID NO VERSÃO 3 NA BASE DO XC É PRECISO ADICIONAR NO XML
        pid_v3 = pid_manager.get_pid_v3_by_v2(pid_database_engine, pid_v2)

        xml_sps.scielo_pid_v3 = pid_v3

    if in_place:
        new_file_xml_path = file_xml_path
    else:
        new_file_xml_path = os.path.join(dest_path,
                                         os.path.basename(file_xml_path))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Beispiel #4
0
def pack_article_xml(file_xml_path):
    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename)

    files.make_empty_dir(pkg_path)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.info("%s possui %s ativos digitais", file_xml_path,
                len(asset_replacements))

    package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path,
                                  sps_package.package_name)

    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Beispiel #5
0
def convert_article_xml(file_xml_path):

    obj_xmltree = xml.loadToXML(file_xml_path)
    obj_xml = obj_xmltree.getroot()

    obj_xml.set("specific-use", "sps-1.9")
    obj_xml.set("dtd-version", "1.1")

    xml_sps = SPS_Package(obj_xmltree)
    # CONVERTE O BODY DO AM PARA SPS
    xml_sps.transform_body()
    # CONVERTE PUB-DATE PARA SPS 1.9
    xml_sps.transform_pubdate()

    # CONSTROI O SCIELO-id NO XML CONVERTIDO
    xml_sps.create_scielo_id()

    # Remove a TAG <counts> do XML
    xml_sps.transform_article_meta_count()

    languages = "-".join(xml_sps.languages)
    _, fname = os.path.split(file_xml_path)
    fname, fext = fname.rsplit(".", 1)

    new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"),
                                     "%s.%s.%s" % (fname, languages, fext))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Beispiel #6
0
 def setUp(self):
     xml = """<article><article-meta>
         <pub-date pub-type="epub">
             <year>2010</year><month>9</month><day>10</day></pub-date>
         <pub-date pub-type="epub-ppub">
             <year>2011</year></pub-date>
     </article-meta></article>"""
     xmltree = etree.fromstring(xml)
     self.sps_package = SPS_Package(xmltree, None)
Beispiel #7
0
 def setUp(self):
     self.xml = """<article><article-meta>
         <pub-date date-type="pub">
             <year>2010</year><month>5</month><day>13</day></pub-date>
         <pub-date date-type="collection">
             <year>2012</year><month>2</month><day>3</day></pub-date>
     </article-meta></article>"""
     xmltree = etree.fromstring(self.xml)
     self.sps_package = SPS_Package(xmltree, None)
Beispiel #8
0
 def setUp(self):
     article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink">
             <inline-graphic xlink:href="a01tab01.gif"/>
             <graphic xlink:href="a01f01.gif"/>
             <ext-link xlink:href="a01tab02.gif"/>
             <ext-link xlink:href="mailto:a01f02.gif"/>
             <inline-supplementary-material xlink:href="a01tab03.gif"/>
             <supplementary-material xlink:href="a01tab04.gif"/>
             <media xlink:href="a01tab04.gif"/>
         </root>
         """
     self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01")
def article_xml_constructor(file_xml_path: str, dest_path: str) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    # CONSTROI O SCIELO-id NO XML CONVERTIDO
    xml_sps.create_scielo_id()

    new_file_xml_path = os.path.join(dest_path,
                                     os.path.basename(file_xml_path))
    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Beispiel #10
0
 def setUp(self):
     xml = """<article xmlns:xlink="http://www.w3.org/1999/xlink"><article-meta>
         <counts>
             <fig-count count="0"/>
             <table-count count="0"/>
             <equation-count count="0"/>
         </counts>
         <body>
             <fig id="i01"><graphic xlink:href="/img/fbpe/rm/v30n1/0002i01.gif"/></fig>
             <table-wrap id="tab01"><label>Tabela 1</label><table><tr><td>TEXTO</td></tr></table></table-wrap>
         </body>
     </article-meta></article>"""
     xmltree = etree.fromstring(xml)
     self.sps_package = SPS_Package(xmltree, None)
class Test_MatchPubDate4(unittest.TestCase):
    def setUp(self):
        xml = """<article><article-meta>
            <pub-date date-type="pub">
                <year>2010</year><month>9</month><day>1</day></pub-date>
            <pub-date date-type="epub-ppub">
                <year>2011</year></pub-date>
            <pub-date date-type="collection">
                <year>2012</year><month>2</month></pub-date>
        </article-meta></article>"""
        xmltree = etree.fromstring(xml)
        self.sps_package = SPS_Package(xmltree, None)

    def test__match_pubdate(self):
        result = self.sps_package._match_pubdate(
            ('pub-date[@date-type="pub"]',
             'pub-date[@date-type="collection"]'))
        self.assertEqual(result.findtext("year"), "2010")

    def test_document_pubdate(self):
        self.assertEqual(self.sps_package.document_pubdate,
                         ("2010", "09", "01"))

    def test_documents_bundle_pubdate(self):
        self.assertEqual(self.sps_package.documents_bundle_pubdate,
                         ("2012", "02", ""))
Beispiel #12
0
def get_document_manifest(document: etree.ElementTree, document_url: str,
                          assets: list) -> dict:
    """Cria um manifesto no formato do Kernel a partir de um
    documento xml"""

    obj_sps = SPS_Package(document)
    _id = obj_sps.scielo_id
    date = obj_sps.document_pubdate

    if not _id:
        raise ValueError("Document requires an scielo-id") from None

    if not date:
        raise ValueError("A creation date is required") from None

    _creation_date = parse_date("-".join(
        [date_part for date_part in date if date_part]))

    _version = {
        "data": document_url,
        "assets": {},
        "timestamp": _creation_date
    }
    _document = {"id": _id, "versions": [_version]}

    for asset in assets:
        _version["assets"][asset.get("asset_id")] = [[
            _creation_date, asset.get("asset_url")
        ]]

    return _document
def register_document(folder: str, session_db, storage) -> None:

    logger.info("Processando a Pasta %s", folder)
    list_files = files.list_files(folder)

    obj_xml = None
    prefix = ""
    xml_files = files.xml_files_list(folder)
    _renditions = list(
        filter(lambda file: ".pdf" in file or ".html" in file, list_files))

    if len(xml_files) > 1:
        raise exceptions.XMLError("Existe %s xmls no pacote SPS",
                                  len(xml_files))
    else:
        try:
            x_file = xml_files[0]
        except IndexError as ex:
            raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex)

    xml_path = os.path.join(folder, x_file)
    obj_xml = xml.loadToXML(xml_path)

    xml_sps = SPS_Package(obj_xml)

    # TODO: é possível que alguns artigos não possuam o self.acron
    prefix = xml_sps.media_prefix
    url_xml = storage.register(xml_path, prefix)

    static_assets, static_additionals = get_document_assets_path(
        obj_xml, list_files, folder)
    registered_assets = put_static_assets_into_storage(static_assets, prefix,
                                                       storage)

    for additional_path in static_additionals.values():
        storage.register(os.path.join(additional_path), prefix)

    if obj_xml:
        renditions = get_document_renditions(folder, _renditions, prefix,
                                             storage)
        manifest_data = ManifestDomainAdapter(
            manifest=manifest.get_document_manifest(
                obj_xml, url_xml, registered_assets, renditions))

        try:
            session_db.documents.add(data=manifest_data)
            session_db.changes.add({
                "timestamp": utcnow(),
                "entity": "Document",
                "id": manifest_data.id()
            })
            logger.info("Document-store save: %s", manifest_data.id())
        except AlreadyExists as exc:
            logger.exception(exc)

    return obj_xml, manifest_data.id()
Beispiel #14
0
class Test_MatchPubDate4(unittest.TestCase):
    def setUp(self):
        xml = """<article><article-meta>
            <pub-date date-type="pub">
                <year>2010</year><month>9</month><day>1</day></pub-date>
            <pub-date date-type="epub-ppub">
                <year>2011</year></pub-date>
            <pub-date date-type="collection">
                <year>2012</year><month>2</month></pub-date>
        </article-meta></article>"""
        xmltree = etree.fromstring(xml)
        self.sps_package = SPS_Package(xmltree, None)

    def test__match_pubdate(self):
        result = self.sps_package._match_pubdate(
            ('pub-date[@date-type="pub"]',
             'pub-date[@date-type="collection"]'))
        self.assertEqual(result.findtext("year"), "2010")

    def test_document_pubdate(self):
        self.assertEqual(self.sps_package.document_pubdate,
                         ("2010", "09", "01"))

    def test_documents_bundle_pubdate(self):
        self.assertEqual(self.sps_package.documents_bundle_pubdate,
                         ("2012", "02", ""))

    def test_transform_pubdate(self):
        self.sps_package.transform_pubdate()
        xpaths_results = (
            ('pub-date[@date-type="pub"]', ("2010", "9", "1")),
            ('pub-date[@date-type="collection"]', ("2012", "2", None)),
        )
        for xpath, result in xpaths_results:
            with self.subTest(xpath=xpath, result=result):
                pubdate = self.sps_package.article_meta.find(xpath)
                self.assertIsNotNone(pubdate)
                self.assertEqual(pubdate.get("publication-format"),
                                 "electronic")
                self.assertEqual(pubdate.findtext("year"), result[0])
                self.assertEqual(pubdate.findtext("month"), result[1])
                self.assertEqual(pubdate.findtext("day"), result[2])
Beispiel #15
0
def document_store_by_request(folder: str, storage) -> None:

    logger.info("Processando a Pasta %s", folder)
    list_files = files.list_files(folder)

    obj_xml = None
    prefix = ""
    xml_files = files.xml_files_list(folder)
    _renditions = list(
        filter(lambda file: ".pdf" in file or ".html" in file, list_files))
    medias_files = set(list_files) - set(xml_files) - set(_renditions)

    if len(xml_files) > 1:
        raise exceptions.XMLError("Existe %s xmls no pacote SPS",
                                  len(xml_files))
    else:
        try:
            x_file = xml_files[0]
        except IndexError as ex:
            raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex)

    xml_path = os.path.join(folder, x_file)
    obj_xml = xml.loadToXML(xml_path)
    xml_sps = SPS_Package(obj_xml)

    prefix = xml_sps.media_prefix
    url_xml = storage.register(xml_path, prefix)

    assets = []
    for m_file in medias_files:
        assets.append({
            "asset_id":
            m_file,
            "asset_url":
            storage.register(os.path.join(folder, m_file), prefix),
        })

    renditions = []

    if obj_xml:
        documentstore_data = {
            "data": url_xml,
            "assets": assets,
            "renditions": renditions,
        }
        scielo_id = xml_sps.scielo_id
        if scielo_id:
            result = request.put(
                request.join(settings.DOCUMENT_STORE_URL,
                             "/documents/%s" % scielo_id),
                data=json.dumps(documentstore_data),
            )
            logger.info("Retorno Documents-Store: %s", result.status_code)
Beispiel #16
0
class Test_MatchPubDate1_Season(unittest.TestCase):
    def setUp(self):
        xml = """<article><article-meta>
            <pub-date date-type="pub">
                <year>2010</year><month>5</month><day>13</day></pub-date>
            <pub-date date-type="collection">
                <year>2012</year><season>Jan-Feb</season></pub-date>
        </article-meta></article>"""
        xmltree = etree.fromstring(xml)
        self.sps_package = SPS_Package(xmltree, None)

    def test__match_pubdate(self):
        result = self.sps_package._match_pubdate(
            ('pub-date[@date-type="pub"]',
             'pub-date[@date-type="collection"]'))
        self.assertEqual(result.findtext("year"), "2010")

    def test_document_pubdate(self):
        self.assertEqual(self.sps_package.document_pubdate,
                         ("2010", "05", "13"))

    def test_documents_bundle_pubdate(self):
        self.assertEqual(self.sps_package.documents_bundle_pubdate,
                         ("2012", "", ""))

    def test_transform_pubdate(self):
        self.sps_package.transform_pubdate()
        pubdate = self.sps_package.article_meta.find(
            'pub-date[@date-type="pub"]')
        self.assertIsNotNone(pubdate)
        self.assertEqual(pubdate.get("publication-format"), "electronic")
        self.assertEqual(pubdate.findtext("year"), "2010")
        self.assertEqual(pubdate.findtext("month"), "5")
        self.assertEqual(pubdate.findtext("day"), "13")
        pubdate = self.sps_package.article_meta.find(
            'pub-date[@date-type="collection"]')
        self.assertIsNotNone(pubdate)
        self.assertEqual(pubdate.get("publication-format"), "electronic")
        self.assertEqual(pubdate.findtext("year"), "2012")
        self.assertEqual(pubdate.findtext("season"), "Jan-Feb")
def register_document(folder: str, session_db, storage) -> None:

    logger.info("Processando a Pasta %s", folder)
    list_files = files.list_files(folder)

    obj_xml = None
    prefix = ""
    xml_files = files.xml_files_list(folder)
    medias_files = set(list_files) - set(xml_files)

    if len(xml_files) > 1:
        raise exceptions.XMLError("Existe %s xmls no pacote SPS",
                                  len(xml_files))
    else:
        try:
            x_file = xml_files[0]
        except IndexError as ex:
            raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex)

    xml_path = os.path.join(folder, x_file)
    obj_xml = xml.loadToXML(xml_path)

    xml_sps = SPS_Package(obj_xml)

    prefix = xml_sps.media_prefix
    url_xml = storage.register(xml_path, prefix)

    assets = []
    for m_file in medias_files:
        assets.append({
            "asset_id":
            m_file,
            "asset_url":
            storage.register(os.path.join(folder, m_file), prefix),
        })

    if obj_xml:
        manifest_data = ManifestDomainAdapter(
            manifest=manifest.get_document_manifest(obj_xml, url_xml, assets))

        try:
            session_db.documents.add(data=manifest_data)
            session_db.changes.add({
                "timestamp": utcnow(),
                "entity": "Document",
                "id": manifest_data.id()
            })
            logger.info("Document-store save: %s", manifest_data.id())
        except AlreadyExists as exc:
            logger.exception(exc)

    return obj_xml, manifest_data.id()
Beispiel #18
0
def convert_article_xml(file_xml_path: str,
                        spy=False,
                        poison_pill=PoisonPill()):

    if poison_pill.poisoned:
        return
    logger.info(os.path.basename(file_xml_path))

    obj_xmltree = xml.loadToXML(file_xml_path)
    obj_xml = obj_xmltree.getroot()

    obj_xml.set("specific-use", "sps-1.9")
    obj_xml.set("dtd-version", "1.1")

    xml_sps = SPS_Package(obj_xmltree)
    # CONVERTE O BODY DO AM PARA SPS
    xml_sps.transform_body(spy)
    # Transforma XML em SPS 1.9
    xml_sps.transform_content()
    # Completa datas presentes na base artigo e ausente no XML
    json_file_path = Path(config.get("SOURCE_PATH")).joinpath(
        Path(xml_sps.scielo_pid_v2 + ".json"))
    article = xylose_converter.json_file_to_xylose_article(json_file_path)
    document_pubdate, issue_pubdate = get_article_dates(article)
    xml_sps.complete_pub_date(document_pubdate, issue_pubdate)

    # Remove a TAG <counts> do XML
    xml_sps.transform_article_meta_count()

    languages = "-".join(xml_sps.languages)
    _, fname = os.path.split(file_xml_path)
    fname, fext = fname.rsplit(".", 1)

    new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"),
                                     "%s.%s.%s" % (fname, languages, fext))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Beispiel #19
0
    def update_xml_file(self, xml_target_path, row, pack_name):
        """
        Lê e atualiza o XML do pacote informado com os dados de artigos do arquivo
        articles_data_reader.
        """
        obj_xmltree = xml.loadToXML(xml_target_path)

        logger.debug('Updating XML "%s" with CSV info', xml_target_path)
        sps_package = self._update_sps_package_obj(SPS_Package(obj_xmltree),
                                                   pack_name, row,
                                                   xml_target_path)

        # Salva XML com alterações
        xml.objXML2file(xml_target_path, sps_package.xmltree, pretty=True)
        return sps_package
Beispiel #20
0
def get_document_manifest(document: etree.ElementTree, document_url: str,
                          assets: list, renditions: List[dict]) -> dict:
    """Cria um manifesto no formato do Kernel a partir de um
    documento xml"""

    obj_sps = SPS_Package(document)
    _id = obj_sps.scielo_id
    date = obj_sps.document_pubdate

    if not _id:
        raise ValueError("Document requires an scielo-id") from None

    if not date:
        raise ValueError("A creation date is required") from None

    _creation_date = parse_date("-".join(
        [date_part for date_part in date if date_part]))

    _renditions = []
    _version = {
        "data": document_url,
        "assets": {},
        "timestamp": _creation_date,
        "renditions": _renditions,
    }
    _document = {"id": _id, "versions": [_version]}

    for asset in assets:
        _version["assets"][asset.get("asset_id")] = [[
            _creation_date, asset.get("asset_url")
        ]]

    for rendition in renditions:
        _renditions.append({
            "filename":
            rendition.get("filename"),
            "data": [{
                "timestamp": _creation_date,
                "url": rendition.get("url"),
                "size_bytes": rendition.get("size_bytes"),
            }],
            "mimetype":
            rendition.get("mimetype"),
            "lang":
            rendition.get("lang", obj_sps.languages[0]),
        })

    return _document
Beispiel #21
0
def article_xml_constructor(file_xml_path: str, dest_path: str,
                            pid_database_engine, in_place: bool) -> None:

    logger.debug("file: %s", file_xml_path)

    parsed_xml = xml.loadToXML(file_xml_path)
    xml_sps = SPS_Package(parsed_xml)

    register_pid_v3(pid_database_engine, xml_sps)

    if in_place:
        new_file_xml_path = file_xml_path
    else:
        new_file_xml_path = os.path.join(dest_path,
                                         os.path.basename(file_xml_path))

    xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
Beispiel #22
0
class Test_SPS_Package(unittest.TestCase):
    def setUp(self):
        article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink">
                <inline-graphic xlink:href="a01tab01.gif"/>
                <graphic xlink:href="a01f01.gif"/>
                <ext-link xlink:href="a01tab02.gif"/>
                <ext-link xlink:href="mailto:a01f02.gif"/>
                <inline-supplementary-material xlink:href="a01tab03.gif"/>
                <supplementary-material xlink:href="a01tab04.gif"/>
                <media xlink:href="a01tab04.gif"/>
            </root>
            """
        self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01")

    def test_elements_which_has_xlink_href(self):
        items = list(self.sps_package.elements_which_has_xlink_href)
        self.assertEqual(len(items), 7)
        self.assertEqual(
            [node.tag for node in items],
            sorted([
                "inline-graphic",
                "graphic",
                "ext-link",
                "ext-link",
                "inline-supplementary-material",
                "supplementary-material",
                "media",
            ]),
        )

    def test_replace_assets(self):
        expected = [
            ("a01tab02.gif", "a01-gtab02"),
            ("a01f01.gif", "a01-gf01"),
            ("a01tab01.gif", "a01-gtab01"),
            ("a01tab03.gif", "a01-gtab03"),
            ("a01tab04.gif", "a01-gtab04"),
            ("a01tab04.gif", "a01-gtab04"),
        ]
        items = self.sps_package.replace_assets_names()
        self.assertEqual(len(items), 6)
        for i, item in enumerate(items):
            with self.subTest(i):
                self.assertEqual(expected[i][0], item[0])
                self.assertEqual(expected[i][1], item[1])
Beispiel #23
0
class Test_ArticleMetaCount(unittest.TestCase):
    def setUp(self):
        xml = """<article xmlns:xlink="http://www.w3.org/1999/xlink"><article-meta>
            <counts>
                <fig-count count="0"/>
                <table-count count="0"/>
                <equation-count count="0"/>
            </counts>
            <body>
                <fig id="i01"><graphic xlink:href="/img/fbpe/rm/v30n1/0002i01.gif"/></fig>
                <table-wrap id="tab01"><label>Tabela 1</label><table><tr><td>TEXTO</td></tr></table></table-wrap>
            </body>
        </article-meta></article>"""
        xmltree = etree.fromstring(xml)
        self.sps_package = SPS_Package(xmltree, None)

    def test__transform_article_meta_count(self):
        result = self.sps_package.transform_article_meta_count()

        self.assertIsNone(result.find(".//counts"))
Beispiel #24
0
def sps_package(article_meta_xml, doi="10.1590/S0074-02761962000200006"):
    xml = build_xml(article_meta_xml, doi)
    xmltree = etree.fromstring(xml)
    return SPS_Package(xmltree, "a01")
Beispiel #25
0
class Test_SPS_Package(unittest.TestCase):
    def setUp(self):
        article_xml = """<root xmlns:xlink="http://www.w3.org/1999/xlink">
                <inline-graphic xlink:href="a01tab01.gif"/>
                <graphic xlink:href="a01f01.gif"/>
                <ext-link xlink:href="a01tab02.gif"/>
                <ext-link xlink:href="mailto:a01f02.gif"/>
                <inline-supplementary-material xlink:href="a01tab03.gif"/>
                <supplementary-material xlink:href="a01tab04.gif"/>
                <media xlink:href="a01tab04.gif"/>
            </root>
            """
        self.sps_package = SPS_Package(etree.fromstring(article_xml), "a01")

    def test_elements_which_has_xlink_href(self):
        items = list(self.sps_package.elements_which_has_xlink_href)
        self.assertEqual(len(items), 7)
        self.assertEqual(
            [node.tag for node in items],
            sorted([
                "inline-graphic",
                "graphic",
                "ext-link",
                "ext-link",
                "inline-supplementary-material",
                "supplementary-material",
                "media",
            ]),
        )

    def test_replace_assets(self):
        expected = [
            ("a01tab02.gif", "a01-gtab02"),
            ("a01f01.gif", "a01-gf01"),
            ("a01tab01.gif", "a01-gtab01"),
            ("a01tab03.gif", "a01-gtab03"),
            ("a01tab04.gif", "a01-gtab04"),
            ("a01tab04.gif", "a01-gtab04"),
        ]
        items = self.sps_package.replace_assets_names()
        self.assertEqual(len(items), 6)
        for i, item in enumerate(items):
            with self.subTest(i):
                self.assertEqual(expected[i][0], item[0])
                self.assertEqual(expected[i][1], item[1])

    @mock.patch("documentstore_migracao.export.sps_package.article.get_article"
                )
    def test_get_renditions_metadata_no_renditions(self, mk_get_article):
        mk_article = mock.Mock()
        mk_article.fulltexts.return_value = {}
        mk_get_article.return_value = mk_article
        renditions, renditions_metadata = self.sps_package.get_renditions_metadata(
        )
        self.assertEqual(renditions, [])
        self.assertEqual(renditions_metadata, {})

    @mock.patch("documentstore_migracao.export.sps_package.article.get_article"
                )
    def test_get_renditions_metadata(self, mk_get_article):
        fulltexts = {
            "pdf": {
                "en": "http://www.scielo.br/pdf/aa/v1n1/a01.pdf",
                "pt": "http://www.scielo.br/pdf/aa/v1n1/pt_a01.pdf",
            },
            "html": {
                "en":
                "http://www.scielo.br/scielo.php?script=sci_arttext&tlng=en",
                "pt":
                "http://www.scielo.br/scielo.php?script=sci_arttext&tlng=pt",
            },
        }
        mk_article = mock.Mock()
        mk_article.fulltexts.return_value = fulltexts
        mk_get_article.return_value = mk_article
        renditions, renditions_metadata = self.sps_package.get_renditions_metadata(
        )
        for lang, link in fulltexts.get("pdf"):
            self.assertEqual(renditions, [
                ('http://www.scielo.br/pdf/aa/v1n1/a01.pdf', 'a01'),
                ('http://www.scielo.br/pdf/aa/v1n1/pt_a01.pdf', 'pt_a01'),
            ])
            self.assertEqual(
                renditions_metadata, {
                    'en': 'http://www.scielo.br/pdf/aa/v1n1/a01.pdf',
                    'pt': 'http://www.scielo.br/pdf/aa/v1n1/pt_a01.pdf',
                })
Beispiel #26
0
def register_document(folder: str, session, storage, pid_database_engine, poison_pill=PoisonPill()) -> None:
    """Registra registra pacotes SPS em uma instância do Kernel e seus
    ativos digitais em um object storage."""

    if poison_pill.poisoned:
        return

    logger.debug("Starting the import step for '%s' package.", folder)

    package_files = files.list_files(folder)
    xmls = files.xml_files_list(folder)

    if xmls is None or len(xmls) == 0:
        raise exceptions.XMLError(
            "There is no XML file into package '%s'. Please verify and try later."
            % folder
        ) from None

    xml_path = os.path.join(folder, xmls[0])
    constructor.article_xml_constructor(xml_path, folder, pid_database_engine, False)

    try:
        obj_xml = xml.loadToXML(xml_path)
    except lxml.etree.ParseError as exc:
        raise exceptions.XMLError(
            "Could not parse the '%s' file, please validate"
            " this file before then try to import again." % xml_path,
        ) from None

    xml_sps = SPS_Package(obj_xml)

    pid_v3 = xml_sps.scielo_pid_v3

    try:
        session.documents.fetch(id=pid_v3)
    except DoesNotExist:
        pass
    else:
        logger.debug(
            "Document '%s' already exist in kernel. Returning article result information",
            pid_v3,
        )
        return get_article_result_dict(xml_sps)

    prefix = xml_sps.media_prefix or ""
    url_xml = storage.register(xml_path, prefix)
    static_assets, static_additionals = get_document_assets_path(
        obj_xml, package_files, folder
    )
    registered_assets = put_static_assets_into_storage(static_assets, prefix, storage)

    for additional_path in static_additionals.values():
        storage.register(os.path.join(additional_path), prefix)

    renditions = get_document_renditions(folder, prefix, storage)
    document = Document(
        manifest=manifest.get_document_manifest(
            xml_sps, url_xml, registered_assets, renditions
        )
    )

    try:
        add_document(session, document)
        if renditions:
            add_renditions(session, document)
    except AlreadyExists as exc:
        logger.error(exc)
    else:
        logger.debug("Document with id '%s' was imported.", document.id())

    return get_article_result_dict(xml_sps)
Beispiel #27
0
def update_articles_mixed_citations(
    source: str,
    output_folder: str = None,
    override: bool = False,
    disable_bar: bool = False,
):
    """Atualiza os elementos de ``mixed-citations`` em um ou mais XMLs.

    O resultado da atualização pode ser salvo no próprio arquivo XML ou em
    outro arquivo XML em um diretório diferente utilizando o parâmetro
    ``output_folder``.
    
    Marque o `override` como `True` para sobrescrever todas as mixed citations
    das referências, caso contrário, apenas as referências sem mixed citations
    serão atualizadas (padrão)."""

    CACHE_DIR = config.get("PARAGRAPH_CACHE_PATH")

    if not os.path.exists(source):
        raise FileNotFoundError("Source path '%s' does not exist" % source)
    elif output_folder is not None and not os.path.exists(output_folder):
        raise FileNotFoundError("Output folder '%s' does not exist" %
                                output_folder)

    def get_references_text_from_paragraphs(paragraphs: list,
                                            pid: str) -> dict:
        """Filtra as referências a partir dos paragráfos.

        As referências possuem a mesma estrutura dos parágrafos na base MST
        exceto pelo índice (v888). Considera-se uma referência os registros que
        possuem o índice/order (v888) e a chave de `PID` para o artigo (v880).

        Params:
            paragraphs (List[dict]): Lista de parágrafos extraídos da base MST
            pid (str): Identificador do documento no formato `scielo-v2`

        Returns:
            references (Dict[str, str]): Dicionário com referências filtradas,
            e.g: {"order": "text"}
        """
        references = {}

        for paragraph in paragraphs:
            article_pid = get_nested(paragraph, "v880", 0, "_", default=None)
            index = get_nested(paragraph, "v888", 0, "_", default=-1)

            if index != -1 and article_pid == pid:
                references[index] = XMLUtils.cleanup_mixed_citation_text(
                    get_nested(paragraph, "v704", 0, "_"))

        return references

    def get_output_file_path(original_file, output_folder=None):
        """Retorna o path completo para um arquivo de saída"""
        if output_folder is None:
            return original_file

        return os.path.join(output_folder, os.path.basename(original_file))

    def get_paragraphs_from_cache(file) -> list:
        """Retorna uma lista de paragráfos a partir de um arquivo JSON"""
        paragraphs = []

        with open(file, "r") as f:
            for line in f.readlines():
                paragraphs.append(json.loads(line))

        return paragraphs

    xmls = get_files_in_path(source, extension=".xml")

    with tqdm(total=len(xmls), disable=disable_bar) as pbar:
        for xml in xmls:
            try:
                package = SPS_Package(etree.parse(xml))

                if package.scielo_pid_v2 is None:
                    logger.error(
                        "Could not update file '%s' because its PID is unknown.",
                        xml)
                    continue

                paragraph_file = f"{CACHE_DIR}/{package.scielo_pid_v2}.json"
                paragraphs = get_paragraphs_from_cache(paragraph_file)
                references = get_references_text_from_paragraphs(
                    paragraphs, pid=package.scielo_pid_v2)
                updated = package.update_mixed_citations(references,
                                                         override=override)
                output_file = get_output_file_path(xml, output_folder)
                XMLUtils.objXML2file(output_file, package.xmltree, pretty=True)

                if len(updated) > 0:
                    logger.debug("Updated %0.3d references from '%s' file.",
                                 len(updated), xml)

            except etree.XMLSyntaxError as e:
                logger.error(e)
            except FileNotFoundError as e:
                logger.error(
                    "Could not update file '%s' "
                    "the exception '%s' occurred.", xml, e)
            pbar.update(1)
Beispiel #28
0
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()):
    """Empacoda um xml e seus ativos digitais.

    Args:
        file_xml_path: Caminho para o XML
        poison_pill: Injeta um PosionPill()

    Retornos:
        Sem retornos.

        Persiste o XML no ``package_path``

    Exemplo:
        packing.pack_article_xml(
                os.path.join("S0044-59672003000300002.xml")
            )

    Exceções:
        Não lança exceções.
    """
    if poison_pill.poisoned:
        return

    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)
    sps_package.fix("article_id_which_id_type_is_other",
                    sps_package.scielo_pid_v2
                    and sps_package.scielo_pid_v2[-5:],
                    silently=True)
    new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10])
    if new_issns:
        sps_package.fix("issns", new_issns, silently=True)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH,
                                       original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.debug("%s possui %s ativos digitais", file_xml_path,
                 len(asset_replacements))

    source_json = get_source_json(sps_package.scielo_pid_v2)
    renditions, renditions_metadata = source_json.get_renditions_metadata()
    logger.debug("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(
        asset_replacements + renditions,
        pkg_path,
        incomplete_pkg_path,
        sps_package.package_name,
        sps_package.scielo_pid_v2,
    )

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))
    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)