Beispiel #1
0
 def test_init_xml_is_ok(self):
     text = "<doc/>"
     suitable_xml = xml_utils.SuitableXML(text)
     self.assertEqual(suitable_xml.content, "<doc/>")
     self.assertIsNone(suitable_xml.xml_error)
     self.assertEqual(suitable_xml.doctype, '')
     self.assertIsNone(suitable_xml.xml_declaration)
Beispiel #2
0
    def test_write_should_write_corrected_xml_in_dest_file(self):
        text = (
            '<?xml version="1.0" encoding="utf-8"?>'
            '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal '
            'Publishing DTD v1.1 20151215//EN" '
            '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">'
            '<article><p>&amp;lt;</p></article>')

        with tempfile.TemporaryDirectory() as xml_dir_path:
            in_xml_path = os.path.join(xml_dir_path, "in_xml_doc.xml")
            with open(in_xml_path, 'w') as xml_file:
                xml_file.write(text)

            out_xml_path = os.path.join(xml_dir_path, "out_xml_doc.xml")
            suitable_xml = xml_utils.SuitableXML(in_xml_path)
            suitable_xml.write(out_xml_path)

            out_xml = xml_utils.etree.parse(out_xml_path)
            self.assertIsNotNone(out_xml.docinfo)
            self.assertEqual(out_xml.docinfo.xml_version, "1.0")
            self.assertEqual(out_xml.docinfo.encoding, "UTF-8")
            self.assertEqual(
                out_xml.docinfo.doctype,
                '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal '
                'Publishing DTD v1.1 20151215//EN" '
                '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">'
            )
            self.assertEqual(xml_utils.etree.tostring(out_xml.getroot()),
                             b'<article>\n  <p>&lt;</p>\n</article>')
            self.assertIsNone(suitable_xml.xml_error)
Beispiel #3
0
    def make_xml(self, scielo_dtd_files, pmc_dtd_files):

        sps_xml = self.article.tree
        # j1.1/xsl/sgml2xml/xml2pmc.xsl
        pmc_xml = xml_utils.transform(sps_xml, scielo_dtd_files.xsl_output)
        xml_utils.write(self.pmc_xml_filepath, pmc_xml)

        # recarrega
        pmc_xml = xml_utils.SuitableXML(self.pmc_xml_filepath)

        filenames, changed = self._get_filenames(pmc_xml.xml)
        numbers = self._insert_math_id(pmc_xml.xml)

        if numbers or changed:
            pmc_xml.write(self.pmc_xml_filepath)

        dirname = os.path.dirname(self.pmc_xml_filepath)
        for old, new in filenames:
            old = os.path.join(self.scielo_pkg_files.path, old)
            if os.path.isfile(old):
                new = os.path.join(dirname, new)
                shutil.copyfile(old, new)
            else:
                logging.info("File not found %s to compose PMC Package %s",
                             old, self.pmc_xml_filepath)

        # j1.1/xsl/sgml2xml/pmc.xsl
        result = xml_utils.transform(pmc_xml.xml, pmc_dtd_files.xsl_output)
        xml_utils.write(self.pmc_xml_filepath, result)

        # validate
        xml_validator = sps_xml_validators.PMCXMLValidator(pmc_dtd_files)
        xml_validator.validate(self.pmc_xml_filepath,
                               self.outputs.pmc_dtd_report_filename,
                               self.outputs.pmc_style_report_filename)
Beispiel #4
0
 def test_write_should_write_original_content_if_input_is_not_xml(self):
     text = "Qualquer texto nao XML."
     suitable_xml = xml_utils.SuitableXML(text)
     with tempfile.TemporaryDirectory() as xml_dir_path:
         xml_path = os.path.join(xml_dir_path, "xml_doc.xml")
         suitable_xml.write(xml_path)
         with open(xml_path) as xml_file:
             self.assertEqual(xml_file.read(), text)
     self.assertIsNotNone(suitable_xml.xml_error)
     self.assertIn("it must be an XML content or XML file path",
                   suitable_xml.xml_error)
Beispiel #5
0
    def __init__(self, src_pkgfiles, acron, dest_path):
        self.src_pkgfiles = src_pkgfiles
        self.acron = acron
        self.dest_path = dest_path

        self.xml = xml_utils.SuitableXML(self.src_pkgfiles.filename)
        self.new_name = self.xml.xml

        self.related_files_copy = []
        self.href_replacements = []
        self.href_files_copy = []
        self.href_names = []
        self.missing_href_files = []
Beispiel #6
0
 def test_well_formed_xml_content_removes_extra_spaces(self):
     text = """<doc><p><title>is nunc. Scelerisque in dictum non
     consectetur
     a erat nam. Ipsum dolor sit amet consectetur\t
     adipiscing elit duis tristique sollicitudin. \n
     Eu scelerisque felis imperdiet proin fermen</title></p></doc>"""
     expected = (
         '<doc><p><title>is nunc. Scelerisque in dictum non '
         'consectetur a erat nam. Ipsum dolor sit amet consectetur '
         'adipiscing elit duis tristique sollicitudin. Eu scelerisque '
         'felis imperdiet proin fermen</title></p></doc>')
     suitable_xml = xml_utils.SuitableXML(text)
     suitable_xml.well_formed_xml_content()
     self.assertEqual(expected, suitable_xml.content)
Beispiel #7
0
    def _insert_xhtml_tables_in_document(self):
        for xhtml in self.xml.findall(".//xhtml"):
            href = xhtml.get("href")
            if not href:
                continue
            table_file_path = os.path.join(self.src_pkgfiles.path, href)
            if not os.path.isfile(table_file_path):
                continue

            xml_table = xml_utils.SuitableXML(table_file_path)
            if not xml_table.xml:
                continue
            table = xml_table.xml.find(".//table")
            if table is not None:
                parent = xhtml.getparent()
                parent.replace(xhtml, deepcopy(table))
Beispiel #8
0
    def test_write_should_write_original_content_if_input_is_invalid_xml(self):
        text = (
            '<?xml version="1.0" encoding="utf-8"?>'
            '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal '
            'Publishing DTD v1.1 20151215//EN" '
            '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">'
            '\n<article>'
            '<p><ext-link ext-link-type="uri" xlink:href="<link-invalido>">bla</ext-link></p>'
            '</article>')
        suitable_xml = xml_utils.SuitableXML(text)
        with tempfile.TemporaryDirectory() as xml_dir_path:
            xml_path = os.path.join(xml_dir_path, "xml_doc.xml")
            suitable_xml.write(xml_path)

            with open(xml_path) as xml_file:
                self.assertEqual(xml_file.read(), text)
        self.assertIsNotNone(suitable_xml.xml_error)
        self.assertIn("Loading XML from 'str': ", suitable_xml.xml_error)
Beispiel #9
0
 def test_init_xml_with_no_xml_declaration(self):
     text = (
         '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal '
         'Publishing DTD v1.1 20151215//EN" '
         '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">'
         '\n<article/> lixo')
     suitable_xml = xml_utils.SuitableXML(text)
     self.assertIsNone(suitable_xml.xml_declaration)
     self.assertEqual(
         suitable_xml.doctype,
         '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal '
         'Publishing DTD v1.1 20151215//EN" "https://jats.nlm.nih.gov/'
         'publishing/1.1/JATS-journalpublishing1.dtd">')
     self.assertEqual(
         suitable_xml.content,
         '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal '
         'Publishing DTD v1.1 20151215//EN" "https://jats.nlm.nih.gov/'
         'publishing/1.1/JATS-journalpublishing1.dtd">\n<article/>')
Beispiel #10
0
 def test_well_formed_xml_content_converts_quot_ent_to_chars(self):
     text = '<doc><p><a href=&quot;bla&quot;>teste</a></p></doc>'
     expected = '<doc><p><a href="bla">teste</a></p></doc>'
     suitable_xml = xml_utils.SuitableXML(text)
     suitable_xml.well_formed_xml_content()
     self.assertEqual(expected, suitable_xml.content)
Beispiel #11
0
 def test_well_formed_xml_content_converts_entities_to_chars(self):
     text = '<doc><p>&#91;&ccedil;&#93;</p></doc>'
     expected = '<doc><p>[ç]</p></doc>'
     suitable_xml = xml_utils.SuitableXML(text)
     suitable_xml.well_formed_xml_content()
     self.assertEqual(expected, suitable_xml.content)
Beispiel #12
0
 def test_well_formed_xml_content_removes_junk_after_last_close_tag(self):
     text = '<doc><p></p></doc> lixo'
     expected = '<doc><p/></doc>'
     suitable_xml = xml_utils.SuitableXML(text)
     suitable_xml.well_formed_xml_content()
     self.assertEqual(expected, suitable_xml.content)
Beispiel #13
0
 def test_content_returns_characteres_instead_their_entities(self):
     text = ('<doc><p>&#91;&ccedil;&#93;</p> lixo</doc>')
     expected = ('<doc><p>[ç]</p> lixo</doc>')
     suitable_xml = xml_utils.SuitableXML(text)
     self.assertEqual(expected, suitable_xml.content)
Beispiel #14
0
 def test_init_xml_with_no_doctype(self):
     text = '<?xml version="1.0" encoding="utf-8"?><doc/>'
     suitable_xml = xml_utils.SuitableXML(text)
     self.assertEqual(suitable_xml.xml_declaration,
                      '<?xml version="1.0" encoding="utf-8"?>')
     self.assertEqual(suitable_xml.doctype, '')
Beispiel #15
0
 def test_init_xml_with_junk_is_loaded_without_errors(self):
     text = "<doc/> lixo"
     suitable_xml = xml_utils.SuitableXML(text)
     self.assertEqual(suitable_xml.content, "<doc/>")
     self.assertIsNone(suitable_xml.xml_error)