def clean_dte_xml_file(input_file_path: str, output_file_path: str) -> Iterable[bytes]: with open(input_file_path, mode='rb') as f: file_bytes = f.read() xml_doc = xml_utils.parse_untrusted_xml(file_bytes) xml_doc_cleaned, modified = cl_sii.dte.parse.clean_dte_xml( xml_doc, set_missing_xmlns=True, remove_doc_personalizado=True, ) # TODO: add exception with a nice message for the caller. cl_sii.dte.parse.validate_dte_xml(xml_doc_cleaned) with open(output_file_path, 'w+b') as f: xml_utils.write_xml_doc(xml_doc_cleaned, f) with open(output_file_path, mode='rb') as f: file_bytes_rewritten = f.read() # note: another way to compute the difference in a similar format is # `diff -Naur $input_file_path $output_file_path` file_bytes_diff_gen = difflib.diff_bytes( dfunc=difflib.unified_diff, a=file_bytes.splitlines(), b=file_bytes_rewritten.splitlines()) return file_bytes_diff_gen
def _set_dte_xml_missing_xmlns(xml_doc: XmlElement) -> Tuple[XmlElement, bool]: # source: name of the XML element without namespace. # cl_sii/data/ref/factura_electronica/schemas-xml/DTE_v10.xsd#L22 (f57a326) # cl_sii/data/ref/factura_electronica/schemas-xml/EnvioDTE_v10.xsd#L92 (f57a326) em_tag_simple = 'DTE' em_namespace = DTE_XMLNS em_tag_namespaced = '{%s}%s' % (em_namespace, em_tag_simple) # Tag of 'DTE' should be ... assert em_tag_namespaced == '{http://www.sii.cl/SiiDte}DTE' modified = False root_em = xml_doc.getroottree().getroot() root_em_tag = root_em.tag if root_em_tag == em_tag_namespaced: pass elif root_em_tag == em_tag_simple: modified = True root_em.set('xmlns', em_namespace) f = io.BytesIO() xml_utils.write_xml_doc(xml_doc, f) new_xml_doc_bytes = f.getvalue() xml_doc = xml_utils.parse_untrusted_xml(new_xml_doc_bytes) else: exc_msg = "XML root element tag does not match the expected simple or namespaced name." raise Exception(exc_msg, em_tag_simple, em_tag_namespaced, root_em_tag) return xml_doc, modified
def test_clean_dte_xml_ok_3(self) -> None: file_bytes = self.dte_bad_xml_3_xml_bytes xml_doc = xml_utils.parse_untrusted_xml(file_bytes) self.assertEqual(xml_doc.getroottree().getroot().tag, 'DTE') with self.assertRaises(xml_utils.XmlSchemaDocValidationError) as cm: validate_dte_xml(xml_doc) self.assertSequenceEqual(cm.exception.args, ( "Element 'DTE': No matching global declaration available for the validation root., " "line 2", )) xml_doc_cleaned, modified = clean_dte_xml( xml_doc, set_missing_xmlns=True, remove_doc_personalizado=True, ) self.assertTrue(modified) # This will not raise. validate_dte_xml(xml_doc_cleaned) f = io.BytesIO() xml_utils.write_xml_doc(xml_doc_cleaned, f) file_bytes_rewritten = f.getvalue() del f xml_doc_rewritten = xml_utils.parse_untrusted_xml(file_bytes_rewritten) validate_dte_xml(xml_doc_rewritten) expected_file_bytes_diff = ( b'--- \n', b'+++ \n', b'@@ -1,5 +1,5 @@\n', b'-<?xml version="1.0" encoding="windows-1252"?>', b'-<DTE version="1.0">', b"+<?xml version='1.0' encoding='WINDOWS-1252'?>", b'+<DTE xmlns="http://www.sii.cl/SiiDte" version="1.0">', b' <Documento ID="DTE-33-2336600">', b' <Encabezado>', b' <IdDoc>', ) file_bytes_diff_gen = difflib.diff_bytes( dfunc=difflib.unified_diff, a=file_bytes.splitlines(), b=file_bytes_rewritten.splitlines()) self.assertSequenceEqual( [diff_line for diff_line in file_bytes_diff_gen], expected_file_bytes_diff)
def test_clean_dte_xml_ok_2(self) -> None: file_bytes = self.dte_bad_xml_2_xml_bytes xml_doc = xml_utils.parse_untrusted_xml(file_bytes) self.assertEqual( xml_doc.getroottree().getroot().tag, 'DTE') with self.assertRaises(xml_utils.XmlSchemaDocValidationError) as cm: validate_dte_xml(xml_doc) self.assertSequenceEqual( cm.exception.args, ("Element 'DTE': No matching global declaration available for the validation root., " "line 2", ) ) xml_doc_cleaned, modified = clean_dte_xml( xml_doc, set_missing_xmlns=True, remove_doc_personalizado=True, ) self.assertTrue(modified) # This will not raise. validate_dte_xml(xml_doc_cleaned) f = io.BytesIO() xml_utils.write_xml_doc(xml_doc_cleaned, f) file_bytes_rewritten = f.getvalue() del f xml_doc_rewritten = xml_utils.parse_untrusted_xml(file_bytes_rewritten) validate_dte_xml(xml_doc_rewritten) expected_file_bytes_diff = ( b'--- \n', b'+++ \n', b'@@ -1,5 +1,5 @@\n', b'-<?xml version="1.0" encoding="ISO-8859-1"?>', b'-<DTE version="1.0">', b"+<?xml version='1.0' encoding='ISO-8859-1'?>", b'+<DTE xmlns="http://www.sii.cl/SiiDte" version="1.0">', b' <!-- O Win32 Chrome 73 central VERSION: v20190227 -->', b' <Documento ID="MiPE76399752-6048">', b' <Encabezado>', b'@@ -64,13 +64,13 @@\n', b' </Documento>', b' <Signature xmlns="http://www.w3.org/2000/09/xmldsig#">', b' <SignedInfo>', b'-<CanonicalizationMethod Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315" />', # noqa: E501 b'-<SignatureMethod Algorithm="http://www.w3.org/2000/09/xmldsig#rsa-sha1" />', b'+<CanonicalizationMethod Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315"/>', # noqa: E501 b'+<SignatureMethod Algorithm="http://www.w3.org/2000/09/xmldsig#rsa-sha1"/>', b' <Reference URI="#MiPE76399752-6048">', b' <Transforms>', b'-<Transform Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315" />', b'+<Transform Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315"/>', b' </Transforms>', b'-<DigestMethod Algorithm="http://www.w3.org/2000/09/xmldsig#sha1" />', b'+<DigestMethod Algorithm="http://www.w3.org/2000/09/xmldsig#sha1"/>', b' <DigestValue>tk/D3mfO/KtdWyFXYZHe7dtYijg=</DigestValue>', b' </Reference>', b' </SignedInfo>', ) file_bytes_diff_gen = difflib.diff_bytes( dfunc=difflib.unified_diff, a=file_bytes.splitlines(), b=file_bytes_rewritten.splitlines()) self.assertSequenceEqual( [diff_line for diff_line in file_bytes_diff_gen], expected_file_bytes_diff )