def clean_dte_xml_file(input_file_path: str, output_file_path: str) -> Iterable[bytes]: with open(input_file_path, mode='rb') as f: file_bytes = f.read() xml_doc = xml_utils.parse_untrusted_xml(file_bytes) xml_doc_cleaned, modified = cl_sii.dte.parse.clean_dte_xml( xml_doc, set_missing_xmlns=True, remove_doc_personalizado=True, ) # TODO: add exception with a nice message for the caller. cl_sii.dte.parse.validate_dte_xml(xml_doc_cleaned) with open(output_file_path, 'w+b') as f: xml_utils.write_xml_doc(xml_doc_cleaned, f) with open(output_file_path, mode='rb') as f: file_bytes_rewritten = f.read() # note: another way to compute the difference in a similar format is # `diff -Naur $input_file_path $output_file_path` file_bytes_diff_gen = difflib.diff_bytes( dfunc=difflib.unified_diff, a=file_bytes.splitlines(), b=file_bytes_rewritten.splitlines()) return file_bytes_diff_gen
def get_file_diff(self, test_name, file1, file2, is_bytes=False): msg = '' diffOut = '' if is_bytes: expectedLines = open(file1, "rb").readlines() outputLines = open(file2, "rb").readlines() for line in difflib.diff_bytes(difflib.unified_diff, expectedLines, outputLines): diffOut += line else: expectedLines = open(file1).readlines() outputLines = open(file2).readlines() for line in difflib.unified_diff(expectedLines, outputLines): diffOut += line if len(diffOut) > 0: msg += ('FAIL -- Test %s has different ouput than expected' % (test_name)) msg += (wrap_message(''.join(expectedLines), test_name + ' Expected Lines')) msg += (wrap_message(''.join(outputLines), test_name + ' Output Lines')) return msg
def compareTwoBinaryFiles(flags, filepaths, filelines): exitCode = 0 if hasattr(difflib, 'diff_bytes'): # python 3.5 or newer diffs = difflib.diff_bytes(difflib.unified_diff, filelines[0], filelines[1], filepaths[0].encode(), filepaths[1].encode(), n=flags.num_context_lines) diffs = [diff.decode(errors="backslashreplace") for diff in diffs] else: # python 2.7 if flags.unified_diff: func = difflib.unified_diff else: func = difflib.context_diff diffs = func(filelines[0], filelines[1], filepaths[0], filepaths[1], n=flags.num_context_lines) for diff in diffs: sys.stdout.write(to_string(diff)) exitCode = 1 return exitCode
def diffBinaryFiles(file1, file2): with open(file1, "rb") as f: content1 = f.read() with open(file2, "rb") as f: content2 = f.read() gen = difflib.diff_bytes(difflib.unified_diff, [content1], [content2]) return not(list(gen))
def diffBinaryFiles(file1, file2): with open(file1, "rb") as f: content1 = f.read() with open(file2, "rb") as f: content2 = f.read() gen = difflib.diff_bytes(difflib.unified_diff, [content1], [content2]) return not (list(gen))
def diffsize(lA, lB): if not lA: return len(strip_to_diff_parts(lB)) if not lB: return len(strip_to_diff_parts(lA)) lA = strip_to_diff_parts(lA) lB = strip_to_diff_parts(lB) diff = difflib.diff_bytes(difflib.unified_diff, lA, lB) return len(list(diff))
def diff_bytes(file1, file2, return_str=False): """ Compare the bytes of two files. Simulates the output of GNU diff. """ texts = [] for f in [file1, file2]: with open(f, 'rb') as f: text = f.read() text = text.replace(b'\r\n', b'\n') # Ignore line breaks for Windows texts += [text.split(b'\n')] text1, text2 = texts output = [] new_part = True num = 0 for line in difflib.diff_bytes(difflib.unified_diff, text1, text2, fromfile=file1.encode(), tofile=file2.encode(), n=0, lineterm=b''): num += 1 if num < 3: line = line.decode() line = line.replace('--- ', '<<< ') line = line.replace('+++ ', '>>> ') output += [line.encode()] continue flag = line[0:1] if flag == b'-': # line unique to sequence 1 new_flag = b'< ' elif flag == b'+': # line unique to sequence 2 new_flag = b'> ' if new_part: new_part = False output += [b'---'] elif flag == b' ': # line common to both sequences # new_flag = b' ' continue elif flag == b'?': # line not present in either input sequence new_flag = b'? ' elif flag == b'@': output += [re.sub(rb'@@ -([^ ]+) \+([^ ]+) @@', rb'\1c\2', line)] new_part = True continue else: new_flag = flag output += [new_flag + line[1:]] if return_str: return '\n'.join([repr(line)[2:-1] for line in output]) else: return b'\n'.join(output)
def diff_score(a, b): la = a.splitlines(keepends=True) lb = b.splitlines(keepends=True) d = difflib.diff_bytes(difflib.context_diff, la, lb) sc = 0 for ln in d: if ln.startswith(b'! ') or ln.startswith(b'+ ') or ln.startswith( b'- '): sc += 1 return 1 - (sc / (len(la) + len(lb)))
def unidiff(a, b, filename_a=b'original', timestamp_a=b'', filename_b=b'modified', timestamp_b=b'', ignore_blanks=False): r"""Compare two sequences of lines; generate the resulting delta. Each sequence must contain individual single-line strings ending with newlines. Such sequences can be obtained from the `readlines()` method of file-like objects. The delta generated also consists of newline-terminated strings, ready to be printed as-is via the writeline() method of a file-like object. Note that the last line of a file may *not* have a newline; this is reported in the same way that GNU diff reports this. *This method only supports UNIX line ending conventions.* filename_a and filename_b are used to generate the header, allowing other tools to determine what 'files' were used to generate this output. timestamp_a and timestamp_b, when supplied, are expected to be last-modified timestamps to be inserted in the header, as floating point values since the epoch. """ if isinstance(a, six.binary_type): a = a.splitlines() if isinstance(b, six.binary_type): b = b.splitlines() if isinstance(filename_a, six.text_type): filename_a = filename_a.encode('utf-8') if isinstance(filename_b, six.text_type): filename_b = filename_b.encode('utf-8') if not isinstance(timestamp_a, six.binary_type): timestamp_a = six.text_type(timestamp_a).encode('utf-8') if not isinstance(timestamp_b, six.binary_type): timestamp_b = six.text_type(timestamp_b).encode('utf-8') if ignore_blanks: a = [x for x in a if not BLANKS_REGEX.match(x)] b = [x for x in b if not BLANKS_REGEX.match(x)] if six.PY2: return difflib.unified_diff(a, b, filename_a, filename_b, timestamp_a, timestamp_b, lineterm=b"") else: return difflib.diff_bytes(difflib.unified_diff, a, b, filename_a, filename_b, timestamp_a, timestamp_b, lineterm=b"")
def test_byte_filenames(self): # somebody renamed a file from ISO-8859-2 to UTF-8 fna = b'\xb3odz.txt' # "łodz.txt" fnb = b'\xc5\x82odz.txt' # they transcoded the content at the same time a = [b'\xa3odz is a city in Poland.'] b = [b'\xc5\x81odz is a city in Poland.'] check = self.check unified = difflib.unified_diff context = difflib.context_diff check(difflib.diff_bytes(unified, a, b, fna, fnb)) check(difflib.diff_bytes(context, a, b, fna, fnb)) def assertDiff(expect, actual): # do not compare expect and equal as lists, because unittest # uses difflib to report difference between lists actual = list(actual) self.assertEqual(len(expect), len(actual)) for e, a in zip(expect, actual): self.assertEqual(e, a) expect = [ b'--- \xb3odz.txt', b'+++ \xc5\x82odz.txt', b'@@ -1 +1 @@', b'-\xa3odz is a city in Poland.', b'+\xc5\x81odz is a city in Poland.', ] actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'') assertDiff(expect, actual) # with dates (plain ASCII) datea = b'2005-03-18' dateb = b'2005-03-19' check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb)) check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb)) expect = [ # note the mixed encodings here: this is deeply wrong by every # tenet of Unicode, but it doesn't crash, it's parseable by # patch, and it's how UNIX(tm) diff behaves b'--- \xb3odz.txt\t2005-03-18', b'+++ \xc5\x82odz.txt\t2005-03-19', b'@@ -1 +1 @@', b'-\xa3odz is a city in Poland.', b'+\xc5\x81odz is a city in Poland.', ] actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb, lineterm=b'') assertDiff(expect, actual)
def test_clean_dte_xml_ok_3(self) -> None: file_bytes = self.dte_bad_xml_3_xml_bytes xml_doc = xml_utils.parse_untrusted_xml(file_bytes) self.assertEqual(xml_doc.getroottree().getroot().tag, 'DTE') with self.assertRaises(xml_utils.XmlSchemaDocValidationError) as cm: validate_dte_xml(xml_doc) self.assertSequenceEqual(cm.exception.args, ( "Element 'DTE': No matching global declaration available for the validation root., " "line 2", )) xml_doc_cleaned, modified = clean_dte_xml( xml_doc, set_missing_xmlns=True, remove_doc_personalizado=True, ) self.assertTrue(modified) # This will not raise. validate_dte_xml(xml_doc_cleaned) f = io.BytesIO() xml_utils.write_xml_doc(xml_doc_cleaned, f) file_bytes_rewritten = f.getvalue() del f xml_doc_rewritten = xml_utils.parse_untrusted_xml(file_bytes_rewritten) validate_dte_xml(xml_doc_rewritten) expected_file_bytes_diff = ( b'--- \n', b'+++ \n', b'@@ -1,5 +1,5 @@\n', b'-<?xml version="1.0" encoding="windows-1252"?>', b'-<DTE version="1.0">', b"+<?xml version='1.0' encoding='WINDOWS-1252'?>", b'+<DTE xmlns="http://www.sii.cl/SiiDte" version="1.0">', b' <Documento ID="DTE-33-2336600">', b' <Encabezado>', b' <IdDoc>', ) file_bytes_diff_gen = difflib.diff_bytes( dfunc=difflib.unified_diff, a=file_bytes.splitlines(), b=file_bytes_rewritten.splitlines()) self.assertSequenceEqual( [diff_line for diff_line in file_bytes_diff_gen], expected_file_bytes_diff)
def compute_assignment(sA, dA, sB, dB): pmap = [] la = len(sA) lb = len(sB) # Attempt to greedily assign an exact match with 0 weight (and # give the other choices for this commit a very large weight). # This speeds up the case where the patches are the same. eqA, eqB = split_away_same_patches(sA, dA, sB, dB) lhs1, rhs1 = compute_matching_assignment( [u for u, e in zip(sA, eqA) if e is None], dA, [v for v, e in zip(sB, eqB) if e is None], dB) imap = make_index_map(eqA, eqB) jmap = make_index_map(eqB, eqA) lhs = np.array(rebuild_match_list(eqA, lhs1, jmap)) rhs = np.array(rebuild_match_list(eqB, rhs1, imap)) # We assume the user is really more interested in the second # argument ("newer" version). To that end, we print the output in # the order of the RHS. To put the LHS commits that are no longer # in the RHS into a good place, we place them once we have seen # all of their predecessors in the LHS. new_on_lhs = (lhs >= lb)[:la] lhs_prior_counter = np.arange(la) def process_lhs_orphans(): while True: assert (lhs_prior_counter >= 0).all() w = (lhs_prior_counter == 0) & new_on_lhs idx = w.nonzero()[0] if len(idx) == 0: break pmap.append((idx[0], None, None)) new_on_lhs[idx[0]] = False lhs_prior_counter[idx[0] + 1:] -= 1 for j, (u, i) in enumerate(zip(sB, rhs)): # now show an RHS commit process_lhs_orphans() if i < la: idiff = list( difflib.diff_bytes(difflib.unified_diff, dA[sA[i]], dB[u])) pmap.append((i, j, idiff)) lhs_prior_counter[i + 1:] -= 1 else: pmap.append((None, j, None)) process_lhs_orphans() return pmap
def check_prune_result(expected): actual = sorted([int(x) for x in exo([b'git', b'log', b'--pretty=format:%at']).out.splitlines()]) if expected != actual: for x in expected: print('ex:', x, strftime('%Y-%m-%d-%H%M%S', localtime(x)), file=stderr) for line in diff_bytes(unified_diff, [result_diffline(x) for x in expected], [result_diffline(x) for x in actual], fromfile=b'expected', tofile=b'actual'): sys.stderr.flush() byte_stream(sys.stderr).write(line) wvpass(expected == actual)
def is_content_equal(c1: Dict[str, str], c2: Dict[str, str]) -> bool: if len(c1.keys()) != len(c1.keys()): print("number of contents is not same") print(c1.keys()) print(c2.keys()) return False for key in c1.keys(): if key not in c2: print(f"file does not exist: {key}") return False if c1[key] != c2[key]: print(f"file is not equal: {key}") for diff in difflib.diff_bytes(difflib.unified_diff, c1[key].splitlines(), c2[key].splitlines()): print(diff) return False return True
def compare_to_formatted(self, filename_str: str) -> None: """Compare the expected formatted output to file contents.""" # This string encode is from argparse, so we should be able to trust it. filename = filename_str.encode() actual = self.get_filelines(filename) expected = self.get_formatted_lines(filename) if self.edit_in_place: # If edit in place is used, the formatter will fix in place with # no stdout. So compare the before/after file for hook pass/fail expected = self.get_filelines(filename) diff = list( difflib.diff_bytes(difflib.unified_diff, actual, expected, fromfile=b'original', tofile=b'formatted')) if len(diff) > 0: header = filename + b"\n" + 20 * b"=" + b"\n" self.stderr += header + b"\n".join(diff) + b"\n" self.returncode = 1
def __call__( self, id: str, from_version_at: str, to_version_at: str = None ) -> bytes: session = self.Session() document = session.documents.fetch(id) from_version = document.data(version_at=from_version_at).splitlines() if to_version_at: _to_version_at = {"version_at": to_version_at} else: _to_version_at = {} to_version = document.data(**_to_version_at).splitlines() diff = difflib.diff_bytes( difflib.unified_diff, from_version, to_version, fromfile=from_version_at.encode("utf-8"), tofile=to_version_at.encode("utf-8") if to_version_at else b"latest", lineterm=b"", ) return b"\n".join(diff)
def _diff(logger, ln, l, rn, r): """Return the difference between two strings""" if l == r: # slightly faster path logger.debug("_diff '%s' and '%s' fast match", ln, rn) return [] # compare diff = list( difflib.diff_bytes(difflib.unified_diff, l.splitlines(), r.splitlines(), fromfile=ln.encode(), tofile=rn.encode(), lineterm=rb"")) logger.debug("_diff: %s", diff) if not diff: # Always return a list. return [] return diff
def _ndiff_matches(self, olines, dlines): """ Uses difflib's ndiff to find matching lines in ancestor and alice or bob Args: olines - list of bytestrings of ancestor dlines - list of bytestrings of either alice or bob Returns: dictionary mapping matching line numbers in ancestor to other """ on, dn = 0, 0 matches = {} # See difflib.diff_bytes documentation # https://docs.python.org/3/library/difflib.html # Use this dfunc to allow ndiff to work on mixed or unknown encoded # byte strings def do_ndiff(alines, blines, fromfile, tofile, fromfiledate, tofiledate, n, lineterm): return ndiff(alines, blines, linejunk=None, charjunk=None) for line in diff_bytes(do_ndiff, olines, dlines, b'ancestor', b'other', b' ', b' ', n=-1, lineterm=b'\n'): dt = line[0:2] if dt == b' ': on += 1 dn += 1 matches[on] = dn elif dt == b'+ ': dn += 1 elif dt == b'- ': on += 1 return matches
def test_byte_filenames(self): fna = b'\xb3odz.txt' fnb = b'\xc5\x82odz.txt' a = [b'\xa3odz is a city in Poland.'] b = [b'\xc5\x81odz is a city in Poland.'] check = self.check unified = difflib.unified_diff context = difflib.context_diff check(difflib.diff_bytes(unified, a, b, fna, fnb)) check(difflib.diff_bytes(context, a, b, fna, fnb)) def assertDiff(expect, actual): actual = list(actual) self.assertEqual(len(expect), len(actual)) for e, a in zip(expect, actual): self.assertEqual(e, a) expect = [ b'--- \xb3odz.txt', b'+++ \xc5\x82odz.txt', b'@@ -1 +1 @@', b'-\xa3odz is a city in Poland.', b'+\xc5\x81odz is a city in Poland.' ] actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'') assertDiff(expect, actual) datea = b'2005-03-18' dateb = b'2005-03-19' check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb)) check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb)) expect = [ b'--- \xb3odz.txt\t2005-03-18', b'+++ \xc5\x82odz.txt\t2005-03-19', b'@@ -1 +1 @@', b'-\xa3odz is a city in Poland.', b'+\xc5\x81odz is a city in Poland.' ] actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb, lineterm=b'') assertDiff(expect, actual)
def test_byte_content(self): # if we receive byte strings, we return byte strings a = [b'hello', b'andr\xe9'] # iso-8859-1 bytes b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes unified = difflib.unified_diff context = difflib.context_diff check = self.check check(difflib.diff_bytes(unified, a, a)) check(difflib.diff_bytes(unified, a, b)) # now with filenames (content and filenames are all bytes!) check(difflib.diff_bytes(unified, a, a, b'a', b'a')) check(difflib.diff_bytes(unified, a, b, b'a', b'b')) # and with filenames and dates check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013')) check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013')) # same all over again, with context diff check(difflib.diff_bytes(context, a, a)) check(difflib.diff_bytes(context, a, b)) check(difflib.diff_bytes(context, a, a, b'a', b'a')) check(difflib.diff_bytes(context, a, b, b'a', b'b')) check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013')) check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))
def run(self): self.detail(color(self.name, Color.UNDERLINE)) test_input = self.read_file(self.input_file) self.detail(color('Command:', Color.BOLD) + ' ' + ' '.join(self.cmd)) stdout, stderr, process = self.run_cmd(test_input) stdout = self.convert_output(stdout) stderr = self.convert_output(stderr) stdout_match = False stderr_match = False if path.isfile(self.stdout_file): expected_stdout = self.convert_output( self.read_file(self.stdout_file)) if expected_stdout == stdout: stdout_match = True else: if self.kwargs.get("diff_mode", False): self.detail(color('STDOUT:', Color.YELLOW)) for line in diff_bytes(unified_diff, expected_stdout.split(b'\n'), stdout.split(b'\n'), fromfile=b'Expected STDOUT', tofile=b'Received STDOUT'): print(line) else: self.detail(color('Received STDOUT:', Color.YELLOW)) self.detail(stdout.decode()) self.detail(color('Expected STDOUT:', Color.YELLOW)) self.detail(expected_stdout.decode()) elif len(stdout) > 0: self.detail(color('Received STDOUT:', Color.YELLOW)) self.detail(stdout.decode()) self.detail( color('Missing STDOUT file: %s' % self.stdout_file, Color.YELLOW)) else: stdout_match = True if path.isfile(self.stderr_file): expected_stderr = self.convert_output( self.read_file(self.stderr_file)) if expected_stderr == stderr: stderr_match = True else: if self.kwargs.get("diff_mode", False): self.detail(color('STDERR:', Color.YELLOW)) for line in diff_bytes(unified_diff, expected_stderr.split(b'\n'), stderr.split(b'\n'), fromfile=b'Expected STDERR', tofile=b'Received STDERR'): print(line) else: self.detail(color('Received STDERR:', Color.YELLOW)) self.detail(stderr.decode()) self.detail(color('Expected STDERR:', Color.YELLOW)) self.detail(expected_stderr.decode()) elif len(stderr) > 0: self.detail(color('Received STDERR:', Color.YELLOW)) self.detail(stderr.decode()) self.detail( color('Missing STDERR file: %s' % self.stderr_file, Color.YELLOW)) else: stderr_match = True self.success = stdout_match and stderr_match if self.success: self.detail(color('Success', Color.GREEN)) else: self.detail(color('Failure', Color.RED))
def test_byte_content(self): a = [b'hello', b'andr\xe9'] b = [b'hello', b'andr\xc3\xa9'] unified = difflib.unified_diff context = difflib.context_diff check = self.check check(difflib.diff_bytes(unified, a, a)) check(difflib.diff_bytes(unified, a, b)) check(difflib.diff_bytes(unified, a, a, b'a', b'a')) check(difflib.diff_bytes(unified, a, b, b'a', b'b')) check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013')) check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013')) check(difflib.diff_bytes(context, a, a)) check(difflib.diff_bytes(context, a, b)) check(difflib.diff_bytes(context, a, a, b'a', b'a')) check(difflib.diff_bytes(context, a, b, b'a', b'b')) check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013')) check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))
def test_clean_dte_xml_ok_2(self) -> None: file_bytes = self.dte_bad_xml_2_xml_bytes xml_doc = xml_utils.parse_untrusted_xml(file_bytes) self.assertEqual( xml_doc.getroottree().getroot().tag, 'DTE') with self.assertRaises(xml_utils.XmlSchemaDocValidationError) as cm: validate_dte_xml(xml_doc) self.assertSequenceEqual( cm.exception.args, ("Element 'DTE': No matching global declaration available for the validation root., " "line 2", ) ) xml_doc_cleaned, modified = clean_dte_xml( xml_doc, set_missing_xmlns=True, remove_doc_personalizado=True, ) self.assertTrue(modified) # This will not raise. validate_dte_xml(xml_doc_cleaned) f = io.BytesIO() xml_utils.write_xml_doc(xml_doc_cleaned, f) file_bytes_rewritten = f.getvalue() del f xml_doc_rewritten = xml_utils.parse_untrusted_xml(file_bytes_rewritten) validate_dte_xml(xml_doc_rewritten) expected_file_bytes_diff = ( b'--- \n', b'+++ \n', b'@@ -1,5 +1,5 @@\n', b'-<?xml version="1.0" encoding="ISO-8859-1"?>', b'-<DTE version="1.0">', b"+<?xml version='1.0' encoding='ISO-8859-1'?>", b'+<DTE xmlns="http://www.sii.cl/SiiDte" version="1.0">', b' <!-- O Win32 Chrome 73 central VERSION: v20190227 -->', b' <Documento ID="MiPE76399752-6048">', b' <Encabezado>', b'@@ -64,13 +64,13 @@\n', b' </Documento>', b' <Signature xmlns="http://www.w3.org/2000/09/xmldsig#">', b' <SignedInfo>', b'-<CanonicalizationMethod Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315" />', # noqa: E501 b'-<SignatureMethod Algorithm="http://www.w3.org/2000/09/xmldsig#rsa-sha1" />', b'+<CanonicalizationMethod Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315"/>', # noqa: E501 b'+<SignatureMethod Algorithm="http://www.w3.org/2000/09/xmldsig#rsa-sha1"/>', b' <Reference URI="#MiPE76399752-6048">', b' <Transforms>', b'-<Transform Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315" />', b'+<Transform Algorithm="http://www.w3.org/TR/2001/REC-xml-c14n-20010315"/>', b' </Transforms>', b'-<DigestMethod Algorithm="http://www.w3.org/2000/09/xmldsig#sha1" />', b'+<DigestMethod Algorithm="http://www.w3.org/2000/09/xmldsig#sha1"/>', b' <DigestValue>tk/D3mfO/KtdWyFXYZHe7dtYijg=</DigestValue>', b' </Reference>', b' </SignedInfo>', ) file_bytes_diff_gen = difflib.diff_bytes( dfunc=difflib.unified_diff, a=file_bytes.splitlines(), b=file_bytes_rewritten.splitlines()) self.assertSequenceEqual( [diff_line for diff_line in file_bytes_diff_gen], expected_file_bytes_diff )
# Get the program input. If NAME.input exists, read it. # Otherwise, there is no input. if os.path.exists(input_path): with open(input_path, 'rb') as f: input_text = f.read() else: input_text = b'' # Invoke the test program and collect the merged stdout and stderr. from subprocess import check_call, Popen, PIPE, STDOUT p = Popen([exe_path], stdin=PIPE, stdout=PIPE, stderr=STDOUT) stdout, _ = p.communicate(input_text) actual = stdout.splitlines(1) # Read the name.TEST file to get the expected output. with open(test_path, 'rb') as f: expected = f.readlines() if actual != expected: # Actual output differs from the expected output. # Format a context diff. The program output is a list of byte strings, # so the expected output is also read as byte strings. # The diff is then performed on byte strings, and # the result is converted to Unicode using what I hope # is a lossless conversion. import difflib for line in difflib.diff_bytes(difflib.context_diff, expected, actual, b'Expected', b'Actual'): sys.stdout.write(line.decode('latin1')) sys.exit(1)
def check_reliability(service, output_dir, timeout): """ Checks the reliability of a URL, and appends the result to a CSV file. Parameters: service(Service) : The `Service` instance to check. output_path(str): The path of the CSV file to append the results to. timeout(float): The timeout in seconds for the GET requests - if `None`, defaults to `DEFAULT_CHECK_INTERVAL`. """ info(f"Checking {service.url}") ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") db = ReliabilityDB(output_dir) checksum = None note = None diff_impossible = False try: with requests.get(service.url, timeout=timeout, stream=True, allow_redirects=True) as r: # Best effort file name header_file_name = get_filename( r.headers.get("content-disposition")) last_segment = service.url.split("/")[-1] if header_file_name is not None: file_name = header_file_name elif ("&" not in last_segment and "?" not in last_segment and (last_segment.lower().endswith(".gml") or last_segment.lower().endswith(".zip"))): file_name = last_segment else: file_name = "download" content = r.content # Accept single file ZIP responses if last_segment.endswith("zip"): with zipfile.ZipFile(io.BytesIO(content)) as z: files_info = z.infolist() if len(files_info) == 1: first_info = files_info[0] with z.open(first_info) as f: file_name = first_info.filename content = f.read() else: diff_impossible = True note = ( "Could not perform diff: response is multi-file ZIP." ) if not file_name.lower().endswith("zip"): file_name += ".zip" if not diff_impossible: # Attempt pretty-formatting to reduce diffs for compacted XML try: doc = etree.parse(io.BytesIO(content)) content = etree.tostring(doc, encoding="utf8", pretty_print=True) except etree.ParseError: diff_impossible = True note = "Could not perform diff: invalid XML." checksum = hashlib.md5(content).hexdigest() db.add_check(ts, checksum=checksum, status=r.status_code, note=note) if checksum != db.latest_checksum: download_dir = output_dir / ts download_dir.mkdir() with open(download_dir / file_name, "wb") as f: f.write(content) if db.latest_changed_ts is not None: if diff_impossible: diff_msg = note else: new_lines = content.splitlines() with open( output_dir / db.latest_changed_ts / db.latest_changed_file_name, "rb") as f: previous_lines = f.read().splitlines() diff = difflib.diff_bytes( difflib.unified_diff, previous_lines, new_lines, db.latest_changed_ts.encode("utf-8"), ts.encode("utf-8"), ) diff_msg = [l for l in diff] with open(download_dir / "diff", "wb") as f: f.writelines(b"%b\n" % l for l in diff_msg) db.latest_changed_ts = ts db.latest_changed_file_name = file_name db.latest_checksum = checksum except requests.exceptions.Timeout: db.add_check(ts, timeout=True, note=note) except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError): db.add_check(ts, conn_error=True, note=note) except zipfile.BadZipFile: db.add_check(ts, content_error=True, note="Bad Zip file")
def update_event(self, inp=-1): self.set_output_val(0, difflib.diff_bytes(self.input(0), self.input(1), self.input(2), self.input(3), self.input(4), self.input(5), self.input(6), self.input(7), self.input(8)))
difflib.unified_diff(a, b, fromfile="", tofile="", fromfiledate="", tofiledate="", n=3, lineterm="\n") Compare a and b (list of strings); return a delta (a generator generating the delta lines) in unified diff format. Unified diffs are a compact way of showing just the lines that have changed plus a few lines of context. The changes are shown in an inline style. The number of context lines is set by n which defaults to three """ s1 = ["bacon\n", "eggs\n", "ham\n", "guido\n"] s2 = ["python\n", "eggy\n", "hamster\n", "guido\n"] sys.stdout.writelines( difflib.unified_diff(s1, s2, fromfile="before.py", tofile="after.py")) """ difflib.diff_bytes(dfunc, a, b, fromfile=b"", tofile=b"", fromfiledate=b"", tofiledate=b"", n=3, lineterm=b"\n") Compare a and b (list of bytes objects) using dfunct; yield a sequence of delta lines (also bytes) in the format returned by dfunc. dunc must be a callable, typically either unified_diff() or context_diff() Allows you to compare data with unknown or inconsistent encoding. All inputs except n must be bytes objects, not str. difflib.IS_LINE_JUNK(line) Returns True for ignorable lines. The line line is ignorable if line is blank or contains a single "#", otherwise it is not ignorable. Used as default for parameter linejunk in ndiff() in older versions
def do_diff(self, ref_lines, test_lines, ref_file, test_file): return difflib.diff_bytes(difflib.unified_diff, ref_lines, test_lines, fromfile=ref_file.encode('utf-8'), tofile=test_file.encode('utf-8'))