def perform_diff(doc1text, doc2text): import diff_match_patch return diff_match_patch.diff( doc1text, doc2text, timelimit=0, checklines=False)
def default_differ(text1, text2): try: import diff_match_patch if hasattr(diff_match_patch, "diff_match_patch"): # https://code.google.com/p/google-diff-match-patch/source/browse/trunk/python3/diff_match_patch.py for op, oplen in diff_match_patch.diff_match_patch().diff_main(text1, text2): if op == -1: op = "-" if op == +1: op = "+" if op == 0: op = "=" yield (op, len(oplen)) else: # https://pypi.python.org/pypi/diff_match_patch_python/1.0.1 for x in diff_match_patch.diff(text1, text2): yield x except ImportError: import difflib diff = difflib.SequenceMatcher(text1, text2, autojunk=False) for (tag, i1, i2, j1, j2) in diff.get_opcodes(): if tag == "equal": yield ("=", i2-i1) elif tag == "insert": yield ("+", j2-j1) elif tag == "delete": yield ("-", i2-i1) elif tag == "replace": yield ("-", i2-i1) yield ("+", j2-j1)
def compute_dmp_diff(a_text, b_text, timelimit=4): if (isinstance(a_text, str) and isinstance(b_text, str)): changes = diff(a_text, b_text, checklines=False, timelimit=timelimit, cleanup_semantic=True, counts_only=False) elif (isinstance(a_text, bytes) and isinstance(b_text, bytes)): changes = diff_bytes(a_text, b_text, checklines=False, timelimit=timelimit, cleanup_semantic=True, counts_only=False) else: raise TypeError("Both the texts should be either of type 'str' or 'bytes'.") result = [(diff_codes[change[0]], change[1]) for change in changes] return result
def make_amendments(contract, new_version): nlp = spacy.load('en') old_version = contract.version_set.order_by('-created_at')[1] changes = diff(old_version.text, new_version.text, timelimit=0, checklines=False) cursor_old = 0 cursor_new = 0 working = False old_text_temp = "" old_text_length = 0 for op, length in changes: if op == "=": # No change for the next few characters # If we are working but we reach here, the last change was just a deletion, so we save it if working: risk = assess_risk(nlp, "", old_text_temp) Amendment.create_and_save(new_version, old_text_temp, "", cursor_new, cursor_new + old_text_length, risk) working = False cursor_old += length cursor_new += length if op == "-": # Some text is deleted, we need to save it to add it to an amendment old_text_temp = old_version.text[cursor_old:cursor_old + length] old_text_length = length working = True cursor_old += length if op == "+": # Some text is added, we save it either way, and check if we were working new_text = new_version.text[cursor_new:cursor_new + length] if working: # If we were working, we save the old changed text risk = assess_risk(nlp, new_text, old_text_temp) Amendment.create_and_save(new_version, old_text_temp, new_text, cursor_new, cursor_new + length, risk) working = False else: # If not, we just save the new text risk = assess_risk(nlp, new_text, "") Amendment.create_and_save(new_version, "", new_text, cursor_new, cursor_new + length, risk) cursor_new += length
def perform_diff(doc1text, doc2text): import diff_match_patch # Support two different diff_match_patch modules try: # https://pypi.org/project/diff_match_patch_python/ diff = diff_match_patch.diff(doc1text, doc2text, timelimit=0, checklines=False) except AttributeError: # https://pypi.org/project/diff-match-patch/ dmp = diff_match_patch.diff_match_patch() diff = dmp.diff_main(doc1text, doc2text) dmp.diff_cleanupSemantic(diff) # from pprint import pprint # pprint(diff) return diff
def get_diff_steps(a: str, b: str): """Yield steps to turn a into b. Example: >>> list(SpanUpdater.get_diff_steps("12 34 56", "12 78 34")) [('=', 3), ('+', 3), ('=', 2), ('-', 3)] Meaning: to turn a into b, keep the first 3 characters the same, insert three new characters (we don't care what), keep the next two characters, delete three characters. """ try: return diff_match_patch.diff(a, b, timelimit=0, checklines=False, cleanup_semantic=False) except AttributeError as e: raise AttributeError( "This may be caused by having the diff_match_patch package " "installed, which is incompatible with " "diff_match_patch_python.") from e
def get_diff_google(a, b): '''Возвращает различия между строками ``a`` и ``b`` в виде списка. Каждый элемент списка — кортеж из двух элментов, который показывает действие, которое нужно применить к ``a``, чтобы получить ``b``: - ``('=', длина)`` — кусок указанной длины одинаков для ``a`` и ``b`` - ``('+', 'кусок')`` — этот кусок добавлен в новом тексте - ``('-', 'кусок')`` — этот кусок удалён из старого текста Используется ``google-diff-match-patch``, который быстрый, но даёт не самые красивые диффы. ''' import diff_match_patch ia = 0 ib = 0 result = [] for op, l in diff_match_patch.diff(a, b, timelimit=20, checklines=False): if op == '=': assert a[ia:ia + l] == b[ib:ib + l] result.append(('=', l)) ia += l ib += l elif op == '-': result.append(('-', a[ia:ia + l])) ia += l elif op == '+': result.append(('+', b[ib:ib + l])) ib += l return result
def compare_xml_text(doc1, doc2, timelimit=10): # Compare the text of two XML documents, marking up each document with new # <span> tags. The documents are modified in place. def make_bytes(s): if type(s) != str: s = s.encode("utf8") else: pass # assume already utf8 return s def serialize_document(doc): from StringIO import StringIO class State(object): pass state = State() state.text = StringIO() state.offsets = list() state.charcount = 0 def append_text(text, node, texttype, state): if not text: return text = make_bytes(text) state.text.write(text) state.offsets.append([state.charcount, len(text), node, texttype]) state.charcount += len(text) def recurse_on(node, state): # etree handles text oddly: node.text contains the text of the element, but if # the element has children then only the text up to its first child, and node.tail # contains the text after the element but before the next sibling. To iterate the # text in document order, we cannot use node.iter(). append_text(node.text, node, 0, state) # 0 == .text for child in node: recurse_on(child, state) append_text(node.tail, node, 1, state) # 1 == .tail recurse_on(doc.getroot(), state) state.text = state.text.getvalue() return state doc1data = serialize_document(doc1) doc2data = serialize_document(doc2) def simplify_diff(diff_iter): # Simplify the diff by collapsing any regions with more changes than # similarities, so that small unchanged regions appear within the larger # set of changes (as changes, not as similarities). prev = [] for op, length in diff_iter: if len(prev) < 2: prev.append( (op, length) ) else: # If the op two hunks ago is the same as the current hunk and # the total lengths of two hunks ago and the current is creater # than the length of the hunk in the middle... if op in ('-', '+') and prev[0][0] == op and prev[1][0] == '=' \ and prev[0][1] + length > (prev[1][1]-1)**1.4: prev.append( (op, prev[0][1] + prev[1][1] + length) ) prev.append( ('-' if op == '+' else '+', prev[1][1]) ) prev.pop(0) prev.pop(0) # If the two hunks differ in op, combine them a different way. elif op in ('-', '+') and prev[0][0] in ('-', '+') and prev[1][0] == '=' \ and prev[0][1] + length > (prev[1][1]-1)**1.4: prev.append( (prev[0][0], prev[0][1] + prev[1][1]) ) prev.append( (op, prev[1][1] + length) ) prev.pop(0) prev.pop(0) else: yield prev.pop(0) prev.append( (op, length) ) for p in prev: yield p def reformat_diff(diff_iter): # Re-format the operations of the diffs to indicate the byte # offsets on the left and right. left_pos = 0 right_pos = 0 for op, length in diff_iter: left_len = length if op in ("-", "=") else 0 right_len = length if op in ("+", "=") else 0 yield (op, left_pos, left_len, right_pos, right_len) left_pos += left_len right_pos += right_len def slice_bytes(text, start, end): # Return the range [start:length] from the byte-representation of # the text string, returning unicode. If text is unicode, convert to # bytes, take the slice, and then convert back from UTF8 as best as # possible since we may have messed up the UTF8 encoding. return make_bytes(text)[start:end].decode("utf8", "replace") def mark_text(doc, offsets, pos, length, mode): # Wrap the text in doc at position pos and of byte length length # with a <span>, and set the class to mode. def make_wrapper(label=None): wrapper_node = lxml.etree.Element('span') wrapper_node.set('class', mode) #if label: wrapper_node.set('make_wrapper_label', label) return wrapper_node for i, (off, offlen, offnode, offtype) in enumerate(offsets): # Does the change intersect this span? if pos >= off+offlen or pos+length <= off: continue if pos == off and length >= offlen: # The text to mark is the whole part of this span, # plus possibly some more. if offtype == 0: # It is the node's .text, meaning replace the text # that exists up to the node's first child. w = make_wrapper("A") w.text = offnode.text offnode.text = "" offnode.insert(0, w) else: # It is the node's .tail, meaning replace the text # that exists after the element and before the next # sibling. w = make_wrapper("B") offtail = offnode.tail # see below offnode.addnext(w) w.text = offtail w.tail = None offnode.tail = "" elif pos == off and length < offlen: # The text to mark starts here but ends early. if offtype == 0: w = make_wrapper("C") offnode.insert(0, w) w.text = slice_bytes(offnode.text, 0, length) w.set("txt", slice_bytes(offnode.text, 0, length)) w.tail = slice_bytes(offnode.text, length, offlen) offnode.text = "" else: w = make_wrapper("D") offtail = offnode.tail # get it early to avoid any automatic space normalization offnode.addnext(w) # add it early for the same reason w.text = slice_bytes(offtail, 0, length) w.tail = slice_bytes(offtail, length, offlen) offnode.tail = "" # After this point we may come back to edit more text in this # node after this point. However, what was in this node at offset # x is now in the tail of the new wrapper node at position x-length. offsets[i] = (off+length, offlen-length, w, 1) elif pos > off and pos+length >= off+offlen: # The text to mark starts part way into this span and ends # at the end (or beyond). if offtype == 0: w = make_wrapper("E") offnode.insert(0, w) w.text = slice_bytes(offnode.text, pos-off, offlen) offnode.text = slice_bytes(offnode.text, 0, pos-off) else: w = make_wrapper("F") offtail = offnode.tail # see above offnode.addnext(w) # see above w.text = slice_bytes(offtail, pos-off, offlen) w.tail = None offnode.tail = slice_bytes(offtail, 0, pos-off) elif pos > off and pos+length < off+offlen: # The text to mark starts part way into this span and ends # early. if offtype == 0: w = make_wrapper("G") offnode.insert(0, w) w.text = slice_bytes(offnode.text, pos-off, (pos-off)+length) w.tail = slice_bytes(offnode.text, (pos-off)+length, offlen) offnode.text = slice_bytes(offnode.text, 0, pos-off) else: #if len(make_bytes(offnode.tail)) != offlen: raise Exception(str(len(make_bytes(offnode.tail))) + "/" + str(offlen) + "/" + lxml.etree.tostring(offnode)) w = make_wrapper("H") offtail = offnode.tail # see above offnode.addnext(w) # see above w.text = slice_bytes(offtail, pos-off, (pos-off)+length) w.tail = slice_bytes(offtail, (pos-off)+length, offlen) offnode.tail = slice_bytes(offtail, 0, pos-off) # After this point we may come back to edit more text in this # node after this point. However, what was in this node at offset # x is now in the tail of the new wrapper node at position x-length. offsets[i] = (off+(pos-off)+length, offlen-(pos-off)-length, w, 1) else: raise Exception() if pos+length > off+offlen: d = off+offlen - pos pos += d length -= d if length <= 0: return def get_bounding_nodes(pos, length, offsets): nodes = [] for off, offlen, offnode, offtype in offsets: if off <= pos < off+offlen: nodes.append(offnode) if off <= pos+length < off+offlen: nodes.append(offnode) if len(nodes) == 0: return None return nodes[0], nodes[-1] def mark_correspondence(leftnode, rightnode, idx, ab): if not leftnode.get("id"): leftnode.set("id", "left_%d%s" % (idx, ab)) if not rightnode.get("id"): rightnode.set("id", "right_%d%s" % (idx, ab)) leftnode.set("cw_" + ab, rightnode.get("id")) rightnode.set("cw_" + ab, leftnode.get("id")) import diff_match_patch diff = diff_match_patch.diff(doc1data.text, doc2data.text, timelimit=timelimit) diff = reformat_diff(simplify_diff(diff)) idx = 0 for op, left_pos, left_len, right_pos, right_len in diff: idx += 1 left_nodes = get_bounding_nodes(left_pos, left_len, doc1data.offsets) right_nodes = get_bounding_nodes(right_pos, right_len, doc2data.offsets) if left_nodes and right_nodes: mark_correspondence(left_nodes[0], right_nodes[0], idx, "top") mark_correspondence(left_nodes[1], right_nodes[1], idx, "bot") if op == "=" and doc1data.text[left_pos:left_pos+left_len] == doc2data.text[right_pos:right_pos+right_len]: continue if left_len > 0: mark_text(doc1, doc1data.offsets, left_pos, left_len, "del" if right_len == 0 else "change") if right_len > 0: mark_text(doc2, doc2data.offsets, right_pos, right_len, "ins" if left_len == 0 else "change") return doc1, doc2
import diff_match_patch left_text = "this is a test" right_text = "this is not a test" diff = diff_match_patch.diff(left_text, right_text, timelimit=15, checklines=False) for op, length in diff: if op == "-": print "next", length, "characters are deleted" if op == "=": print "next", length, "characters are in common" if op == "+": print "next", length, "characters are inserted"
def differ(text1, text2): # ensure we use the C++ Google DMP and can specify the time limit import diff_match_patch for x in diff_match_patch.diff(text1, text2, timelimit=timelimit): yield x
def compare_text(text2, text1w, word_map, text1w_len): # Clone so that the orginal can be reused on the next call. word_map = dict(word_map) # Convert the second text to an array of words using the same word to character mapping. text2w = to_words(text2, word_map) # Invert the word_map for from_words. word_map = {v: k for k, v in word_map.items()} # Perform diff, getting the "matching blocks" of text. These # blocks may be single words or the entire text2 document, # distributed anywhere in the text1 document. import diff_match_patch ops = diff_match_patch.diff(text1w, text2w, timelimit=0, checklines=False, cleanup_semantic=False) i1 = 0 i2 = 0 blocks = [] for op, oplen in ops: if op != "=": if op == "-": i1 += oplen if op == "+": i2 += oplen continue blocks.append({ "text1_start": i1, "text2_start": i2, "size": oplen, }) i1 += oplen i2 += oplen # Annotate each block with its distance to preceding and surrounding # blocks. for i in range(1, len(blocks)): blocks[i]['text1_nbefore'] = blocks[i]['text1_start'] - ( blocks[i - 1]['text1_start'] + blocks[i - 1]['size']) blocks[i]['text2_nbefore'] = blocks[i]['text2_start'] - ( blocks[i - 1]['text2_start'] + blocks[i - 1]['size']) blocks[i - 1]['text1_nafter'] = blocks[i]['text1_nbefore'] blocks[i - 1]['text2_nafter'] = blocks[i]['text2_nbefore'] # We want to compute a number that indicates how much of text2 # appears in text1. So we compute the number of words in text2 # that appear in text1 and divide by the total number of words # in text2. Drop blocks that are so far away from surrounding # text that they probably don't represent a contiguous part of # copied text between the documents. matched_blocks = [ b for b in blocks if b['size'] > 10 or (b['size'] > b.get("text1_nbefore", 0) and b['size'] > b.get("text1_nafter", 0) and b['size'] > b.get("text2_nbefore", 0) and b['size'] > b.get("text2_nafter", 0)) ] extract = "...".join( from_words( text2w[block['text2_start']:block['text2_start'] + block['size']], word_map) for block in matched_blocks) ratio1 = sum(b['size'] for b in matched_blocks) / float(text1w_len) ratio2 = sum(b['size'] for b in matched_blocks) / float(len(text2w)) return ratio1, ratio2, extract
def compare_xml_text(doc1, doc2, timelimit=10): # Compare the text of two XML documents, marking up each document with new # <span> tags. The documents are modified in place. def make_bytes(s): if type(s) != str: s = s.encode("utf8") else: pass # assume already utf8 return s def serialize_document(doc): from StringIO import StringIO class State(object): pass state = State() state.text = StringIO() state.offsets = list() state.charcount = 0 def append_text(text, node, texttype, state): if not text: return text = make_bytes(text) state.text.write(text) state.offsets.append([state.charcount, len(text), node, texttype]) state.charcount += len(text) def recurse_on(node, state): # etree handles text oddly: node.text contains the text of the element, but if # the element has children then only the text up to its first child, and node.tail # contains the text after the element but before the next sibling. To iterate the # text in document order, we cannot use node.iter(). append_text(node.text, node, 0, state) # 0 == .text for child in node: recurse_on(child, state) append_text(node.tail, node, 1, state) # 1 == .tail recurse_on(doc.getroot(), state) state.text = state.text.getvalue() return state doc1data = serialize_document(doc1) doc2data = serialize_document(doc2) def simplify_diff(diff_iter): # Simplify the diff by collapsing any regions with more changes than # similarities, so that small unchanged regions appear within the larger # set of changes (as changes, not as similarities). prev = [] for op, length in diff_iter: if len(prev) < 2: prev.append((op, length)) else: # If the op two hunks ago is the same as the current hunk and # the total lengths of two hunks ago and the current is creater # than the length of the hunk in the middle... if op in ('-', '+') and prev[0][0] == op and prev[1][0] == '=' \ and prev[0][1] + length > (prev[1][1]-1)**1.4: prev.append((op, prev[0][1] + prev[1][1] + length)) prev.append(('-' if op == '+' else '+', prev[1][1])) prev.pop(0) prev.pop(0) # If the two hunks differ in op, combine them a different way. elif op in ('-', '+') and prev[0][0] in ('-', '+') and prev[1][0] == '=' \ and prev[0][1] + length > (prev[1][1]-1)**1.4: prev.append((prev[0][0], prev[0][1] + prev[1][1])) prev.append((op, prev[1][1] + length)) prev.pop(0) prev.pop(0) else: yield prev.pop(0) prev.append((op, length)) for p in prev: yield p def reformat_diff(diff_iter): # Re-format the operations of the diffs to indicate the byte # offsets on the left and right. left_pos = 0 right_pos = 0 for op, length in diff_iter: left_len = length if op in ("-", "=") else 0 right_len = length if op in ("+", "=") else 0 yield (op, left_pos, left_len, right_pos, right_len) left_pos += left_len right_pos += right_len def slice_bytes(text, start, end): # Return the range [start:length] from the byte-representation of # the text string, returning unicode. If text is unicode, convert to # bytes, take the slice, and then convert back from UTF8 as best as # possible since we may have messed up the UTF8 encoding. return make_bytes(text)[start:end].decode("utf8", "replace") def mark_text(doc, offsets, pos, length, mode): # Wrap the text in doc at position pos and of byte length length # with a <span>, and set the class to mode. def make_wrapper(label=None): wrapper_node = lxml.etree.Element('span') wrapper_node.set('class', mode) #if label: wrapper_node.set('make_wrapper_label', label) return wrapper_node for i, (off, offlen, offnode, offtype) in enumerate(offsets): # Does the change intersect this span? if pos >= off + offlen or pos + length <= off: continue if pos == off and length >= offlen: # The text to mark is the whole part of this span, # plus possibly some more. if offtype == 0: # It is the node's .text, meaning replace the text # that exists up to the node's first child. w = make_wrapper("A") w.text = offnode.text offnode.text = "" offnode.insert(0, w) else: # It is the node's .tail, meaning replace the text # that exists after the element and before the next # sibling. w = make_wrapper("B") offtail = offnode.tail # see below offnode.addnext(w) w.text = offtail w.tail = None offnode.tail = "" elif pos == off and length < offlen: # The text to mark starts here but ends early. if offtype == 0: w = make_wrapper("C") offnode.insert(0, w) w.text = slice_bytes(offnode.text, 0, length) w.set("txt", slice_bytes(offnode.text, 0, length)) w.tail = slice_bytes(offnode.text, length, offlen) offnode.text = "" else: w = make_wrapper("D") offtail = offnode.tail # get it early to avoid any automatic space normalization offnode.addnext(w) # add it early for the same reason w.text = slice_bytes(offtail, 0, length) w.tail = slice_bytes(offtail, length, offlen) offnode.tail = "" # After this point we may come back to edit more text in this # node after this point. However, what was in this node at offset # x is now in the tail of the new wrapper node at position x-length. offsets[i] = (off + length, offlen - length, w, 1) elif pos > off and pos + length >= off + offlen: # The text to mark starts part way into this span and ends # at the end (or beyond). if offtype == 0: w = make_wrapper("E") offnode.insert(0, w) w.text = slice_bytes(offnode.text, pos - off, offlen) offnode.text = slice_bytes(offnode.text, 0, pos - off) else: w = make_wrapper("F") offtail = offnode.tail # see above offnode.addnext(w) # see above w.text = slice_bytes(offtail, pos - off, offlen) w.tail = None offnode.tail = slice_bytes(offtail, 0, pos - off) elif pos > off and pos + length < off + offlen: # The text to mark starts part way into this span and ends # early. if offtype == 0: w = make_wrapper("G") offnode.insert(0, w) w.text = slice_bytes(offnode.text, pos - off, (pos - off) + length) w.tail = slice_bytes(offnode.text, (pos - off) + length, offlen) offnode.text = slice_bytes(offnode.text, 0, pos - off) else: #if len(make_bytes(offnode.tail)) != offlen: raise Exception(str(len(make_bytes(offnode.tail))) + "/" + str(offlen) + "/" + lxml.etree.tostring(offnode)) w = make_wrapper("H") offtail = offnode.tail # see above offnode.addnext(w) # see above w.text = slice_bytes(offtail, pos - off, (pos - off) + length) w.tail = slice_bytes(offtail, (pos - off) + length, offlen) offnode.tail = slice_bytes(offtail, 0, pos - off) # After this point we may come back to edit more text in this # node after this point. However, what was in this node at offset # x is now in the tail of the new wrapper node at position x-length. offsets[i] = (off + (pos - off) + length, offlen - (pos - off) - length, w, 1) else: raise Exception() if pos + length > off + offlen: d = off + offlen - pos pos += d length -= d if length <= 0: return def get_bounding_nodes(pos, length, offsets): nodes = [] for off, offlen, offnode, offtype in offsets: if off <= pos < off + offlen: nodes.append(offnode) if off <= pos + length < off + offlen: nodes.append(offnode) if len(nodes) == 0: return None return nodes[0], nodes[-1] def mark_correspondence(leftnode, rightnode, idx, ab): if not leftnode.get("id"): leftnode.set("id", "left_%d%s" % (idx, ab)) if not rightnode.get("id"): rightnode.set("id", "right_%d%s" % (idx, ab)) leftnode.set("cw_" + ab, rightnode.get("id")) rightnode.set("cw_" + ab, leftnode.get("id")) import diff_match_patch diff = diff_match_patch.diff(doc1data.text, doc2data.text, timelimit=timelimit) diff = reformat_diff(simplify_diff(diff)) idx = 0 for op, left_pos, left_len, right_pos, right_len in diff: idx += 1 left_nodes = get_bounding_nodes(left_pos, left_len, doc1data.offsets) right_nodes = get_bounding_nodes(right_pos, right_len, doc2data.offsets) if left_nodes and right_nodes: mark_correspondence(left_nodes[0], right_nodes[0], idx, "top") mark_correspondence(left_nodes[1], right_nodes[1], idx, "bot") if op == "=" and doc1data.text[left_pos:left_pos + left_len] == doc2data.text[ right_pos:right_pos + right_len]: continue if left_len > 0: mark_text(doc1, doc1data.offsets, left_pos, left_len, "del" if right_len == 0 else "change") if right_len > 0: mark_text(doc2, doc2data.offsets, right_pos, right_len, "ins" if left_len == 0 else "change") return doc1, doc2
You may obtain a copy of the licence at http://www.apache.org/licences/licence-2.0 Unless required by applicable law or agreed to in writing, software distributed under the licence is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the licence for the specific language governing permissions and limitations under the licence.""" import struct import sys from diff_match_patch import diff if __name__ == "__main__": while True: oldLen, newLen = struct.unpack("=II", sys.stdin.buffer.read(8)) if not oldLen and not newLen: break # sentinal value oldText = sys.stdin.buffer.read(oldLen).decode("utf-8") newText = sys.stdin.buffer.read(newLen).decode("utf-8") res = "" for op, text in diff(oldText, newText, counts_only=False): if op == "+": res += text.rstrip() + "\n" sys.stdout.buffer.write(struct.pack("=I", len(res))) sys.stdout.buffer.write(res.encode("utf-8")) sys.stdin.flush() sys.stdout.flush()