コード例 #1
0
ファイル: pdf-diff.py プロジェクト: pbrumm/pdf-diff
def perform_diff(doc1text, doc2text):
    import diff_match_patch
    return diff_match_patch.diff(
        doc1text,
        doc2text,
        timelimit=0,
        checklines=False)
コード例 #2
0
ファイル: command_line.py プロジェクト: nigelzor/pdf-diff
def perform_diff(doc1text, doc2text):
    import diff_match_patch
    return diff_match_patch.diff(
        doc1text,
        doc2text,
        timelimit=0,
        checklines=False)
コード例 #3
0
ファイル: __init__.py プロジェクト: alfonsodiecko/PYTHON_DIST
def default_differ(text1, text2):
	try:
		import diff_match_patch
		if hasattr(diff_match_patch, "diff_match_patch"):
			# https://code.google.com/p/google-diff-match-patch/source/browse/trunk/python3/diff_match_patch.py
			for op, oplen in diff_match_patch.diff_match_patch().diff_main(text1, text2):
				if op == -1: op = "-"
				if op == +1: op = "+"
				if op ==  0: op = "="
				yield (op, len(oplen))
		else:
			# https://pypi.python.org/pypi/diff_match_patch_python/1.0.1
			for x in diff_match_patch.diff(text1, text2):
				yield x
	except ImportError:
		import difflib
		diff = difflib.SequenceMatcher(text1, text2, autojunk=False)
		for (tag, i1, i2, j1, j2) in diff.get_opcodes():
			if tag == "equal":
				yield ("=", i2-i1)
			elif tag == "insert":
				yield ("+", j2-j1)
			elif tag == "delete":
				yield ("-", i2-i1)
			elif tag == "replace":
				yield ("-", i2-i1)
				yield ("+", j2-j1)
コード例 #4
0
def compute_dmp_diff(a_text, b_text, timelimit=4):
    if (isinstance(a_text, str) and isinstance(b_text, str)):
        changes = diff(a_text, b_text, checklines=False, timelimit=timelimit, cleanup_semantic=True, counts_only=False)
    elif (isinstance(a_text, bytes) and isinstance(b_text, bytes)):
        changes = diff_bytes(a_text, b_text, checklines=False, timelimit=timelimit, cleanup_semantic=True,
                             counts_only=False)
    else:
        raise TypeError("Both the texts should be either of type 'str' or 'bytes'.")

    result = [(diff_codes[change[0]], change[1]) for change in changes]
    return result
コード例 #5
0
def make_amendments(contract, new_version):
    nlp = spacy.load('en')
    old_version = contract.version_set.order_by('-created_at')[1]

    changes = diff(old_version.text, new_version.text, timelimit=0, checklines=False)

    cursor_old = 0
    cursor_new = 0

    working = False

    old_text_temp = ""
    old_text_length = 0

    for op, length in changes:
        if op == "=":
            # No change for the next few characters

            # If we are working but we reach here, the last change was just a deletion, so we save it
            if working:
                risk = assess_risk(nlp, "", old_text_temp)
                Amendment.create_and_save(new_version, old_text_temp, "", cursor_new, cursor_new + old_text_length,
                                          risk)
                working = False

            cursor_old += length
            cursor_new += length

        if op == "-":
            # Some text is deleted, we need to save it to add it to an amendment
            old_text_temp = old_version.text[cursor_old:cursor_old + length]
            old_text_length = length
            working = True

            cursor_old += length

        if op == "+":
            # Some text is added, we save it either way, and check if we were working
            new_text = new_version.text[cursor_new:cursor_new + length]
            if working:
                # If we were working, we save the old changed text
                risk = assess_risk(nlp, new_text, old_text_temp)
                Amendment.create_and_save(new_version, old_text_temp, new_text, cursor_new, cursor_new + length, risk)
                working = False
            else:
                # If not, we just save the new text
                risk = assess_risk(nlp, new_text, "")
                Amendment.create_and_save(new_version, "", new_text, cursor_new, cursor_new + length, risk)

            cursor_new += length
コード例 #6
0
ファイル: command_line.py プロジェクト: rpavlik/pdf-diff
def perform_diff(doc1text, doc2text):
    import diff_match_patch

    # Support two different diff_match_patch modules
    try:
        # https://pypi.org/project/diff_match_patch_python/
        diff = diff_match_patch.diff(doc1text,
                                     doc2text,
                                     timelimit=0,
                                     checklines=False)
    except AttributeError:
        # https://pypi.org/project/diff-match-patch/
        dmp = diff_match_patch.diff_match_patch()
        diff = dmp.diff_main(doc1text, doc2text)
        dmp.diff_cleanupSemantic(diff)

    # from pprint import pprint
    # pprint(diff)
    return diff
コード例 #7
0
 def get_diff_steps(a: str, b: str):
     """Yield steps to turn a into b. Example:
         >>> list(SpanUpdater.get_diff_steps("12 34 56", "12 78 34"))
         [('=', 3), ('+', 3), ('=', 2), ('-', 3)]
     Meaning: to turn a into b, keep the first 3 characters the same,
     insert three new characters (we don't care what), keep the next
     two characters, delete three characters.
     """
     try:
         return diff_match_patch.diff(a,
                                      b,
                                      timelimit=0,
                                      checklines=False,
                                      cleanup_semantic=False)
     except AttributeError as e:
         raise AttributeError(
             "This may be caused by having the diff_match_patch package "
             "installed, which is incompatible with "
             "diff_match_patch_python.") from e
コード例 #8
0
def get_diff_google(a, b):
    '''Возвращает различия между строками ``a`` и ``b`` в виде списка.
    Каждый элемент списка — кортеж из двух элментов, который показывает
    действие, которое нужно применить к ``a``, чтобы получить ``b``:

    - ``('=', длина)`` — кусок указанной длины одинаков для ``a`` и ``b``

    - ``('+', 'кусок')`` — этот кусок добавлен в новом тексте

    - ``('-', 'кусок')`` — этот кусок удалён из старого текста

    Используется ``google-diff-match-patch``, который быстрый, но даёт
    не самые красивые диффы.
    '''

    import diff_match_patch

    ia = 0
    ib = 0

    result = []

    for op, l in diff_match_patch.diff(a, b, timelimit=20, checklines=False):
        if op == '=':
            assert a[ia:ia + l] == b[ib:ib + l]
            result.append(('=', l))
            ia += l
            ib += l
        elif op == '-':
            result.append(('-', a[ia:ia + l]))
            ia += l
        elif op == '+':
            result.append(('+', b[ib:ib + l]))
            ib += l

    return result
コード例 #9
0
ファイル: diff.py プロジェクト: andreymal/mini_fiction
def get_diff_google(a, b):
    '''Возвращает различия между строками ``a`` и ``b`` в виде списка.
    Каждый элемент списка — кортеж из двух элментов, который показывает
    действие, которое нужно применить к ``a``, чтобы получить ``b``:

    - ``('=', длина)`` — кусок указанной длины одинаков для ``a`` и ``b``

    - ``('+', 'кусок')`` — этот кусок добавлен в новом тексте

    - ``('-', 'кусок')`` — этот кусок удалён из старого текста

    Используется ``google-diff-match-patch``, который быстрый, но даёт
    не самые красивые диффы.
    '''

    import diff_match_patch

    ia = 0
    ib = 0

    result = []

    for op, l in diff_match_patch.diff(a, b, timelimit=20, checklines=False):
        if op == '=':
            assert a[ia:ia + l] == b[ib:ib + l]
            result.append(('=', l))
            ia += l
            ib += l
        elif op == '-':
            result.append(('-', a[ia:ia + l]))
            ia += l
        elif op == '+':
            result.append(('+', b[ib:ib + l]))
            ib += l

    return result
コード例 #10
0
def compare_xml_text(doc1, doc2, timelimit=10):
    # Compare the text of two XML documents, marking up each document with new
    # <span> tags. The documents are modified in place.
    
    def make_bytes(s):
        if type(s) != str:
            s = s.encode("utf8")
        else:
            pass # assume already utf8
        return s
    
    def serialize_document(doc):
        from StringIO import StringIO
        class State(object):
            pass
        state = State()
        state.text = StringIO()
        state.offsets = list()
        state.charcount = 0
        def append_text(text, node, texttype, state):
            if not text: return
            text = make_bytes(text)
            state.text.write(text)
            state.offsets.append([state.charcount, len(text), node, texttype])
            state.charcount += len(text)
        def recurse_on(node, state):
            # etree handles text oddly: node.text contains the text of the element, but if
            # the element has children then only the text up to its first child, and node.tail
            # contains the text after the element but before the next sibling. To iterate the
            # text in document order, we cannot use node.iter().
            append_text(node.text, node, 0, state) # 0 == .text
            for child in node:
                recurse_on(child, state)
            append_text(node.tail, node, 1, state) # 1 == .tail
        recurse_on(doc.getroot(), state)
        state.text = state.text.getvalue()
        return state
        
    doc1data = serialize_document(doc1)
    doc2data = serialize_document(doc2)
    
    def simplify_diff(diff_iter):
        # Simplify the diff by collapsing any regions with more changes than
        # similarities, so that small unchanged regions appear within the larger
        # set of changes (as changes, not as similarities).
        prev = []
        for op, length in diff_iter:
            if len(prev) < 2:
                prev.append( (op, length) )
            else:
                # If the op two hunks ago is the same as the current hunk and
                # the total lengths of two hunks ago and the current is creater
                # than the length of the hunk in the middle...
                if op in ('-', '+') and prev[0][0] == op and prev[1][0] == '=' \
                    and prev[0][1] + length > (prev[1][1]-1)**1.4:
                    prev.append( (op, prev[0][1] + prev[1][1] + length) )
                    prev.append( ('-' if op == '+' else '+', prev[1][1]) )
                    prev.pop(0)
                    prev.pop(0)
                    
                # If the two hunks differ in op, combine them a different way.
                elif op in ('-', '+') and prev[0][0] in ('-', '+') and prev[1][0] == '=' \
                    and prev[0][1] + length > (prev[1][1]-1)**1.4:
                    prev.append( (prev[0][0], prev[0][1] + prev[1][1]) )
                    prev.append( (op, prev[1][1] + length) )
                    prev.pop(0)
                    prev.pop(0)
                
                else:
                    yield prev.pop(0)
                    prev.append( (op, length) )
        for p in prev:
            yield p
    
    def reformat_diff(diff_iter):
        # Re-format the operations of the diffs to indicate the byte
        # offsets on the left and right.
        left_pos = 0
        right_pos = 0
        for op, length in diff_iter:
            left_len = length if op in ("-", "=") else 0
            right_len = length if op in ("+", "=") else 0
            yield (op, left_pos, left_len, right_pos, right_len)
            left_pos += left_len
            right_pos += right_len
           
    def slice_bytes(text, start, end):
        # Return the range [start:length] from the byte-representation of
        # the text string, returning unicode. If text is unicode, convert to
        # bytes, take the slice, and then convert back from UTF8 as best as
        # possible since we may have messed up the UTF8 encoding.
       return make_bytes(text)[start:end].decode("utf8", "replace")
           
    def mark_text(doc, offsets, pos, length, mode):
       # Wrap the text in doc at position pos and of byte length length
       # with a <span>, and set the class to mode.
       def make_wrapper(label=None):
           wrapper_node = lxml.etree.Element('span')
           wrapper_node.set('class', mode)
           #if label: wrapper_node.set('make_wrapper_label', label)
           return wrapper_node
       for i, (off, offlen, offnode, offtype) in enumerate(offsets):
           # Does the change intersect this span?
           if pos >= off+offlen or pos+length <= off: continue
           
           if pos == off and length >= offlen:
               # The text to mark is the whole part of this span,
               # plus possibly some more.
               if offtype == 0:
                   # It is the node's .text, meaning replace the text
                   # that exists up to the node's first child.
                   w = make_wrapper("A")
                   w.text = offnode.text
                   offnode.text = ""
                   offnode.insert(0, w)
               else:
                   # It is the node's .tail, meaning replace the text
                   # that exists after the element and before the next
                   # sibling.
                   w = make_wrapper("B")
                   offtail = offnode.tail # see below
                   offnode.addnext(w)
                   w.text = offtail
                   w.tail = None
                   offnode.tail = ""
           elif pos == off and length < offlen:
               # The text to mark starts here but ends early.
               if offtype == 0:
                   w = make_wrapper("C")
                   offnode.insert(0, w)
                   w.text = slice_bytes(offnode.text, 0, length)
                   w.set("txt", slice_bytes(offnode.text, 0, length))
                   w.tail = slice_bytes(offnode.text, length, offlen)
                   offnode.text = ""
               else:
                   w = make_wrapper("D")
                   offtail = offnode.tail # get it early to avoid any automatic space normalization
                   offnode.addnext(w) # add it early for the same reason
                   w.text = slice_bytes(offtail, 0, length)
                   w.tail = slice_bytes(offtail, length, offlen)
                   offnode.tail = ""
               # After this point we may come back to edit more text in this
               # node after this point. However, what was in this node at offset
               # x is now in the tail of the new wrapper node at position x-length.
               offsets[i] = (off+length, offlen-length, w, 1)
           elif pos > off and pos+length >= off+offlen:
               # The text to mark starts part way into this span and ends
               # at the end (or beyond).
               if offtype == 0:
                   w = make_wrapper("E")
                   offnode.insert(0, w)
                   w.text = slice_bytes(offnode.text, pos-off, offlen)
                   offnode.text = slice_bytes(offnode.text, 0, pos-off)
               else:
                   w = make_wrapper("F")
                   offtail = offnode.tail # see above
                   offnode.addnext(w) # see above
                   w.text = slice_bytes(offtail, pos-off, offlen)
                   w.tail = None
                   offnode.tail = slice_bytes(offtail, 0, pos-off)
           elif pos > off and pos+length < off+offlen:
               # The text to mark starts part way into this span and ends
               # early.
               if offtype == 0:
                   w = make_wrapper("G")
                   offnode.insert(0, w)
                   w.text = slice_bytes(offnode.text, pos-off, (pos-off)+length)
                   w.tail = slice_bytes(offnode.text, (pos-off)+length, offlen)
                   offnode.text = slice_bytes(offnode.text, 0, pos-off)
               else:
                   #if len(make_bytes(offnode.tail)) != offlen: raise Exception(str(len(make_bytes(offnode.tail))) + "/" + str(offlen) + "/" + lxml.etree.tostring(offnode))
                   w = make_wrapper("H")
                   offtail = offnode.tail # see above
                   offnode.addnext(w) # see above
                   w.text = slice_bytes(offtail, pos-off, (pos-off)+length)
                   w.tail = slice_bytes(offtail, (pos-off)+length, offlen)
                   offnode.tail = slice_bytes(offtail, 0, pos-off)
               # After this point we may come back to edit more text in this
               # node after this point. However, what was in this node at offset
               # x is now in the tail of the new wrapper node at position x-length.
               offsets[i] = (off+(pos-off)+length, offlen-(pos-off)-length, w, 1)
           else:
               raise Exception()
           
           if pos+length > off+offlen:
               d = off+offlen - pos
               pos += d
               length -= d
               if length <= 0: return
           
    def get_bounding_nodes(pos, length, offsets):
       nodes = []
       for off, offlen, offnode, offtype in offsets:
           if off <= pos < off+offlen:
               nodes.append(offnode)
           if off <= pos+length < off+offlen:
               nodes.append(offnode)
       if len(nodes) == 0: return None
       return nodes[0], nodes[-1]
    def mark_correspondence(leftnode, rightnode, idx, ab):
        if not leftnode.get("id"): leftnode.set("id", "left_%d%s" % (idx, ab))
        if not rightnode.get("id"): rightnode.set("id", "right_%d%s" % (idx, ab))
        leftnode.set("cw_" + ab, rightnode.get("id"))
        rightnode.set("cw_" + ab, leftnode.get("id"))
           
    import diff_match_patch
    diff = diff_match_patch.diff(doc1data.text, doc2data.text, timelimit=timelimit)
    diff = reformat_diff(simplify_diff(diff))
    idx = 0
    for op, left_pos, left_len, right_pos, right_len in diff:
        idx += 1
        left_nodes = get_bounding_nodes(left_pos, left_len, doc1data.offsets)
        right_nodes = get_bounding_nodes(right_pos, right_len, doc2data.offsets)
        if left_nodes and right_nodes:
            mark_correspondence(left_nodes[0], right_nodes[0], idx, "top")
            mark_correspondence(left_nodes[1], right_nodes[1], idx, "bot")
        
        if op == "=" and doc1data.text[left_pos:left_pos+left_len] == doc2data.text[right_pos:right_pos+right_len]: continue
        if left_len > 0: mark_text(doc1, doc1data.offsets, left_pos, left_len, "del" if right_len == 0 else "change")
        if right_len > 0: mark_text(doc2, doc2data.offsets, right_pos, right_len, "ins" if left_len == 0 else "change")
    
    return doc1, doc2
コード例 #11
0
import diff_match_patch

left_text = "this is a test"
right_text = "this is not a test"

diff = diff_match_patch.diff(left_text, right_text, timelimit=15, checklines=False)

for op, length in diff:
	if op == "-":
		print "next", length, "characters are deleted"
	if op == "=":
		print "next", length, "characters are in common"
	if op == "+":
		print "next", length, "characters are inserted"
	

コード例 #12
0
 def differ(text1, text2):
     # ensure we use the C++ Google DMP and can specify the time limit
     import diff_match_patch
     for x in diff_match_patch.diff(text1, text2, timelimit=timelimit):
         yield x
コード例 #13
0
def compare_text(text2, text1w, word_map, text1w_len):
    # Clone so that the orginal can be reused on the next call.
    word_map = dict(word_map)

    # Convert the second text to an array of words using the same word to character mapping.
    text2w = to_words(text2, word_map)

    # Invert the word_map for from_words.
    word_map = {v: k for k, v in word_map.items()}

    # Perform diff, getting the "matching blocks" of text. These
    # blocks may be single words or the entire text2 document,
    # distributed anywhere in the text1 document.
    import diff_match_patch
    ops = diff_match_patch.diff(text1w,
                                text2w,
                                timelimit=0,
                                checklines=False,
                                cleanup_semantic=False)
    i1 = 0
    i2 = 0
    blocks = []
    for op, oplen in ops:
        if op != "=":
            if op == "-": i1 += oplen
            if op == "+": i2 += oplen
            continue
        blocks.append({
            "text1_start": i1,
            "text2_start": i2,
            "size": oplen,
        })
        i1 += oplen
        i2 += oplen

    # Annotate each block with its distance to preceding and surrounding
    # blocks.
    for i in range(1, len(blocks)):
        blocks[i]['text1_nbefore'] = blocks[i]['text1_start'] - (
            blocks[i - 1]['text1_start'] + blocks[i - 1]['size'])
        blocks[i]['text2_nbefore'] = blocks[i]['text2_start'] - (
            blocks[i - 1]['text2_start'] + blocks[i - 1]['size'])
        blocks[i - 1]['text1_nafter'] = blocks[i]['text1_nbefore']
        blocks[i - 1]['text2_nafter'] = blocks[i]['text2_nbefore']

    # We want to compute a number that indicates how much of text2
    # appears in text1. So we compute the number of words in text2
    # that appear in text1 and divide by the total number of words
    # in text2. Drop blocks that are so far away from surrounding
    # text that they probably don't represent a contiguous part of
    # copied text between the documents.
    matched_blocks = [
        b for b in blocks
        if b['size'] > 10 or (b['size'] > b.get("text1_nbefore", 0)
                              and b['size'] > b.get("text1_nafter", 0)
                              and b['size'] > b.get("text2_nbefore", 0)
                              and b['size'] > b.get("text2_nafter", 0))
    ]
    extract = "...".join(
        from_words(
            text2w[block['text2_start']:block['text2_start'] +
                   block['size']], word_map) for block in matched_blocks)
    ratio1 = sum(b['size'] for b in matched_blocks) / float(text1w_len)
    ratio2 = sum(b['size'] for b in matched_blocks) / float(len(text2w))
    return ratio1, ratio2, extract
コード例 #14
0
def compare_xml_text(doc1, doc2, timelimit=10):
    # Compare the text of two XML documents, marking up each document with new
    # <span> tags. The documents are modified in place.

    def make_bytes(s):
        if type(s) != str:
            s = s.encode("utf8")
        else:
            pass  # assume already utf8
        return s

    def serialize_document(doc):
        from StringIO import StringIO

        class State(object):
            pass

        state = State()
        state.text = StringIO()
        state.offsets = list()
        state.charcount = 0

        def append_text(text, node, texttype, state):
            if not text: return
            text = make_bytes(text)
            state.text.write(text)
            state.offsets.append([state.charcount, len(text), node, texttype])
            state.charcount += len(text)

        def recurse_on(node, state):
            # etree handles text oddly: node.text contains the text of the element, but if
            # the element has children then only the text up to its first child, and node.tail
            # contains the text after the element but before the next sibling. To iterate the
            # text in document order, we cannot use node.iter().
            append_text(node.text, node, 0, state)  # 0 == .text
            for child in node:
                recurse_on(child, state)
            append_text(node.tail, node, 1, state)  # 1 == .tail

        recurse_on(doc.getroot(), state)
        state.text = state.text.getvalue()
        return state

    doc1data = serialize_document(doc1)
    doc2data = serialize_document(doc2)

    def simplify_diff(diff_iter):
        # Simplify the diff by collapsing any regions with more changes than
        # similarities, so that small unchanged regions appear within the larger
        # set of changes (as changes, not as similarities).
        prev = []
        for op, length in diff_iter:
            if len(prev) < 2:
                prev.append((op, length))
            else:
                # If the op two hunks ago is the same as the current hunk and
                # the total lengths of two hunks ago and the current is creater
                # than the length of the hunk in the middle...
                if op in ('-', '+') and prev[0][0] == op and prev[1][0] == '=' \
                    and prev[0][1] + length > (prev[1][1]-1)**1.4:
                    prev.append((op, prev[0][1] + prev[1][1] + length))
                    prev.append(('-' if op == '+' else '+', prev[1][1]))
                    prev.pop(0)
                    prev.pop(0)

                # If the two hunks differ in op, combine them a different way.
                elif op in ('-', '+') and prev[0][0] in ('-', '+') and prev[1][0] == '=' \
                    and prev[0][1] + length > (prev[1][1]-1)**1.4:
                    prev.append((prev[0][0], prev[0][1] + prev[1][1]))
                    prev.append((op, prev[1][1] + length))
                    prev.pop(0)
                    prev.pop(0)

                else:
                    yield prev.pop(0)
                    prev.append((op, length))
        for p in prev:
            yield p

    def reformat_diff(diff_iter):
        # Re-format the operations of the diffs to indicate the byte
        # offsets on the left and right.
        left_pos = 0
        right_pos = 0
        for op, length in diff_iter:
            left_len = length if op in ("-", "=") else 0
            right_len = length if op in ("+", "=") else 0
            yield (op, left_pos, left_len, right_pos, right_len)
            left_pos += left_len
            right_pos += right_len

    def slice_bytes(text, start, end):
        # Return the range [start:length] from the byte-representation of
        # the text string, returning unicode. If text is unicode, convert to
        # bytes, take the slice, and then convert back from UTF8 as best as
        # possible since we may have messed up the UTF8 encoding.
        return make_bytes(text)[start:end].decode("utf8", "replace")

    def mark_text(doc, offsets, pos, length, mode):
        # Wrap the text in doc at position pos and of byte length length
        # with a <span>, and set the class to mode.
        def make_wrapper(label=None):
            wrapper_node = lxml.etree.Element('span')
            wrapper_node.set('class', mode)
            #if label: wrapper_node.set('make_wrapper_label', label)
            return wrapper_node

        for i, (off, offlen, offnode, offtype) in enumerate(offsets):
            # Does the change intersect this span?
            if pos >= off + offlen or pos + length <= off: continue

            if pos == off and length >= offlen:
                # The text to mark is the whole part of this span,
                # plus possibly some more.
                if offtype == 0:
                    # It is the node's .text, meaning replace the text
                    # that exists up to the node's first child.
                    w = make_wrapper("A")
                    w.text = offnode.text
                    offnode.text = ""
                    offnode.insert(0, w)
                else:
                    # It is the node's .tail, meaning replace the text
                    # that exists after the element and before the next
                    # sibling.
                    w = make_wrapper("B")
                    offtail = offnode.tail  # see below
                    offnode.addnext(w)
                    w.text = offtail
                    w.tail = None
                    offnode.tail = ""
            elif pos == off and length < offlen:
                # The text to mark starts here but ends early.
                if offtype == 0:
                    w = make_wrapper("C")
                    offnode.insert(0, w)
                    w.text = slice_bytes(offnode.text, 0, length)
                    w.set("txt", slice_bytes(offnode.text, 0, length))
                    w.tail = slice_bytes(offnode.text, length, offlen)
                    offnode.text = ""
                else:
                    w = make_wrapper("D")
                    offtail = offnode.tail  # get it early to avoid any automatic space normalization
                    offnode.addnext(w)  # add it early for the same reason
                    w.text = slice_bytes(offtail, 0, length)
                    w.tail = slice_bytes(offtail, length, offlen)
                    offnode.tail = ""
                # After this point we may come back to edit more text in this
                # node after this point. However, what was in this node at offset
                # x is now in the tail of the new wrapper node at position x-length.
                offsets[i] = (off + length, offlen - length, w, 1)
            elif pos > off and pos + length >= off + offlen:
                # The text to mark starts part way into this span and ends
                # at the end (or beyond).
                if offtype == 0:
                    w = make_wrapper("E")
                    offnode.insert(0, w)
                    w.text = slice_bytes(offnode.text, pos - off, offlen)
                    offnode.text = slice_bytes(offnode.text, 0, pos - off)
                else:
                    w = make_wrapper("F")
                    offtail = offnode.tail  # see above
                    offnode.addnext(w)  # see above
                    w.text = slice_bytes(offtail, pos - off, offlen)
                    w.tail = None
                    offnode.tail = slice_bytes(offtail, 0, pos - off)
            elif pos > off and pos + length < off + offlen:
                # The text to mark starts part way into this span and ends
                # early.
                if offtype == 0:
                    w = make_wrapper("G")
                    offnode.insert(0, w)
                    w.text = slice_bytes(offnode.text, pos - off,
                                         (pos - off) + length)
                    w.tail = slice_bytes(offnode.text, (pos - off) + length,
                                         offlen)
                    offnode.text = slice_bytes(offnode.text, 0, pos - off)
                else:
                    #if len(make_bytes(offnode.tail)) != offlen: raise Exception(str(len(make_bytes(offnode.tail))) + "/" + str(offlen) + "/" + lxml.etree.tostring(offnode))
                    w = make_wrapper("H")
                    offtail = offnode.tail  # see above
                    offnode.addnext(w)  # see above
                    w.text = slice_bytes(offtail, pos - off,
                                         (pos - off) + length)
                    w.tail = slice_bytes(offtail, (pos - off) + length, offlen)
                    offnode.tail = slice_bytes(offtail, 0, pos - off)
                # After this point we may come back to edit more text in this
                # node after this point. However, what was in this node at offset
                # x is now in the tail of the new wrapper node at position x-length.
                offsets[i] = (off + (pos - off) + length,
                              offlen - (pos - off) - length, w, 1)
            else:
                raise Exception()

            if pos + length > off + offlen:
                d = off + offlen - pos
                pos += d
                length -= d
                if length <= 0: return

    def get_bounding_nodes(pos, length, offsets):
        nodes = []
        for off, offlen, offnode, offtype in offsets:
            if off <= pos < off + offlen:
                nodes.append(offnode)
            if off <= pos + length < off + offlen:
                nodes.append(offnode)
        if len(nodes) == 0: return None
        return nodes[0], nodes[-1]

    def mark_correspondence(leftnode, rightnode, idx, ab):
        if not leftnode.get("id"): leftnode.set("id", "left_%d%s" % (idx, ab))
        if not rightnode.get("id"):
            rightnode.set("id", "right_%d%s" % (idx, ab))
        leftnode.set("cw_" + ab, rightnode.get("id"))
        rightnode.set("cw_" + ab, leftnode.get("id"))

    import diff_match_patch
    diff = diff_match_patch.diff(doc1data.text,
                                 doc2data.text,
                                 timelimit=timelimit)
    diff = reformat_diff(simplify_diff(diff))
    idx = 0
    for op, left_pos, left_len, right_pos, right_len in diff:
        idx += 1
        left_nodes = get_bounding_nodes(left_pos, left_len, doc1data.offsets)
        right_nodes = get_bounding_nodes(right_pos, right_len,
                                         doc2data.offsets)
        if left_nodes and right_nodes:
            mark_correspondence(left_nodes[0], right_nodes[0], idx, "top")
            mark_correspondence(left_nodes[1], right_nodes[1], idx, "bot")

        if op == "=" and doc1data.text[left_pos:left_pos +
                                       left_len] == doc2data.text[
                                           right_pos:right_pos + right_len]:
            continue
        if left_len > 0:
            mark_text(doc1, doc1data.offsets, left_pos, left_len,
                      "del" if right_len == 0 else "change")
        if right_len > 0:
            mark_text(doc2, doc2data.offsets, right_pos, right_len,
                      "ins" if left_len == 0 else "change")

    return doc1, doc2
コード例 #15
0
ファイル: nvda_dmp.py プロジェクト: feerrenrut/nvda_dmp
You may obtain a copy of the licence at

    http://www.apache.org/licences/licence-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the licence is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the licence for the specific language governing permissions and
limitations under the licence."""

import struct
import sys

from diff_match_patch import diff

if __name__ == "__main__":
    while True:
        oldLen, newLen = struct.unpack("=II", sys.stdin.buffer.read(8))
        if not oldLen and not newLen:
            break  # sentinal value
        oldText = sys.stdin.buffer.read(oldLen).decode("utf-8")
        newText = sys.stdin.buffer.read(newLen).decode("utf-8")
        res = ""
        for op, text in diff(oldText, newText, counts_only=False):
            if op == "+":
                res += text.rstrip() + "\n"
        sys.stdout.buffer.write(struct.pack("=I", len(res)))
        sys.stdout.buffer.write(res.encode("utf-8"))
        sys.stdin.flush()
        sys.stdout.flush()