コード例 #1
0
def para_diff(
        actual,
        target,
        rearrange_phrases=False,
        min_rearrange_length=3,
        refuse_common_threshold=0,
        junk=[]):
    """ Does a para_diff based on flat lists of DiffWord objects. """

    actual = [] if actual is None else actual
    target = [] if target is None else target

    # Compute the phrases.
    result = word_diff.word_diff(
        actual, target, refuse_common_threshold=refuse_common_threshold)

    break_phrases(result, junk=junk)

    # Provide some metadata.
    result.num_paras_actual = actual[-1].para + 1 if len(actual) > 0 else 0
    result.num_paras_target = target[-1].para + 1 if len(target) > 0 else 0

    if rearrange_phrases:
        # Rearrange the phrases.
        rearrange(
            result, min_rearrange_length=min_rearrange_length,
            refuse_common_threshold=refuse_common_threshold,
            junk=junk)

    # Split and merge the phrases to meet paragraph boundaries.
    # split_and_merge(result)

    return result
コード例 #2
0
ファイル: para_diff.py プロジェクト: ckorzen/arxiv-benchmark
def para_diff(actual, target, junk=[]):
    """ Diff for paragraphs. Returns a list of phrases / operations to do to
    translate the paragraphs in 'actual' to the paragraphs in 'target'. 
    Allowed phrases / operations are: (1) Split paragraph, (2) Merge paragraph, 
    (3) Rearrange phrase, (4) Insert phrase, (5) Delete phrase and (6) common
    phrases (phrases which are common to actual and target). 
    Let us introduce a simple example to use throughout the documentation:
    
           actual                      target
    The quick fox jumps         The quick fox jumps
    over the lazy dog.          over the lazy dog. 
                                Pack my box with an
    Pack my box with an         thirty dozen liquor  
    twenty dozen liquor         jugs.
    jugs.
    
    The actual text consists of 2 paragraphs, target consists of only 1 
    paragraph. Further thex differ in the words twenty <-> thirty.
    
    Actually, 'actual' and 'target' have to be (arbitrarily nested) lists of 
    words. In our case, both lists have to be once nested lists of words of 
    paragraphs:
    actual = [['The', 'quick', 'fox' ...], ['Pack', 'my', ...]]
    target = [['The', 'quick', 'fox' ...]]
    """
    
    # Flatten both lists of words to be able to do a word-based diff.
    # 'actual_flatten' and 'target_flatten' are now flat lists of tuples. Each
    # tuple (<word>, <flat_pos>, <pos_stack>) consists of:
    #   <word>      : The word
    #   <flat_pos>  : The pos of word in flat representation of original list.
    #   <pos_stack> : The position stack as list. The i-th element denotes the 
    #                 position of the word in the original list at level i.
    # actual_flatten = [('The', 0, [0, 0]), ..., ('Pack', 9, [1, 0]), ...]
    # target_flatten = [('The', 0, [0, 0]), ..., ('Pack', 9, [0, 9]), ...] 
    actual_flatten = flatten(actual)
    target_flatten = flatten(target)
 
    # Do a word-based diff on 'actual_flatten' and 'target_flatten'. 
    # The result is a list of diff.DiffCommonPhrase and diff.DiffReplacePhrase 
    # objects denoting the operations to perform to transform actual_flatten 
    # into target_flatten.
    # 
    # [= [('The', 0, [0, 0]), ('quick', 1, [0, 1]), ('fox', 2, [0, 2])],
    #    [('The', 0, [0, 0]), ('quick', 1, [0, 1]), ('fox', 2, [0, 2])]]
    # denotes a DiffCommonPhrase consisting of the related elements in 
    # actual_flatten and target_flatten. It implies that "The quick brown fox" 
    # occurs in both texts.
    #
    # [/ [('twenty', 13, [1, 5])], [('thirty', 13, [0, 13])]]
    # denotes a DiffReplacePhrase consisting of the related elements in 
    # actual_flatten and target_flatten. It implies that we have to replace 
    # "twenty" in actual by "thirty" in target.
    # One of the element lists in a DiffReplacePhrase may be empty, denoting 
    # either an insertion or a deletion.
    # NOTE: The diff result does *not* respect any paragraph boundaries. Diff
    # considers both list as an sequence of words. For example, a 
    # diff.DiffCommonPhrase may extend one or more paragraph boundaries.
    diff_phrases = diff.diff(actual_flatten, target_flatten)
        
    # Assume, that there is a phrase that occur in both list, but its order in 
    # actual doesn't correspond to the order in target, for example:
    #
    #   The quick fox jumps        Pack my box with an 
    #   over the lazy dog.         twenty dozen liquor 
    #                              jugs.
    #   Pack my box with an
    #   twenty dozen liquor        The quick fox jumps
    #   jugs.                      over the lazy dog.
    #
    # The phrase "The quick fox ..." is rearranged in both texts. Diff won't 
    # find such a rearrangement. Instead it would state to delete the phrase
    # from actual and to insert it in target.
    # Try to find such phrases and to rearrange them.
    rearranged_phrases = rearr.rearrange(diff_phrases, junk)
    
    # Because diff doesn't know any paragraph boundaries (see above), we have
    # to divide the phrases to get phrases per paragraph.
    return divide_phrases_per_para(rearranged_phrases, junk)
コード例 #3
0
ファイル: para_diff.py プロジェクト: ckorzen/arxiv-benchmark
def para_diff(actual, target, junk=[]):
    """ Finds the differences between the two given lists of paragraphs. """

    # 'actual' and 'target' may be arbitrarily nested list of words. 
    # In our case, both lists are once nested list of words of paragraphs.
    # example: 'actual_paras' = [['words', 'of', 'first', 'paragraph], [...]]

    # Flatten the list of words to be able to do a word-based diff. 
    actual_flatten = flatten(actual)
    target_flatten = flatten(target)

    # 'actual_flatten' and 'target_flatten' are now flat lists of tuples. Each
    # tuple (<word>, <flat_pos>, <pos_stack>) consists of:
    #   <word>      : The word
    #   <flat_pos>  : The pos of word in flat representation of original list.
    #   <pos_stack> : The position stack as list. The i-th element denotes the 
    #                 position of the word in the original list at level i.
    # example: flatten([['foo', 'bar'], ['baz']]) 
    #            = [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])] 

    # Do a word-based diff on 'actual_flatten' and 'target_flatten'. 
    # The result is a list of diff.Common and diff.Replace objects denoting
    # the operations to perform to transform actual_flatten into target_flatten.
    # Both objects contain the related elements in 'actual_flatten' and 
    # 'target_flatten'
    #
    # examples: 
    # 
    # (= [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])],
    #    [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])])
    # denotes a common object including the related elements in actual_flatten 
    # and the elements in target_flatten. It implies that "foo bar baz" occurs
    # in both lists.
    #
    # (/ [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])],
    #    [('doo', 0, [0, 0])])
    # denotes a replace object including the related elements in actual_flatten 
    # and the elements in target_flatten. It implies that "foo bar baz" in 
    # 'actual' is replaced by "doo" in 'target'.
    #
    # One of the element lists in diff.Replace objects my be empty, denoting 
    # either a insert or an deletion.
    diff_result = diff.diff(actual_flatten, target_flatten)

    # There could be phrases that occur in both, 'actual_flatten' and 
    # 'target_flatten' but their order doesn't correspond. Try to identify and
    # rearrange such phrases.
    rearrange_result = rearr.rearrange(diff_result, junk)
    
    # The rearrange result is now a flat list of diff.Common, diff.Replace and
    # rearr.Rearrange objects and doesn't meet any paragraph structures. 
    # So we need to split (and merge) the objects now to get operations per 
    # paragraph.

    para_result = []

    # Keep track of the previous actual item and the previous target item to
    # be able to decide where to split the objects.    
    prev_item_actual = None
    prev_item_target = None
    
    for item in rearrange_result:
        if isinstance(item, diff.Common):
            item = Commons(item, prev_item_actual, prev_item_target)
        elif isinstance(item, rearr.Rearrange):
            item = Rearranges(item, prev_item_actual, prev_item_target)
        elif isinstance(item, diff.Replace):
            item = Replaces(item, prev_item_actual, prev_item_target)

        # TODO: Obtain the previous actual item and the previous target item.
        if item and item.phrases:
            xxx = [x for x in item.phrases if not isinstance(x, Delete) and not util.ignore_phrase(x, junk)]
            if xxx:
                last_item = xxx[-1]
                if last_item.items_actual:
                    prev_item_actual = last_item.items_actual[-1]

            yyy = [y for y in item.phrases if not isinstance(y, Insert) and not util.ignore_phrase(y, junk)]
            if yyy:
                last_item = yyy[-1]
                if last_item.items_target:
                    prev_item_target = last_item.items_target[-1]

        para_result.append(item)

    # return merge(para_result)
    return para_result