コード例 #1
0
    def test_various_combinations(self):
        actual   = str(para_diff([["foo"], ["bar"]], [["foo", "baz", "bar"]]))
        expected = "[(= [foo]), (+ [baz]), (==), (= [bar])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar"]], [["foo", "baz"], ["bar"]]))
        expected = "[(= [foo]), (+ [baz]), (‖), (= [bar])]"
        self.assertEqual(actual, expected)
コード例 #2
0
    def test_split_para(self):
        actual   = str(para_diff([["foo", "bar"]], [["foo"], ["bar"]]))
        expected = "[(= [foo]), (‖), (= [bar])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar", "baz"]], [["foo"], ["bar"], ["baz"]]))
        expected = "[(= [foo]), (‖), (= [bar]), (‖), (= [baz])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar", "baz"]], [["foo", "bar"], ["baz"]]))
        expected = "[(= [foo, bar]), (‖), (= [baz])]"
        self.assertEqual(actual, expected)
コード例 #3
0
    def test_equal_paras(self):
        actual   = str(para_diff([], []))
        expected = str([])
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar"]], [["foo", "bar"]]))
        expected = "[(= [foo, bar])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar"], ["baz", "boo"]], [["foo", "bar"], ["baz", "boo"]]))
        expected = "[(= [foo, bar]), (= [baz, boo])]"
        self.assertEqual(actual, expected)
コード例 #4
0
ファイル: doc_diff.py プロジェクト: ckorzen/arxiv-benchmark
def doc_diff(actual, target, junk=[]):
    """ Aligns the given string 'actual' against the given string 'target'.
    Both strings are seen as sequences of paragraphs. Returns a sequence of 
    operations that are necessary to transform 'actual' into 'target'. The 
    operations under consideration are:

    * Split paragraph.
    * Merge paragraph.
    * Delete paragraph.
    * Rearrange paragraph.
    * Insert word.
    * Delete word.
    * Rearrange word. 
    """

    # 'actual' and 'target' are strings and may be arranged in paragraphs 
    # which are denoted by two newlines.

    # Extract the paragraphs from 'actual' and 'target' and format them (remove
    # special characters and transform all letters to lowercase letters). 
    actual_paras = util.to_formatted_paragraphs(actual, to_protect=junk)
    target_paras = util.to_formatted_paragraphs(target, to_protect=junk)
    
    # 'actual_paras' and 'target_paras' are lists of lists of words, where each
    # inner list includes the (normalized) words of a paragraph.
    # example: 'actual_paras' = [['words', 'of', 'first', 'paragraph], [...]] 

    # Run para diff to get the basic paragraph operations to perform to 
    # transform 'actual' into 'target'.
    diff_result = para_diff.para_diff(actual_paras, target_paras, junk)

    x = merge(apply_insert_delete_replace_rearrange(diff_result, junk))
    
    return x;
コード例 #5
0
ファイル: doc_diff.py プロジェクト: maxdippel/PDFDiVi
def doc_diff(actual, target, excludes=[], junk=[]):
    """ Given two texts, 'actual' and 'target', this method outputs a sequence
    of phrases which can be used to determine the operations required to 
    transform 'actual' into 'target'. A phrase is defined by a sequence of 
    words. The texts are seen as sequences of paragraphs (text blocks separated 
    by two or more newlines). Phrases lives within paragraphs and do *not* 
    exceed paragraph boundaries. We differ in the following phrases:
    
    * CommonPhrase: 
        A phrase that is common to both texts.
    * ReplacePhrase: 
        A phrase x in 'actual' to replace by a phrase y in 'target'.
        Note that x or y could be empty, i.e. a ReplacePhrase could indeed 
        represent a phrase to delete from 'actual' or a phrase to insert into 
        'target'.
    * RearrangePhrase: 
        A phrase that is common to both texts but their order in the texts 
        differ.
        
    On comparing, words are normalized, i.e. punctuation marks and any other
    special characters will be removed and all characters will be transformed 
    to lowercases. 
    You can exclude certain words from normalization by defining a list 
    'excludes' of related regular expression that matches the words you wish 
    to exclude.
    
    Let us define the following running example to use throughout the whole 
    documentation:
    
    actual                  target:
    --------------------    --------------------
    The quick, big fox      The big, quick fox
    eats the ice-cold 
    sandwich.               eats the ice-cold
                            sandwich.
    
    Note, that 'actual' consists of one paragraph and 'target' consists of two
    paragraphs.
    """
    # TODO: Explain 'junk'.


    # Split 'actual' and 'target' into paragraphs and normalize the words.
    #
    # Result is a list of lists of DocWords, where the inner lists represent 
    # the paragraphs (one list for each paragraph) with included words. Each 
    # DocWord includes the normalized and the unnormalized version (with 
    # trailing whitespaces) of a word:
    # paras_target = [[DocWord("the", "The "), DocWord("big", "big, "), ...],
    #                 [DocWord("eats", "eats "), DocWord("the", "the "), ...]]
    paras_actual = to_normalized_paras(actual, to_lower=True, excludes=excludes)
    paras_target = to_normalized_paras(target, to_lower=True, excludes=excludes)

    return {
        "num_paras_actual": len(paras_actual),
        "num_paras_target": len(paras_target),
        "num_words_actual": sum(len(x) for x in paras_actual),
        "num_words_target": sum(len(x) for x in paras_target),
        "phrases": para_diff.para_diff(paras_actual, paras_target, junk) 
    }
コード例 #6
0
    def test_replace_para(self):
        actual   = str(para_diff([["foo"]], [["bar"]]))
        expected = "[(/ [foo], [bar])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar"]], [["baz", "boo"]]))
        expected = "[(/ [foo, bar], [baz, boo])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo"], ["bar"]], [["baz"], ["boo"]]))
        expected = "[(/ [foo], [baz]), (/ [bar], [boo])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar"]], [["baz"]]))
        expected = "[(/ [foo, bar], [baz])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo"], ["bar"]], [["baz"]]))
        expected = "[(/ [foo], [baz]), (/ [bar], [])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff(["foo"], [["baz", "boo"]]))
        expected = "[(/ [foo], [baz, boo])]"
        self.assertEqual(actual, expected)

        actual   = str(para_diff(["foo"], [["baz"], ["boo"]]))
        expected = "[(/ [foo], [baz]), (/ [], [boo])]"
        self.assertEqual(actual, expected)
コード例 #7
0
    def test_rearrange_para(self):
        #actual   = str(para_diff([["foo"], ["bar"]], [["bar"], ["foo"]]))
        #expected = "[(<> [foo]), (= [bar])]"
        #self.assertEqual(actual, expected)

        #actual   = str(para_diff([["foo"], ["bar"], ["baz"]], [["baz"], ["bar"], ["foo"]]))
        #expected = "[(<> [foo]), (<> [bar]), (= [baz])]"
        #self.assertEqual(actual, expected)

        #actual   = str(para_diff([["foo", "bar", "baz"]], [["baz", "bar", "foo"]]))
        #expected = "[(<> [foo]), (<> [bar]), (= [baz])]"
        #self.assertEqual(actual, expected)

        #actual   = str(para_diff([["foo", "bar", "baz", "boo"]], [["baz", "boo", "foo", "bar"]]))
        #expected = "[(<> [foo, bar]), (= [baz, boo])]"
        #self.assertEqual(actual, expected)

        actual   = str(para_diff([["foo", "bar", "baz"]], [["baz", "foo", "bar"]]))
        expected = "[(= [foo, bar]), (<> [baz])]"
        self.assertEqual(actual, expected)
コード例 #8
0
def doc_diff(
        actual,
        target,
        rearrange_phrases=False,
        min_rearrange_length=3,
        refuse_common_threshold=0,
        junk=[]):
    """ Does doc_diff based on flat lists of DiffWord objects. """

    # Run para_diff on given input.
    diff_result = para_diff.para_diff(
        actual,
        target,
        rearrange_phrases=rearrange_phrases,
        min_rearrange_length=min_rearrange_length,
        refuse_common_threshold=refuse_common_threshold,
        junk=junk)

    # Compute the number of operations.
    handle_diff_result(diff_result, junk)

    return diff_result
コード例 #9
0
 def assert_equal(self, input1, input2, expected_str):
     actual     = para_diff(input1, input2)
     actual_str = "[%s]" % ", ".join([str(x) for x in actual]) 
     self.assertEqual(actual_str, expected_str)
コード例 #10
0
 def assert_equal(self, input1, input2, expected_str):
     actual = para_diff(input1, input2)
     actual_str = "[%s]" % ", ".join([str(x) for x in actual])
     self.assertEqual(actual_str, expected_str)