def test_various_combinations(self): actual = str(para_diff([["foo"], ["bar"]], [["foo", "baz", "bar"]])) expected = "[(= [foo]), (+ [baz]), (==), (= [bar])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar"]], [["foo", "baz"], ["bar"]])) expected = "[(= [foo]), (+ [baz]), (‖), (= [bar])]" self.assertEqual(actual, expected)
def test_split_para(self): actual = str(para_diff([["foo", "bar"]], [["foo"], ["bar"]])) expected = "[(= [foo]), (‖), (= [bar])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar", "baz"]], [["foo"], ["bar"], ["baz"]])) expected = "[(= [foo]), (‖), (= [bar]), (‖), (= [baz])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar", "baz"]], [["foo", "bar"], ["baz"]])) expected = "[(= [foo, bar]), (‖), (= [baz])]" self.assertEqual(actual, expected)
def test_equal_paras(self): actual = str(para_diff([], [])) expected = str([]) self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar"]], [["foo", "bar"]])) expected = "[(= [foo, bar])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar"], ["baz", "boo"]], [["foo", "bar"], ["baz", "boo"]])) expected = "[(= [foo, bar]), (= [baz, boo])]" self.assertEqual(actual, expected)
def doc_diff(actual, target, junk=[]): """ Aligns the given string 'actual' against the given string 'target'. Both strings are seen as sequences of paragraphs. Returns a sequence of operations that are necessary to transform 'actual' into 'target'. The operations under consideration are: * Split paragraph. * Merge paragraph. * Delete paragraph. * Rearrange paragraph. * Insert word. * Delete word. * Rearrange word. """ # 'actual' and 'target' are strings and may be arranged in paragraphs # which are denoted by two newlines. # Extract the paragraphs from 'actual' and 'target' and format them (remove # special characters and transform all letters to lowercase letters). actual_paras = util.to_formatted_paragraphs(actual, to_protect=junk) target_paras = util.to_formatted_paragraphs(target, to_protect=junk) # 'actual_paras' and 'target_paras' are lists of lists of words, where each # inner list includes the (normalized) words of a paragraph. # example: 'actual_paras' = [['words', 'of', 'first', 'paragraph], [...]] # Run para diff to get the basic paragraph operations to perform to # transform 'actual' into 'target'. diff_result = para_diff.para_diff(actual_paras, target_paras, junk) x = merge(apply_insert_delete_replace_rearrange(diff_result, junk)) return x;
def doc_diff(actual, target, excludes=[], junk=[]): """ Given two texts, 'actual' and 'target', this method outputs a sequence of phrases which can be used to determine the operations required to transform 'actual' into 'target'. A phrase is defined by a sequence of words. The texts are seen as sequences of paragraphs (text blocks separated by two or more newlines). Phrases lives within paragraphs and do *not* exceed paragraph boundaries. We differ in the following phrases: * CommonPhrase: A phrase that is common to both texts. * ReplacePhrase: A phrase x in 'actual' to replace by a phrase y in 'target'. Note that x or y could be empty, i.e. a ReplacePhrase could indeed represent a phrase to delete from 'actual' or a phrase to insert into 'target'. * RearrangePhrase: A phrase that is common to both texts but their order in the texts differ. On comparing, words are normalized, i.e. punctuation marks and any other special characters will be removed and all characters will be transformed to lowercases. You can exclude certain words from normalization by defining a list 'excludes' of related regular expression that matches the words you wish to exclude. Let us define the following running example to use throughout the whole documentation: actual target: -------------------- -------------------- The quick, big fox The big, quick fox eats the ice-cold sandwich. eats the ice-cold sandwich. Note, that 'actual' consists of one paragraph and 'target' consists of two paragraphs. """ # TODO: Explain 'junk'. # Split 'actual' and 'target' into paragraphs and normalize the words. # # Result is a list of lists of DocWords, where the inner lists represent # the paragraphs (one list for each paragraph) with included words. Each # DocWord includes the normalized and the unnormalized version (with # trailing whitespaces) of a word: # paras_target = [[DocWord("the", "The "), DocWord("big", "big, "), ...], # [DocWord("eats", "eats "), DocWord("the", "the "), ...]] paras_actual = to_normalized_paras(actual, to_lower=True, excludes=excludes) paras_target = to_normalized_paras(target, to_lower=True, excludes=excludes) return { "num_paras_actual": len(paras_actual), "num_paras_target": len(paras_target), "num_words_actual": sum(len(x) for x in paras_actual), "num_words_target": sum(len(x) for x in paras_target), "phrases": para_diff.para_diff(paras_actual, paras_target, junk) }
def test_replace_para(self): actual = str(para_diff([["foo"]], [["bar"]])) expected = "[(/ [foo], [bar])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar"]], [["baz", "boo"]])) expected = "[(/ [foo, bar], [baz, boo])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo"], ["bar"]], [["baz"], ["boo"]])) expected = "[(/ [foo], [baz]), (/ [bar], [boo])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar"]], [["baz"]])) expected = "[(/ [foo, bar], [baz])]" self.assertEqual(actual, expected) actual = str(para_diff([["foo"], ["bar"]], [["baz"]])) expected = "[(/ [foo], [baz]), (/ [bar], [])]" self.assertEqual(actual, expected) actual = str(para_diff(["foo"], [["baz", "boo"]])) expected = "[(/ [foo], [baz, boo])]" self.assertEqual(actual, expected) actual = str(para_diff(["foo"], [["baz"], ["boo"]])) expected = "[(/ [foo], [baz]), (/ [], [boo])]" self.assertEqual(actual, expected)
def test_rearrange_para(self): #actual = str(para_diff([["foo"], ["bar"]], [["bar"], ["foo"]])) #expected = "[(<> [foo]), (= [bar])]" #self.assertEqual(actual, expected) #actual = str(para_diff([["foo"], ["bar"], ["baz"]], [["baz"], ["bar"], ["foo"]])) #expected = "[(<> [foo]), (<> [bar]), (= [baz])]" #self.assertEqual(actual, expected) #actual = str(para_diff([["foo", "bar", "baz"]], [["baz", "bar", "foo"]])) #expected = "[(<> [foo]), (<> [bar]), (= [baz])]" #self.assertEqual(actual, expected) #actual = str(para_diff([["foo", "bar", "baz", "boo"]], [["baz", "boo", "foo", "bar"]])) #expected = "[(<> [foo, bar]), (= [baz, boo])]" #self.assertEqual(actual, expected) actual = str(para_diff([["foo", "bar", "baz"]], [["baz", "foo", "bar"]])) expected = "[(= [foo, bar]), (<> [baz])]" self.assertEqual(actual, expected)
def doc_diff( actual, target, rearrange_phrases=False, min_rearrange_length=3, refuse_common_threshold=0, junk=[]): """ Does doc_diff based on flat lists of DiffWord objects. """ # Run para_diff on given input. diff_result = para_diff.para_diff( actual, target, rearrange_phrases=rearrange_phrases, min_rearrange_length=min_rearrange_length, refuse_common_threshold=refuse_common_threshold, junk=junk) # Compute the number of operations. handle_diff_result(diff_result, junk) return diff_result
def assert_equal(self, input1, input2, expected_str): actual = para_diff(input1, input2) actual_str = "[%s]" % ", ".join([str(x) for x in actual]) self.assertEqual(actual_str, expected_str)