def test_blocks_splitting_token_case(self): collation = Collation() collation.add_witness("W1", "a c b c") collation.add_witness("W2", "a c b") algorithm = DekkerSuffixAlgorithm(collation) blocks = algorithm.get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 5-7")) # a c b self.assertIn(block1, blocks)
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") algorithm = DekkerSuffixAlgorithm(collation) blocks = algorithm.get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_witness("W1", "the black cat") collation.add_witness("W2", "the black cat") algorithm = DekkerSuffixAlgorithm(collation) blocks = algorithm.get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 4-6")) self.assertEqual([block1], blocks)
def testPlainWitness(self): plain_witness = { 'id': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.' } c = Collation() c.add_witness(plain_witness) self.assertEqual(len(c.witnesses[0].tokens()), 10)
def test_filter_potential_blocks(self): collation = Collation() collation.add_witness("W1", "a a") collation.add_witness("w2", "a") extsufarr = collation.to_extended_suffix_array() potential_blocks = extsufarr.split_lcp_array_into_intervals() algorithm = DekkerSuffixAlgorithm(collation) algorithm.filter_potential_blocks(potential_blocks) self.assertFalse(potential_blocks)
def test_block_witnesses_Hermans_case_two_witnesses(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") algorithm = DekkerSuffixAlgorithm(collation) block_witness = algorithm.get_block_witness(collation.witnesses[0]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug()) block_witness = algorithm.get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
def test_blocks_failing_transposition_use_case_old_algorithm(self): collation = Collation() collation.add_witness("W1", "the cat and the dog") collation.add_witness("W2", "the dog and the cat") algorithm = DekkerSuffixAlgorithm(collation) blocks = algorithm.get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-1, 9-10")) block2 = Block(RangeSet("3-4, 6-7")) block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks)
def test_split_lcp_intervals_into_smaller_intervals(self): collation = Collation() collation.add_witness("W1", "the cat") collation.add_witness("W2", "the cat") collation.add_witness("W3", "the cat") extsufarr = collation.to_extended_suffix_array() split_intervals = extsufarr.split_lcp_array_into_intervals() self.assertIntervalIn(0, 2, 3, split_intervals) # the cat self.assertIntervalIn(1, 1, 3, split_intervals) # cat self.assertEqual(2, len(split_intervals), "More items: "+str(split_intervals))
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") collation.add_witness("W3", "a b c d E g h i ! q r s t") algorithm = DekkerSuffixAlgorithm(collation) blocks = algorithm.get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 20")), blocks) # F
def test_block_witnesses_Hermans_case(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") collation.add_witness("W3", "a b c d E g h i ! q r s t") algorithm = DekkerSuffixAlgorithm(collation) block_witness1 = algorithm.get_block_witness(collation.witnesses[0]) self.assertEquals(["a b c d", "F", "g h i", "! q r s t"], block_witness1.debug()) block_witness2 = algorithm.get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d", "F", "g h i", "! q r s t"], block_witness2.debug()) block_witness3 = algorithm.get_block_witness(collation.witnesses[2]) self.assertEquals(["a b c d", "g h i", "! q r s t"], block_witness3.debug())
def testPretokenizedWitnessAdd(self): pt_witness = { "id": "A", "tokens": [ {"t": "A", "ref": 123}, {"t": "black and blue", "adj": True}, {"t": "cat", "id": "xyz"}, {"t": "bird", "id": "abc"} ] } c = Collation() c.add_witness(pt_witness) self.assertEqual(len(c.witnesses[0].tokens()), 4)
def test_Hermans_case_variantgraph(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") graph = VariantGraph() algorithm = DekkerSuffixAlgorithm(collation) algorithm.build_variant_graph_from_blocks(graph, collation) start_vertex = graph.start a = graph.vertexWith("a") b = graph.vertexWith("b") t = graph.vertexWith("t") end_vertex = graph.end self.assert_(graph.edge_between(start_vertex, a)) self.assert_(graph.edge_between(a, b)) self.assert_(graph.edge_between(t, end_vertex))
def align_witnesses(self, witnesses): normalized_witnesses = [] tokenized_witnesses = [] for witness in witnesses: normalized_tokens = [] tokenized_witness = [] sigil = witness["id"] for token in witness["tokens"]: tokenized_witness.append(token) if "n" in token: normalized_tokens.append(token["n"]) else: normalized_tokens.append(token["t"]) normalized_witnesses.append( Witness(sigil, " ".join(normalized_tokens))) tokenized_witnesses.append(tokenized_witness) collation = Collation() for normalized_witness in normalized_witnesses: if normalized_witness.content: collation.add_witness(normalized_witness.sigil, normalized_witness.content) results = {"witnesses": [], "table": [[]], "status": []} if len(collation.witnesses) > 0: at = collate(collation, output="novisualization", segmentation=False) tokenized_at = AlignmentTable(collation) for row, tokenized_witness in zip(at.rows, tokenized_witnesses): new_row = Row(row.header) tokenized_at.rows.append(new_row) token_counter = 0 for cell in row.cells: if cell != "-": if token_counter <= len(tokenized_witness) - 1: new_row.cells.append( tokenized_witness[token_counter]) token_counter += 1 else: # TODO: should probably be null or None instead, but that would break the rendering at the moment new_row.cells.append({"t": "^"}) alignment = json.loads( display_alignment_table_as_json(tokenized_at)) self.transform_alignment(alignment)
def testBeckett(self): collation = Collation() collation.add_witness("1", "The same clock as when for example Magee once died.") collation.add_witness("2", "The same as when for example Magee once died.") table = collate(collation, output="novisualization") self.assertEquals(["The same", "clock", "as when for example Magee once died."], table.rows[0].to_list()) self.assertEquals(["The same", "-", "as when for example Magee once died."], table.rows[1].to_list()) # table.print_plain_text() # "The same as when for example McKee once died .",// # "The same as when among others Darly once died & left him.",// # # "The same as when Darly among others once died and left him."); pass # # if __name__ == "__main__": # #import sys;sys.argv = ['', 'Test.testName'] # unittest.main()
def testPretokenizedWitnessAdd(self): pt_witness = { "id": "A", "tokens": [{ "t": "A", "ref": 123 }, { "t": "black and blue", "adj": True }, { "t": "cat", "id": "xyz" }, { "t": "bird", "id": "abc" }] } c = Collation() c.add_witness(pt_witness) self.assertEqual(len(c.witnesses[0].tokens()), 4)
def testDoubleTransposition1(self): collation = Collation() collation.add_witness("A", "the cat is black") collation.add_witness("B", "black is the cat") alignment_table = collate(collation, output="novisualization") self.assertEquals(["the cat", "is", "black"], alignment_table.rows[0].to_list()) self.assertEquals(["black", "is", "the cat"], alignment_table.rows[1].to_list()) # @Test # public void doubleTransposition1() { # final SimpleWitness[] w = createWitnesses("the cat is black", "black is the cat"); # final RowSortedTable<Integer, Witness, Set<Token>> t = table(collate(w)); # assertEquals("|the|cat|is|black| |", toString(t, w[0])); # assertEquals("|black| |is|the|cat|", toString(t, w[1])); # } # # @Test # public void doubleTransposition2() { # final SimpleWitness[] w = createWitnesses("a b", "b a"); # final RowSortedTable<Integer, Witness, Set<Token>> t = table(collate(w)); # assertEquals("| |a|b|", toString(t, w[0])); # assertEquals("|b|a| |", toString(t, w[1])); # } # # @Test # public void doubleTransposition3() { # final SimpleWitness[] w = createWitnesses("a b c", "b a c"); # final RowSortedTable<Integer, Witness, Set<Token>> t = table(collate(w)); # assertEquals("| |a|b|c|", toString(t, w[0])); # assertEquals("|b|a| |c|", toString(t, w[1])); # } # # if __name__ == "__main__": # #import sys;sys.argv = ['', 'Test.testName'] # unittest.main()
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") self.assertEquals(RangeSet("0-14"), collation.get_range_for_witness("W1")) self.assertEquals(RangeSet("16-28"), collation.get_range_for_witness("W2"))
def testPlainWitness(self): plain_witness = {'id': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'} c = Collation() c.add_witness(plain_witness) self.assertEqual(len(c.witnesses[0].tokens()), 10)
def test_combined_string_hermans_case(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") # $ is meant to separate witnesses here self.assertEquals("a b c d F g h i ! K ! q r s t $1 a b c d F g h i ! q r s t", collation.get_combined_string())