def test_scoring_with_properties_filter(self): json_in = { "witnesses" : [ { "id" : "A", "tokens" : [ { "t" : "filler1" }, { "t" : "token" }, ] }, { "id" : "B", "tokens" : [ { "t" : "token", "rend" : "b" }, { "t" : "filler2" }, ] } ] } expected_output = """+---+---------+-------+---------+ | A | filler1 | token | - | | B | - | token | filler2 | +---+---------+-------+---------+""" alignment_table = collate_pretokenized_json(json_in) self.assertEqual(expected_output, str(alignment_table)) expected_output = """+---+---------+---------+ | A | filler1 | token | | B | token | filler2 | +---+---------+---------+""" alignment_table = collate_pretokenized_json(json_in, properties_filter=self.match_properties) self.assertEqual(expected_output, str(alignment_table))
def testJSONOutput_empty_cells_in_output(self): json_in = { "witnesses": [{ "id": "A", "tokens": [{ "t": "A", "ref": 123 }, { "t": "black", "adj": True }, { "t": "cat", "id": "xyz" }] }, { "id": "B", "tokens": [{ "t": "A" }, { "t": "kitten.", "n": "cat" }] }] } expected_json = '{"table": [[[{"ref": 123, "t": "A"}], [{"adj": true, "t": "black"}], [{"id": "xyz", "t": "cat"}]], [[{"t": "A"}], [null], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' json_out = collate_pretokenized_json(json_in, output="json") self.assertEqual(expected_json, json_out)
def testSegmentationPretokenizedJSON(self): json_in = { "witnesses": [{ "id": "A", "tokens": [{ "t": "A", "ref": 123 }, { "t": "black", "adj": True }, { "t": "cat", "id": "xyz" }] }, { "id": "B", "tokens": [{ "t": "A" }, { "t": "white", "adj": True }, { "t": "stripy", "adj": True }, { "t": "kitten.", "n": "cat" }] }] } bad_func = lambda x: collate_pretokenized_json(x, segmentation=True) self.assertRaises(UnsupportedError, bad_func, json_in)
def testPretokenizedWitness(self): pretokenized_witness = { "witnesses": [ { "id": "A", "tokens": [ {"t": "A", "ref": 123}, {"t": "black", "adj": True}, {"t": "cat", "id": "xyz"}, {"t": "bird", "id": "abc"} ] }, { "id": "B", "tokens": [ {"t": "A"}, {"t": "white", "adj": True}, {"t": "mousedog bird", "adj": False} ] } ] } result = collate_pretokenized_json(pretokenized_witness) self.assertEqual(len(result.rows[0].to_list()), 4) self.assertEqual(len(result.rows[1].to_list()), 4) # The second witness should have a token that reads 'mousedog bird'. self.assertIn("mousedog bird", result.rows[1].to_list())
def testHTMLOutputVerticalLayoutPretokenizedJSON(self): json_in = { "witnesses" : [ { "id" : "A", "tokens" : [ { "t" : "A", "ref" : 123 }, { "t" : "black" , "adj" : True }, { "t" : "cat", "id" : "xyz" } ] }, { "id" : "B", "tokens" : [ { "t" : "A" }, { "t" : "white" , "adj" : True }, { "t" : "kitten.", "n" : "cat" } ] } ] } expected_output = """\ +-------+---------+ | A | B | +-------+---------+ | A | A | +-------+---------+ | black | white | +-------+---------+ | cat | kitten. | +-------+---------+""" plain_text_output = str(collate_pretokenized_json(json_in, layout="vertical")) self.assertEquals(expected_output, plain_text_output)
def testHTMLOutputPretokenizedJSON(self): json_in = { "witnesses" : [ { "id" : "A", "tokens" : [ { "t" : "A", "ref" : 123 }, { "t" : "black" , "adj" : True }, { "t" : "cat", "id" : "xyz" } ] }, { "id" : "B", "tokens" : [ { "t" : "A" }, { "t" : "white" , "adj" : True }, { "t" : "kitten.", "n" : "cat" } ] } ] } expected_plain_table = """\ +---+---+-------+---------+ | A | A | black | cat | | B | A | white | kitten. | +---+---+-------+---------+""" plain_table = str(collate_pretokenized_json(json_in, output="table")) self.assertEqual(expected_plain_table, plain_table)
def testJSONOutputPretokenizedJSON(self): json_in = { "witnesses" : [ { "id" : "A", "tokens" : [ { "t" : "A", "ref" : 123 }, { "t" : "black" , "adj" : True }, { "t" : "cat", "id" : "xyz" } ] }, { "id" : "B", "tokens" : [ { "t" : "A" }, { "t" : "white" , "adj" : True }, { "t" : "kitten.", "n" : "cat" } ] } ] } expected_json = '{"table": [[[{"ref": 123, "t": "A"}], [{"adj": true, "t": "black"}], [{"id": "xyz", "t": "cat"}]], [[{"t": "A"}], [{"adj": true, "t": "white"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' json_out = collate_pretokenized_json(json_in, output="json") self.assertEqual(expected_json, json_out)
def test_near_matching_segmented(self): result = collate_pretokenized_json(self.json_in, near_match=True, segmentation=True) self.assertEquals([ "I bought", "this glass, because it matches those dinner plates." ], result.rows[0].to_list()) self.assertEquals(["I bought", "those glasses."], result.rows[1].to_list())
def test_near_matching(self): result = collate_pretokenized_json(self.json_in, near_match=True) self.assertEquals([ "I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "." ], result.rows[0].to_list()) self.assertEquals([ "I", "bought", "those", "glasses", None, None, None, None, None, None, None, "." ], result.rows[1].to_list())
def testHTMLOutputVerticalLayoutPretokenizedJSON(self): json_in = { "witnesses": [{ "id": "A", "tokens": [{ "t": "A", "ref": 123 }, { "t": "black", "adj": True }, { "t": "cat", "id": "xyz" }] }, { "id": "B", "tokens": [{ "t": "A" }, { "t": "white", "adj": True }, { "t": "kitten.", "n": "cat" }] }] } expected_output = """\ +-------+---------+ | A | B | +-------+---------+ | A | A | +-------+---------+ | black | white | +-------+---------+ | cat | kitten. | +-------+---------+""" plain_text_output = str( collate_pretokenized_json(json_in, layout="vertical")) self.assertEquals(expected_output, plain_text_output)
def testPretokenizedWitness(self): pretokenized_witness = { "witnesses": [{ "id": "A", "tokens": [{ "t": "A", "ref": 123 }, { "t": "black", "adj": True }, { "t": "cat", "id": "xyz" }, { "t": "bird", "id": "abc" }] }, { "id": "B", "tokens": [{ "t": "A" }, { "t": "white", "adj": True }, { "t": "mousedog bird", "adj": False }] }] } result = collate_pretokenized_json(pretokenized_witness) self.assertEqual(len(result.rows[0].to_list()), 4) self.assertEqual(len(result.rows[1].to_list()), 4) # The second witness should have a token that reads 'mousedog bird'. self.assertIn("mousedog bird", result.rows[1].to_list())
def testHTMLOutputPretokenizedJSON(self): json_in = { "witnesses": [{ "id": "A", "tokens": [{ "t": "A", "ref": 123 }, { "t": "black", "adj": True }, { "t": "cat", "id": "xyz" }] }, { "id": "B", "tokens": [{ "t": "A" }, { "t": "white", "adj": True }, { "t": "kitten.", "n": "cat" }] }] } expected_plain_table = """\ +---+---+-------+---------+ | A | A | black | cat | | B | A | white | kitten. | +---+---+-------+---------+""" plain_table = str(collate_pretokenized_json(json_in, output="table")) self.assertEqual(expected_plain_table, plain_table)
def testSegmentationPretokenizedJSON(self): json_in = { "witnesses" : [ { "id" : "A", "tokens" : [ { "t" : "A", "ref" : 123 }, { "t" : "black" , "adj" : True }, { "t" : "cat", "id" : "xyz" } ] }, { "id" : "B", "tokens" : [ { "t" : "A" }, { "t" : "white" , "adj" : True }, { "t" : "stripy", "adj" : True }, { "t" : "kitten.", "n" : "cat" } ] } ] } bad_func = lambda x: collate_pretokenized_json(x, segmentation=True) self.assertRaises(UnsupportedError, bad_func, json_in )
def test_near_matching_segmented(self): result = collate_pretokenized_json(self.json_in, near_match=True, segmentation=True) self.assertEquals(["I bought", "this glass, because it matches those dinner plates."], result.rows[0].to_list()) self.assertEquals(["I bought", "those glasses."], result.rows[1].to_list())
def test_near_matching(self): result = collate_pretokenized_json(self.json_in, near_match=True) self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], result.rows[0].to_list()) self.assertEquals(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list())