def test_hermans_witness_order_independence_case_two_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("B", "a b c d F g h i ! q r s t")
     alignment_table = collate(collation)
     self.assertEquals(["a b c d F g h i!", "K!", "q r s t"], alignment_table.rows[0].to_list())
     self.assertEquals(["a b c d F g h i!", "-", "q r s t"], alignment_table.rows[1].to_list())
 def testBeckett(self):
     collation = Collation()
     collation.add_plain_witness("1", "The same clock as when for example Magee once died.")
     collation.add_plain_witness("2", "The same as when for example Magee once died.")
     table = collate(collation)
     self.assertEquals(["The same", "clock", "as when for example Magee once died."], table.rows[0].to_list())
     self.assertEquals(["The same", None, "as when for example Magee once died."], table.rows[1].to_list())
 def testDoubleTransposition1(self):
     collation = Collation()
     collation.add_plain_witness("A", "the cat is black")
     collation.add_plain_witness("B", "black is the cat")
     alignment_table = collate(collation)
     self.assertEquals(["the cat", "is", "black"], alignment_table.rows[0].to_list())
     self.assertEquals(["black", "is", "the cat"], alignment_table.rows[1].to_list())
Example #4
0
    def testHTMLOutputPretokenizedJSON(self):
        json_in = {
      "witnesses" : [
        {
          "id" : "A",
          "tokens" : [
              { "t" : "A", "ref" : 123 },
              { "t" : "black" , "adj" : True },
              { "t" : "cat", "id" : "xyz" }
          ]
        },
        {
          "id" : "B",
          "tokens" : [
              { "t" : "A" },
              { "t" : "white" , "adj" : True },
              { "t" : "kitten.", "n" : "cat" }
          ]
        }
      ]
    }
        expected_plain_table = """\
+---+---+-------+---------+
| A | A | black | cat     |
| B | A | white | kitten. |
+---+---+-------+---------+"""
        plain_table = str(collate(json_in, output="table"))
        self.assertEqual(expected_plain_table, plain_table)
Example #5
0
    def testJSONOutputPretokenizedJSON(self):
        json_in = {
      "witnesses" : [
        {
          "id" : "A",
          "tokens" : [
              { "t" : "A", "ref" : 123 },
              { "t" : "black" , "adj" : True },
              { "t" : "cat", "id" : "xyz" }
          ]
        },
        {
          "id" : "B",
          "tokens" : [
              { "t" : "A" },
              { "t" : "white" , "adj" : True },
              { "t" : "kitten.", "n" : "cat" }
          ]
        }
      ]
    }

        expected_json = {"table": [[[{"ref": 123, "t": "A"}], [{"adj": True, "t": "black"}], [{"id": "xyz", "t": "cat"}]], [[{"t": "A"}], [{"adj": True, "t": "white"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}
        json_out = collate(json_in, output="json")
        self.assertEqual(expected_json, json.loads(json_out))
Example #6
0
 def testDoubleTransposition1(self):
     collation = Collation()
     collation.add_plain_witness("A", "the cat is black")
     collation.add_plain_witness("B", "black is the cat")
     alignment_table = collate(collation)
     self.assertEquals(["the cat", "is", "black"], alignment_table.rows[0].to_list())
     self.assertEquals(["black", "is", "the cat"], alignment_table.rows[1].to_list())
Example #7
0
    def testSegmentationPretokenizedJSON(self):
        json_in = {
      "witnesses" : [
        {
          "id" : "A",
          "tokens" : [
              { "t" : "A", "ref" : 123 },
              { "t" : "black" , "adj" : True },
              { "t" : "cat", "id" : "xyz" }
          ]
        },
        {
          "id" : "B",
          "tokens" : [
              { "t" : "A" },
              { "t" : "white" , "adj" : True },
              { "t" : "stripy", "adj" : True },
              { "t" : "kitten.", "n" : "cat" }
          ]
        }
      ]
    }
        json_expected = {"table": [[[{"ref": 123, "t": "A"}], [{"adj": True, "t": "black"}], [{"id": "xyz", "t": "cat"}]], [[{"t": "A"}], [{"adj": True, "t": "white"}, {"adj": True, "t": "stripy"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}

        json_out = collate(json_in, output="json", segmentation=True)
        self.assertEquals(json_expected, json.loads(json_out))
Example #8
0
    def test_near_matching_accidentally_incorrect_long(self):
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "The brown fox jumps over this dog.")
        collation.add_plain_witness(
            "B", "The brown fox jumps over there that dog.")
        alignment_table = str(
            collate(collation,
                    near_match=True,
                    segmentation=False,
                    scheduler=scheduler))
        self.assertTask("build column for rank", ["this", "6"], scheduler[0])
        self.assertTask("build column for rank", ["this", "7"], scheduler[1])
        self.assertTask("move node from prior rank to rank with best match",
                        ["this", "6", "7"], scheduler[2])
        self.assertTask("build column for rank", ["over", "5"], scheduler[3])
        self.assertTask("build column for rank", ["over", "6"], scheduler[4])
        self.assertEquals(5, len(scheduler))
        expected = """\
+---+-----+-------+-----+-------+------+-------+------+-----+---+
| A | The | brown | fox | jumps | over | -     | this | dog | . |
| B | The | brown | fox | jumps | over | there | that | dog | . |
+---+-----+-------+-----+-------+------+-------+------+-----+---+"""
        self.assertEquals(expected, alignment_table)
Example #9
0
    def testHTMLOutputVerticalLayoutPretokenizedJSON(self):
        json_in = {
      "witnesses" : [
        {
          "id" : "A",
          "tokens" : [
              { "t" : "A", "ref" : 123 },
              { "t" : "black" , "adj" : True },
              { "t" : "cat", "id" : "xyz" }
          ]
        },
        {
          "id" : "B",
          "tokens" : [
              { "t" : "A" },
              { "t" : "white" , "adj" : True },
              { "t" : "kitten.", "n" : "cat" }
          ]
        }
      ]
    }
        expected_output = """\
+-------+---------+
|   A   |    B    |
+-------+---------+
|   A   |    A    |
+-------+---------+
| black |  white  |
+-------+---------+
|  cat  | kitten. |
+-------+---------+"""
        plain_text_output = str(collate(json_in, layout="vertical"))
        self.assertEquals(expected_output, plain_text_output)
Example #10
0
    def testJSONOutputPretokenizedJSON(self):
        json_in = {
      "witnesses" : [
        {
          "id" : "A",
          "tokens" : [
              { "t" : "A", "ref" : 123 },
              { "t" : "black" , "adj" : True },
              { "t" : "cat", "id" : "xyz" }
          ]
        },
        {
          "id" : "B",
          "tokens" : [
              { "t" : "A" },
              { "t" : "white" , "adj" : True },
              { "t" : "kitten.", "n" : "cat" }
          ]
        }
      ]
    }

        # expected_json = {"table": [[[{"ref": 123, "t": "A"}], [{"adj": True, "t": "black"}], [{"id": "xyz", "t": "cat"}]], [[{"t": "A"}], [{"adj": True, "t": "white"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}
        expected_json = {"table": [[[{"ref": 123, "_sigil": "A", "t": "A", "_token_array_position": 0}], [{"adj": True, "_sigil": "A", "t": "black", "_token_array_position": 1}], [{"id": "xyz", "_sigil": "A", "t": "cat", "_token_array_position": 2}]], [[{"_sigil": "B", "t": "A", "_token_array_position": 4}], [{"adj": True, "_sigil": "B", "t": "white", "_token_array_position": 5}], [{"n": "cat", "_sigil": "B", "t": "kitten.", "_token_array_position": 6}]]], "witnesses": ["A", "B"]}
        json_out = collate(json_in, output="json")
        self.assertEqual(expected_json, json.loads(json_out))
Example #11
0
    def testHTMLOutputPretokenizedJSON(self):
        json_in = {
      "witnesses" : [
        {
          "id" : "A",
          "tokens" : [
              { "t" : "A", "ref" : 123 },
              { "t" : "black" , "adj" : True },
              { "t" : "cat", "id" : "xyz" }
          ]
        },
        {
          "id" : "B",
          "tokens" : [
              { "t" : "A" },
              { "t" : "white" , "adj" : True },
              { "t" : "kitten.", "n" : "cat" }
          ]
        }
      ]
    }
        expected_plain_table = """\
+---+---+-------+---------+
| A | A | black | cat     |
| B | A | white | kitten. |
+---+---+-------+---------+"""
        plain_table = str(collate(json_in, output="table"))
        self.assertEqual(expected_plain_table, plain_table)
Example #12
0
    def testHTMLOutputVerticalLayoutPretokenizedJSON(self):
        json_in = {
      "witnesses" : [
        {
          "id" : "A",
          "tokens" : [
              { "t" : "A", "ref" : 123 },
              { "t" : "black" , "adj" : True },
              { "t" : "cat", "id" : "xyz" }
          ]
        },
        {
          "id" : "B",
          "tokens" : [
              { "t" : "A" },
              { "t" : "white" , "adj" : True },
              { "t" : "kitten.", "n" : "cat" }
          ]
        }
      ]
    }
        expected_output = """\
+-------+---------+
|   A   |    B    |
+-------+---------+
|   A   |    A    |
+-------+---------+
| black |  white  |
+-------+---------+
|  cat  | kitten. |
+-------+---------+"""
        plain_text_output = str(collate(json_in, layout="vertical"))
        self.assertEqual(expected_output, plain_text_output)
 def testPretokenizedWitness(self):
     pretokenized_witness = {
         "witnesses": [
             {
                 "id": "A",
                 "tokens": [
                     {"t": "A", "ref": 123},
                     {"t": "black", "adj": True},
                     {"t": "cat", "id": "xyz"},
                     {"t": "bird", "id": "abc"}
                 ]
             },
             {
                 "id": "B",
                 "tokens": [
                     {"t": "A"},
                     {"t": "white", "adj": True},
                     {"t": "mousedog bird", "adj": False}
                 ]
             }
         ]
     }
     result = collate(pretokenized_witness, segmentation=False)
     self.assertEqual(len(result.rows[0].to_list()), 4)
     self.assertEqual(len(result.rows[1].to_list()), 4)
     # The second witness should have a token that reads 'mousedog bird'.
     self.assertIn("mousedog bird", str(result.rows[1].to_list()))
 def test_exact_matching(self):
     collation = Collation()
     collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     alignment_table = collate(collation)
     self.assertEqual(["I bought ", "this glass , because it matches ", "those ", "dinner plates"],
                       alignment_table.rows[0].to_list_of_strings())
     self.assertEqual(["I bought ", None, "those ", "glasses"], alignment_table.rows[1].to_list_of_strings())
 def test_near_matching(self):
     collation = Collation()
     collation.add_plain_witness("A", "I bought this glass, because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     alignment_table = collate(collation, near_match=True)
     self.assertEquals(["I bought", "this glass, because it matches those dinner plates"],
                       alignment_table.rows[0].to_list())
     self.assertEquals(["I bought", "those glasses"], alignment_table.rows[1].to_list())
Example #16
0
 def test_exact_matching(self):
     collation = Collation()
     collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     alignment_table = collate(collation)
     self.assertEqual(["I bought ", "this glass , because it matches ", "those ", "dinner plates"],
                       alignment_table.rows[0].to_list_of_strings())
     self.assertEqual(["I bought ", None, "those ", "glasses"], alignment_table.rows[1].to_list_of_strings())
Example #17
0
 def test_hermans_witness_order_independence_case_two_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("B", "a b c d F g h i ! q r s t")
     alignment_table = collate(collation)
     self.assertEquals(["a b c d F g h i ! ", "K ! ", "q r s t"],
                       alignment_table.rows[0].to_list_of_strings())
     self.assertEquals(["a b c d F g h i ! ", None, "q r s t"],
                       alignment_table.rows[1].to_list_of_strings())
 def test_witness_order(self):
     collation = Collation()
     collation.add_plain_witness("A", "x a y")
     collation.add_plain_witness("B", "x b y")
     collation.add_plain_witness("C", "x a b y")
     alignment_table = collate(collation)
     self.assertEquals(["x", "a", "-", "y"], alignment_table.rows[0].to_list())
     self.assertEquals(["x", "-", "b", "y"], alignment_table.rows[1].to_list())
     self.assertEquals(["x", "a", "b", "y"], alignment_table.rows[2].to_list())
 def test_hermans_witness_order_independence_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("B", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("C", "a b c d E g h i ! q r s t")
     alignment_table = collate(collation)
     self.assertEquals(["a b c d ", "F ", "g h i ", "! K ", "! q r s t"], alignment_table.rows[0].to_list_of_strings())
     self.assertEquals(["a b c d ", "F ", "g h i ", None, "! q r s t"], alignment_table.rows[1].to_list_of_strings())
     self.assertEquals(["a b c d ", "E ", "g h i ", None, "! q r s t"], alignment_table.rows[2].to_list_of_strings())
 def testThisMorningExample(self):
     collation = Collation()
     collation.add_plain_witness(
         "A", "This morning the cat observed little birds in the trees.")
     collation.add_plain_witness(
         "B",
         "The cat was observing birds in the little trees this morning, it observed birds for two hours."
     )
     alignment_table = collate(collation, detect_transpositions=True)
Example #21
0
    def test_near_matching_middle(self):
        # Three candidates, closest is middle, match rank 2 0 1 (0 is closest)
        # Should go to the middle; incorrectly goes right
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 0123 efgh")
        collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh")
        alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 4); this is activeRank
        # Find the first witness with a gap at that rank (A)
        # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4)
        #   parameters are token string and rank to check
        self.assertTask("build column for rank", ["0123", "2"], scheduler[0])
        self.assertTask("build column for rank", ["0123", "3"], scheduler[1])
        self.assertTask("build column for rank", ["0123", "4"], scheduler[2])
        # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4
        #   is at rank 3, so move "0123" from current rank 2 to rank 3
        self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3])
        # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap
        #   (rank 2, gap in A), with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[4])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[5])
        # Don't move it because it's closer to current location
        # No more gaps at rank 2, non gaps at rank 1, no more ranks
        self.assertEqual(6, len(scheduler))
        expected = """\
+---+------+------+------+------+------+
| A | abcd | -    | 0123 | -    | efgh |
| B | abcd | 0xxx | 012x | 01xx | efgh |
+---+------+------+------+------+------+"""
        self.assertEqual(expected, alignment_table)

        def test_near_matching_clash(self):
            # If the previous rank has a vertex with more than one witness, where at least
            # one witness is a candidate for being moved, don't move it if any of the
            # witnesses has a node at the new rank.
            #
            # If there were only A and B, we'd move cce away from bbb to align with cce.
            # Witness C should prevent this.
            self.maxDiff = None
            collation = Collation()
            collation.add_plain_witness("A", "aaa bbb ccc ddd")
            collation.add_plain_witness("B", "aaa cce ddd")
            collation.add_plain_witness("C", "aaa cce ccc ddd")
            alignment_table = str(collate(collation, near_match=True, segmentation=False))
            expected = """\
    +---+-----+-----+-----+-----+
    | A | aaa | bbb | ccc | ddd |
    | B | aaa | cce | -   | ddd |
    | C | aaa | cce | ccc | ddd |
    +---+-----+-----+-----+-----+"""
            self.assertEqual(expected, alignment_table)
Example #22
0
    def test_near_matching_middle(self):
        # Three candidates, closest is middle, match rank 2 0 1 (0 is closest)
        # Should go to the middle; incorrectly goes right
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 0123 efgh")
        collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh")
        alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 4); this is activeRank
        # Find the first witness with a gap at that rank (A)
        # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4)
        #   parameters are token string and rank to check
        self.assertTask("build column for rank", ["0123", "2"], scheduler[0])
        self.assertTask("build column for rank", ["0123", "3"], scheduler[1])
        self.assertTask("build column for rank", ["0123", "4"], scheduler[2])
        # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4
        #   is at rank 3, so move "0123" from current rank 2 to rank 3
        self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3])
        # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap
        #   (rank 2, gap in A), with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[4])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[5])
        # Don't move it because it's closer to current location
        # No more gaps at rank 2, non gaps at rank 1, no more ranks
        self.assertEquals(6, len(scheduler))
        expected = """\
+---+------+------+------+------+------+
| A | abcd | -    | 0123 | -    | efgh |
| B | abcd | 0xxx | 012x | 01xx | efgh |
+---+------+------+------+------+------+"""
        self.assertEqual(expected, alignment_table)

        def test_near_matching_clash(self):
            # If the previous rank has a vertex with more than one witness, where at least
            # one witness is a candidate for being moved, don't move it if any of the
            # witnesses has a node at the new rank.
            #
            # If there were only A and B, we'd move cce away from bbb to align with cce.
            # Witness C should prevent this.
            self.maxDiff = None
            collation = Collation()
            collation.add_plain_witness("A", "aaa bbb ccc ddd")
            collation.add_plain_witness("B", "aaa cce ddd")
            collation.add_plain_witness("C", "aaa cce ccc ddd")
            alignment_table = str(collate(collation, near_match=True, segmentation=False))
            expected = """\
    +---+-----+-----+-----+-----+
    | A | aaa | bbb | ccc | ddd |
    | B | aaa | cce | -   | ddd |
    | C | aaa | cce | ccc | ddd |
    +---+-----+-----+-----+-----+"""
            self.assertEqual(expected, alignment_table)
    def test_near_matching_accidentally_correct_long(self):
        collation = Collation()
        collation.add_plain_witness("A", "The brown fox jumps over this dog.")
        collation.add_plain_witness("B", "The brown fox jumps over that there dog.")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-----+-------+-----+-------+------+------+-------+-----+---+
| A | The | brown | fox | jumps | over | this | -     | dog | . |
| B | The | brown | fox | jumps | over | that | there | dog | . |
+---+-----+-------+-----+-------+------+------+-------+-----+---+"""
        self.assertEqual(expected, alignment_table)
Example #24
0
 def test_near_matching(self):
     collation = Collation()
     collation.add_plain_witness(
         "A", "I bought this glass, because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     alignment_table = collate(collation, near_match=True)
     self.assertEquals(
         ["I bought", "this glass, because it matches those dinner plates"],
         alignment_table.rows[0].to_list())
     self.assertEquals(["I bought", "those glasses"],
                       alignment_table.rows[1].to_list())
    def test_near_matching_accidentally_incorrect_short(self):
        collation = Collation()
        collation.add_plain_witness("A", "over this dog")
        collation.add_plain_witness("B", "over there that dog")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+------+-------+------+-----+
| A | over | -     | this | dog |
| B | over | there | that | dog |
+---+------+-------+------+-----+"""
        self.assertEqual(expected, alignment_table)
 def test_storage_of_tokens_on_variant_graph(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c")
     collation.add_plain_witness("B", "a d c")
     variant_graph = collate(collation, output="graph")
     self.assertEqual("{}", str(variant_graph.vertex_attributes(0)["tokens"]))
     self.assertEqual("{}", str(variant_graph.vertex_attributes(1)["tokens"]))
     # TODO: testing node 2 is difficult because of random order of tokens
     self.assertEqual("{'A': b}", str(variant_graph.vertex_attributes(3)["tokens"]))
     # TODO: testing node 4 is difficult because of random order of tokens
     self.assertEqual("{'B': d}", str(variant_graph.vertex_attributes(5)["tokens"]))
Example #27
0
    def test_near_matching_accidentally_incorrect_short(self):
        collation = Collation()
        collation.add_plain_witness("A", "over this dog")
        collation.add_plain_witness("B", "over there that dog")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+------+-------+------+-----+
| A | over | -     | this | dog |
| B | over | there | that | dog |
+---+------+-------+------+-----+"""
        self.assertEqual(expected, alignment_table)
Example #28
0
    def test_near_matching_accidentally_correct_long(self):
        collation = Collation()
        collation.add_plain_witness("A", "The brown fox jumps over this dog.")
        collation.add_plain_witness("B", "The brown fox jumps over that there dog.")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-----+-------+-----+-------+------+------+-------+-----+---+
| A | The | brown | fox | jumps | over | this | -     | dog | . |
| B | The | brown | fox | jumps | over | that | there | dog | . |
+---+-----+-------+-----+-------+------+------+-------+-----+---+"""
        self.assertEqual(expected, alignment_table)
Example #29
0
    def test_near_matching_rank_0(self):
        # find_prior_node() should check ranks back through 0, not 1
        collation = Collation()
        collation.add_plain_witness("A", "this")
        collation.add_plain_witness("B", "there thin")
        output = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-------+------+
| A | -     | this |
| B | there | thin |
+---+-------+------+"""
        self.assertEqual(expected, output)
Example #30
0
 def test_hermans_witness_order_independence_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("B", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("C", "a b c d E g h i ! q r s t")
     alignment_table = collate(collation)
     self.assertEquals(["a b c d", "F", "g h i", "! K", "! q r s t"],
                       alignment_table.rows[0].to_list())
     self.assertEquals(["a b c d", "F", "g h i", None, "! q r s t"],
                       alignment_table.rows[1].to_list())
     self.assertEquals(["a b c d", "E", "g h i", None, "! q r s t"],
                       alignment_table.rows[2].to_list())
Example #31
0
 def test_witness_order(self):
     collation = Collation()
     collation.add_plain_witness("A", "x a y")
     collation.add_plain_witness("B", "x b y")
     collation.add_plain_witness("C", "x a b y")
     alignment_table = collate(collation)
     self.assertEquals(["x ", "a ", None, "y"],
                       alignment_table.rows[0].to_list_of_strings())
     self.assertEquals(["x ", None, "b ", "y"],
                       alignment_table.rows[1].to_list_of_strings())
     self.assertEquals(["x ", "a ", "b ", "y"],
                       alignment_table.rows[2].to_list_of_strings())
Example #32
0
    def test_near_matching_rank_0(self):
        # find_prior_node() should check ranks back through 0, not 1
        collation = Collation()
        collation.add_plain_witness("A", "this")
        collation.add_plain_witness("B", "there thin")
        output = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-------+------+
| A | -     | this |
| B | there | thin |
+---+-------+------+"""
        self.assertEqual(expected, output)
Example #33
0
 def testBeckett(self):
     collation = Collation()
     collation.add_plain_witness(
         "1", "The same clock as when for example Magee once died.")
     collation.add_plain_witness(
         "2", "The same as when for example Magee once died.")
     table = collate(collation)
     self.assertEquals(
         ["The same", "clock", "as when for example Magee once died."],
         table.rows[0].to_list())
     self.assertEquals(
         ["The same", None, "as when for example Magee once died."],
         table.rows[1].to_list())
Example #34
0
def text_reducer(data_in, **kwargs):
    '''Reduce a list of text into an alignment table
    Parameters
    ----------
    data : list
        A list of strings to be aligned

    Returns
    -------
    reduction : dict
        A dictionary with the following keys:

        *   `aligned_text`: A list of lists containing the aligned text.
            There is one list for each identified word, and each of those lists contains
            one item for each user that entered text. If the user did not transcribe
            a word an empty string is used.
        *   `number_views`: Number of volunteers who entered non-blank text
        *   `consensus_score`: The average number of users who's text agreed.
            Note, if `consensus_score` is the same a `number_views` every user agreed with each other
    '''
    reduction = {}
    if len(data_in) > 0:
        user_ids_input = kwargs.pop('user_id')
        idx, data, gold_standard = zip(*data_in)
        user_ids = [user_ids_input[i] for i in idx]
        witness_keys = []
        aligned_text = []
        collation = col.Collation()
        for index, text in enumerate(data):
            key = str(index)
            witness_keys.append(key)
            collation.add_plain_witness(key, text)
        alignment_table = col.collate(collation,
                                      near_match=True,
                                      segmentation=False)
        for cols in alignment_table.columns:
            word_dict = cols.tokens_per_witness
            word_list = []
            for key in witness_keys:
                word_list.append(str(word_dict.get(key, [''])[0]))
            aligned_text.append(word_list)
        consensus_score_value, consensus_text = consensus_score(aligned_text)
        reduction = {
            'aligned_text': aligned_text,
            'number_views': len(data),
            'consensus_score': consensus_score_value,
            'consensus_text': consensus_text,
            'gold_standard': list(gold_standard),
            'user_ids': user_ids
        }
    return reduction
Example #35
0
 def testDoubleTransposition2(self):
     # Either the 'a' can align or the 'b' can. See also #3 below.
     collation = Collation()
     collation.add_plain_witness("A", "a b")
     collation.add_plain_witness("B", "b a")
     alignment_table = collate(collation)
     witness_a_list = alignment_table.rows[0].to_list()
     self.assertEquals(len(witness_a_list), 3)
     witness_b_list = alignment_table.rows[1].to_list()
     self.assertEquals(len(witness_b_list), 3)
     matching_tokens = []
     for idx in range(3):
         if witness_a_list[idx] == witness_b_list[idx]:
             matching_tokens.append(witness_a_list[idx])
     self.assertEquals(len(matching_tokens), 1)
 def testDoubleTransposition2(self):
     # Either the 'a' can align or the 'b' can. See also #3 below.
     collation = Collation()
     collation.add_plain_witness("A", "a b")
     collation.add_plain_witness("B", "b a")
     alignment_table = collate(collation)
     witness_a_list = alignment_table.rows[0].to_list()
     self.assertEquals(len(witness_a_list), 3)
     witness_b_list = alignment_table.rows[1].to_list()
     self.assertEquals(len(witness_b_list), 3)
     matching_tokens = []
     for idx in range(3):
         if witness_a_list[idx] == witness_b_list[idx]:
             matching_tokens.append(witness_a_list[idx])
     self.assertEquals(len(matching_tokens), 1)
 def test_storage_of_tokens_on_variant_graph(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c")
     collation.add_plain_witness("B", "a d c")
     variant_graph = collate(collation, output="graph")
     self.assertEqual("{}", str(variant_graph.start.tokens))
     self.assertEqual("{}", str(variant_graph.end.tokens))
     # 'b' and 'd' are in one witness each
     self.assertEqual("{'A': [b]}", str(variant_graph.vertexWith('b').tokens))
     self.assertEqual("{'B': [d]}", str(variant_graph.vertexWith('d').tokens))
     # 'a' and 'c' are in both witnesses
     self.assertEqual("[a]",str(variant_graph.vertexWith('a').tokens['A']))
     self.assertEqual("[a]", str(variant_graph.vertexWith('a').tokens['B']))
     self.assertEqual("[c]", str(variant_graph.vertexWith('c').tokens['A']))
     self.assertEqual("[c]", str(variant_graph.vertexWith('c').tokens['B']))
 def test_storage_of_tokens_on_variant_graph(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c")
     collation.add_plain_witness("B", "a d c")
     variant_graph = collate(collation, output="graph")
     self.assertEqual("{}",
                      str(variant_graph.vertex_attributes(0)["tokens"]))
     self.assertEqual("{}",
                      str(variant_graph.vertex_attributes(1)["tokens"]))
     # TODO: testing node 2 is difficult because of random order of tokens
     self.assertEqual("{'A': b}",
                      str(variant_graph.vertex_attributes(3)["tokens"]))
     # TODO: testing node 4 is difficult because of random order of tokens
     self.assertEqual("{'B': d}",
                      str(variant_graph.vertex_attributes(5)["tokens"]))
    def align_witnesses(self, witnesses):
        normalized_witnesses = []
        tokenized_witnesses = []
        for witness in witnesses:
            normalized_tokens = []
            tokenized_witness = []
            sigil = witness["id"]
            for token in witness["tokens"]:
                tokenized_witness.append(token)
                if "n" in token:
                    normalized_tokens.append(token["n"])
                else:
                    normalized_tokens.append(token["t"])
            normalized_witnesses.append(
                Witness(sigil, " ".join(normalized_tokens)))
            tokenized_witnesses.append(tokenized_witness)
        collation = Collation()
        for normalized_witness in normalized_witnesses:
            if normalized_witness.content:
                collation.add_witness(normalized_witness.sigil,
                                      normalized_witness.content)

        results = {"witnesses": [], "table": [[]], "status": []}

        if len(collation.witnesses) > 0:
            at = collate(collation,
                         output="novisualization",
                         segmentation=False)
            tokenized_at = AlignmentTable(collation)
            for row, tokenized_witness in zip(at.rows, tokenized_witnesses):

                new_row = Row(row.header)
                tokenized_at.rows.append(new_row)
                token_counter = 0

                for cell in row.cells:
                    if cell != "-":
                        if token_counter <= len(tokenized_witness) - 1:
                            new_row.cells.append(
                                tokenized_witness[token_counter])
                            token_counter += 1
                    else:
                        # TODO: should probably be null or None instead, but that would break the rendering at the moment
                        new_row.cells.append({"t": "^"})

            alignment = json.loads(
                display_alignment_table_as_json(tokenized_at))
            self.transform_alignment(alignment)
Example #40
0
 def test_storage_of_tokens_on_variant_graph(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c")
     collation.add_plain_witness("B", "a d c")
     variant_graph = collate(collation, output="graph")
     self.assertEqual("{}", str(variant_graph.start.tokens))
     self.assertEqual("{}", str(variant_graph.end.tokens))
     # 'b' and 'd' are in one witness each
     self.assertEqual("{'A': [b]}",
                      str(variant_graph.vertexWith('b').tokens))
     self.assertEqual("{'B': [d]}",
                      str(variant_graph.vertexWith('d').tokens))
     # 'a' and 'c' are in both witnesses
     self.assertEqual("[a]", str(variant_graph.vertexWith('a').tokens['A']))
     self.assertEqual("[a]", str(variant_graph.vertexWith('a').tokens['B']))
     self.assertEqual("[c]", str(variant_graph.vertexWith('c').tokens['A']))
     self.assertEqual("[c]", str(variant_graph.vertexWith('c').tokens['B']))
Example #41
0
 def testJSONOutput_empty_cells_in_output(self):
     json_in = {
         "witnesses": [{
             "id":
             "A",
             "tokens": [{
                 "t": "A",
                 "ref": 123
             }, {
                 "t": "black",
                 "adj": True
             }, {
                 "t": "cat",
                 "id": "xyz"
             }]
         }, {
             "id": "B",
             "tokens": [{
                 "t": "A"
             }, {
                 "t": "kitten.",
                 "n": "cat"
             }]
         }]
     }
     expected_json = {
         "table": [[[{
             "ref": 123,
             "t": "A"
         }], [{
             "adj": True,
             "t": "black"
         }], [{
             "id": "xyz",
             "t": "cat"
         }]], [[{
             "t": "A"
         }], None, [{
             "n": "cat",
             "t": "kitten."
         }]]],
         "witnesses": ["A", "B"]
     }
     json_out = collate(json_in, output="json")
     self.assertEqual(expected_json, json.loads(json_out))
 def testDoubleTransposition3(self):
     # Tricky. Aligning a and c can work; so can aligning b and c. Both
     # are equally valid, and both can crop up.
     # Let's test that each row has four values, and that two of the
     # columns have identical values, and that c is one of those columns.
     collation = Collation()
     collation.add_plain_witness("A", "a b c")
     collation.add_plain_witness("B", "b a c")
     alignment_table = collate(collation)
     witness_a_list = alignment_table.rows[0].to_list()
     self.assertEquals(len(witness_a_list), 4)
     witness_b_list = alignment_table.rows[1].to_list()
     self.assertEquals(len(witness_b_list), 4)
     matching_tokens = []
     for idx in range(4):
         if witness_a_list[idx] == witness_b_list[idx]:
             matching_tokens.append(witness_a_list[idx])
     self.assertEquals(len(matching_tokens), 2)
     self.assertIn("c", matching_tokens)
Example #43
0
 def testDoubleTransposition3(self):
     # Tricky. Aligning a and c can work; so can aligning b and c. Both
     # are equally valid, and both can crop up.
     # Let's test that each row has four values, and that two of the
     # columns have identical values, and that c is one of those columns.
     collation = Collation()
     collation.add_plain_witness("A", "a b c")
     collation.add_plain_witness("B", "b a c")
     alignment_table = collate(collation)
     witness_a_list = alignment_table.rows[0].to_list()
     self.assertEquals(len(witness_a_list), 4)
     witness_b_list = alignment_table.rows[1].to_list()
     self.assertEquals(len(witness_b_list), 4)
     matching_tokens = []
     for idx in range(4):
         if witness_a_list[idx] == witness_b_list[idx]:
             matching_tokens.append(witness_a_list[idx])
     self.assertEquals(len(matching_tokens), 2)
     self.assertIn("c", matching_tokens)
Example #44
0
    def test_near_matching_accidentally_incorrect_long(self):
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "The brown fox jumps over this dog.")
        collation.add_plain_witness("B", "The brown fox jumps over there that dog.")
        alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler))
        self.assertTask("build column for rank", ["this", "6"], scheduler[0])
        self.assertTask("build column for rank", ["this", "7"], scheduler[1])
        self.assertTask("move node from prior rank to rank with best match", ["this", "6", "7"], scheduler[2])
        self.assertTask("build column for rank", ["over", "5"], scheduler[3])
        self.assertTask("build column for rank", ["over", "6"], scheduler[4])
        self.assertEquals(5, len(scheduler))
        expected = """\
+---+-----+-------+-----+-------+------+-------+------+-----+---+
| A | The | brown | fox | jumps | over | -     | this | dog | . |
| B | The | brown | fox | jumps | over | there | that | dog | . |
+---+-----+-------+-----+-------+------+-------+------+-----+---+"""
        self.assertEquals(expected, alignment_table)
Example #45
0
    def test_near_matching_nonclash(self):
        # If the previous rank has a vertex with more than one witness, where at least
        # one witness is a candidate for being moved, don't move it if any of the
        # witnesses has a node at the new rank.
        #
        # If there were only A and B, we'd move cce away from bbb to align with cce.
        # Witness C should prevent this.
        self.maxDiff = None
        collation = Collation()
        collation.add_plain_witness("A", "aaa bbb ccc ddd")
        collation.add_plain_witness("B", "aaa cce ddd")
        collation.add_plain_witness("C", "aaa cce ddd")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-----+-----+-----+-----+
| A | aaa | bbb | ccc | ddd |
| B | aaa | -   | cce | ddd |
| C | aaa | -   | cce | ddd |
+---+-----+-----+-----+-----+"""
        self.assertEqual(expected, alignment_table)
    def test_near_matching_nonclash(self):
        # If the previous rank has a vertex with more than one witness, where at least
        # one witness is a candidate for being moved, don't move it if any of the
        # witnesses has a node at the new rank.
        #
        # If there were only A and B, we'd move cce away from bbb to align with cce.
        # Witness C should prevent this.
        self.maxDiff = None
        collation = Collation()
        collation.add_plain_witness("A", "aaa bbb ccc ddd")
        collation.add_plain_witness("B", "aaa cce ddd")
        collation.add_plain_witness("C", "aaa cce ddd")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-----+-----+-----+-----+
| A | aaa | bbb | ccc | ddd |
| B | aaa | -   | cce | ddd |
| C | aaa | -   | cce | ddd |
+---+-----+-----+-----+-----+"""
        self.assertEqual(expected, alignment_table)
Example #47
0
def frd_collate(col_obj):
    df = chunks_to_df(col_obj.files())
    counter = 0
    for gr in df.groupby('chunk_nr'):
        counter += 1
        col_sample_id = f"{col_obj.hashes()}__{counter:03}"
        collation = collatex.Collation()
        cur_df = gr[1]
        for i, row in cur_df.iterrows():
            print(row['id'])
            collation.add_plain_witness(row['id'], row['text'])
        table = collatex.collate(collation)
        col_sample, _ = FrdCollationSample.objects.get_or_create(
            title_slug=col_sample_id, parent_col=col_obj)
        data = visualize_table_vertically_with_colors(table, collation)
        data_tei = export_alignment_table_as_tei(table, collation)
        col_sample.data_html = data
        col_sample.data_tei = data_tei
        col_sample.save()

    return col_obj
Example #48
0
    def test_near_matching_middle(self):
        # Three candidates, closest is middle, match rank 2 0 1 (0 is closest)
        # Should go to the middle; incorrectly goes right
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 0123 efgh")
        collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh")
        alignment_table = str(
            collate(collation,
                    near_match=True,
                    segmentation=False,
                    scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 4); this is activeRank
        # Find the first witness with a gap at that rank (A)
        # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4)
        #   parameters are token string and rank to check
        self.assertTask("build column for rank", ["0123", "2"], scheduler[0])
        self.assertTask("build column for rank", ["0123", "3"], scheduler[1])
        self.assertTask("build column for rank", ["0123", "4"], scheduler[2])
        # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4
        #   is at rank 3, so move "0123" from current rank 2 to rank 3
        self.assertTask("move node from prior rank to rank with best match",
                        ["0123", "2", "3"], scheduler[3])
        # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap
        #   (rank 2, gap in A), with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[4])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[5])
        # Don't move it because it's closer to current location
        # No more gaps at rank 2, non gaps at rank 1, no more ranks
        self.assertEquals(6, len(scheduler))
        expected = """\
+---+------+------+------+------+------+
| A | abcd | -    | 0123 | -    | efgh |
| B | abcd | 0xxx | 012x | 01xx | efgh |
+---+------+------+------+------+------+"""
        self.assertEqual(expected, alignment_table)
Example #49
0
 def testPretokenizedWitness(self):
     pretokenized_witness = {
         "witnesses": [{
             "id":
             "A",
             "tokens": [{
                 "t": "A",
                 "ref": 123
             }, {
                 "t": "black",
                 "adj": True
             }, {
                 "t": "cat",
                 "id": "xyz"
             }, {
                 "t": "bird",
                 "id": "abc"
             }]
         }, {
             "id":
             "B",
             "tokens": [{
                 "t": "A"
             }, {
                 "t": "white",
                 "adj": True
             }, {
                 "t": "mousedog bird",
                 "adj": False
             }]
         }]
     }
     result = collate(pretokenized_witness, segmentation=False)
     self.assertEqual(len(result.rows[0].to_list()), 4)
     self.assertEqual(len(result.rows[1].to_list()), 4)
     # The second witness should have a token that reads 'mousedog bird'.
     self.assertIn("mousedog bird", str(result.rows[1].to_list()))
 def testJSONOutput_empty_cells_in_output(self):
     json_in = {
   "witnesses" : [
     {
       "id" : "A",
       "tokens" : [
           { "t" : "A", "ref" : 123 },
           { "t" : "black" , "adj" : True },
           { "t" : "cat", "id" : "xyz" }
       ]
     },
     {
       "id" : "B",
       "tokens" : [
           { "t" : "A" },
           { "t" : "kitten.", "n" : "cat" }
       ]
     }
   ]
 }
     # expected_json = {"table": [[[{"ref": 123, "t": "A"}], [{"adj": True, "t": "black"}], [{"id": "xyz", "t": "cat"}]], [[{"t": "A"}], None, [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}
     expected_json = {"table": [[[{"ref": 123, "_sigil": "A", "t": "A", "_token_array_position": 0}], [{"adj": True, "_sigil": "A", "t": "black", "_token_array_position": 1}], [{"id": "xyz", "_sigil": "A", "t": "cat", "_token_array_position": 2}]], [[{"_sigil": "B", "t": "A", "_token_array_position": 4}], None, [{"n": "cat", "_sigil": "B", "t": "kitten.", "_token_array_position": 5}]]], "witnesses": ["A", "B"]}
     json_out = collate(json_in, output="json")
     self.assertEqual(expected_json, json.loads(json_out))
 def test_longer_example(self):
     collation = Collation()
     collation.add_plain_witness("A", "The quick brown fox jumps over the dog.")
     collation.add_plain_witness("B", "The brown fox jumps over the lazy dog.")
     collate(collation)
Example #52
0
    def test_near_matching_three_witnesses(self):
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 012345 efgh")
        collation.add_plain_witness("B", "abcd 0xxxxx 01xxxx 01234x 012xxx 0123xx efgh")
        collation.add_plain_witness("C", "abcd 01xxxx zz23xx efgh")
        alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 6); this is activeRank
        # Find the first witness (alphabetically by siglum) with a gap at that rank (A)
        # Get the first token to the left of the gap for the first gappy witness ("012345" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from current rank (2) through activeRank (6), inclusive
        self.assertTask("build column for rank", ["012345", "2"], scheduler[0])
        self.assertTask("build column for rank", ["012345", "3"], scheduler[1])
        self.assertTask("build column for rank", ["012345", "4"], scheduler[2])
        self.assertTask("build column for rank", ["012345", "5"], scheduler[3])
        self.assertTask("build column for rank", ["012345", "6"], scheduler[4])
        # The best (max()) fit of "012345" among all ranks between current rank 2 and activeRank 6
        #   is at rank 4, so move "012345" from current rank 2 to rank 4
        self.assertTask("move node from prior rank to rank with best match", ["012345", "2", "4"], scheduler[5])
        # Find next (alphabetically) witness with a gap at activeRank (still 6), which is witness C
        # Get the first token to the left of the gap ("zz23xx" in C at rank 4)
        #   and check whether to move it
        # Calculate strength of match for all columns from current rank (4) through activeRank (6), inclusive
        self.assertTask("build column for rank", ["zz23xx", "4"], scheduler[6])
        self.assertTask("build column for rank", ["zz23xx", "5"], scheduler[7])
        self.assertTask("build column for rank", ["zz23xx", "6"], scheduler[8])
        # The best (max()) fit of "zz23xx" among all ranks between current rank 4 and activeRank 6
        #   is at rank 6, so move "zz23xx" from current rank 4 to rank 6
        self.assertTask("move node from prior rank to rank with best match", ["zz23xx", "4", "6"], scheduler[9])
        # No more gaps at rank 6, so advance to rank 5, which has gaps in witnesses A and C
        # First gap (alphabetically by siglum) at rank 5 is in witness A, where left node is "012345" at rank 4
        self.assertTask("build column for rank", ["012345", "4"], scheduler[10])
        self.assertTask("build column for rank", ["012345", "5"], scheduler[11])
        # Match is closest at current rank 4, so don't move the node
        # Next gap at rank 5 is in witness C, where left node is "01xxxx" at rank 3
        self.assertTask("build column for rank", ["01xxxx", "3"], scheduler[12])
        self.assertTask("build column for rank", ["01xxxx", "4"], scheduler[13])
        self.assertTask("build column for rank", ["01xxxx", "5"], scheduler[14])
        # Exact match at current rank 3, so don't move it
        # No more gaps at rank 5, so advance to rank 4, which has a gap in witness C,
        #   where left node is "01xxxx" at rank 3
        self.assertTask("build column for rank", ["01xxxx", "3"], scheduler[15])
        self.assertTask("build column for rank", ["01xxxx", "4"], scheduler[16])
        # Exact match at rank 3, so don't move it
        # No more gaps at rank 4, so advance to rank 3, where only gap is in witness A, with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[17])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[18])
        self.assertTask("build column for rank", ["abcd", "3"], scheduler[19])
        # Exact match at rank 1, so don't move it
        # No more gaps at rank 3, so advance to rank 2, with gaps in witnesses A and C and "abcd" at rank 1
        # Check witness A first
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[20])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[21])
        # Exact match at rank 1, so don't move it
        # Check witness C
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[22])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[23])
        # Exact match at rank 1, so don't move it
        # No more gaps at rank 2, no gaps at rank 1
        self.assertEquals(24, len(scheduler))
        expected = """\
+---+------+--------+--------+--------+--------+--------+------+
| A | abcd | -      | -      | 012345 | -      | -      | efgh |
| B | abcd | 0xxxxx | 01xxxx | 01234x | 012xxx | 0123xx | efgh |
| C | abcd | -      | 01xxxx | -      | -      | zz23xx | efgh |
+---+------+--------+--------+--------+--------+--------+------+"""
        self.assertEqual(expected, alignment_table)
def optics_line_text_reducer(data_by_frame, **kwargs_optics):
    '''Reduce the line-text extracts as a list of lines of text.

    Parameters
    ----------
    data_by_frame : dict
        A dictionary returned by :meth:`process_data`
    kwargs :
        * `See OPTICS <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html>`_
        * `min_samples` : The smallest number of transcribed lines needed to form a cluster.
          `auto` will set this value based on the number of volunteers who transcribed on a page within a subject.
        * `xi` : Determines the minimum steepness on the reachability plot that constitutes a cluster boundary.
        * `angle_eps` : How close the angle of two lines need to be in order to be placed in the same angle cluster.
          Note: This will only change the order of the lines.
        * `gutter_eps` : How close the `x` position of the start of two lines need to be in order to be placed in the same column cluster.
          Note: This will only change the order of the lines.
        * `min_line_length` : The minimum length a transcribed line of text needs to be in order to be used in the reduction.
        * `low_consensus_threshold` : The minimum consensus score allowed to be considered "done".
        * `minimum_views` : A value that is passed along to the font-end to set when lines should turn grey (has no effect on aggregation)

    Returns
    -------
    reduction : dict
        A dictionary with on key for each `frame` of the subject that have lists as values.
        Each item of the list represents one line transcribed of text and is a dictionary
        with these keys:

        * `clusters_x` : the `x` position of each identified word
        * `clusters_y` : the `y` position of each identified word
        * `clusters_text` : A list of lists containing the text at each cluster position
          There is one list for each identified word, and each of those lists contains
          one item for each user that identified the cluster. If the user did not transcribe
          the word an empty string is used.
        * `line_slope`: The slope of the line of text in degrees
        * `number_views` : The number of users that transcribed the line of text
        * `consensus_score` : The average number of users who's text agreed for the line
          Note, if `consensus_score` is the same a `number_views` every user agreed with each other
        * `user_ids`: List of panoptes user ids in the same order as `clusters_text`
        * `gold_standard`: List of bools indicating of the if a transcription was made in frontends
          gold standard mode
        * `slope_label`: integer indicating what slope cluster the line belongs to
        * `gutter_label`: integer indicating what gutter cluster (i.e. column) the line belongs to
        * `low_consensus` : True if the `consensus_score` is less than the threshold set by the
          `low_consensus_threshold` keyword

        For the entire subject the following is also returned:
        * `low_consensus_lines` : The number of lines with low consensus
        * `transcribed_lines` : The total number of lines transcribed on the subject

        Note: the image coordinate system has y increasing downward.
    '''
    user_ids_input = np.array(kwargs_optics.pop('user_id'))
    low_consensus_threshold = kwargs_optics.pop('low_consensus_threshold')
    _ = kwargs_optics.pop('minimum_views')
    output = defaultdict(list)
    min_samples_orig = kwargs_optics.pop('min_samples')
    angle_eps = kwargs_optics.pop('angle_eps')
    gutter_eps = kwargs_optics.pop('gutter_eps')
    max_eps = kwargs_optics.pop('max_eps', np.inf)
    if max_eps is None:
        max_eps = np.inf
    low_consensus_lines = 0
    number_of_lines = 0
    for frame, value in data_by_frame.items():
        frame_unordered = []
        X = np.array(value['X'])
        data = np.array(value['data'])
        if X.size > 0:
            num_users = len(np.unique(X[:, 1]))
            ext_index = np.array(extractor_index(X[:, 1]))
        else:
            num_users = 0
            ext_index = np.array([])
        if min_samples_orig == 'auto':
            min_samples = get_min_samples(num_users)
        else:
            min_samples = max(2, min_samples_orig)
        if num_users >= min_samples:
            db = OPTICS(metric=metric,
                        metric_params={'data_in': data},
                        min_samples=min_samples,
                        **kwargs_optics)
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', category=RuntimeWarning)
                db.fit(X)
            clean_labels = remove_user_duplication(db.labels_,
                                                   db.core_distances_, X[:, 1])
            for label in np.unique(clean_labels):
                cdx = clean_labels == label
                if label == -1:
                    # noise values are assigned to clusters of one
                    frame_unordered += cluster_of_one(X[cdx], data,
                                                      user_ids_input,
                                                      ext_index[cdx].tolist())
                else:
                    xs = [data[int(i)]['x'] for i in X[cdx, 0]]
                    ys = [data[int(i)]['y'] for i in X[cdx, 0]]
                    xm = np.median(xs, axis=0)
                    ym = np.median(ys, axis=0)
                    slope = np.rad2deg(
                        np.arctan2(ym[-1] - ym[0], xm[-1] - xm[0]))
                    collation = col.Collation()
                    witness_keys = []
                    clusters_text = []
                    user_ids = []
                    gold_standard = []
                    for row in X[cdx]:
                        index = int(row[0])
                        user_index = int(row[1])
                        text = data[index]['text'][0]
                        gs = data[index]['gold_standard']
                        if text.strip() != '':
                            key = str(index)
                            witness_keys.append(key)
                            user_ids.append(user_ids_input[user_index])
                            gold_standard.append(gs)
                            collation.add_plain_witness(key, text)
                    if len(collation.witnesses) > 0:
                        alignment_table = col.collate(collation,
                                                      near_match=True,
                                                      segmentation=False)
                        for cols in alignment_table.columns:
                            word_dict = cols.tokens_per_witness
                            word_list = []
                            for key in witness_keys:
                                word_list.append(
                                    str(word_dict.get(key, [''])[0]))
                            clusters_text.append(word_list)
                    consensus_score_value, consensus_text = consensus_score(
                        clusters_text)
                    low_consensus = consensus_score_value < low_consensus_threshold
                    if low_consensus:
                        low_consensus_lines += 1
                    value = {
                        'clusters_x': xm.tolist(),
                        'clusters_y': ym.tolist(),
                        'clusters_text': clusters_text,
                        'number_views': cdx.sum(),
                        'line_slope': slope,
                        'consensus_score': consensus_score_value,
                        'consensus_text': consensus_text,
                        'user_ids': user_ids,
                        'extract_index': ext_index[cdx].tolist(),
                        'gold_standard': gold_standard,
                        'low_consensus': low_consensus,
                        'flagged': low_consensus
                    }
                    number_of_lines += 1
                    frame_unordered.append(value)
        else:
            # not enough data to cluster so assign each extract
            # to its own cluster
            frame_unordered += cluster_of_one(X, data, user_ids_input,
                                              ext_index.tolist())
            if len(frame_unordered) > 0:
                low_consensus_lines += 1
                number_of_lines += 1
        output[frame] = order_lines(frame_unordered,
                                    angle_eps=angle_eps,
                                    gutter_eps=gutter_eps)
        output['low_consensus_lines'] = low_consensus_lines
        output['transcribed_lines'] = number_of_lines
        output['reducer'] = 'optics_line_text_reducer'
    return dict(output)
Example #54
0
'''
Created on Apr 20, 2014

Darwin Integration test

@author: Ronald Haentjens Dekker
'''
import json
from collatex import Collation, collate

if __name__ == '__main__':
    # read source JSON data into dictionary 
    json_data=open('darwin_chapter1_para1.json')
    data = json.load(json_data)
    json_data.close()
    #pprint(data)
    
    # generate collation object from dictionary    
    collation = Collation.create_from_dict(data)

    print(collate(collation))
    
#     write_dot(graph.graph, "rawoutput") 
Example #55
0
 def testThisMorningExample(self):
     collation = Collation()
     collation.add_plain_witness("A", "This morning the cat observed little birds in the trees.")
     collation.add_plain_witness("B",
                                 "The cat was observing birds in the little trees this morning, it observed birds for two hours.")
     alignment_table = collate(collation, detect_transpositions=True)
def align_words(word_line, xy_line, text_line, kwargs_cluster, kwargs_dbscan):
    '''A function to take the annotations for one line of text, aligns the words,
    and finds the end-points for the line.

    Parameters
    ----------
    word_line : np.array
        An nx1 array with the x-position of each dot in the rotated coordiate frame.
    xy_line : np.array
        An nx2 array with the non-rotated (x, y) positions of each dot.
    text_line : np.array
        An nx1 array with the text for each dot.
    gs_line : np.array
        An array of bools indicating if the annotation was made in gold standard mode
    kwargs_cluster : dict
        A dictionary containing the `eps_*` and `dot_freq` keywords
    kwargs_dbscan : dict
        A dictionary containing all the other DBSCAN keywords

    Returns
    -------
    clusters_x : list
        A list with the start and end x-position of the line
    clusters_y : list
        A list with the start and end y-position of the line
    clusters_text : list
        A list-of-lists with the words transcribed at each dot cluster found. One
        list per cluster. Note: the empty strings that were added to each annotaiton are
        stripped before returning the words.
    '''
    clusters_x = []
    clusters_y = []
    clusters_text = []
    # ignore min_samples when trying to find the end points of a line
    min_samples = kwargs_dbscan.pop('min_samples', 1)
    db_words = DBSCAN(eps=kwargs_cluster['eps_word'],
                      min_samples=1,
                      **kwargs_dbscan).fit(word_line)
    # put min_samples back in
    kwargs_dbscan['min_samples'] = min_samples
    word_labels = sort_labels(db_words.labels_, word_line)
    if len(word_labels) > 1:
        word_labels = [word_labels[0], word_labels[-1]]
        for word_label in word_labels:
            wdx = db_words.labels_ == word_label
            word_x, word_y = xy_line[wdx].mean(axis=0)
            clusters_x.append(float(word_x))
            clusters_y.append(float(word_y))
        collation = col.Collation()
        witness_key = []
        for tdx, t in enumerate(text_line):
            if t.strip() != '':
                key = str(tdx)
                collation.add_plain_witness(key, t)
                witness_key.append(key)
        if len(collation.witnesses) > 0:
            alignment_table = col.collate(collation,
                                          near_match=True,
                                          segmentation=False)
            for cols in alignment_table.columns:
                word_dict = cols.tokens_per_witness
                word_list = []
                for key in witness_key:
                    if len(word_dict) >= kwargs_cluster['min_word_count']:
                        word_list.append(str(word_dict.get(key, [''])[0]))
                    else:
                        word_list.append('')
                clusters_text.append(word_list)
            # fix memory leak by deleting this
            del alignment_table
    return clusters_x, clusters_y, clusters_text