Beispiel #1
0
    def __init__(self, ner_tokens, corpus_tokens, res_name="unknown"):
        assert ner_tokens and corpus_tokens and res_name
        
        self.res_name = res_name
        
        print "Calculating statistics .... "
        corpus_tokens = [t for t in corpus_tokens]
        self.corpus_defined_entities = len(corpus_tokens)  
        
        self.intersections = 0
        self.exact_match, self._exact_match = 0, []
        self.single_entity, self._single_entity = 0, []
        self.multiple_entities, self._multiple_entities = 0, []
        
        self.noise_in_entity, self._noise_in_entity = 0, []
        self.not_completed_entity, self._not_completed_entity = 0, []
        
        self.exact_match_type_error, self._exact_match_type_error = 0, []
        self.exact_match_no_type_error, self._exact_match_no_type_error = 0, []
        self.wrong_grouped_entities, self._wrong_grouped_entities= 0, []

        ner_tokens_set = TokenSet(ner_tokens)
        for t in ner_tokens_set.tokens(TokenSet.IntersectedTokens(corpus_tokens)):
            ner_off1, ner_off2 = t[1], t[1] + t[2] - 1
            if len(t.matched) > 1: 
                self.multiple_entities += 1
                ss_off1, ss_off2 = t.matched[0][1], t.matched[0][1]  + t.matched[0][2] - 1  
                es_off1, es_off2 = t.matched[-1][1], t.matched[-1][1]  + t.matched[-1][2] - 1  
            
                if ss_off1 == ner_off1 and es_off2 == ner_off2:
                    l = len(t.matched) - 1
                    for m in t.matched:
                        l += m[2]
                    if l == t[2]: self.wrong_grouped_entities += 1
            else:
                self.single_entity += 1
                c_off1, c_off2 = t.matched[0][1], t.matched[0][1]  + t.matched[0][2] - 1  
          
                if ner_off1 == c_off1 and ner_off2 == c_off2 : 
                    self.exact_match += 1
                    self._exact_match.append(t)
                    
                    if t.matched[0][3]  == t[3]:
                        self.exact_match_no_type_error += 1
                        self._exact_match_no_type_error.append(t)
                    else:
                        self.exact_match_type_error += 1
                        self._exact_match_type_error.append(t)
            
                elif ner_off1 < c_off1 or ner_off2 > c_off2: 
                    self.noise_in_entity += 1
                    self._noise_in_entity.append(t)
                elif ner_off1 > c_off1 or ner_off2 < c_off2: 
                    self.not_completed_entity += 1
                    self._not_completed_entity.append(t)
        
            self.intersections += 1
    
        print "Step 1 is done"
        self.not_in_corpus = len([e for e in ner_tokens_set.tokens(NotIntersectedSet(corpus_tokens))])
        print "Step 2 is done"
        self.not_in_ner    = len([e for e in TokenSet(corpus_tokens).tokens(NotIntersectedSet(ner_tokens))])
        print "Step 3 is done"
Beispiel #2
0
    def test_token_set(self):
        tokens = (["Amsterdam", 0, 9,
                   Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG],
                  ["Something", -1, 4, 0], ["Amstel", 73, 6, Token.NE_MISC])
        s = TokenSet(tokens)

        self.assertEqual(s[0], tokens[0])
        self.assertEqual(s[1], tokens[1])
        self.assertEqual(s[2], tokens[2])
        self.assertEqual(s[3], tokens[3])
        self.assertEqual(len(s), len(tokens))

        def f():
            return s[4]

        self.assertRaises(IndexError, f)

        def f():
            s[0] = ("", 1, 1, 0)

        self.assertRaises(NotImplementedError, f)

        self.assertEqual(tokens[0] in s, True)
        self.assertEqual(tokens[1] in s, True)
        self.assertEqual(tokens[2] in s, True)
        self.assertEqual(tokens[3] in s, True)
        self.assertEqual(("", 2, 2, 2) in s, False)

        i = 0
        for t in s:
            self.assertEqual(tokens[i], t)
            i += 1
        self.assertEqual(len(tokens), i)

        i = 0
        for t in s.tokens():
            self.assertEqual(tokens[i], t)
            i += 1
        self.assertEqual(len(tokens), i)

        r = [e for e in s.tokens(Token.NE_BITS)]
        self.assertEqual(len(tokens) - 1, len(r))

        r = [e for e in s.tokens(Token.NE_ORG)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.UndefPosition()
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[2])

        rule = TokenSet.InInterval(20, 53)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.InInterval(20, 58)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.InInterval(20, 59)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[1])
        self.assertEqual(r[1], tokens[-1])

        def f1():
            TokenSet.InInterval(-1, 52)

        def f2():
            TokenSet.InInterval(0, 0)

        def f3():
            TokenSet.InInterval(0, -1)

        self.assertRaises(AssertionError, f1)
        self.assertRaises(AssertionError, f2)
        self.assertRaises(AssertionError, f3)

        rule = TokenSet.NOT(TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(3, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])
        self.assertEqual(r[2], tokens[3])

        rule = TokenSet.OR(TokenSet.Type(Token.NE_ORG),
                           TokenSet.Type(Token.NE_LOC))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])

        rule = TokenSet.AND(TokenSet.InInterval(0, 35),
                            TokenSet.Type(Token.NE_LOC))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[0])

        rule = TokenSet.AND(
            TokenSet.InInterval(0, 35),
            TokenSet.OR(TokenSet.Type(Token.NE_LOC),
                        TokenSet.Type(Token.NE_ORG)))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])

        rule = TokenSet.NOT(
            TokenSet.OR(TokenSet.Type(Token.NE_ORG),
                        TokenSet.Type(Token.NE_LOC)))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        tokens_to_compare = (["Amsterdam", 0, 9,
                              Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG])
        rule = TokenSet.EqualTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        rule = TokenSet.NOT(TokenSet.EqualTokens(tokens_to_compare))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        rule = TokenSet.OR(TokenSet.EqualTokens(tokens_to_compare),
                           TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(3, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])
        self.assertEqual(r[2], tokens[2])

        rule = TokenSet.AND(TokenSet.EqualTokens(tokens_to_compare),
                            TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(0, len(r))

        tokens_to_compare = (["nsjdjsh", 0, 9, Token.NE_LOC],
                             ["sdsd", 30, 4, Token.NE_ORG], ['dssd', -1, 4, 0])
        rule = TokenSet.EqualByPositionTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], )
        rule = TokenSet.IntersectedTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], )
        rule = TokenSet.NOT(TokenSet.IntersectedTokens(tokens_to_compare))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        tokens_to_compare = (["ABC", 33, 41, Token.NE_LOC], )
        rule = TokenSet.IntersectedTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[1])
        self.assertEqual(r[1], tokens[3])