Beispiel #1
0
def save_noise_entities(ner_id, corpus, model = None):
    a  = load_all_recognized_tokens(ner_id, corpus.lang, model)
    ct = corpus.ne_tokens()
 
    if model == None: model = 'default'
    f1 = open(ner_id + "." + corpus.lang + "." + model + ".multiple.txt", "w")
    f2 = open(ner_id + "." + corpus.lang + "." + model + "-single.txt", "w")

    for t in TokenSet(a).tokens(TokenSet.MatchIntersectedSet(ct)):
        ner_off1, ner_off2 = t[1], t[1] + t[2] - 1

        # multiple intersection
        if len(t.matched) > 1: 
            ss_off1, ss_off2 = t.matched[0][1], t.matched[0][1]  + t.matched[0][2] - 1  
            es_off1, es_off2 = t.matched[-1][1], t.matched[-1][1]  + t.matched[-1][2] - 1  
        
            f1.write(str(t) + "\n")
            for m in t.matched:
                f1.write(">>> " + str(Token(m)) + "\n")
        else:
            #  single intersection
            c_off1, c_off2 = t.matched[0][1], t.matched[0][1]  + t.matched[0][2] - 1  
      
            # exact matched tokens are not interested
            if ner_off1 == c_off1 and ner_off2 == c_off2 : continue
                
            f2.write(str(t) + "\n")
            for m in t.matched:
                f2.write(">>> " + str(Token(m)) + "\n")
    f1.close()
    f2.close()
Beispiel #2
0
def calc_recognized_types_distribution_for_wrongly_recognized_entities(ner_id, lang = 'nl', model = None):
    t = load_matched_wrongtyped_tokens(ner_id, lang, model)
    s = TokenSet(t)
    misc = s.tokens(Token.NE_MISC)
    loc  = s.tokens(Token.NE_LOC)
    per  = s.tokens(Token.NE_PER)
    org  = s.tokens(Token.NE_ORG)
    print "======== %s Recognized entities type distribution :" % ner_id
    print "LOCATIONS    : %4d  %3d" % (len(loc), (len(loc)*100)/len(t)) 
    print "PERSONS      : %4d  %3d" % (len(per), (len(per)*100)/len(t)) 
    print "ORGANIZATION : %4d  %3d" % (len(org), (len(org)*100)/len(t)) 
    print "MISC         : %4d  %3d" % (len(misc),(len(misc)*100)/len(t)) 
    print "============================="
    print "AMOUNT       : %4d  100" % len(t) 
Beispiel #3
0
def calc_types_distribution_for_completly_wrongly_recognized_entities(ner_id, lang = 'nl', model = None):
    class NotMatchLocationSet(TokenSet.MatchSet):
        def __init__(self, tokens):
            super(self.__class__, self).__init__(tokens, False)
        
        def match_tokens(self, token1, token2):
            return token1[1] >= 0 and token2[1] >= 0 and (token1[1] != token2[1] or token1[2] != token2[2])  
    
    a = load_all_recognized_tokens(ner_id, lang, model)
    r = load_all_matched_tokens(ner_id, lang, model)
    nm = TokenSet.NotMatchSet(TokenSet.MatchLocationSet(r))
    s = TokenSet(TokenSet(a).tokens(nm))

    misc = s.tokens(Token.NE_MISC)
    loc  = s.tokens(Token.NE_LOC)
    per  = s.tokens(Token.NE_PER)
    org  = s.tokens(Token.NE_ORG)
    print "======== %s Recognized entities type distribution :" % ner_id
    print "LOCATIONS    : %4d  %3d" % (len(loc), (len(loc)*100)/len(s)) 
    print "PERSONS      : %4d  %3d" % (len(per), (len(per)*100)/len(s)) 
    print "ORGANIZATION : %4d  %3d" % (len(org), (len(org)*100)/len(s)) 
    print "MISC         : %4d  %3d" % (len(misc),(len(misc)*100)/len(s)) 
    print "============================="
    print "AMOUNT       : %4d  100" % len(s) 
Beispiel #4
0
    def __init__(self, ner_tokens, corpus_tokens, res_name="unknown"):
        assert ner_tokens and corpus_tokens and res_name
        
        self.res_name = res_name
        
        print "Calculating statistics .... "
        corpus_tokens = [t for t in corpus_tokens]
        self.corpus_defined_entities = len(corpus_tokens)  
        
        self.intersections = 0
        self.exact_match, self._exact_match = 0, []
        self.single_entity, self._single_entity = 0, []
        self.multiple_entities, self._multiple_entities = 0, []
        
        self.noise_in_entity, self._noise_in_entity = 0, []
        self.not_completed_entity, self._not_completed_entity = 0, []
        
        self.exact_match_type_error, self._exact_match_type_error = 0, []
        self.exact_match_no_type_error, self._exact_match_no_type_error = 0, []
        self.wrong_grouped_entities, self._wrong_grouped_entities= 0, []

        ner_tokens_set = TokenSet(ner_tokens)
        for t in ner_tokens_set.tokens(TokenSet.IntersectedTokens(corpus_tokens)):
            ner_off1, ner_off2 = t[1], t[1] + t[2] - 1
            if len(t.matched) > 1: 
                self.multiple_entities += 1
                ss_off1, ss_off2 = t.matched[0][1], t.matched[0][1]  + t.matched[0][2] - 1  
                es_off1, es_off2 = t.matched[-1][1], t.matched[-1][1]  + t.matched[-1][2] - 1  
            
                if ss_off1 == ner_off1 and es_off2 == ner_off2:
                    l = len(t.matched) - 1
                    for m in t.matched:
                        l += m[2]
                    if l == t[2]: self.wrong_grouped_entities += 1
            else:
                self.single_entity += 1
                c_off1, c_off2 = t.matched[0][1], t.matched[0][1]  + t.matched[0][2] - 1  
          
                if ner_off1 == c_off1 and ner_off2 == c_off2 : 
                    self.exact_match += 1
                    self._exact_match.append(t)
                    
                    if t.matched[0][3]  == t[3]:
                        self.exact_match_no_type_error += 1
                        self._exact_match_no_type_error.append(t)
                    else:
                        self.exact_match_type_error += 1
                        self._exact_match_type_error.append(t)
            
                elif ner_off1 < c_off1 or ner_off2 > c_off2: 
                    self.noise_in_entity += 1
                    self._noise_in_entity.append(t)
                elif ner_off1 > c_off1 or ner_off2 < c_off2: 
                    self.not_completed_entity += 1
                    self._not_completed_entity.append(t)
        
            self.intersections += 1
    
        print "Step 1 is done"
        self.not_in_corpus = len([e for e in ner_tokens_set.tokens(NotIntersectedSet(corpus_tokens))])
        print "Step 2 is done"
        self.not_in_ner    = len([e for e in TokenSet(corpus_tokens).tokens(NotIntersectedSet(ner_tokens))])
        print "Step 3 is done"
Beispiel #5
0
 def f3():
     TokenSet.InInterval(0, -1)
Beispiel #6
0
 def f1():
     TokenSet.InInterval(-1, 52)
Beispiel #7
0
 def f2():
     TokenSet.InInterval(0, 0)
Beispiel #8
0
    def test_token_set(self):
        tokens = (["Amsterdam", 0, 9,
                   Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG],
                  ["Something", -1, 4, 0], ["Amstel", 73, 6, Token.NE_MISC])
        s = TokenSet(tokens)

        self.assertEqual(s[0], tokens[0])
        self.assertEqual(s[1], tokens[1])
        self.assertEqual(s[2], tokens[2])
        self.assertEqual(s[3], tokens[3])
        self.assertEqual(len(s), len(tokens))

        def f():
            return s[4]

        self.assertRaises(IndexError, f)

        def f():
            s[0] = ("", 1, 1, 0)

        self.assertRaises(NotImplementedError, f)

        self.assertEqual(tokens[0] in s, True)
        self.assertEqual(tokens[1] in s, True)
        self.assertEqual(tokens[2] in s, True)
        self.assertEqual(tokens[3] in s, True)
        self.assertEqual(("", 2, 2, 2) in s, False)

        i = 0
        for t in s:
            self.assertEqual(tokens[i], t)
            i += 1
        self.assertEqual(len(tokens), i)

        i = 0
        for t in s.tokens():
            self.assertEqual(tokens[i], t)
            i += 1
        self.assertEqual(len(tokens), i)

        r = [e for e in s.tokens(Token.NE_BITS)]
        self.assertEqual(len(tokens) - 1, len(r))

        r = [e for e in s.tokens(Token.NE_ORG)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.UndefPosition()
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[2])

        rule = TokenSet.InInterval(20, 53)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.InInterval(20, 58)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.InInterval(20, 59)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[1])
        self.assertEqual(r[1], tokens[-1])

        def f1():
            TokenSet.InInterval(-1, 52)

        def f2():
            TokenSet.InInterval(0, 0)

        def f3():
            TokenSet.InInterval(0, -1)

        self.assertRaises(AssertionError, f1)
        self.assertRaises(AssertionError, f2)
        self.assertRaises(AssertionError, f3)

        rule = TokenSet.NOT(TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(3, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])
        self.assertEqual(r[2], tokens[3])

        rule = TokenSet.OR(TokenSet.Type(Token.NE_ORG),
                           TokenSet.Type(Token.NE_LOC))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])

        rule = TokenSet.AND(TokenSet.InInterval(0, 35),
                            TokenSet.Type(Token.NE_LOC))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[0])

        rule = TokenSet.AND(
            TokenSet.InInterval(0, 35),
            TokenSet.OR(TokenSet.Type(Token.NE_LOC),
                        TokenSet.Type(Token.NE_ORG)))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])

        rule = TokenSet.NOT(
            TokenSet.OR(TokenSet.Type(Token.NE_ORG),
                        TokenSet.Type(Token.NE_LOC)))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        tokens_to_compare = (["Amsterdam", 0, 9,
                              Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG])
        rule = TokenSet.EqualTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        rule = TokenSet.NOT(TokenSet.EqualTokens(tokens_to_compare))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        rule = TokenSet.OR(TokenSet.EqualTokens(tokens_to_compare),
                           TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(3, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])
        self.assertEqual(r[2], tokens[2])

        rule = TokenSet.AND(TokenSet.EqualTokens(tokens_to_compare),
                            TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(0, len(r))

        tokens_to_compare = (["nsjdjsh", 0, 9, Token.NE_LOC],
                             ["sdsd", 30, 4, Token.NE_ORG], ['dssd', -1, 4, 0])
        rule = TokenSet.EqualByPositionTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], )
        rule = TokenSet.IntersectedTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], )
        rule = TokenSet.NOT(TokenSet.IntersectedTokens(tokens_to_compare))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        tokens_to_compare = (["ABC", 33, 41, Token.NE_LOC], )
        rule = TokenSet.IntersectedTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[1])
        self.assertEqual(r[1], tokens[3])
Beispiel #9
0
 def validate_corpus_tokens(self, corpus, tokens):
     assert len(tokens) > 0
     ts = TokenSet(corpus.ne_tokens())
     r = [e for e in ts.tokens(TokenSet.EqualByPositionTokens(tokens))]
Beispiel #10
0
    'mustn', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once',
    'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over',
    'own', 're', 's', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some',
    'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them',
    'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through',
    'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'we',
    'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom',
    'why', 'with', 'won', 'would', 'wouldn', 'you', 'your', 'yours',
    'yourself', 'yourselves'
]


class SkipStopWords(TokenSet.Match):
    def __init__(self,
                 stop_words=ENGLISH_STOP_WORDS,
                 mask=re.compile(r"^[a-z]$", re.U | re.I)):
        assert stop_words
        self.stop_words, self.mask = stop_words, mask

    def match(self, t):
        r = (t[0].lower() in self.stop_words) or (self.mask
                                                  and self.mask.match(t[0]))
        return not r


if __name__ == "__main__":
    from gravity.tae.tokenizer import WordTokenizer
    txt = "Andrei cannot drive a car if he has more than 0.5 pro-mile of alcohol !"
    for t in TokenSet(WordTokenizer()(txt)).tokens(SkipStopWords()):
        print t
Beispiel #11
0
 def validate_corpus_tokens(self, corpus, tokens):
     assert len(tokens) > 0
     ts = TokenSet(corpus.ne_tokens())
     r  = [ e for e in ts.tokens(TokenSet.EqualByPositionTokens(tokens)) ]
Beispiel #12
0
    def test_token_set(self):
        tokens = (
            ["Amsterdam", 0, 9, Token.NE_LOC],
            ["FIFA", 30, 4, Token.NE_ORG],
            ["Something", -1, 4, 0],
            ["Amstel", 73, 6, Token.NE_MISC],
        )
        s = TokenSet(tokens)

        self.assertEqual(s[0], tokens[0])
        self.assertEqual(s[1], tokens[1])
        self.assertEqual(s[2], tokens[2])
        self.assertEqual(s[3], tokens[3])
        self.assertEqual(len(s), len(tokens))

        def f():
            return s[4]

        self.assertRaises(IndexError, f)

        def f():
            s[0] = ("", 1, 1, 0)

        self.assertRaises(NotImplementedError, f)

        self.assertEqual(tokens[0] in s, True)
        self.assertEqual(tokens[1] in s, True)
        self.assertEqual(tokens[2] in s, True)
        self.assertEqual(tokens[3] in s, True)
        self.assertEqual(("", 2, 2, 2) in s, False)

        i = 0
        for t in s:
            self.assertEqual(tokens[i], t)
            i += 1
        self.assertEqual(len(tokens), i)

        i = 0
        for t in s.tokens():
            self.assertEqual(tokens[i], t)
            i += 1
        self.assertEqual(len(tokens), i)

        r = [e for e in s.tokens(Token.NE_BITS)]
        self.assertEqual(len(tokens) - 1, len(r))

        r = [e for e in s.tokens(Token.NE_ORG)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.UndefPosition()
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[2])

        rule = TokenSet.InInterval(20, 53)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.InInterval(20, 58)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[1])

        rule = TokenSet.InInterval(20, 59)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[1])
        self.assertEqual(r[1], tokens[-1])

        def f1():
            TokenSet.InInterval(-1, 52)

        def f2():
            TokenSet.InInterval(0, 0)

        def f3():
            TokenSet.InInterval(0, -1)

        self.assertRaises(AssertionError, f1)
        self.assertRaises(AssertionError, f2)
        self.assertRaises(AssertionError, f3)

        rule = TokenSet.NOT(TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(3, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])
        self.assertEqual(r[2], tokens[3])

        rule = TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])

        rule = TokenSet.AND(TokenSet.InInterval(0, 35), TokenSet.Type(Token.NE_LOC))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(1, len(r))
        self.assertEqual(r[0], tokens[0])

        rule = TokenSet.AND(
            TokenSet.InInterval(0, 35), TokenSet.OR(TokenSet.Type(Token.NE_LOC), TokenSet.Type(Token.NE_ORG))
        )
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])

        rule = TokenSet.NOT(TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC)))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        tokens_to_compare = (["Amsterdam", 0, 9, Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG])
        rule = TokenSet.EqualTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        rule = TokenSet.NOT(TokenSet.EqualTokens(tokens_to_compare))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        rule = TokenSet.OR(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(3, len(r))
        self.assertEqual(r[0], tokens[0])
        self.assertEqual(r[1], tokens[1])
        self.assertEqual(r[2], tokens[2])

        rule = TokenSet.AND(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition())
        r = [e for e in s.tokens(rule)]
        self.assertEqual(0, len(r))

        tokens_to_compare = (["nsjdjsh", 0, 9, Token.NE_LOC], ["sdsd", 30, 4, Token.NE_ORG], ["dssd", -1, 4, 0])
        rule = TokenSet.EqualByPositionTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC],)
        rule = TokenSet.IntersectedTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], Token(tokens[0]))
        self.assertEqual(r[1], Token(tokens[1]))

        tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC],)
        rule = TokenSet.NOT(TokenSet.IntersectedTokens(tokens_to_compare))
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[2])
        self.assertEqual(r[1], tokens[3])

        tokens_to_compare = (["ABC", 33, 41, Token.NE_LOC],)
        rule = TokenSet.IntersectedTokens(tokens_to_compare)
        r = [e for e in s.tokens(rule)]
        self.assertEqual(2, len(r))
        self.assertEqual(r[0], tokens[1])
        self.assertEqual(r[1], tokens[3])