def test_threshold_filter(self): """ Test the basic functionality of the threshold filter. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ] extractor = EntityExtractor() scorer = TFScorer() filter = ThresholdFilter(0.75) candidates = extractor.extract(corpus) scores = scorer.score(candidates) self.assertEqual(1, scores.get('erdogan', 0)) self.assertEqual(0.5, scores.get('damascus', 0)) scores = filter.filter(scores) self.assertTrue('erdogan' in scores) self.assertFalse('damascus' in scores)
def test_sorting(self): """ Test that the resolver sorts the tokens in descending order of score. """ """ Create the test data """ tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", "Tottenham lose again", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] """ Ensure that the more common candidates are ranked towards the beginning. """ candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) self.assertTrue(scores) resolved, unresolved = Resolver().resolve(scores) self.assertEqual(set(scores.keys()), set(resolved)) self.assertEqual([], unresolved) self.assertEqual('tottenham', resolved[0]) self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
def test_zero_threshold(self): """ Test that when a threshold of zero is given, all candidate participants are retained. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ] extractor = EntityExtractor() scorer = TFScorer() filter = ThresholdFilter(0) candidates = extractor.extract(corpus) scores = scorer.score(candidates) self.assertEqual(1, scores.get('erdogan', 0)) self.assertEqual(0.5, scores.get('damascus', 0)) scores = filter.filter(scores) self.assertTrue('erdogan' in scores) self.assertTrue('damascus' in scores)
def test_sorting(self): """ Test that the resolver sorts the tokens in descending order of score. """ """ Create the test data """ tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", "Tottenham lose again", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores) self.assertEqual('tottenham', resolved[0]) self.assertEqual(set(['manchester', 'united']), set(resolved[1:3])) self.assertEqual( set([ 'falter', 'against', 'hotspur', 'unable', 'avoid', 'defeat', 'lose', 'again' ]), set(resolved[3:]))
def test_repeated_tokens(self): """ Test that when tokens are repeated, the frequency that is returned is the term frequency. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor(tokenizer=tokenizer) scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates, normalize_scores=False) self.assertEqual(2, scores.get('erdogan'))
def test_normalization(self): """ Test that when normalization is disabled, the returned scores are integers. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates, normalize_scores=False) self.assertEqual(2, scores.get('erdogan'))
def test_score_across_multiple_documents(self): """ Test that the score is based on term frequency. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor(tokenizer=tokenizer) scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates, normalize_scores=False) self.assertEqual(3, scores.get('erdogan'))
def test_max_score(self): """ Test that the maximum score is 1 when normalization is enabled. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates) self.assertTrue(all(score <= 1 for score in scores.values()))
def test_score_of_unknown_token(self): """ Test that the score of an unknown token is 0. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates) self.assertFalse(scores.get('unknown'))
def test_high_threshold(self): """ Test that when the threshold is high, it excludes all candidates. """ """ Create the test data """ tokenizer = Tokenizer(min_length=2, stem=True, stopwords=list(stopwords.words("english"))) posts = [ "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite", "Ronaldo's goal voted goal of the year by football fans appreciative of the striker", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) resolver = WikipediaSearchResolver(TF(), tokenizer, 1, corpus) resolved, unresolved = resolver.resolve(scores) self.assertFalse(len(resolved)) self.assertEqual(set(scores.keys()), set(unresolved))
def test_resolve_all(self): """ Test that when resolving candidates, all of them are returned. """ """ Create the test data """ tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", "Tottenham lose again", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] """ Ensure that all candidates are resolved. """ candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) self.assertTrue(scores) resolved, unresolved = Resolver().resolve(scores) self.assertEqual(set(scores.keys()), set(resolved)) self.assertEqual([], unresolved)
def test_empty_corpus(self): """ Test that when an empty corpus is given, all candidates are unresolved. """ """ Create the test data """ tokenizer = Tokenizer(min_length=1, stem=False) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = TokenResolver(tokenizer, []).resolve(scores) self.assertEqual(len(scores), len(unresolved))
def test_unknown_token(self): """ Test that when an unknown candidate is given, it is unresolved. """ """ Create the test data """ tokenizer = Tokenizer(min_length=1, stem=False) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = TokenResolver(tokenizer, corpus).resolve({'unknown': 1}) self.assertTrue('unknown' in unresolved)
def test_low_threshold(self): """ Test that when the threshold is not zero, it excludes some ambiguous candidates. """ """ Create the test data """ tokenizer = Tokenizer(min_length=2, stem=True, stopwords=list(stopwords.words("english"))) posts = [ "Memphis mum about his future at Lyon after the Dutch footballer wins it for the Ligue 1 team", ] corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) resolver = WikipediaNameResolver(TF(), tokenizer, 0.4, corpus) resolved, unresolved = resolver.resolve(scores) self.assertTrue('Memphis' in unresolved)
def test_all_resolved_or_unresolved(self): """ Test that the resolver either resolves or does not resolve named entities. """ """ Create the test data """ tokenizer = Tokenizer(min_length=1, stem=False) posts = [ "Manchester United falter against Burnley", ] corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolver = WikipediaNameResolver(TF(), tokenizer, 0, corpus) resolved, unresolved = resolver.resolve(scores) self.assertEqual(len(scores), len(resolved + unresolved))
def test_sorting(self): """ Test that the resolver sorts the named entities in descending order of score. """ """ Create the test data """ tokenizer = Tokenizer(min_length=3, stem=True, case_fold=True) posts = [ "In the most heated football match of the season, Liverpool falter against Manchester City", "Liverpool unable to avoid defeat to Watford, Manchester City close in on football title" ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = WikipediaSearchResolver(TF(), tokenizer, 0, corpus).resolve(scores) self.assertEqual('Liverpool F.C.', resolved[0]) self.assertEqual('Manchester City F.C.', resolved[1]) self.assertEqual('Watford F.C.', resolved[2])
def test_all_resolved_or_unresolved(self): """ Test that the resolver either resolves or does not resolve named entities. """ """ Create the test data """ tokenizer = Tokenizer(min_length=2, stem=True, stopwords=list(stopwords.words("english"))) posts = [ "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite", "Ronaldo's goal voted goal of the year by football fans appreciative of the striker", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) resolver = WikipediaSearchResolver(TF(), tokenizer, 0, corpus) resolved, unresolved = resolver.resolve(scores) self.assertEqual(len(scores), len(resolved + unresolved))
def test_token_resolver(self): """ Test the token resolver. """ """ Create the test data """ tokenizer = Tokenizer(min_length=1, stem=False) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores) self.assertTrue('manchester' in resolved) self.assertTrue('united' in resolved) self.assertTrue('tottenham' in resolved) self.assertTrue('hotspur' in resolved)
def test_different_tokenizer(self): """ Test that when a different tokenizer is used than the one used in extraction, it is used. """ """ Create the test data """ tokenizer = Tokenizer(min_length=1, stem=False) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores) self.assertTrue('to' in resolved) resolved, unresolved = TokenResolver( Tokenizer(min_length=3, stem=False), corpus).resolve(scores) self.assertTrue('to' in unresolved)
def test_wikipedia_name_resolver(self): """ Test the Wikipedia name resolver. """ """ Create the test data """ tokenizer = Tokenizer(min_length=1, stem=False) posts = [ "Manchester United falter against Burnley", ] corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolver = WikipediaNameResolver(TF(), tokenizer, 0, corpus) resolved, unresolved = resolver.resolve(scores) self.assertTrue('manchester united' in resolved) self.assertTrue('burnley' in resolved)
def test_sorting_ambiguous(self): """ Test that the resolver sorts the named entities in descending order of score, but ambiguous candidates are at the end. """ """ Create the test data """ tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True) posts = [ "Manchester United falter against Manchester City", "Manchester United unable to avoid defeat to Tottenham", "Tottenham lose again", ] corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = WikipediaNameResolver(TF(), tokenizer, 0, corpus).resolve(scores) self.assertEqual('manchester united', resolved[0]) self.assertEqual('manchester city', resolved[1]) self.assertEqual('tottenham', resolved[2])
def test_case_folding(self): """ Test that when case-folding is set, the case does not matter. In this test, the stem 'report' can be formed by: #. Reporters - appears twice #. reporters - appears twice #. reports - appears three times Without case-folding, 'reports' would be chosen to represent the token 'report'. 'reports' appears three times, and 'Reporters' and 'reporters' appear twice. With case-folding, 'reports' still appears three times, but 'reporters' appears four times. """ """ Create the test data """ tokenizer = Tokenizer(min_length=1, stem=True) posts = [ "Reporters Without Borders issue statement after reporters are harrassed", "Reporters left waiting all night long: reports", "Two reporters injured before gala: reports", "Queen reacts: reports of her falling ill exaggerated" ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) resolved, unresolved = TokenResolver(tokenizer, corpus, case_fold=False).resolve(scores) self.assertTrue('reports' in resolved) resolved, unresolved = TokenResolver(tokenizer, corpus, case_fold=True).resolve(scores) self.assertTrue('reporters' in resolved)