Beispiel #1
0
    def test_threshold_filter(self):
        """
        Test the basic functionality of the threshold filter.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0.75)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertFalse('damascus' in scores)
Beispiel #2
0
    def test_zero_threshold(self):
        """
        Test that when a threshold of zero is given, all candidate participants are retained.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertTrue('damascus' in scores)
Beispiel #3
0
    def test_repeated_tokens(self):
        """
        Test that when tokens are repeated, the frequency that is returned is the term frequency.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor(tokenizer=tokenizer)
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates, normalize_scores=False)
        self.assertEqual(2, scores.get('erdogan'))
Beispiel #4
0
    def test_score_of_unknown_token(self):
        """
        Test that the score of an unknown token is 0.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor()
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)
        self.assertFalse(scores.get('unknown'))
Beispiel #5
0
    def test_max_score(self):
        """
        Test that the maximum score is 1 when normalization is enabled.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor()
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)
        self.assertTrue(all(score <= 1 for score in scores.values()))
Beispiel #6
0
    def test_normalization(self):
        """
        Test that when normalization is disabled, the returned scores are integers.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor()
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates, normalize_scores=False)
        self.assertEqual(2, scores.get('erdogan'))
Beispiel #7
0
    def test_score_across_multiple_documents(self):
        """
        Test that the score is based on term frequency.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor(tokenizer=tokenizer)
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates, normalize_scores=False)
        self.assertEqual(3, scores.get('erdogan'))