Example #1
0
    def test_extract_from_text(self):
        """
        Test that the entity extractor's named entities do appear in the corresponding tweet.
        """
        """
        Load the corpus.
        """
        filename = os.path.join(os.path.dirname(__file__), '..', '..', '..',
                                '..', 'tests', 'corpora', 'understanding',
                                'CRYCHE-100.json')
        corpus = []
        with open(filename) as f:
            for i, line in enumerate(f):
                tweet = json.loads(line)
                original = tweet
                while "retweeted_status" in tweet:
                    tweet = tweet["retweeted_status"]

                if "extended_tweet" in tweet:
                    text = tweet["extended_tweet"].get("full_text",
                                                       tweet.get("text", ""))
                else:
                    text = tweet.get("text", "")

                document = Document(text)
                corpus.append(document)

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        for (document, candidate_set) in zip(corpus, candidates):
            text = document.text.lower().replace('\n', ' ').replace('  ', ' ')
            self.assertTrue(
                all(candidate in text.lower() for candidate in candidate_set))
Example #2
0
    def test_threshold_filter(self):
        """
        Test the basic functionality of the threshold filter.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0.75)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertFalse('damascus' in scores)
Example #3
0
    def test_zero_threshold(self):
        """
        Test that when a threshold of zero is given, all candidate participants are retained.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertTrue('damascus' in scores)
Example #4
0
    def test_empty_corpus(self):
        """
        Test the entity extractor with an empty corpus.
        """

        extractor = EntityExtractor()
        candidates = extractor.extract([])
        self.assertFalse(len(candidates))
Example #5
0
    def test_named_entity_at_start(self):
        """
        Test that the entity extractor is capable of extracting named entities at the start of a sentence.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Liverpool falter again",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        self.assertTrue("liverpool" in set(candidates[0]))
Example #6
0
    def test_named_entity_at_end(self):
        """
        Test that the entity extractor is capable of extracting named entities at the end of a sentence.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Spiral continues for Lyon",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        self.assertTrue("lyon" in set(candidates[0]))
Example #7
0
    def test_multiple_sentences(self):
        """
        Test that the entity extractor is capable of extracting named entities from multiple sentences.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "The downward spiral continues for Lyon. Bruno Genesio under threat.",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        self.assertEqual(set(["lyon", "bruno genesio"]), set(candidates[0]))
Example #8
0
    def test_repeated_named_entities(self):
        """
        Test that the entity extractor does not filter named entities that appear multiple times.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "The downward spiral continues for Lyon. Lyon coach Bruno Genesio under threat.",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        self.assertEqual(set(["lyon", "bruno genesio"]), set(candidates[0]))
Example #9
0
    def test_entity_extractor(self):
        """
        Test the entity extractor with normal input.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Liverpool falter against Tottenham Hotspur",
            "Mourinho under pressure as Tottenham follow with a loss",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        self.assertEqual(set(["liverpool", "tottenham hotspur"]),
                         set(candidates[0]))
        self.assertEqual(set(["mourinho", "tottenham"]), set(candidates[1]))
Example #10
0
    def test_comma_separated_entities(self):
        """
        Test that comma-separated named entities are returned individually.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Memphis Depay, Oumar Solet, Leo Dubois and Youssouf Kone all out injured",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        self.assertEqual(
            set([
                "memphis depay", "oumar solet", 'leo dubois', 'youssouf kone'
            ]), set(candidates[0]))
Example #11
0
    def test_return_length(self):
        """
        Test that the entity extractor returns as many token sets as the number of documents given.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Liverpool falter against Tottenham Hotspur",
            "",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        self.assertEqual(2, len(candidates))
        self.assertEqual(set(["liverpool", "tottenham hotspur"]),
                         set(candidates[0]))
        self.assertEqual(set([]), set(candidates[1]))
Example #12
0
    def test_low_threshold(self):
        """
        Test that when the threshold is not zero, it excludes some ambiguous candidates.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=2, stem=True, stopwords=list(stopwords.words("english")))
        posts = [
            "Memphis mum about his future at Lyon after the Dutch footballer wins it for the Ligue 1 team",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)

        resolver = WikipediaNameResolver(TF(), tokenizer, 0.4, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertTrue('Memphis' in unresolved)
Example #13
0
    def test_all_resolved_or_unresolved(self):
        """
        Test that the resolver either resolves or does not resolve named entities.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Burnley",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)

        resolver = WikipediaNameResolver(TF(), tokenizer, 0, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertEqual(len(scores), len(resolved + unresolved))
Example #14
0
    def test_all_resolved_or_unresolved(self):
        """
        Test that the resolver either resolves or does not resolve named entities.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=2,
                              stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite",
            "Ronaldo's goal voted goal of the year by football fans appreciative of the striker",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)

        resolver = WikipediaSearchResolver(TF(), tokenizer, 0, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertEqual(len(scores), len(resolved + unresolved))
Example #15
0
    def test_sorting(self):
        """
        Test that the resolver sorts the named entities in descending order of score.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=True, case_fold=True)
        posts = [
            "In the most heated football match of the season, Liverpool falter against Manchester City",
            "Liverpool unable to avoid defeat to Watford, Manchester City close in on football title"
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = WikipediaSearchResolver(TF(), tokenizer, 0,
                                                       corpus).resolve(scores)
        self.assertEqual('Liverpool F.C.', resolved[0])
        self.assertEqual('Manchester City F.C.', resolved[1])
        self.assertEqual('Watford F.C.', resolved[2])
Example #16
0
    def test_wikipedia_name_resolver(self):
        """
        Test the Wikipedia name resolver.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Burnley",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)

        resolver = WikipediaNameResolver(TF(), tokenizer, 0, corpus)
        resolved, unresolved = resolver.resolve(scores)

        self.assertTrue('manchester united' in resolved)
        self.assertTrue('burnley' in resolved)
Example #17
0
    def test_sorting_ambiguous(self):
        """
        Test that the resolver sorts the named entities in descending order of score, but ambiguous candidates are at the end.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
        posts = [
            "Manchester United falter against Manchester City",
            "Manchester United unable to avoid defeat to Tottenham",
            "Tottenham lose again",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = WikipediaNameResolver(TF(), tokenizer, 0, corpus).resolve(scores)
        self.assertEqual('manchester united', resolved[0])
        self.assertEqual('manchester city', resolved[1])
        self.assertEqual('tottenham', resolved[2])
Example #18
0
    def test_high_threshold(self):
        """
        Test that when the threshold is high, it excludes all candidates.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=2,
                              stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite",
            "Ronaldo's goal voted goal of the year by football fans appreciative of the striker",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)

        resolver = WikipediaSearchResolver(TF(), tokenizer, 1, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertFalse(len(resolved))
        self.assertEqual(set(scores.keys()), set(unresolved))
Example #19
0
    def test_binary_named_entities(self):
        """
        Test that the entity extractor does not consider the entity type when the binary option is turned off.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "The downward spiral continues for Lyon. Rudi Garcia under threat.",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = EntityExtractor(binary=False)
        candidates = extractor.extract(corpus)
        self.assertEqual(
            set(["lyon", "rudi", "garcia"]), set(candidates[0])
        )  # 'Rudi' and 'Garcia' mistakenly have different types

        extractor = EntityExtractor(binary=True)
        candidates = extractor.extract(corpus)
        self.assertEqual(set(["lyon", "rudi garcia"]), set(candidates[0]))