Exemple #1
0
    def test_search_with_space(self):
        """
        Test that when performing a search for a term containing a whitespace, it is considered as a term.
        """

        articles = search.collect('Ciprian Tătărușanu')
        self.assertTrue(len(articles))
        self.assertTrue('Ciprian Tătărușanu' in articles)
Exemple #2
0
    def test_search_unknown_term(self):
        """
        Test that when searching for an unknown term, no results are returned.
        """

        random_string = ''.join(
            random.choice(string.ascii_lowercase + string.digits)
            for i in range(32))
        self.assertFalse(len(search.collect(random_string)))
Exemple #3
0
    def test_search_with_ampersand(self):
        """
        Test that when performing a search for a term containing an ampersand, it is considered as a term.
        """

        self.assertTrue(len(search.collect('calvin & hobbes')))
Exemple #4
0
    def test_search_with_accent(self):
        """
        Test that when searching for a term with an accent, results are returned.
        """

        self.assertTrue(len(search.collect('Tătărușanu')))
Exemple #5
0
    def test_search_large_limit(self):
        """
        Test that when searching with a limit larger than 50, more than 50 articles are returned.
        """

        self.assertEqual(100, len(search.collect('Lyon', 100)))
Exemple #6
0
    def test_search_limit(self):
        """
        Test that when searching with a limit, no more than that limit are returned.
        """

        self.assertEqual(10, len(search.collect('Lyon', 10)))
Exemple #7
0
    def test_search_multiple_terms(self):
        """
        Test that when multiple terms are given, relevant results are given.
        """

        self.assertTrue(len(search.collect(['Lyon', 'Bordeaux'])))
Exemple #8
0
    def test_search_one_term(self):
        """
        Test that when one term is given, relevant results are returned.
        """

        self.assertTrue(len(search.collect('Lyon')))
Exemple #9
0
    def test_search_no_terms(self):
        """
        Test that when no terms are given, no articles are returned.
        """

        self.assertEqual([], search.collect([]))
Exemple #10
0
    def resolve(self, candidates, *args, **kwargs):
        """
        Resolve the given candidates.
        They are sorted according to their score.

        :param candidates: The candidates to resolve.
        :type candidates: list

        :return: A tuple containing the resolved and unresolved candidates respectively.
        :rtype: tuple of lists
        """

        resolved_candidates, unresolved_candidates = [], []
        """
        Get the possible pages for each candidate.
        From each of these pages, remove the brackets because this information is secondary.
        If there are years outside the brackets, then the page can be excluded.
        Most often, pages with years in them are not entities.
        Unfortunately, exceptions exist, such as with the name `TSG 1899 Hoffenheim`.
        """
        candidates = sorted(candidates.keys(),
                            key=lambda candidate: candidates.get(candidate),
                            reverse=True)
        for candidate in candidates:
            """
            The page name is retained as-is when checking the year.
            If a page had brackets in it, they are retained.
            They are only removed temporarily to check if the non-bracket part has a year in it.
            In this way, the information about pages and their text can be collected.
            """
            pages = search.collect(candidate, limit=5)
            pages = [
                page for page in pages
                if not self._has_year(self._remove_brackets(page))
            ]
            """
            Fetch the page types.
            Disambiguation, list or missing pages are removed altogether.
            If any pages remain at this point, get their text and score the pages based on relevance to the corpus.
            """
            types = info.types(pages)
            pages = [
                page for page, type in types.items()
                if type is info.ArticleType.NORMAL
            ]
            if len(pages):
                articles = text.collect(pages, introduction_only=True)
                candidate_document = Document(
                    candidate,
                    self.tokenizer.tokenize(candidate),
                    scheme=self.scheme)
                """
                To calculate the score, bracketed text is removed since they do not convey important information.
                Tokens that are part of the candidate name are removed from the sentence.
                """
                scores = {}
                for page, introduction in articles.items():
                    introduction = self._remove_brackets(introduction)
                    sentence = self._get_first_sentence(introduction)
                    tokens = self.tokenizer.tokenize(sentence)
                    tokens = [
                        token for token in tokens
                        if token not in candidate_document.dimensions
                    ]
                    sentence_document = Document(introduction,
                                                 tokens,
                                                 scheme=self.scheme)

                    title_document = Document(page,
                                              self.tokenizer.tokenize(page),
                                              scheme=self.scheme)
                    scores[page] = self._compute_score(candidate_document,
                                                       title_document,
                                                       sentence_document)
                """
                Get the most relevant article.
                If it exceeds the threshold, then the candidate is resolved to that article.
                If it fails to exceed the threshold, the candidate is added to the unresolved candidates.
                """
                article, score = sorted(scores.items(),
                                        key=lambda score: score[1],
                                        reverse=True)[0]
                if score >= self.threshold and article not in resolved_candidates:
                    resolved_candidates.append(article)
                    continue

            unresolved_candidates.append(candidate)

        return (resolved_candidates, unresolved_candidates)