Exemple #1
0
    def test_get_introduction_only(self):
        """
        Test that when only the introduction is requested, it is returned.
        """

        page = 'Olympique Lyon'
        introduction = text.collect(page, introduction_only=True)
        extract = text.collect(page, introduction_only=False)
        self.assertLess(len(introduction[page]), len(extract[page]))
Exemple #2
0
    def test_collect_none(self):
        """
        Test that when no pages are given, an empty dictionary is returned.
        """

        extracts = text.collect([])
        self.assertFalse(len(extracts))
Exemple #3
0
    def _disambiguate(self, pages):
        """
        Disambiguate a candidate by finding the link that is most similar to the domain.
        The function returns the link's page name and the associated score.
        Only one page is returned: the one with the highest score.

        :param pages: A list of page titles.
        :type pages: list of str

        :return: A tuple containing the most similar page and its similarity score.
        :rtype: tuple
        """
        """
        Get the first section of each page.
        Then, convert them into documents.
        """
        pages = text.collect(pages, introduction_only=True)
        for page, introduction in pages.items():
            pages[page] = Document(introduction,
                                   self.tokenizer.tokenize(introduction),
                                   scheme=self.scheme)
            pages[page].normalize()
        """
        Rank the page scores in descending order.
        Then, choose the best page and return it alongside its score.
        """
        scores = {
            page: vector_math.cosine(introduction, self.domain)
            for page, introduction in pages.items()
        }
        article, score = sorted(scores.items(),
                                key=lambda score: score[1],
                                reverse=True)[0]
        return (article, score)
Exemple #4
0
    def test_redirection(self):
        """
        Test that pages may redirect, but the original pages are retained.
        """

        page = 'Olympique Lyon'
        extracts = text.collect(page)
        self.assertTrue(page in extracts)
Exemple #5
0
    def test_get_page_with_accent(self):
        """
        Test that pages that contain an accent in their title are retrieved normally.
        """

        page = 'Ciprian Tătărușanu'
        extracts = text.collect(page, introduction_only=True)
        self.assertEqual(1, len(extracts))
        self.assertTrue(page in extracts)
        self.assertGreater(len(extracts[page]), 100)
Exemple #6
0
    def test_get_multiple_introductions(self):
        """
        Test that when multiple introductions are requested, they are all returned.
        """

        pages = ['Olympique Lyonnais', 'Borussia Dortmund']
        extracts = text.collect(pages, introduction_only=True)
        self.assertEqual(2, len(extracts))
        self.assertEqual(set(pages), set(list(extracts.keys())))
        self.assertTrue(all(len(text) for text in extracts.values()))
Exemple #7
0
    def test_get_long_list(self):
        """
        Test that when getting a long list (greater than the stagger value), all pages are retrieed.
        """

        pages = [
            'Anthony Lopes', 'Mapou Yanga-Mbiwa', 'Joachim Andersen', 'Rafael',
            'Jason Denayer', 'Marcelo', 'Martin Terrier', 'Houssem Aouar',
            'Moussa Dembélé', 'Bertrand Traoré', 'Memphis Depay',
            'Thiago Mendes', 'Léo Dubois', 'Oumar Solet',
            'Jeff Reine-Adélaïde', 'Rayan Cherki', 'Bruno Guimarães',
            'Amine Gouiri', 'Marçal', 'Karl Toko Ekambi', 'Jean Lucas',
            'Kenny Tete', 'Maxence Caqueret', 'Camilo Reijers de Oliveira',
            'Maxwel Cornet', 'Youssouf Koné', 'Lucas Tousart',
            'Ciprian Tătărușanu', 'Boubacar Fofana'
        ]

        extracts = text.collect(pages, introduction_only=True)
        self.assertEqual(len(pages), len(extracts))
        self.assertEqual(set(pages), set(list(extracts.keys())))
        self.assertTrue(all(len(text) > 100 for text in extracts.values()))
Exemple #8
0
    def _add_to_graph(self, graph, outgoing_links, threshold=0):
        """
        Add the links to the graph.
        The function fetches the article text and uses it to add to the weighted graph.

        .. note::

            The weight of edges is `1 - similarity`.
            The higher the similarity, the less weight.
            Therefore more paths go through that edge.

        :param graph: The graph to which to add the new nodes and edges.
        :type graph: :class:`~nx.Graph`
        :param outgoing_links: The dictionary of links.
                               The keys should be the source articles.
                               The values should be the outgoing links from these articles.
        :type outgoing_links: dict
        :param threshold: The minimum similarity between the source and target articles to add an edge between them.
        :type threshold: float
        """
        """
        Get the text from all articles.
        """
        sources = list(outgoing_links.keys())
        targets = [
            link for link_set in outgoing_links.values() for link in link_set
        ]
        articles = text.collect(sources + targets, introduction_only=True)
        """
        Convert each article into a document.
        The article is based only on the first sentence.
        """
        documents = {}
        for title, introduction in articles.items():
            introduction = self._remove_brackets(introduction)
            introduction = self._get_first_sentence(introduction)
            document = Document(introduction,
                                self.tokenizer.tokenize(introduction),
                                scheme=self.scheme)
            document.normalize()
            documents[title] = document
        """
        Add first the nodes, and then the edges to the graph.
        This is done by going through all the outgoing links.
        If they have a page, the similarity between the source article and that link is computed.
        If the similarity exceeds the threshold, add an edge between the two.
        """
        for source, targets in outgoing_links.items():
            if source not in documents:
                continue

            if source not in graph.nodes:
                graph.add_node(source, document=documents[source])

            for target in targets:
                if target not in documents:
                    continue

                if target not in graph.nodes:
                    graph.add_node(target, document=documents[target])

                if source in documents and target in documents:
                    similarity = vector_math.cosine(documents[source],
                                                    documents[target])
                    if similarity > threshold:
                        graph.add_edge(source, target, weight=(1 - similarity))
Exemple #9
0
    def resolve(self, candidates, *args, **kwargs):
        """
        Resolve the given candidates.
        They are sorted according to their score.

        :param candidates: The candidates to resolve.
        :type candidates: list

        :return: A tuple containing the resolved and unresolved candidates respectively.
        :rtype: tuple of lists
        """

        resolved_candidates, unresolved_candidates = [], []
        """
        Get the possible pages for each candidate.
        From each of these pages, remove the brackets because this information is secondary.
        If there are years outside the brackets, then the page can be excluded.
        Most often, pages with years in them are not entities.
        Unfortunately, exceptions exist, such as with the name `TSG 1899 Hoffenheim`.
        """
        candidates = sorted(candidates.keys(),
                            key=lambda candidate: candidates.get(candidate),
                            reverse=True)
        for candidate in candidates:
            """
            The page name is retained as-is when checking the year.
            If a page had brackets in it, they are retained.
            They are only removed temporarily to check if the non-bracket part has a year in it.
            In this way, the information about pages and their text can be collected.
            """
            pages = search.collect(candidate, limit=5)
            pages = [
                page for page in pages
                if not self._has_year(self._remove_brackets(page))
            ]
            """
            Fetch the page types.
            Disambiguation, list or missing pages are removed altogether.
            If any pages remain at this point, get their text and score the pages based on relevance to the corpus.
            """
            types = info.types(pages)
            pages = [
                page for page, type in types.items()
                if type is info.ArticleType.NORMAL
            ]
            if len(pages):
                articles = text.collect(pages, introduction_only=True)
                candidate_document = Document(
                    candidate,
                    self.tokenizer.tokenize(candidate),
                    scheme=self.scheme)
                """
                To calculate the score, bracketed text is removed since they do not convey important information.
                Tokens that are part of the candidate name are removed from the sentence.
                """
                scores = {}
                for page, introduction in articles.items():
                    introduction = self._remove_brackets(introduction)
                    sentence = self._get_first_sentence(introduction)
                    tokens = self.tokenizer.tokenize(sentence)
                    tokens = [
                        token for token in tokens
                        if token not in candidate_document.dimensions
                    ]
                    sentence_document = Document(introduction,
                                                 tokens,
                                                 scheme=self.scheme)

                    title_document = Document(page,
                                              self.tokenizer.tokenize(page),
                                              scheme=self.scheme)
                    scores[page] = self._compute_score(candidate_document,
                                                       title_document,
                                                       sentence_document)
                """
                Get the most relevant article.
                If it exceeds the threshold, then the candidate is resolved to that article.
                If it fails to exceed the threshold, the candidate is added to the unresolved candidates.
                """
                article, score = sorted(scores.items(),
                                        key=lambda score: score[1],
                                        reverse=True)[0]
                if score >= self.threshold and article not in resolved_candidates:
                    resolved_candidates.append(article)
                    continue

            unresolved_candidates.append(candidate)

        return (resolved_candidates, unresolved_candidates)