コード例 #1
0
    def test_get_no_links(self):
        """
        Test that when no links are requested, nothing is returned.
        """

        self.assertEqual({}, links.collect([]))
        self.assertEqual([], links.collect([], separate=False))
コード例 #2
0
    def test_get_links_with_redirects_same_as_redirects(self):
        """
        Test that the links from the redirection are the same as the links without redirection.
        """

        redirect_articles = links.collect('Olympique Lyon', separate=False)
        articles = links.collect('Olympique Lyonnais', separate=False)
        self.assertEqual(set(articles), set(redirect_articles))
コード例 #3
0
    def test_links_introduction_less_than_whole(self):
        """
        Test that when getting the links from the introduction, there are fewer links than when fetching from the whole article.
        """

        intro_articles = links.collect('Olympique Lyonnais',
                                       separate=False,
                                       introduction_only=True)
        articles = links.collect('Olympique Lyonnais',
                                 separate=False,
                                 introduction_only=False)
        self.assertLess(len(intro_articles), len(articles))
コード例 #4
0
    def test_collective_links_same_as_individual(self):
        """
        Test that getting links from pages individually results in the same links as when fetched collectively.
        """

        results = []
        pages = [
            'Orléans', 'Claude Allègre', 'Côté', 'Māori naming customs',
            'García (surname)'
        ]
        for page in pages:
            results = results + links.collect(page, separate=False)

        self.assertTrue(set(results), links.collect(pages, separate=False))
コード例 #5
0
    def test_links_introduction_subset_whole(self):
        """
        Test that when getting links from the introduction, the links a subset of the links in the whole article.
        """

        intro_articles = links.collect('Olympique Lyonnais',
                                       separate=False,
                                       introduction_only=True)
        articles = links.collect('Olympique Lyonnais',
                                 separate=False,
                                 introduction_only=False)
        articles = [article.lower() for article in articles]
        self.assertTrue(
            all(article.lower() in articles for article in intro_articles))
コード例 #6
0
    def test_get_many_links(self):
        """
        Test that when many links are requested, all of them are returned.
        """

        pages = [
            'Anthony Lopes', 'Mapou Yanga-Mbiwa', 'Joachim Andersen', 'Rafael',
            'Jason Denayer', 'Marcelo', 'Martin Terrier', 'Houssem Aouar',
            'Moussa Dembélé', 'Bertrand Traoré', 'Memphis Depay',
            'Thiago Mendes', 'Léo Dubois', 'Oumar Solet',
            'Jeff Reine-Adélaïde', 'Rayan Cherki', 'Bruno Guimarães',
            'Amine Gouiri', 'Marçal', 'Karl Toko Ekambi', 'Jean Lucas',
            'Kenny Tete', 'Maxence Caqueret', 'Camilo Reijers de Oliveira',
            'Maxwel Cornet', 'Youssouf Koné', 'Lucas Tousart',
            'Ciprian Tătărușanu', 'Boubacar Fofana', 'Roman Bürki',
            'Dan-Axel Zagadou', 'Achraf Hakimi', 'Thomas Delaney',
            'Jadon Sancho', 'Mahmoud Dahoud', 'Mario Götze', 'Marco Reus',
            'Raphaël Guerreiro', 'Nico Schulz', 'Mats Hummels',
            'Manuel Akanji', 'Erling Braut Håland', 'Leonardo Balerdi',
            'Julian Brandt', 'Mateu Morey', 'Thorgan Hazard', 'Marwin Hitz',
            'Łukasz Piszczek', 'Emre Can', 'Axel Witsel', 'Marcel Schmelzer',
            'Giovanni Reyna', 'Eric Oelschlägel'
        ]

        articles = links.collect(pages, separate=True)
        self.assertEqual(set(pages), set(articles.keys()))
        self.assertTrue(all(len(link_set) for link_set in articles.values()))
コード例 #7
0
    def test_get_links_with_redirects(self):
        """
        Test that when getting links from a page that redirects, the collection allows redirection.
        """

        page = 'Olympique Lyon'
        articles = links.collect(page, separate=True)
        self.assertTrue(page in articles)
コード例 #8
0
    def test_recursive_links_not_separated(self):
        """
        Test that getting recursive links without separating them works.
        """

        page = 'Olympique Lyonnais'
        articles = links.collect_recursive(page, 1, separate=False)
        self.assertEqual(set(articles), set(links.collect(page,
                                                          separate=False)))
コード例 #9
0
    def test_recursive_links_level_one(self):
        """
        Test that recursive collection with a level of 1 returns the same results as normal link collection.
        """

        page = 'Olympique Lyonnais'
        articles = links.collect_recursive(page, 1, separate=False)
        self.assertEqual(set(articles), set(links.collect(page,
                                                          separate=False)))
コード例 #10
0
    def test_links_introduction_only(self):
        """
        Test getting links from the introduction only.
        """

        page = 'Olympique Lyon'
        articles = links.collect(page, separate=True, introduction_only=True)
        self.assertTrue(page in articles)
        self.assertTrue(len(articles.get(page)))
コード例 #11
0
    def test_recursive_links_separated(self):
        """
        Test that getting recursive links and separating them by article works.
        """

        page = 'Olympique Lyonnais'
        articles = links.collect_recursive(page, 1, separate=True)
        self.assertTrue(page in articles)
        self.assertEqual(set(articles.get(page)),
                         set(links.collect(page, separate=True).get(page)))
コード例 #12
0
    def test_get_unseparated_links(self):
        """
        Test that when the links are requested to be unseparated, a list is returned.
        """

        page = 'Olympique Lyonnais'
        articles = links.collect(page, separate=False)
        self.assertEqual(list, type(articles))
        self.assertTrue('Fernando Marçal' in articles)
        self.assertTrue('AS Saint-Étienne' in articles)
コード例 #13
0
    def test_get_separated_links(self):
        """
        Test that when the links are requested to be separated, a dictionary is returned.
        """

        page = 'Olympique Lyonnais'
        articles = links.collect(page, separate=True)
        self.assertEqual(dict, type(articles))
        self.assertTrue(page in articles)
        self.assertTrue('Fernando Marçal' in articles.get(page))
        self.assertTrue('AS Saint-Étienne' in articles.get(page))
コード例 #14
0
    def test_seed_set_with_accents(self):
        """
        Test that seed sets that include accents return results.
        """

        pages = [
            'Orléans', 'Claude Allègre', 'Côté', 'Māori naming customs',
            'García (surname)'
        ]
        articles = links.collect(pages, separate=True)
        self.assertEqual(set(pages), set(articles.keys()))
        self.assertTrue(all(len(links) for links in articles.values()))
コード例 #15
0
    def test_link_with_accents(self):
        """
        Test that links that include an accent are still returned.
        """

        page = 'French name'
        articles = links.collect(page, separate=False)
        self.assertTrue('Orléans' in articles)
        self.assertTrue('Claude Allègre' in articles)
        self.assertTrue('Côté' in articles)
        self.assertTrue('Māori naming customs' in articles)
        self.assertTrue('García (surname)' in articles)
コード例 #16
0
    def test_get_separated_links_multiple_pages(self):
        """
        Test that when the links from multiple pages are requested to be separated, a dictionary with multiple keys is returned.
        """

        pages = ['Olympique Lyonnais', 'Michelangelo']
        articles = links.collect(pages, separate=True)
        self.assertEqual(dict, type(articles))
        self.assertEqual(set(pages), set(articles.keys()))
        self.assertTrue('Olympique Lyonnais' in articles)
        self.assertTrue('Michelangelo' in articles)
        self.assertTrue(
            'AS Saint-Étienne' in articles.get('Olympique Lyonnais'))
コード例 #17
0
    def test_get_links_multiple_pages(self):
        """
        Test that when links are requested from multiple pages, all links are returned.
        """

        pages = ['Olympique Lyonnais', 'Michelangelo']
        articles = links.collect(pages, separate=True)
        self.assertEqual(set(pages), set(articles.keys()))
        self.assertTrue('Olympique Lyonnais' in articles)
        self.assertTrue('Michelangelo' in articles)
        self.assertTrue(
            'AS Saint-Étienne' in articles.get('Olympique Lyonnais'))
        self.assertTrue('Leonardo da Vinci' in articles.get('Michelangelo'))
コード例 #18
0
    def resolve(self, candidates, *args, **kwargs):
        """
        Resolve the given candidates.
        The resolved candidates are sorted in descending order of their score.
        However, resolved ambiguous candidates are at the end.

        :param candidates: The candidates to resolve.
                           The candidates should be in the form of a dictionary.
                           The keys should be the candidates, and the values the scores.
        :type candidates: dict

        :return: A tuple containing the resolved and unresolved candidates respectively.
        :rtype: tuple of lists
        """

        resolved_candidates, unresolved_candidates = [], []

        candidates = sorted(candidates.keys(),
                            key=lambda candidate: candidates.get(candidate),
                            reverse=True)
        resolved, unresolved, ambiguous = self._resolve_unambiguous_candidates(
            candidates)
        resolved_candidates.extend(resolved)
        unresolved_candidates.extend(unresolved)
        """
        Get the potential disambiguations of the ambiguous candidates.
        Then, find the best page for each candidate.
        If its similarity with the domain is sufficiently high, the candidate is resolved.
        """
        ambiguous = links.collect(ambiguous, introduction_only=False)
        for candidate, pages in ambiguous.items():
            """
            If there are candidate pages, get the most similar page.
            If the most similar page exceeds the similarity threshold, resolve the candidate to that page.
            Otherwise, the candidate cannot be resolved.
            """
            if len(pages) > 0:
                page, score = self._disambiguate(pages)
                if score >= self.threshold:
                    resolved_candidates.append(page)
                    continue

            unresolved_candidates.append(candidate)

        return (resolved_candidates, unresolved_candidates)
コード例 #19
0
    def extrapolate(self, participants, *args, **kwargs):
        """
        Extrapolate the given participants.

        :param participants: The participants to extrapolate.
                             It is assumed that all participants were resolved using a Wikipedia resolver.
                             This means that all participants share their name with a Wikipedia page.
        :type participants: list of str

        :return: The new participants, in descending order of their relevance to the domain.
        :rtype: list of str
        """

        extrapolated = {}
        """
        Create an empty graph.
        This graph will host all resolved participants and candidate participants during extrapolation.
        """
        graph = nx.Graph()
        """
        Get the first-level links.
        Then, filter the links to retain only those in the top 100, and those that do not have a year in them.
        """
        first_level = links.collect(participants,
                                    introduction_only=False,
                                    separate=True)
        link_frequency = self._link_frequency(first_level)
        link_frequency = {
            link: frequency
            for link, frequency in link_frequency.items()
            if not self._has_year(self._remove_brackets(link))
        }
        link_frequency = {
            link: frequency
            for link, frequency in link_frequency.items()
            if not link.startswith('List of')
        }
        link_frequency = sorted(link_frequency.keys(),
                                key=lambda link: link_frequency.get(link),
                                reverse=True)
        frequent_links = link_frequency[:self.first_level_links]
        first_level = {
            article: [
                link for link in first_level.get(article)
                if link in frequent_links
            ]
            for article in first_level
        }
        self._add_to_graph(graph,
                           first_level,
                           threshold=self.first_level_similarity)
        """
        Repeat the process a second time.
        This time, the filter identifies the cut-off at the 1000th most frequent link.
        Once more, articles with a year in the title are excluded.
        Articles that have already been seen are not considered.
        """
        second_level = links.collect(frequent_links,
                                     introduction_only=False,
                                     separate=True)
        link_frequency = self._link_frequency(second_level)
        link_frequency = {
            link: frequency
            for link, frequency in link_frequency.items()
            if not self._has_year(self._remove_brackets(link))
        }
        link_frequency = {
            link: frequency
            for link, frequency in link_frequency.items()
            if not link.startswith('List of')
        }
        if link_frequency:
            cutoff = sorted(link_frequency.values(), reverse=True)[
                self.second_level_links -
                1] if len(link_frequency) >= self.second_level_links else max(
                    link_frequency.values())
            frequent_links = [
                link for link in link_frequency
                if link_frequency.get(link) >= cutoff
            ]
            frequent_links = [
                link for link in frequent_links
                if link not in list(graph.nodes)
            ]
            second_level = {
                article: [
                    link for link in second_level.get(article)
                    if link in frequent_links
                ]
                for article in second_level
            }
            self._add_to_graph(graph,
                               second_level,
                               threshold=self.second_level_similarity)
        """
        Partition the graph into communities.
        The process is repeated until there are fewer than the square root of nodes in the graph.
        Nodes from partitions with at least 3 members are considered to be participants.
        The exceptions are:

            #. nodes that are also normal terms
            #. nodes that have a year in the title
        """
        communities = community.girvan_newman(
            graph, most_valuable_edge=self._most_central_edge)
        partitions = list(next(communities))
        while len(partitions) < math.sqrt(len(graph.nodes)):
            partitions = list(next(communities))

        partitions = [
            partition for partition in partitions if len(partition) > 3
        ]
        extrapolated = [node for partition in partitions for node in partition]
        extrapolated = [
            participant for participant in extrapolated
            if participant not in participants
        ]
        extrapolated = [
            participant for participant in extrapolated
            if participant.strip().lower() not in words.words()
        ]
        extrapolated = [
            participant for participant in extrapolated
            if not self._has_year(participant)
        ]
        """
        Calculate a score for each candidate participant, retaining those having a score that exceeds the threshold.
        Moreover, exclude candidates that were provided in the resolved participants.
        Return the candidates in descending order of relevance.
        """
        extrapolated = {
            participant:
            vector_math.cosine(self.domain,
                               graph.nodes[participant]['document'])
            for participant in extrapolated
        }
        extrapolated = {
            participant: score
            for participant, score in extrapolated.items()
            if score >= self.threshold
        }
        extrapolated = sorted(
            extrapolated.keys(),
            key=lambda participant: extrapolated.get(participant),
            reverse=True)
        return extrapolated