Exemple #1
0
def _update_wikigraph(neighbours: list[str], bfs_objects: tuple[list, _Queue],
                      sources_info: tuple[int, int, int], wikigraph: WikiGraph,
                      curr_name: str) -> int:
    """Add neighbours to wikigraph, update bfs_objects, and return the number
    of resources found"""
    # Reset the counter the following while loop
    i = 0
    sources_found_per_page = 0
    new_sources_found = 0
    curr_sources_found, num_sources, sources_per_page = sources_info
    visited, q = bfs_objects

    # stop loop either when we've added all the neighbours or curr_url
    # or we found our desired number of sources
    while not (i >= len(neighbours) or
               (new_sources_found + curr_sources_found) >= num_sources
               or sources_found_per_page >= sources_per_page):
        v_link = neighbours[i]
        v_name = get_title(v_link)
        i += 1

        # if the neighbour is not in visited, add it to the graph
        if v_link not in visited:
            q.enqueue(v_link)
            visited.append(v_link)

            if not wikigraph.is_vertex_in_graph(v_name):
                wikigraph.add_vertex(v_name, v_link)
                sources_found_per_page += 1
                new_sources_found += 1

        wikigraph.add_edge(curr_name, v_name)

    return new_sources_found
Exemple #2
0
def build_wikigraph(starting_url: str, num_sources: int,
                    sources_per_page: int) -> WikiGraph:
    """ Return a Graph with all the sources and the <starting_url> as its vertex.

    Find <num_sources> number of sources from the <starting_url> Wikipedia article.

    If one wikipedia article contains the link to another wikipedia article,
    then they are adjacent.

    NOTE: This function may not return <num_sources> in some cases since wikipedia may have
    deleted pages but not updated the links on its pages, or there just aren't that many
    links surrounding the <starting_url>.

    (Implemented with the Breadth-First-Search Algorithm)
    """
    # tells us which vertex we should next add to the graph
    q = _Queue()

    curr_url = starting_url

    # ACCUMULATOR visited keeps track of the vertices we have already visited to make
    # sure we don't enter an infinite loop
    visited = []

    # ACCUMULATOR wiki_graph_so_far builds up our wikigraph
    wiki_graph_so_far = WikiGraph()

    # ACCUMULATOR sources_found keeps track of the number of sources found
    sources_found = 0

    # Add initial article to queue, visited, and our wikigraph
    q.enqueue(curr_url)
    visited.append(curr_url)
    curr_name = get_title(curr_url)
    wiki_graph_so_far.add_vertex(curr_name, curr_url)

    # we will either stop when the queue is empty or, when we have found the
    # desired number of sources
    while not (q.is_empty() or sources_found >= num_sources):

        # Reassign curr_url to the next item in the queue
        curr_url = q.dequeue()
        curr_name = get_title(curr_url)

        # find the neighbouring links on the article for curr_url
        neighbours = get_adjacent_urls(curr_url)

        bfs_objects = (visited, q)
        sources_info = (sources_found, num_sources, sources_per_page)
        new_sources_found = _update_wikigraph(neighbours, bfs_objects,
                                              sources_info, wiki_graph_so_far,
                                              curr_name)
        sources_found += new_sources_found

    return wiki_graph_so_far
def test_build_wikigraph() -> None:
    """Test build_wikigraph for the correct vertex names, correct number of vertices, and correct
    neighbours for each vertices
    """
    wikigraph = build_wikigraph('https://en.wikipedia.org/wiki/Cade_(horse)',
                                6, 2)

    cade_neighbours_expected = wikigraph.get_neighbours('Cade (horse)')
    cade_neighbours_actual = {'Godolphin Arabian', 'Bald Galloway'}

    assert cade_neighbours_expected == cade_neighbours_actual

    expected_graph = WikiGraph()

    expected_graph.add_vertex('Cade (horse)',
                              'https://en.wikipedia.org/wiki/Cade_(horse)')
    expected_graph.add_vertex(
        'Godolphin Arabian', 'https://en.wikipedia.org/wiki/Godolphin_Arabian')
    expected_graph.add_vertex('Bald Galloway',
                              'https://en.wikipedia.org/wiki/Bald_Galloway')
    expected_graph.add_vertex('George Stubbs',
                              'https://en.wikipedia.org/wiki/George_Stubbs')
    expected_graph.add_vertex(
        'Francis Godolphin, 2nd Earl of Godolphin',
        'https://en.wikipedia.org/wiki/'
        'Francis_Godolphin,_2nd_Earl_of_Godolphin')
    expected_graph.add_vertex(
        'Stallion (horse)', 'https://en.wikipedia.org/wiki/Stallion_(horse)')
    expected_graph.add_vertex(
        'Kingdom of Great Britain',
        'https://en.wikipedia.org/wiki/Kingdom_of_Great_Britain')

    expected_graph.add_edge('Cade (horse)', 'Godolphin Arabian')
    expected_graph.add_edge('Cade (horse)', 'Bald Galloway')
    expected_graph.add_edge('Godolphin Arabian', 'George Stubbs')
    expected_graph.add_edge('Godolphin Arabian',
                            'Francis Godolphin, 2nd Earl of Godolphin')
    expected_graph.add_edge('Bald Galloway', 'Stallion (horse)')
    expected_graph.add_edge('Bald Galloway', 'Kingdom of Great Britain')

    assert expected_graph.get_all_vertices() == wikigraph.get_all_vertices()
    assert expected_graph.get_neighbours(
        'Cade (horse)') == wikigraph.get_neighbours('Cade (horse)')
    assert expected_graph.get_neighbours(
        'Godolphin Arabian') == wikigraph.get_neighbours('Godolphin Arabian')
    assert expected_graph.get_neighbours(
        'Bald Galloway') == wikigraph.get_neighbours('Bald Galloway')
    assert expected_graph.get_neighbours(
        'George Stubbs') == wikigraph.get_neighbours('George Stubbs')
    assert expected_graph.get_neighbours('Francis Godolphin, 2nd Earl of Godolphin') \
           == wikigraph.get_neighbours('Francis Godolphin, 2nd Earl of Godolphin')
    assert expected_graph.get_neighbours(
        'Stallion (horse)') == wikigraph.get_neighbours('Stallion (horse)')
    assert expected_graph.get_neighbours(
        'Kingdom of Great Britain') == wikigraph.get_neighbours(
            'Kingdom of Great Britain')