def test_build_wikigraph() -> None:
    """Test build_wikigraph for the correct vertex names, correct number of vertices, and correct
    neighbours for each vertices
    """
    wikigraph = build_wikigraph('https://en.wikipedia.org/wiki/Cade_(horse)',
                                6, 2)

    cade_neighbours_expected = wikigraph.get_neighbours('Cade (horse)')
    cade_neighbours_actual = {'Godolphin Arabian', 'Bald Galloway'}

    assert cade_neighbours_expected == cade_neighbours_actual

    expected_graph = WikiGraph()

    expected_graph.add_vertex('Cade (horse)',
                              'https://en.wikipedia.org/wiki/Cade_(horse)')
    expected_graph.add_vertex(
        'Godolphin Arabian', 'https://en.wikipedia.org/wiki/Godolphin_Arabian')
    expected_graph.add_vertex('Bald Galloway',
                              'https://en.wikipedia.org/wiki/Bald_Galloway')
    expected_graph.add_vertex('George Stubbs',
                              'https://en.wikipedia.org/wiki/George_Stubbs')
    expected_graph.add_vertex(
        'Francis Godolphin, 2nd Earl of Godolphin',
        'https://en.wikipedia.org/wiki/'
        'Francis_Godolphin,_2nd_Earl_of_Godolphin')
    expected_graph.add_vertex(
        'Stallion (horse)', 'https://en.wikipedia.org/wiki/Stallion_(horse)')
    expected_graph.add_vertex(
        'Kingdom of Great Britain',
        'https://en.wikipedia.org/wiki/Kingdom_of_Great_Britain')

    expected_graph.add_edge('Cade (horse)', 'Godolphin Arabian')
    expected_graph.add_edge('Cade (horse)', 'Bald Galloway')
    expected_graph.add_edge('Godolphin Arabian', 'George Stubbs')
    expected_graph.add_edge('Godolphin Arabian',
                            'Francis Godolphin, 2nd Earl of Godolphin')
    expected_graph.add_edge('Bald Galloway', 'Stallion (horse)')
    expected_graph.add_edge('Bald Galloway', 'Kingdom of Great Britain')

    assert expected_graph.get_all_vertices() == wikigraph.get_all_vertices()
    assert expected_graph.get_neighbours(
        'Cade (horse)') == wikigraph.get_neighbours('Cade (horse)')
    assert expected_graph.get_neighbours(
        'Godolphin Arabian') == wikigraph.get_neighbours('Godolphin Arabian')
    assert expected_graph.get_neighbours(
        'Bald Galloway') == wikigraph.get_neighbours('Bald Galloway')
    assert expected_graph.get_neighbours(
        'George Stubbs') == wikigraph.get_neighbours('George Stubbs')
    assert expected_graph.get_neighbours('Francis Godolphin, 2nd Earl of Godolphin') \
           == wikigraph.get_neighbours('Francis Godolphin, 2nd Earl of Godolphin')
    assert expected_graph.get_neighbours(
        'Stallion (horse)') == wikigraph.get_neighbours('Stallion (horse)')
    assert expected_graph.get_neighbours(
        'Kingdom of Great Britain') == wikigraph.get_neighbours(
            'Kingdom of Great Britain')
Exemple #2
0
    def test_find_path(self):
        '''Basic test to see if '''
        wiki_graph = WikiGraph()
        start, end = ('Tom Hanks', 'Will I Am')
        path = wiki_graph.find_path(start, end)

        self.assertEqual(path.start, start)
        self.assertEqual(path.end, end)
        self.assertTrue(path, "Path not found")
        self.assertTrue(len(path) < 6, "Path too long len=%d" % len(path))
Exemple #3
0
def build_wikigraph(starting_url: str, num_sources: int,
                    sources_per_page: int) -> WikiGraph:
    """ Return a Graph with all the sources and the <starting_url> as its vertex.

    Find <num_sources> number of sources from the <starting_url> Wikipedia article.

    If one wikipedia article contains the link to another wikipedia article,
    then they are adjacent.

    NOTE: This function may not return <num_sources> in some cases since wikipedia may have
    deleted pages but not updated the links on its pages, or there just aren't that many
    links surrounding the <starting_url>.

    (Implemented with the Breadth-First-Search Algorithm)
    """
    # tells us which vertex we should next add to the graph
    q = _Queue()

    curr_url = starting_url

    # ACCUMULATOR visited keeps track of the vertices we have already visited to make
    # sure we don't enter an infinite loop
    visited = []

    # ACCUMULATOR wiki_graph_so_far builds up our wikigraph
    wiki_graph_so_far = WikiGraph()

    # ACCUMULATOR sources_found keeps track of the number of sources found
    sources_found = 0

    # Add initial article to queue, visited, and our wikigraph
    q.enqueue(curr_url)
    visited.append(curr_url)
    curr_name = get_title(curr_url)
    wiki_graph_so_far.add_vertex(curr_name, curr_url)

    # we will either stop when the queue is empty or, when we have found the
    # desired number of sources
    while not (q.is_empty() or sources_found >= num_sources):

        # Reassign curr_url to the next item in the queue
        curr_url = q.dequeue()
        curr_name = get_title(curr_url)

        # find the neighbouring links on the article for curr_url
        neighbours = get_adjacent_urls(curr_url)

        bfs_objects = (visited, q)
        sources_info = (sources_found, num_sources, sources_per_page)
        new_sources_found = _update_wikigraph(neighbours, bfs_objects,
                                              sources_info, wiki_graph_so_far,
                                              curr_name)
        sources_found += new_sources_found

    return wiki_graph_so_far
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(
        "Find a path between two Wikipedia pages via their links.")
    parser.add_argument("--start",
                        help="Title of valid wikipedia page to start from.",
                        type=str,
                        required=True)
    parser.add_argument("--end",
                        help="Title of valid wikipedia page to reach.",
                        type=str,
                        required=True)
    args = parser.parse_args()

    wiki_graph = WikiGraph()
    print("Searching:  '%s' -> '%s'" % (args.start, args.end))
    path = wiki_graph.find_path(args.start, args.end)
    if path:
        print(path.info)
    else:
        print("Failed Search.")
Exemple #5
0
    def test_find_path_benchmark(self):
        wiki_graph = WikiGraph(print_requests=True)
        total_requests = 0
        total_time = 0
        failures = []
        # Loop through and test if paths exit
        for page in samplepages:
            (start, end) = (page, "Homunculus")
            path = wiki_graph.find_path(start, end)
            if path.degree == -1:
                failures.append(path)
            total_requests += path.requests
            total_time += path.time

        print("Total Failures:", len(failures))
        print(failures)
        print("Total requests:", total_requests)
        print("Avg number of requests per path: %.2f" %
              (total_requests / len(top10pages)))
        print("Total time: ", total_time)
        print("Avg time per path: %.2f" % (total_time / len(top10pages)))
def demo_wikigraph():
    wg = WikiGraph()
    wg.set_graph(DEMO_GRAPH_NODE_CHILDREN)
    return wg
def demo_wikigraph_nm():
    wg = WikiGraph(optimize_memory=False)
    wg.set_graph(DEMO_GRAPH_NODE_CHILDREN)
    return wg
Exemple #8
0
def main():
    # initialise args from cli
    parser = argparse.ArgumentParser(
        "For a given sample of articles find a path from each to a central end"
        " article. Write the output to a given csv file.")
    parser.add_argument("-o",
                        "--outfile",
                        help="Filename to save the results to.",
                        type=str,
                        default="wikiresults.json")
    parser.add_argument("-x",
                        "--center",
                        help="Title of valid wiki page to center all nodes on",
                        type=str,
                        default="Homunculus")
    parser.add_argument("-k",
                        "--sample_size",
                        help="Sample size of k pages to search from. "
                        "(Only applies when sample source is not given)",
                        type=int,
                        default=1)
    parser.add_argument(
        "-s",
        "--sample_source",
        help="Filename containing newline delimited list of valid "
        "wiki article titles if not specified sample defaults "
        "to random selection from wikimedia api. ",
        type=str)
    parser.add_argument("-v",
                        action='store_true',
                        help="add to display titles of page requests made.")
    args = parser.parse_args()

    wiki_graph = WikiGraph(print_requests=True)

    # resolve any issues with search sample source.
    if args.sample_source:
        sample = load_sample(args.set)
        size = len(sample)
    else:
        sample = wiki_graph.random_sample(args.sample_size)
        size = args.sample_size

    with open(args.outfile, mode='w') as outfile:
        writer = csv.DictWriter(outfile, ["start", "end", "path", "degree"])
        writer.writeheader()

        total_time = datetime.now()
        total_requests = 0

        for i, page in enumerate(sample):
            print("%d/%d Searching: '%s' -> '%s'" %
                  (i + 1, size, page, args.center))
            path = wiki_graph.find_path(page, args.center)
            print(path.info)
            writer.writerow(path.data)
            total_requests += path.requests

        total_time = (datetime.now() - total_time).total_seconds()
        print("Finished Totals: "
              "N={}. Time={}. Requests={}.".format(args.sample_size,
                                                   total_time, total_requests))