def test_build_wikigraph() -> None: """Test build_wikigraph for the correct vertex names, correct number of vertices, and correct neighbours for each vertices """ wikigraph = build_wikigraph('https://en.wikipedia.org/wiki/Cade_(horse)', 6, 2) cade_neighbours_expected = wikigraph.get_neighbours('Cade (horse)') cade_neighbours_actual = {'Godolphin Arabian', 'Bald Galloway'} assert cade_neighbours_expected == cade_neighbours_actual expected_graph = WikiGraph() expected_graph.add_vertex('Cade (horse)', 'https://en.wikipedia.org/wiki/Cade_(horse)') expected_graph.add_vertex( 'Godolphin Arabian', 'https://en.wikipedia.org/wiki/Godolphin_Arabian') expected_graph.add_vertex('Bald Galloway', 'https://en.wikipedia.org/wiki/Bald_Galloway') expected_graph.add_vertex('George Stubbs', 'https://en.wikipedia.org/wiki/George_Stubbs') expected_graph.add_vertex( 'Francis Godolphin, 2nd Earl of Godolphin', 'https://en.wikipedia.org/wiki/' 'Francis_Godolphin,_2nd_Earl_of_Godolphin') expected_graph.add_vertex( 'Stallion (horse)', 'https://en.wikipedia.org/wiki/Stallion_(horse)') expected_graph.add_vertex( 'Kingdom of Great Britain', 'https://en.wikipedia.org/wiki/Kingdom_of_Great_Britain') expected_graph.add_edge('Cade (horse)', 'Godolphin Arabian') expected_graph.add_edge('Cade (horse)', 'Bald Galloway') expected_graph.add_edge('Godolphin Arabian', 'George Stubbs') expected_graph.add_edge('Godolphin Arabian', 'Francis Godolphin, 2nd Earl of Godolphin') expected_graph.add_edge('Bald Galloway', 'Stallion (horse)') expected_graph.add_edge('Bald Galloway', 'Kingdom of Great Britain') assert expected_graph.get_all_vertices() == wikigraph.get_all_vertices() assert expected_graph.get_neighbours( 'Cade (horse)') == wikigraph.get_neighbours('Cade (horse)') assert expected_graph.get_neighbours( 'Godolphin Arabian') == wikigraph.get_neighbours('Godolphin Arabian') assert expected_graph.get_neighbours( 'Bald Galloway') == wikigraph.get_neighbours('Bald Galloway') assert expected_graph.get_neighbours( 'George Stubbs') == wikigraph.get_neighbours('George Stubbs') assert expected_graph.get_neighbours('Francis Godolphin, 2nd Earl of Godolphin') \ == wikigraph.get_neighbours('Francis Godolphin, 2nd Earl of Godolphin') assert expected_graph.get_neighbours( 'Stallion (horse)') == wikigraph.get_neighbours('Stallion (horse)') assert expected_graph.get_neighbours( 'Kingdom of Great Britain') == wikigraph.get_neighbours( 'Kingdom of Great Britain')
def test_find_path(self): '''Basic test to see if ''' wiki_graph = WikiGraph() start, end = ('Tom Hanks', 'Will I Am') path = wiki_graph.find_path(start, end) self.assertEqual(path.start, start) self.assertEqual(path.end, end) self.assertTrue(path, "Path not found") self.assertTrue(len(path) < 6, "Path too long len=%d" % len(path))
def build_wikigraph(starting_url: str, num_sources: int, sources_per_page: int) -> WikiGraph: """ Return a Graph with all the sources and the <starting_url> as its vertex. Find <num_sources> number of sources from the <starting_url> Wikipedia article. If one wikipedia article contains the link to another wikipedia article, then they are adjacent. NOTE: This function may not return <num_sources> in some cases since wikipedia may have deleted pages but not updated the links on its pages, or there just aren't that many links surrounding the <starting_url>. (Implemented with the Breadth-First-Search Algorithm) """ # tells us which vertex we should next add to the graph q = _Queue() curr_url = starting_url # ACCUMULATOR visited keeps track of the vertices we have already visited to make # sure we don't enter an infinite loop visited = [] # ACCUMULATOR wiki_graph_so_far builds up our wikigraph wiki_graph_so_far = WikiGraph() # ACCUMULATOR sources_found keeps track of the number of sources found sources_found = 0 # Add initial article to queue, visited, and our wikigraph q.enqueue(curr_url) visited.append(curr_url) curr_name = get_title(curr_url) wiki_graph_so_far.add_vertex(curr_name, curr_url) # we will either stop when the queue is empty or, when we have found the # desired number of sources while not (q.is_empty() or sources_found >= num_sources): # Reassign curr_url to the next item in the queue curr_url = q.dequeue() curr_name = get_title(curr_url) # find the neighbouring links on the article for curr_url neighbours = get_adjacent_urls(curr_url) bfs_objects = (visited, q) sources_info = (sources_found, num_sources, sources_per_page) new_sources_found = _update_wikigraph(neighbours, bfs_objects, sources_info, wiki_graph_so_far, curr_name) sources_found += new_sources_found return wiki_graph_so_far
def main(): parser = argparse.ArgumentParser( "Find a path between two Wikipedia pages via their links.") parser.add_argument("--start", help="Title of valid wikipedia page to start from.", type=str, required=True) parser.add_argument("--end", help="Title of valid wikipedia page to reach.", type=str, required=True) args = parser.parse_args() wiki_graph = WikiGraph() print("Searching: '%s' -> '%s'" % (args.start, args.end)) path = wiki_graph.find_path(args.start, args.end) if path: print(path.info) else: print("Failed Search.")
def test_find_path_benchmark(self): wiki_graph = WikiGraph(print_requests=True) total_requests = 0 total_time = 0 failures = [] # Loop through and test if paths exit for page in samplepages: (start, end) = (page, "Homunculus") path = wiki_graph.find_path(start, end) if path.degree == -1: failures.append(path) total_requests += path.requests total_time += path.time print("Total Failures:", len(failures)) print(failures) print("Total requests:", total_requests) print("Avg number of requests per path: %.2f" % (total_requests / len(top10pages))) print("Total time: ", total_time) print("Avg time per path: %.2f" % (total_time / len(top10pages)))
def demo_wikigraph(): wg = WikiGraph() wg.set_graph(DEMO_GRAPH_NODE_CHILDREN) return wg
def demo_wikigraph_nm(): wg = WikiGraph(optimize_memory=False) wg.set_graph(DEMO_GRAPH_NODE_CHILDREN) return wg
def main(): # initialise args from cli parser = argparse.ArgumentParser( "For a given sample of articles find a path from each to a central end" " article. Write the output to a given csv file.") parser.add_argument("-o", "--outfile", help="Filename to save the results to.", type=str, default="wikiresults.json") parser.add_argument("-x", "--center", help="Title of valid wiki page to center all nodes on", type=str, default="Homunculus") parser.add_argument("-k", "--sample_size", help="Sample size of k pages to search from. " "(Only applies when sample source is not given)", type=int, default=1) parser.add_argument( "-s", "--sample_source", help="Filename containing newline delimited list of valid " "wiki article titles if not specified sample defaults " "to random selection from wikimedia api. ", type=str) parser.add_argument("-v", action='store_true', help="add to display titles of page requests made.") args = parser.parse_args() wiki_graph = WikiGraph(print_requests=True) # resolve any issues with search sample source. if args.sample_source: sample = load_sample(args.set) size = len(sample) else: sample = wiki_graph.random_sample(args.sample_size) size = args.sample_size with open(args.outfile, mode='w') as outfile: writer = csv.DictWriter(outfile, ["start", "end", "path", "degree"]) writer.writeheader() total_time = datetime.now() total_requests = 0 for i, page in enumerate(sample): print("%d/%d Searching: '%s' -> '%s'" % (i + 1, size, page, args.center)) path = wiki_graph.find_path(page, args.center) print(path.info) writer.writerow(path.data) total_requests += path.requests total_time = (datetime.now() - total_time).total_seconds() print("Finished Totals: " "N={}. Time={}. Requests={}.".format(args.sample_size, total_time, total_requests))