def test_remove_out_links(self): update_driver = queries.Driver() update_driver.update("1", "https://www.site-1.com", ["https://www.site-2.com"]) assert len(update_driver.get_outlinks("https://www.site-0.com")) == 1 update_driver.update("1", "https://www.site-1.com", []) assert len(update_driver.get_outlinks("https://www.site-0.com")) == 0
def test_unknown_url(self): rank_driver = queries.Driver() rank_driver.update("0", "https://www.site-0.com", ["https://www.site-1.com"]) rank_driver.run_pagerank() rankings = rank_driver.get_pagerank(["100"]) assert len(rankings) == 1 assert rankings["100"] == 0
def test_efficiency(self): update_driver = queries.Driver() testData = generateTestData(1000) start = time.time() for docid in testData: update_driver.update(docid, testData[docid]["url"], testData[docid]["out_links"]) end = time.time() assert (end - start) < 15
def test_efficiency(self): rank_driver = queries.Driver() testData = generateTestData(100) for docid in testData: rank_driver.update(docid, testData[docid]["url"], testData[docid]["out_links"]) start = time.time() rank_driver.run_pagerank() rankings = rank_driver.get_pagerank(testData.keys()) end = time.time() assert (end - start) < 1
def test_single(self): rank_driver = queries.Driver() rank_driver.update("0", "https://www.site-0.com", []) rank_driver.update("1", "https://www.site-1.com", ["https://www.site-0.com"]) rank_driver.update("2", "https://www.site-2.com", ["https://www.site-0.com"]) rank_driver.update("3", "https://www.site-3.com", ["https://www.site-0.com"]) rank_driver.run_pagerank() rankings = rank_driver.get_pagerank(["0"]) assert len(rankings) == 1 assert rankings["0"] > 0
import sys from CrawlManager import CrawlManager import time app = Flask(__name__) CRAWLING_ENDPOINTS = ["http://lspt-crawler1.cs.rpi.edu", "http://lspt-crawler3.cs.rpi.edu:3333"] alternator = 0 #Flag to alternate between endpoints MAX_LINKS = 10 crawl_links = ['http://rpi.edu', 'http://cs.rpi.edu', 'http://info.rpi.edu', 'http://admissions.rpi.edu', 'http://rpiathletics.com' , 'https://research.rpi.edu', 'https://news.rpi.edu', 'https://studentlife.rpi.edu', 'https://giving.rpi.edu', 'https://studenthealth.rpi.edu', 'https://sexualviolence.rpi.edu/', 'https://sll.rpi.edu/', 'https://union.rpi.edu'] graph = queries.Driver() #Neo4j Graph Interface Initialization #Initialize link manager graph.add_initial_urls(crawl_links) manager = CrawlManager(graph) for link in crawl_links: manager.add(link) ''' Alternate between crawling endpoints @modifies alternator between 0 and 1 @return endpoint address ''' def get_crawling_endpoint(): global alternator alternator = not alternator