def testCrawlNoResponse(self): # NO CONNECTION RESPONSE, 602 error test_url = "https://test" test_json = dict() outer_links = [] code = 602 text = None date = None proper_response = make_response(test_url, outer_links, code, text, date) soup.crawl(test_url, test_json) self.assertDictEqual(test_json, proper_response) return
def testCrawlGoodIrrelevantResponse(self): # GOOD RESPONSE, NO RPI RELEVANCE, 600 error test_url = "https://onebananas.com/" test_json = dict() outer_links = [] code = 600 text = None date = None proper_response = make_response(test_url, outer_links, code, text, date) soup.crawl(test_url, test_json) self.assertDictEqual(test_json, proper_response) return
def testCrawlBadResponse(self): # BAD RESPONSE, 404 error test_url = "http://paper.com/users" test_json = dict() outer_links = [] code = 404 text = None date = None proper_response = make_response(test_url, outer_links, code, text, date) soup.crawl(test_url, test_json) self.assertDictEqual(test_json, proper_response) return
def testCrawlGoodRelevantResponse(self): test_url = "https://science.rpi.edu/" test_json = dict() outer_links = [] code = 200 text = None date = soup.find_recrawl_date() proper_response = make_response(test_url, outer_links, code, text, date) soup.crawl(test_url, test_json) self.assertNotEqual(sorted(test_json['outbound-links']), sorted(proper_response['outbound-links'])) self.assertIsNotNone(test_json['plain-text']) return
def testCrawlRobots(self): #No robots.txt is the link below, crawl_robots should be an empty link url = "http://blog.davidstea.com/robots.txt" disallow_list = soup.crawl_robots(url) self.assertEqual(disallow_list, []) #There is valid robots.txt at the link below, which contains dissallowed links. #Make sure that the disallowed links are scraped and removed during the crawl process url = "https://science.rpi.edu" disallow_list = soup.crawl_robots(url) self.assertNotEqual(disallow_list, []) server_response = dict() soup.crawl(url, server_response) for unallowed_link in disallow_list: for link in server_response['outbound-links']: self.assertNotIn(unallowed_link, link) #No robots.txt is the link below, crawl_robots should be an empty link url = "www.google.com" disallow_list = soup.crawl_robots(url) self.assertEqual(disallow_list, []) return
def scrape_link(link, json_object): # call the crawl algorithm on the given link json_object = crawl(link, json_object)