def make_map(arg_tuple): dynamic_path = "/usr/lib/chromium-browser/chromedriver" url = arg_tuple[0] max_nodes = arg_tuple[1] dynamic = arg_tuple[2] url_map = UrlMap(url, dynamic_path, url, dynamic_pages=dynamic) url_map.create_map(total_iterations=max_nodes) return url_map
def setUp(self): def rettest(*args, **kwargs): return args, kwargs self.urlmap = UrlMap() self.urlmap.add(r'^articles/2003/$', rettest) self.urlmap.add(r'^articles/(\d{4})/$', rettest) self.urlmap.add(r'^articles/(\d{4})/(\d{2})/$', rettest) self.urlmap.add( r'^articles/(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/$', rettest)
from urlmapper import UrlMap import time import json from operator import itemgetter mypath = "/Users/Tilley/Downloads/chromedriver" myurl = "https://sjrfire.com" myurlstart = "https://sjrfire.com" # myurl = "https://youtube.com" # myurl = "https://reddit.com" # creates our UrlMap object url_map = UrlMap(myurl, mypath, myurlstart, dynamic_pages=False) url_map.create_map(total_iterations=5) print(url_map.this_map)
import unittest import sys sys.path.append("..") from urlmapper import UrlMap myurl1 = "https://google.com/" myurl2 = "https://reddit.com/" mypath = "/Users/Tilley/Downloads/chromedriver" mymap1 = UrlMap(myurl1, mypath, myurl1, dynamic_pages=False) mymap1.create_map(total_iterations=10) mymap2 = UrlMap(myurl2, mypath, myurl2) mymap2.create_map(total_iterations=0) mymap3 = UrlMap(myurl1, mypath) mymap3.create_map() # total_iterations defaults to 30 # print(mymap1.get_map()) # print(len(mymap1.this_map.keys())) class TestUrlMap(unittest.TestCase): # test to see that we get the correct amount of nodes def test_map_max_nodes(self): self.assertEqual(len(mymap1.this_map.keys()), 10) self.assertEqual(len(mymap2.get_map().keys()), 0) self.assertEqual(len(mymap3.this_map.keys()), 30) # -1 goes forever and cannot be tested # test to see that our queue remains intact after termination
# January 2020 # This module serves to display the previously scraped websites network # from urlmapper import UrlMap import time import json from operator import itemgetter mypath = "/Users/Tilley/Downloads/chromedriver" myurl = "https://sjrfire.com" myurlstart = "https://sjrfire.com" # myurl = "https://youtube.com" # myurl = "https://reddit.com" # creates our UrlMap object url_map = UrlMap(myurl, mypath, myurlstart, dynamic_pages=False) url_map.create_map() # site_mapping = url_map.get_map() # site_map_json_list = url_map.json_list # site_map_json = json.dumps(site_map_json_list, indent=4) # print(site_map_json) # with open('site_map.json', 'w', encoding='utf-8') as f: # json.dump(site_map_json, f, ensure_ascii=False, indent=4) # gets data and formats it so d3.js can read it properly llu = url_map.d3_json_links_list nlu = url_map.d3_json_nodes_list tlu = url_map.json_time nodes_and_links = {}
class TestUrlMap(unittest.TestCase): def setUp(self): def rettest(*args, **kwargs): return args, kwargs self.urlmap = UrlMap() self.urlmap.add(r'^articles/2003/$', rettest) self.urlmap.add(r'^articles/(\d{4})/$', rettest) self.urlmap.add(r'^articles/(\d{4})/(\d{2})/$', rettest) self.urlmap.add( r'^articles/(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/$', rettest) def test_noargs(self): # make sure it map no args path = 'articles/2003/' self.assertEqual(self.urlmap.map_path(path).call(), ((), {})) # should raise an exception for an immutable sequence #self.assertRaises(TypeError, random.shuffle, (1, 2, 3)) def test_oneargs(self): path = 'articles/2004/' self.assertEqual(self.urlmap.map_path(path).call(), (('2004',), {})) def test_kwargss(self): path = 'articles/2004/12/01/' self.assertEqual(self.urlmap.map_path(path).call(), ((), {'day': '01', 'month': '12', 'year': '2004'})) def test_urlnotmatch(self): path = 'article/2004/12/01/' self.assertRaises(UrlNotFound, self.urlmap.map_path, path)