class TestLinks(unittest.TestCase): """This class tests all the functions in the Links class in links.py""" def setUp(self) -> None: # Set up the Links class self.exec = Links(DB.connect()) def test_insert(self): # Test the insert method of the Links class in links.py DB.setup() DB.seed() self.assertIsNone(self.exec.insert(1, 'https://facebook.com')) def test_select(self): # Test the select method of the Links class in links.py DB.setup() DB.seed() self.assertIsNotNone(self.exec.select()) def test_find_with_page_id(self): # Test the find_with_page_id method of the Links class in links.py self.assertIsNone(self.exec.find_with_page_id(1)) def test_delete_by_page_id(self): # Test the delete_by_page_id method of the Links class in links.py self.assertIsNone(self.exec.delete_by_page_id(1)) def tearDown(self) -> None: # Tear down the Links class after all the tests self.exec = None
class TestLinks(unittest.TestCase): ''' class that test for link class insert test for the insert function to insert extracted url ''' def setUp(self) -> None: self.links = Links(DB.new_connect()) def test_insert(self): DB().setup() DB.seed() self.assertEqual(self.links.insert(2, 'https://www.wikipedia.com'), None) def test_select(self): # DB().setup() # DB.seed() # self.links.insert(2, 'https://www.wikipedia.com') # result = [(1, 'https://www.wikipedia.com')] self.assertIsNotNone(self.links.select(1), None) def test_delete(self): # DB.setup() # DB.seed() # self.links.insert(1,2, 'https://www.wikipedia.com') # self.links.delete(page_id) self.assertIsNone(self.links.delete(1), None) def tearDown(self) -> None: # self.links.delete() self.links = None
def web_scraper(page_id): """This function accepts the id,checks if it is within the list of ids in the database, and scrapes only 10 links on that particular link page""" all_ids = Pages(DB.connect()).select_id() new_all_id = [pid[0] for pid in all_ids] if page_id not in new_all_id: raise TypeError('Id does not exist.') else: url = Pages(DB.connect()).select_url(page_id) DB.pages().update(True, page_id) value = requests.get(url) soup = BeautifulSoup(value.text, 'html.parser') list_urls = [] for link in soup.find_all('a', href=True): if link['href'].startswith('https'): list_urls.append(link['href']) new_list_urls = list_urls[:10] DB.links().delete_by_page_id(page_id) for item in new_list_urls: Links(DB.connect()).insert(page_id, item) DB.pages().update(False, page_id)
def links(cls): """ Executes the SQL scripts for links table. :return None: Returns None. """ return Links(cls.connect())
class MyTestLinks(unittest.TestCase): """ This class tests the various methods available to the Links class. """ def setUp(self) -> None: self.DB = Links(DB.connect()) def test_select(self): self.assertIsNotNone(self.DB.select()) def test_fetch(self): self.assertIsNotNone(self.DB.fetch()) def test_insert(self): self.assertIsNotNone(self.DB.insert(2, 'https://rb.gy/zd2xxz')) def test_delete(self): self.assertIsNone(self.DB.delete(1)) def tearDown(self) -> None: self.DB = None
class TestDB(unittest.TestCase): def setUp(self) -> None: self.exec = Links(DB.connect()) def test_insert(self): ''' Test insert into links table ''' DB.setup() DB.seed() value = self.exec.insert(1, 'https://www.google.com/') self.assertEqual(value, None) def test_select(self): ''' Test select from links table ''' DB.setup() value = self.exec.select() self.assertIsNotNone(value) def test_select_by_id(self): ''' Test selection of specific data from links table by id ''' DB.setup() DB.seed() self.exec.insert(1, 'https://www.google.com/') value = self.exec.select_by_id(1) self.assertIsNotNone(value) self.assertEqual(type(value), tuple) def test_select_by_page_id(self): ''' Test selection of specific data from links table by page_id ''' DB.setup() value = self.exec.select_by_page_id(1) self.assertIsNotNone(value) self.assertEqual(type(value), list) def test_delete_by_id(self): ''' Test deletion of specific data in links table by id''' DB.setup() value = self.exec.delete_by_id(1) self.assertEqual(value, None) def test_delete_by_page_id(self): ''' Test deletion of specific data in links table by page_id''' DB.setup() DB.seed() value = self.exec.delete_by_page_id(1) self.assertEqual(value, None) def tearDown(self) -> None: self.exec = None
class TestLinks(TestCase): # Test each and every method in the links class def setUp(self): """Setup all the necessary class and functions""" self.pages = Links() self.conn_server = DB.only_server() self.conn = self.links.connect() self.cursor = self.conn.cursor() def test_connect(self): """ Test connecting to postgresql server is successful """ connection_object = self.conn self.assertIsNotNone(connection_object) def test_insert(self): """Test data provided is inserted into database""" inserted_data = self.pages.insert() self.assertIsNotNone(inserted_data) def test_select(self): """Test select return all data from the database""" data = self.links.select() self.assertIsNotNone(data) def test_find(self): """Test find data returns the data with the id provided""" data = self.links.find(1) self.assertIsNotNone(data) self.assertEqual(type(data), tuple) def test_update(self): """Test data is updated by id with params and returned the updated data """ data = 'True' updated_data = self.links.update(data, 1) self.assertIsNone(updated_data) def test_delete(self): """Test data is deleted by id and returns none """ deleted = self.links.delete(1) self.assertEqual(deleted, None) def tearDown(self): """TearDown connections and delete all data created for testing purposes""" self.links.close()
def spider(page_id): ''' Takes a page id, selects the url linked to page id and runs the scraper Scraper takes url and returns a list of urls scraped, a maximum of 10 links are inserted into the database ''' if type(page_id) != int or page_id == 0: raise ValueError('Page Id is not valid') get_url = DB.pages().get_url(page_id) if get_url is None: return ValueError('Page Id not found') else: url = get_url[0] all_links = [] # set is_scraping to True where id == page_id DB.pages().update_by_id(True, page_id) res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser') for link in soup.find_all('a', href=True): if link['href'].startswith('http'): all_links.append(link['href']) # check if page id is in already in links table, delete all data with page id DB.links().delete_by_page_id(page_id) for link in all_links[:10]: # Insert each link into the links table Links(DB().connect()).insert(page_id, link) # set is_scraping to False in where id == page_id DB.pages().update_by_id(False, page_id)
def setUp(self) -> None: self.exec = Links(DB.connect())
def links(cls): # Returns a reference to the links interface conn = cls.connect() receive = cls.pages().select() link = Links(conn, receive) return link
def setUp(self) -> None: self.links = Links(DB.new_connect())
def links(cls): # Returns a reference to the links interface links = Links(cls.new_connect()) return links
def setUp(self) -> None: # Set up the Links class self.exec = Links(DB.connect())
def links(cls): conn = cls.connect() return Links(conn)
def setUp(self): """Setup all the necessary class and functions""" self.pages = Links() self.conn_server = DB.only_server() self.conn = self.links.connect() self.cursor = self.conn.cursor()