def scraping_function(id): """ This function implements the web scraper that inserts into the liks table. :param id(int): The id at which the url to be scraped is retrieved. :return: None: Returns None :raises: TypeError: Raises a TypeError """ try: # retrieves the url from the pages table url = DB.pages().fetch(id) DB.pages().update(True, id) link_list = [] r = requests.get(url[0]) # scrapes the url for hyperlinks soup = BeautifulSoup(r.text, features='html.parser') for link in soup.find_all('a', href=True): if 'https' in link['href']: link_list.append(link['href']) links = link_list[:10] DB.links().delete(id) for i in links: DB.links().insert(id, i) DB.pages().update(False, id) return None except TypeError: raise TypeError('Id not found in Pages Table')
def web_scraper(page_id): """This function accepts the id,checks if it is within the list of ids in the database, and scrapes only 10 links on that particular link page""" all_ids = Pages(DB.connect()).select_id() new_all_id = [pid[0] for pid in all_ids] if page_id not in new_all_id: raise TypeError('Id does not exist.') else: url = Pages(DB.connect()).select_url(page_id) DB.pages().update(True, page_id) value = requests.get(url) soup = BeautifulSoup(value.text, 'html.parser') list_urls = [] for link in soup.find_all('a', href=True): if link['href'].startswith('https'): list_urls.append(link['href']) new_list_urls = list_urls[:10] DB.links().delete_by_page_id(page_id) for item in new_list_urls: Links(DB.connect()).insert(page_id, item) DB.pages().update(False, page_id)
def scrape(id): DB.pages().update('True', id) url = DB().pages().fetch(id) page = requests.get(url[0]) soup = BeautifulSoup(page.text, features='html.parser') a_soup = soup.find_all('a', href=True) ext_links = [ link.get("href") for link in a_soup if "http" in link.get("href") ] new_links = ext_links[:10] DB.links().delete(id) for i in new_links: DB.links().insert(i, id)
def scrape(id): '''Scrape function fetches the page record with the page_id provided, Raise an exception if page with the isn't found, Updates the page’s is_scraping attribute to true, Fetch the HTML content at the page url using requests, Parses the fetched HTML content to extract hyperlinks (Maximum 10), Deletes existing links that may have been previously saved for the page, Saves the newly extracted links to the database for the page, Updates the page’s is_scraping attribute to false, passes the scraped links to the links table on the database. ''' try: the_url = DB.pages().fetch(id) if len(the_url) == 0: raise Exception the_url = the_url[0] address = the_url[0] DB().pages().update(id, 'True') web_request = requests.get(address) soup = BeautifulSoup(web_request.text, features='html.parser') list_of_links = [] for link in soup.find_all('a', href=True): links = link['href'] if re.search("^https", links): list_of_links.append(links) linksy = (list_of_links[:10]) DB().links().delete(id) for url in linksy: DB().links().insert(url, id) DB().pages().update(id, 'False') return '===============Successfully scraped================' except Exception as e: print(e)
def spider(page_id): ''' Takes a page id, selects the url linked to page id and runs the scraper Scraper takes url and returns a list of urls scraped, a maximum of 10 links are inserted into the database ''' if type(page_id) != int or page_id == 0: raise ValueError('Page Id is not valid') get_url = DB.pages().get_url(page_id) if get_url is None: return ValueError('Page Id not found') else: url = get_url[0] all_links = [] # set is_scraping to True where id == page_id DB.pages().update_by_id(True, page_id) res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser') for link in soup.find_all('a', href=True): if link['href'].startswith('http'): all_links.append(link['href']) # check if page id is in already in links table, delete all data with page id DB.links().delete_by_page_id(page_id) for link in all_links[:10]: # Insert each link into the links table Links(DB().connect()).insert(page_id, link) # set is_scraping to False in where id == page_id DB.pages().update_by_id(False, page_id)
class TestDatabase(TestCase): '''Class to test the database (db) functions''' def setUp(self): self.db = DB() def test_connection(self): '''tests that the connection function does it's work.''' connection = self.db.connect() self.assertIsNotNone(connection) def test_setup(self): '''tests that the setup function does what it was designed to do.''' self.db.setup() self.assertIsNone(self.db.setup()) def test_seed(self): '''tests that the seed function does what it was designed to do.''' self.db.connect() self.db.setup() self.db.seed() self.assertIsNone(self.db.seed()) def test_pages(self): '''tests that the pages function does what it was designed to do.''' self.db.connect() self.db.setup() self.db.seed() selecter = self.db.pages().select() self.assertIsNotNone(selecter) def test_links(self): '''tests that the links function does what it was designed to do.''' self.db.connect() self.db.setup() select_link = self.db.links().select() self.assertIsNotNone(select_link) def TearDown(self): '''the teardown function for all the tests.''' self.db.connect().close()
def test_pages(self): self.assertIsNotNone(DB.pages())
# Show examples of how you would use ALL your implementations here from src.db import DB from src.spider import spider_scrap from celery import Celery from decouple import config # db = DB() db.connect() db.new_connect() db.setup() db.seed() dd = DB.new_connect() pages = DB.pages() # pages.fetch_url(2) print(pages.fetch_url(2)) print(pages.select()) print(pages.find(2)) # print(pages.update_id(1)) links = DB.links() print(links.insert(1, 'www.goggle.com')) print(links.delete(1)) print(links.select(1)) # # # app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND')) # # # @app.task # def scrap_url(): # return spider_scrap(1) # spider_scrap(1)
from celery import Celery from decouple import config from src.spider import spider from src.db.pages import Pages from src.db import DB # Celery Task app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND')) @app.task def test(): return spider(1) # some tests with pages() DB.pages().get_url(2) DB.pages().find_by_id(1) DB.pages().update_by_id() # some tests with links DB.links().select() DB.links().insert(3, 'https://google.com') DB.links().delete_by_page_id(2)
def test_pages(self): ''' Test pages interface ''' self.assertIsNotNone(DB.pages(), None)