def web_scraper(page_id): """This function accepts the id,checks if it is within the list of ids in the database, and scrapes only 10 links on that particular link page""" all_ids = Pages(DB.connect()).select_id() new_all_id = [pid[0] for pid in all_ids] if page_id not in new_all_id: raise TypeError('Id does not exist.') else: url = Pages(DB.connect()).select_url(page_id) DB.pages().update(True, page_id) value = requests.get(url) soup = BeautifulSoup(value.text, 'html.parser') list_urls = [] for link in soup.find_all('a', href=True): if link['href'].startswith('https'): list_urls.append(link['href']) new_list_urls = list_urls[:10] DB.links().delete_by_page_id(page_id) for item in new_list_urls: Links(DB.connect()).insert(page_id, item) DB.pages().update(False, page_id)
def scraping_function(id): """ This function implements the web scraper that inserts into the liks table. :param id(int): The id at which the url to be scraped is retrieved. :return: None: Returns None :raises: TypeError: Raises a TypeError """ try: # retrieves the url from the pages table url = DB.pages().fetch(id) DB.pages().update(True, id) link_list = [] r = requests.get(url[0]) # scrapes the url for hyperlinks soup = BeautifulSoup(r.text, features='html.parser') for link in soup.find_all('a', href=True): if 'https' in link['href']: link_list.append(link['href']) links = link_list[:10] DB.links().delete(id) for i in links: DB.links().insert(id, i) DB.pages().update(False, id) return None except TypeError: raise TypeError('Id not found in Pages Table')
def spider_scrap(page_id): '''function that recieve a page_id and insert links in the link table''' page_ids = [i[0] for i in DB().pages().select()] if page_id in page_ids: url = DB().pages().fetch_url(page_id) else: raise ValueError('page_id not valid') #update is_scraping to true DB().pages().update_id_true(page_id) #fetch the html content at the page url page = requests.get(url[0]) # fetching the html content to extract maximum 10 hyperlinks soup = BeautifulSoup(page.text, features='html.parser') links_list = [] for link in soup.find_all('a', href=True): links = link['href'] if re.search("^https", links): links_list.append(links) link_url = links_list[:10] DB.links().delete(page_id) #saves the newly extratcted links to the database for the page for url in link_url: DB.links().insert(page_id, url) DB().pages().update_id_false(page_id) # print(spider_scrap(1))
def scrape(id): DB.pages().update('True', id) url = DB().pages().fetch(id) page = requests.get(url[0]) soup = BeautifulSoup(page.text, features='html.parser') a_soup = soup.find_all('a', href=True) ext_links = [ link.get("href") for link in a_soup if "http" in link.get("href") ] new_links = ext_links[:10] DB.links().delete(id) for i in new_links: DB.links().insert(i, id)
def spider(page_id): ''' Takes a page id, selects the url linked to page id and runs the scraper Scraper takes url and returns a list of urls scraped, a maximum of 10 links are inserted into the database ''' if type(page_id) != int or page_id == 0: raise ValueError('Page Id is not valid') get_url = DB.pages().get_url(page_id) if get_url is None: return ValueError('Page Id not found') else: url = get_url[0] all_links = [] # set is_scraping to True where id == page_id DB.pages().update_by_id(True, page_id) res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser') for link in soup.find_all('a', href=True): if link['href'].startswith('http'): all_links.append(link['href']) # check if page id is in already in links table, delete all data with page id DB.links().delete_by_page_id(page_id) for link in all_links[:10]: # Insert each link into the links table Links(DB().connect()).insert(page_id, link) # set is_scraping to False in where id == page_id DB.pages().update_by_id(False, page_id)
class TestDatabase(TestCase): '''Class to test the database (db) functions''' def setUp(self): self.db = DB() def test_connection(self): '''tests that the connection function does it's work.''' connection = self.db.connect() self.assertIsNotNone(connection) def test_setup(self): '''tests that the setup function does what it was designed to do.''' self.db.setup() self.assertIsNone(self.db.setup()) def test_seed(self): '''tests that the seed function does what it was designed to do.''' self.db.connect() self.db.setup() self.db.seed() self.assertIsNone(self.db.seed()) def test_pages(self): '''tests that the pages function does what it was designed to do.''' self.db.connect() self.db.setup() self.db.seed() selecter = self.db.pages().select() self.assertIsNotNone(selecter) def test_links(self): '''tests that the links function does what it was designed to do.''' self.db.connect() self.db.setup() select_link = self.db.links().select() self.assertIsNotNone(select_link) def TearDown(self): '''the teardown function for all the tests.''' self.db.connect().close()
def test_links(self): self.assertIsNotNone(DB.links())
from src.db import DB from src.spider import spider_scrap from celery import Celery from decouple import config # db = DB() db.connect() db.new_connect() db.setup() db.seed() dd = DB.new_connect() pages = DB.pages() # pages.fetch_url(2) print(pages.fetch_url(2)) print(pages.select()) print(pages.find(2)) # print(pages.update_id(1)) links = DB.links() print(links.insert(1, 'www.goggle.com')) print(links.delete(1)) print(links.select(1)) # # # app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND')) # # # @app.task # def scrap_url(): # return spider_scrap(1) # spider_scrap(1)
from celery import Celery from decouple import config from src.spider import spider from src.db.pages import Pages from src.db import DB # Celery Task app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND')) @app.task def test(): return spider(1) # some tests with pages() DB.pages().get_url(2) DB.pages().find_by_id(1) DB.pages().update_by_id() # some tests with links DB.links().select() DB.links().insert(3, 'https://google.com') DB.links().delete_by_page_id(2)
app = Celery('spider', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND')) @app.task() def cel_spider(): """ Implements the celery task for the spider :return None: Returns the value of the scraping function. """ return scraping_function(2) # # ---- Usage of DB class ---- # # DB.serv_conn() # # # DB.connect() # DB.setup() # DB.seed() # # # ----Usage of pages.py---- # DB.pages().select() # DB.pages().fetch(2) # DB.pages().update(True, 2) # # # ----Usage of links.py---- # DB.links().select() # DB.links().fetch() # DB.links().insert(1, 'https://www.facebook.com') DB.links().delete(1) # scraping_function(3)
def test_links(self): ''' Test links interface ''' self.assertIsNotNone(DB.links(), None)