def web_scraper(page_id): """This function accepts the id,checks if it is within the list of ids in the database, and scrapes only 10 links on that particular link page""" all_ids = Pages(DB.connect()).select_id() new_all_id = [pid[0] for pid in all_ids] if page_id not in new_all_id: raise TypeError('Id does not exist.') else: url = Pages(DB.connect()).select_url(page_id) DB.pages().update(True, page_id) value = requests.get(url) soup = BeautifulSoup(value.text, 'html.parser') list_urls = [] for link in soup.find_all('a', href=True): if link['href'].startswith('https'): list_urls.append(link['href']) new_list_urls = list_urls[:10] DB.links().delete_by_page_id(page_id) for item in new_list_urls: Links(DB.connect()).insert(page_id, item) DB.pages().update(False, page_id)
def pages(cls): """ Executes the SQL scripts for links table. :return None: Returns None. """ return Pages(cls.connect())
def setUp(self) -> None: # set up the Pages class self.exec = Pages(DB.connect())
def setUp(self) -> None: self.exec = Pages(DB.connect())
def task(): return web_scraper(Pages(DB.connect()).find_url(1))
def pages(cls): # Returns a reference to the pages interface conn = cls.connect() page = Pages(conn) return page
def setUp(self): """Setup all the necessary class and functions""" self.pages = Pages() self.conn_server = DB.only_server() self.conn = self.pages.connect() self.cursor = self.conn.cursor()
def pages(cls): # Returns a referslence to the pages interface result = cls.new_connect() pages = Pages(result) return pages
def setUp(self) -> None: self.pages = Pages(DB.new_connect())
def pages(cls): conn = cls.connect() return Pages(conn)