Beispiel #1
0
def scraping_function(id):
    """
  This function implements the web scraper that inserts into the liks table.
  :param
  id(int): The id at which the url to be scraped is retrieved.
  :return:
  None: Returns None
  :raises:
  TypeError: Raises a TypeError
  """
    try:
        # retrieves the url from the pages table
        url = DB.pages().fetch(id)
        DB.pages().update(True, id)

        link_list = []
        r = requests.get(url[0])
        # scrapes the url for hyperlinks
        soup = BeautifulSoup(r.text, features='html.parser')
        for link in soup.find_all('a', href=True):
            if 'https' in link['href']:
                link_list.append(link['href'])
        links = link_list[:10]
        DB.links().delete(id)
        for i in links:
            DB.links().insert(id, i)
        DB.pages().update(False, id)
        return None
    except TypeError:
        raise TypeError('Id not found in Pages Table')
Beispiel #2
0
def web_scraper(page_id):
    """This function accepts the id,checks if it is within the list of ids in the database, and
    scrapes only 10 links on that particular link page"""
    all_ids = Pages(DB.connect()).select_id()
    new_all_id = [pid[0] for pid in all_ids]

    if page_id not in new_all_id:
        raise TypeError('Id does not exist.')

    else:
        url = Pages(DB.connect()).select_url(page_id)
        DB.pages().update(True, page_id)
        value = requests.get(url)
        soup = BeautifulSoup(value.text, 'html.parser')

        list_urls = []
        for link in soup.find_all('a', href=True):
          if link['href'].startswith('https'):
            list_urls.append(link['href'])

        new_list_urls = list_urls[:10]
        DB.links().delete_by_page_id(page_id)

        for item in new_list_urls:
            Links(DB.connect()).insert(page_id, item)

        DB.pages().update(False, page_id)
Beispiel #3
0
def scrape(id):
    DB.pages().update('True', id)
    url = DB().pages().fetch(id)
    page = requests.get(url[0])
    soup = BeautifulSoup(page.text, features='html.parser')
    a_soup = soup.find_all('a', href=True)
    ext_links = [
        link.get("href") for link in a_soup if "http" in link.get("href")
    ]
    new_links = ext_links[:10]
    DB.links().delete(id)
    for i in new_links:
        DB.links().insert(i, id)
Beispiel #4
0
def scrape(id):
    '''Scrape function fetches the page record with the page_id provided,
  Raise an exception if page with the isn't found,
  Updates the page’s is_scraping attribute to true,
  Fetch the HTML content at the page url using requests,
  Parses the fetched HTML content to extract hyperlinks (Maximum 10),
  Deletes existing links that may have been previously saved for the page,
  Saves the newly extracted links to the database for the page,
  Updates the page’s is_scraping attribute to false,
  passes the scraped links to the links table on the database.
  '''
    try:
        the_url = DB.pages().fetch(id)
        if len(the_url) == 0:
            raise Exception
        the_url = the_url[0]
        address = the_url[0]
        DB().pages().update(id, 'True')
        web_request = requests.get(address)
        soup = BeautifulSoup(web_request.text, features='html.parser')
        list_of_links = []
        for link in soup.find_all('a', href=True):
            links = link['href']
            if re.search("^https", links):
                list_of_links.append(links)
        linksy = (list_of_links[:10])
        DB().links().delete(id)
        for url in linksy:
            DB().links().insert(url, id)
        DB().pages().update(id, 'False')
        return '===============Successfully scraped================'
    except Exception as e:
        print(e)
Beispiel #5
0
def spider(page_id):
    ''' Takes a page id, selects the url linked to page id and runs the scraper
      Scraper takes url and returns a list of urls scraped,
      a maximum of 10 links are inserted into the database '''

    if type(page_id) != int or page_id == 0:
        raise ValueError('Page Id is not valid')

    get_url = DB.pages().get_url(page_id)

    if get_url is None:
        return ValueError('Page Id not found')

    else:
        url = get_url[0]
        all_links = []

        # set is_scraping to True where id == page_id
        DB.pages().update_by_id(True, page_id)

        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')

        for link in soup.find_all('a', href=True):

            if link['href'].startswith('http'):
                all_links.append(link['href'])

        # check if page id is in already in links table, delete all data with page id
        DB.links().delete_by_page_id(page_id)

        for link in all_links[:10]:
            # Insert each link into the links table
            Links(DB().connect()).insert(page_id, link)

        # set is_scraping to False in  where id == page_id
        DB.pages().update_by_id(False, page_id)
Beispiel #6
0
class TestDatabase(TestCase):
    '''Class to test the database (db) functions'''
    def setUp(self):
        self.db = DB()

    def test_connection(self):
        '''tests that the connection function does it's work.'''
        connection = self.db.connect()
        self.assertIsNotNone(connection)

    def test_setup(self):
        '''tests that the setup function does what it was designed to do.'''
        self.db.setup()
        self.assertIsNone(self.db.setup())

    def test_seed(self):
        '''tests that the seed function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        self.assertIsNone(self.db.seed())

    def test_pages(self):
        '''tests that the pages function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        selecter = self.db.pages().select()
        self.assertIsNotNone(selecter)

    def test_links(self):
        '''tests that the links function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        select_link = self.db.links().select()
        self.assertIsNotNone(select_link)

    def TearDown(self):
        '''the teardown function for all the tests.'''
        self.db.connect().close()
Beispiel #7
0
 def test_pages(self):
     self.assertIsNotNone(DB.pages())
Beispiel #8
0
# Show examples of how you would use ALL your implementations here
from src.db import DB
from src.spider import spider_scrap
from celery import Celery
from decouple import config
#
db = DB()
db.connect()
db.new_connect()
db.setup()
db.seed()
dd = DB.new_connect()
pages = DB.pages()
# pages.fetch_url(2)
print(pages.fetch_url(2))
print(pages.select())
print(pages.find(2))
# print(pages.update_id(1))
links = DB.links()
print(links.insert(1, 'www.goggle.com'))
print(links.delete(1))
print(links.select(1))
# #
# app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))
#
#
# @app.task
# def scrap_url():
#   return spider_scrap(1)

# spider_scrap(1)
Beispiel #9
0
from celery import Celery
from decouple import config
from src.spider import spider
from src.db.pages import Pages
from src.db import DB

# Celery Task
app = Celery('main',
             broker=config('CELERY_BROKER'),
             backend=config('CELERY_BACKEND'))


@app.task
def test():
    return spider(1)


# some tests with pages()
DB.pages().get_url(2)
DB.pages().find_by_id(1)
DB.pages().update_by_id()

# some tests with links
DB.links().select()
DB.links().insert(3, 'https://google.com')
DB.links().delete_by_page_id(2)
Beispiel #10
0
 def test_pages(self):
     ''' Test pages interface '''
     self.assertIsNotNone(DB.pages(), None)