Python DB.pages Beispiele

Programmiersprache: Python

Namespace / Paketname: src.db

Klasse / Typ: DB

Methode / Funktion: pages

Beispiele auf hotexamples.com: 10

Python DB.pages - 10 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die src.db.DB.pages, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

DB(30)

seed(20)

connect(12)

links(11)

pages(10)

new_connect(4)

selectAll(3)

create(3)

execute(3)

get_user(1)

selectOne(1)

retrieve_records(1)

query(1)

only_server(1)

greythr_user_id(1)

greythr_password(1)

get_all(1)

get_id(1)

add_record(1)

get(1)

freeze(1)

drop_all(1)

delete(1)

create_user(1)

create_all(1)

commit(1)

all_users(1)

all_nips_papers_missing_abstracts(1)

all(1)

server_conn(1)

Beispiel #1

Datei anzeigen

def scraping_function(id):
    """
  This function implements the web scraper that inserts into the liks table.
  :param
  id(int): The id at which the url to be scraped is retrieved.
  :return:
  None: Returns None
  :raises:
  TypeError: Raises a TypeError
  """
    try:
        # retrieves the url from the pages table
        url = DB.pages().fetch(id)
        DB.pages().update(True, id)

        link_list = []
        r = requests.get(url[0])
        # scrapes the url for hyperlinks
        soup = BeautifulSoup(r.text, features='html.parser')
        for link in soup.find_all('a', href=True):
            if 'https' in link['href']:
                link_list.append(link['href'])
        links = link_list[:10]
        DB.links().delete(id)
        for i in links:
            DB.links().insert(id, i)
        DB.pages().update(False, id)
        return None
    except TypeError:
        raise TypeError('Id not found in Pages Table')

Beispiel #2

Datei anzeigen

def web_scraper(page_id):
    """This function accepts the id,checks if it is within the list of ids in the database, and
    scrapes only 10 links on that particular link page"""
    all_ids = Pages(DB.connect()).select_id()
    new_all_id = [pid[0] for pid in all_ids]

    if page_id not in new_all_id:
        raise TypeError('Id does not exist.')

    else:
        url = Pages(DB.connect()).select_url(page_id)
        DB.pages().update(True, page_id)
        value = requests.get(url)
        soup = BeautifulSoup(value.text, 'html.parser')

        list_urls = []
        for link in soup.find_all('a', href=True):
          if link['href'].startswith('https'):
            list_urls.append(link['href'])

        new_list_urls = list_urls[:10]
        DB.links().delete_by_page_id(page_id)

        for item in new_list_urls:
            Links(DB.connect()).insert(page_id, item)

        DB.pages().update(False, page_id)

Beispiel #3

Datei anzeigen

Datei: spider.py Projekt: ooakhu/web_scraper

def scrape(id):
    DB.pages().update('True', id)
    url = DB().pages().fetch(id)
    page = requests.get(url[0])
    soup = BeautifulSoup(page.text, features='html.parser')
    a_soup = soup.find_all('a', href=True)
    ext_links = [
        link.get("href") for link in a_soup if "http" in link.get("href")
    ]
    new_links = ext_links[:10]
    DB.links().delete(id)
    for i in new_links:
        DB.links().insert(i, id)

Beispiel #4

Datei anzeigen

def scrape(id):
    '''Scrape function fetches the page record with the page_id provided,
  Raise an exception if page with the isn't found,
  Updates the page’s is_scraping attribute to true,
  Fetch the HTML content at the page url using requests,
  Parses the fetched HTML content to extract hyperlinks (Maximum 10),
  Deletes existing links that may have been previously saved for the page,
  Saves the newly extracted links to the database for the page,
  Updates the page’s is_scraping attribute to false,
  passes the scraped links to the links table on the database.
  '''
    try:
        the_url = DB.pages().fetch(id)
        if len(the_url) == 0:
            raise Exception
        the_url = the_url[0]
        address = the_url[0]
        DB().pages().update(id, 'True')
        web_request = requests.get(address)
        soup = BeautifulSoup(web_request.text, features='html.parser')
        list_of_links = []
        for link in soup.find_all('a', href=True):
            links = link['href']
            if re.search("^https", links):
                list_of_links.append(links)
        linksy = (list_of_links[:10])
        DB().links().delete(id)
        for url in linksy:
            DB().links().insert(url, id)
        DB().pages().update(id, 'False')
        return '===============Successfully scraped================'
    except Exception as e:
        print(e)

Beispiel #5

Datei anzeigen

Datei: spider.py Projekt: Resa-Obamwonyi/spider-pyapp

def spider(page_id):
    ''' Takes a page id, selects the url linked to page id and runs the scraper
      Scraper takes url and returns a list of urls scraped,
      a maximum of 10 links are inserted into the database '''

    if type(page_id) != int or page_id == 0:
        raise ValueError('Page Id is not valid')

    get_url = DB.pages().get_url(page_id)

    if get_url is None:
        return ValueError('Page Id not found')

    else:
        url = get_url[0]
        all_links = []

        # set is_scraping to True where id == page_id
        DB.pages().update_by_id(True, page_id)

        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')

        for link in soup.find_all('a', href=True):

            if link['href'].startswith('http'):
                all_links.append(link['href'])

        # check if page id is in already in links table, delete all data with page id
        DB.links().delete_by_page_id(page_id)

        for link in all_links[:10]:
            # Insert each link into the links table
            Links(DB().connect()).insert(page_id, link)

        # set is_scraping to False in  where id == page_id
        DB.pages().update_by_id(False, page_id)

Beispiel #6

Datei anzeigen

Datei: test_db.py Projekt: Rafiatu/spider-pyapp

class TestDatabase(TestCase):
    '''Class to test the database (db) functions'''
    def setUp(self):
        self.db = DB()

    def test_connection(self):
        '''tests that the connection function does it's work.'''
        connection = self.db.connect()
        self.assertIsNotNone(connection)

    def test_setup(self):
        '''tests that the setup function does what it was designed to do.'''
        self.db.setup()
        self.assertIsNone(self.db.setup())

    def test_seed(self):
        '''tests that the seed function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        self.assertIsNone(self.db.seed())

    def test_pages(self):
        '''tests that the pages function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        selecter = self.db.pages().select()
        self.assertIsNotNone(selecter)

    def test_links(self):
        '''tests that the links function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        select_link = self.db.links().select()
        self.assertIsNotNone(select_link)

    def TearDown(self):
        '''the teardown function for all the tests.'''
        self.db.connect().close()

Beispiel #7

Datei anzeigen

 def test_pages(self):
     self.assertIsNotNone(DB.pages())

Beispiel #8

Datei anzeigen

Datei: main.py Projekt: Remi288/Spiderapp

# Show examples of how you would use ALL your implementations here
from src.db import DB
from src.spider import spider_scrap
from celery import Celery
from decouple import config
#
db = DB()
db.connect()
db.new_connect()
db.setup()
db.seed()
dd = DB.new_connect()
pages = DB.pages()
# pages.fetch_url(2)
print(pages.fetch_url(2))
print(pages.select())
print(pages.find(2))
# print(pages.update_id(1))
links = DB.links()
print(links.insert(1, 'www.goggle.com'))
print(links.delete(1))
print(links.select(1))
# #
# app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))
#
#
# @app.task
# def scrap_url():
#   return spider_scrap(1)

# spider_scrap(1)

Beispiel #9

Datei anzeigen

Datei: main.py Projekt: Resa-Obamwonyi/spider-pyapp

from celery import Celery
from decouple import config
from src.spider import spider
from src.db.pages import Pages
from src.db import DB

# Celery Task
app = Celery('main',
             broker=config('CELERY_BROKER'),
             backend=config('CELERY_BACKEND'))


@app.task
def test():
    return spider(1)


# some tests with pages()
DB.pages().get_url(2)
DB.pages().find_by_id(1)
DB.pages().update_by_id()

# some tests with links
DB.links().select()
DB.links().insert(3, 'https://google.com')
DB.links().delete_by_page_id(2)

Beispiel #10

Datei anzeigen

Datei: test_DB.py Projekt: Resa-Obamwonyi/spider-pyapp

 def test_pages(self):
     ''' Test pages interface '''
     self.assertIsNotNone(DB.pages(), None)