Ejemplos de DB.links en Python

Lenguaje de programación: Python

Namespace/Package Name: src.db

Clase / Tipo: DB

Método / Función: links

Ejemplos en hotexamples.com: 11

Python DB.links - 11 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de src.db.DB.links extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

DB(30)

seed(20)

connect(12)

links(11)

pages(10)

new_connect(4)

selectAll(3)

create(3)

execute(3)

get_user(1)

selectOne(1)

retrieve_records(1)

query(1)

only_server(1)

greythr_user_id(1)

greythr_password(1)

get_all(1)

get_id(1)

add_record(1)

get(1)

freeze(1)

drop_all(1)

delete(1)

create_user(1)

create_all(1)

commit(1)

all_users(1)

all_nips_papers_missing_abstracts(1)

all(1)

server_conn(1)

Ejemplo n.º 1

Mostrar archivo

def web_scraper(page_id):
    """This function accepts the id,checks if it is within the list of ids in the database, and
    scrapes only 10 links on that particular link page"""
    all_ids = Pages(DB.connect()).select_id()
    new_all_id = [pid[0] for pid in all_ids]

    if page_id not in new_all_id:
        raise TypeError('Id does not exist.')

    else:
        url = Pages(DB.connect()).select_url(page_id)
        DB.pages().update(True, page_id)
        value = requests.get(url)
        soup = BeautifulSoup(value.text, 'html.parser')

        list_urls = []
        for link in soup.find_all('a', href=True):
          if link['href'].startswith('https'):
            list_urls.append(link['href'])

        new_list_urls = list_urls[:10]
        DB.links().delete_by_page_id(page_id)

        for item in new_list_urls:
            Links(DB.connect()).insert(page_id, item)

        DB.pages().update(False, page_id)

Ejemplo n.º 2

Mostrar archivo

def scraping_function(id):
    """
  This function implements the web scraper that inserts into the liks table.
  :param
  id(int): The id at which the url to be scraped is retrieved.
  :return:
  None: Returns None
  :raises:
  TypeError: Raises a TypeError
  """
    try:
        # retrieves the url from the pages table
        url = DB.pages().fetch(id)
        DB.pages().update(True, id)

        link_list = []
        r = requests.get(url[0])
        # scrapes the url for hyperlinks
        soup = BeautifulSoup(r.text, features='html.parser')
        for link in soup.find_all('a', href=True):
            if 'https' in link['href']:
                link_list.append(link['href'])
        links = link_list[:10]
        DB.links().delete(id)
        for i in links:
            DB.links().insert(id, i)
        DB.pages().update(False, id)
        return None
    except TypeError:
        raise TypeError('Id not found in Pages Table')

Ejemplo n.º 3

Mostrar archivo

def spider_scrap(page_id):
    '''function that recieve a page_id and insert links in the link table'''

    page_ids = [i[0] for i in DB().pages().select()]
    if page_id in page_ids:
        url = DB().pages().fetch_url(page_id)
    else:
        raise ValueError('page_id not valid')

    #update is_scraping to true
    DB().pages().update_id_true(page_id)

    #fetch the html content at the page url
    page = requests.get(url[0])

    # fetching the html content to extract maximum 10 hyperlinks
    soup = BeautifulSoup(page.text, features='html.parser')
    links_list = []
    for link in soup.find_all('a', href=True):
        links = link['href']
        if re.search("^https", links):
            links_list.append(links)
    link_url = links_list[:10]

    DB.links().delete(page_id)

    #saves the newly extratcted links to the database for the page
    for url in link_url:
        DB.links().insert(page_id, url)

    DB().pages().update_id_false(page_id)


# print(spider_scrap(1))

Ejemplo n.º 4

Mostrar archivo

Archivo: spider.py Proyecto: ooakhu/web_scraper

def scrape(id):
    DB.pages().update('True', id)
    url = DB().pages().fetch(id)
    page = requests.get(url[0])
    soup = BeautifulSoup(page.text, features='html.parser')
    a_soup = soup.find_all('a', href=True)
    ext_links = [
        link.get("href") for link in a_soup if "http" in link.get("href")
    ]
    new_links = ext_links[:10]
    DB.links().delete(id)
    for i in new_links:
        DB.links().insert(i, id)

Ejemplo n.º 5

Mostrar archivo

Archivo: spider.py Proyecto: Resa-Obamwonyi/spider-pyapp

def spider(page_id):
    ''' Takes a page id, selects the url linked to page id and runs the scraper
      Scraper takes url and returns a list of urls scraped,
      a maximum of 10 links are inserted into the database '''

    if type(page_id) != int or page_id == 0:
        raise ValueError('Page Id is not valid')

    get_url = DB.pages().get_url(page_id)

    if get_url is None:
        return ValueError('Page Id not found')

    else:
        url = get_url[0]
        all_links = []

        # set is_scraping to True where id == page_id
        DB.pages().update_by_id(True, page_id)

        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')

        for link in soup.find_all('a', href=True):

            if link['href'].startswith('http'):
                all_links.append(link['href'])

        # check if page id is in already in links table, delete all data with page id
        DB.links().delete_by_page_id(page_id)

        for link in all_links[:10]:
            # Insert each link into the links table
            Links(DB().connect()).insert(page_id, link)

        # set is_scraping to False in  where id == page_id
        DB.pages().update_by_id(False, page_id)

Ejemplo n.º 6

Mostrar archivo

Archivo: test_db.py Proyecto: Rafiatu/spider-pyapp

class TestDatabase(TestCase):
    '''Class to test the database (db) functions'''
    def setUp(self):
        self.db = DB()

    def test_connection(self):
        '''tests that the connection function does it's work.'''
        connection = self.db.connect()
        self.assertIsNotNone(connection)

    def test_setup(self):
        '''tests that the setup function does what it was designed to do.'''
        self.db.setup()
        self.assertIsNone(self.db.setup())

    def test_seed(self):
        '''tests that the seed function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        self.assertIsNone(self.db.seed())

    def test_pages(self):
        '''tests that the pages function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        selecter = self.db.pages().select()
        self.assertIsNotNone(selecter)

    def test_links(self):
        '''tests that the links function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        select_link = self.db.links().select()
        self.assertIsNotNone(select_link)

    def TearDown(self):
        '''the teardown function for all the tests.'''
        self.db.connect().close()

Ejemplo n.º 7

Mostrar archivo

 def test_links(self):
     self.assertIsNotNone(DB.links())

Ejemplo n.º 8

Mostrar archivo

Archivo: main.py Proyecto: Remi288/Spiderapp

from src.db import DB
from src.spider import spider_scrap
from celery import Celery
from decouple import config
#
db = DB()
db.connect()
db.new_connect()
db.setup()
db.seed()
dd = DB.new_connect()
pages = DB.pages()
# pages.fetch_url(2)
print(pages.fetch_url(2))
print(pages.select())
print(pages.find(2))
# print(pages.update_id(1))
links = DB.links()
print(links.insert(1, 'www.goggle.com'))
print(links.delete(1))
print(links.select(1))
# #
# app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))
#
#
# @app.task
# def scrap_url():
#   return spider_scrap(1)

# spider_scrap(1)

Ejemplo n.º 9

Mostrar archivo

Archivo: main.py Proyecto: Resa-Obamwonyi/spider-pyapp

from celery import Celery
from decouple import config
from src.spider import spider
from src.db.pages import Pages
from src.db import DB

# Celery Task
app = Celery('main',
             broker=config('CELERY_BROKER'),
             backend=config('CELERY_BACKEND'))


@app.task
def test():
    return spider(1)


# some tests with pages()
DB.pages().get_url(2)
DB.pages().find_by_id(1)
DB.pages().update_by_id()

# some tests with links
DB.links().select()
DB.links().insert(3, 'https://google.com')
DB.links().delete_by_page_id(2)

Ejemplo n.º 10

Mostrar archivo

app = Celery('spider', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))


@app.task()
def cel_spider():
  """
  Implements the celery task for the spider
  :return
  None: Returns the value of the scraping function.
  """
  return scraping_function(2)
#
#  ---- Usage of DB class ----
#
# DB.serv_conn()
# # # DB.connect()
# DB.setup()
# DB.seed()
#
# # ----Usage of pages.py----
# DB.pages().select()
# DB.pages().fetch(2)
# DB.pages().update(True, 2)
#
# # ----Usage of links.py----
# DB.links().select()
# DB.links().fetch()
# DB.links().insert(1, 'https://www.facebook.com')
DB.links().delete(1)

# scraping_function(3)

Ejemplo n.º 11

Mostrar archivo

Archivo: test_DB.py Proyecto: Resa-Obamwonyi/spider-pyapp

 def test_links(self):
     ''' Test links interface '''
     self.assertIsNotNone(DB.links(), None)