Python scrape_html Exemples, scrape.scrape_html Python Exemples

Exemple #1

0

Afficher le fichier

def scrape_links_rbc():
    url = 'https://www.rbc.ua/rus/news'
    links = []
    response = scrape_html(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for div in html.findAll('div', {'class': 'content-section'}):
            for a in div.findAll('a'):
                links.append(a['href'])

    return list(set(links))

Exemple #2

0

Afficher le fichier

def scrape_links_Gordon():
    url = 'https://gordonua.com/ukr/news.html'
    links = []
    response = scrape_html(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for div in html.findAll('div', {'class': 'lenta_head'}):
            for a in div.findAll('a'):
                links.append('https://gordonua.com' + a['href'])

    return list(set(links))

Exemple #3

0

Afficher le fichier

def scrape_links_Glavcom():
    url = 'https://glavcom.ua/news.html'
    links = []
    response = scrape_html(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for ul in html.findAll('ul', {'class': 'list'}):
            for a in ul.findAll('a'):
                links.append('https://glavcom.ua' + a['href'])

    return list(set(links))

Exemple #4

0

Afficher le fichier

def scrape_links_NewsOne():
    url = 'https://newsone.ua/news.html'
    links = []
    response = scrape_html(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for h2 in html.findAll('h2'):
            for a in h2.findAll('a'):
                links.append('https://www.newsone.ua' + a['href'])

    return list(set(links))

Exemple #5

0

Afficher le fichier

def scrape_links_Pravda():
    url = 'https://www.pravda.com.ua/news/'
    links = []
    response = scrape_html(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for div in html.findAll('div', {'class': 'article'}):
            for a in div.findAll('a'):
                if not a['href'].startswith('https'):
                    links.append('https://www.pravda.com.ua' + a['href'])

    return list(set(links))

Exemple #6

0

Afficher le fichier

def scrape_links_Unian():
    url = 'https://www.unian.ua/detail/main_news'
    links = []
    response = scrape_html(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for div in html.findAll('div',
                                {'class': 'gallery-item news-inline-item'}):
            for a in div.findAll('a'):
                links.append(a['href'])

    return list(set(links))

Exemple #7

0

Afficher le fichier

Fichier : getting_data.py Projet : pazzzych/FakeNews

def scrape_data(url):
    response = scrape_html(url)

    title = ''
    text = ''

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for h1 in html.findAll("h1"):
            title = h1.text.replace('\n', '')
        for parag in html.findAll("p"):
                text += parag.text
        
    
    return [title, text]

Exemple #8

0

Afficher le fichier

def scrape_links_Obozrevatel():
    url = 'https://www.obozrevatel.com/ukr/'
    links = []
    response = scrape_html(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for article in html.findAll("article"):
            for a in article.findAll('a'):
                links.append(a['href'])

    for link in links:
        if link.endswith('/'):
            links.remove(link)

    return list(set(links))