Python BeautifulSoup.findall Examples

Programming Language: Python

Namespace/Package Name: BeautifulSoup

Class/Type: BeautifulSoup

Method/Function: findall

Examples at hotexamples.com: 5

BeautifulSoup is a Python package for parsing HTML and XML documents. It provides a simple and intuitive interface to parse HTML documents, locate specific content, and extract it.

Here are some code examples using BeautifulSoup:

1. Find all the links on a webpage:

from bs4 import BeautifulSoup
import requests

url = 'https://www.google.com'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

links = soup.findAll('a')
for link in links:
    print(link.get('href'))

This code sends a GET request to Google's homepage, parses the HTML with BeautifulSoup, and then finds all the `` tags (i.e. links) on the page. It then prints out the URL for each link.

2. Find all the images on a webpage:

from bs4 import BeautifulSoup
import requests

url = 'https://www.google.com'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

images = soup.findAll('img')
for image in images:
    print(image.get('src'))

Python BeautifulSoup.findall - 5 examples found. These are the top rated real world Python examples of BeautifulSoup.BeautifulSoup.findall extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(30)

decompose(30)

first(30)

find_all(30)

findAll(30)

find(30)

fetch(30)

feed(30)

getText(29)

insert(20)

findChildren(19)

body(12)

close(11)

__str__(11)

encode(8)

new_tag(6)

findChild(5)

append(4)

prettify(4)

findSelect(4)

decode(4)

get(4)

__unicode__(3)

goahead(3)

lower(3)

div(3)

findall(3)

pretify(3)

__init__(3)

firstText(2)

pop(2)

data(2)

findNext(2)

read(2)

index(1)

html(1)

query(1)

json(1)

load(1)

re_left(1)

noscript(1)

orig_url(1)

partition(1)

popTag(1)

pretiffy(1)

head(1)

findNextSiblings(1)

group(1)

encodeContents(1)

attrs(1)

Example #1

Show file

File: smashggScraper.py Project: madenney/old-smash-scraper

def getTournaments():

    import requests
    import re
    from BeautifulSoup import BeautifulSoup

    # https://smash.gg/tournaments?per_page=30&filter=%7B%22name%22%3A%22%22%2C%22past%22%3Atrue%2C%22upcoming%22%3Afalse%7D&page=1

    maxPage = 2  # find a way to get this number

    urlBase = "https://smash.gg/tournaments?per_page=30&fipoopC%22past%22%3Atrue%7D&page="

    for i in range(1, maxPage):

        url = urlBase + str(i)

        # Do request
        # try:
        # 	data = requests.get(url).text
        # except ChunkedEncodingError:
        # 	print("Error with tournament: " + tournament.name)
        # 	return False

        print url
        data = requests.get(url).text
        print data
        with open('requesttest.txt', 'w') as file:
            file.write(data)
        soup = BeautifulSoup(data)

        divs = soup.findall("div", {"class": "TournamentCardContainer"})
        print len(divs)

Example #2

Show file

def scrape_page(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    #print soup.prettify()
    #link_table=soup.find("div", {"class" : "alphabet_list clearfix"})
    profiles = soup.findall("div", { "class" : "profileFriendsText" })  #find the section where the friends are
    #next_link=soup.findAll("a")
    for profile in profiles:
        next_url=link['href']
        print next_url

Example #3

Show file

File: facebook_top_pages.py Project: flyeven/scraperwiki-scraper-vault

def scrape_page(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    # print soup.prettify()
    # link_table=soup.find("div", {"class" : "alphabet_list clearfix"})
    profiles = soup.findall("div", {"class": "profileFriendsText"})  # find the section where the friends are
    # next_link=soup.findAll("a")
    for profile in profiles:
        next_url = link["href"]
        print next_url

Example #4

Show file

def printLinks(url):
    ab = anonBrowser()
    ab.anonymize()
    page = ab.open(url)
    html = page.read()
    try:
        print '[+] Printing links from regex.'
        link_finder = re.compile('href="(.*?)"')
        links = link_finder.findall(html)
        for link in links:
            print link
    except:
        pass
    try:
        print '\n[+] Printing links from BeautifulSoup.'
        soup = BeautifulSoup(html)
        links = soup.findall(name='a')
        for link in links:
            if link.has_key('href'):
                print link['href']
    except:
        pass

Example #5

Show file

File: best-universities-based-on-their-ranks.py Project: carriercomm/scraperwiki-scraper-vault

import scraperwiki     #Namespace for Scrapper wiki web site
from BeautifulSoup import BeautifulSoup   #Import the namespace to read web pages

print "TOP 10 countries in currency "

Page = scraperwiki.scrape('http://www.xe.com/')
Source = BeautifulSoup(Page)
scraperwiki.metadata.save('columns', ['country name  ', 'currency name', 'worldrank','highest denomination till date' ,'year of currency eastablishment','trading','mobile currency site'])
MainTable = Source.findall ("table", { "trading" : "1":"10" })
RowDetails = MainTable.findAll("tr")
print "****Scrapping Started*****"
for row in RowDetails:
        Dicrecord = {}  #Create Dictionary to store top currency Details
        Columns = row.findAll("td")
        if Columns:
            Dicrecord['country name'] = Columns[0].text
            Dicrecord['currency name'] = Columns[1].text
            Dicrecord['highest denomination till date'] = Columns[2].text
            Dicrecord['year of currency eastablishment'] = Columns[3].text
            Dicrecord['trading'] = Columns[5].text
            Dicrecord['mobile currency site'] = Columns[8].text   
            scraperwiki.datastore.save(["top 10 currency"], Dicrecord)
            print Dicrecord
print "****Scrapping Complted*****"
import scraperwiki     #Namespace for Scrapper wiki web site
from BeautifulSoup import BeautifulSoup   #Import the namespace to read web pages

print "TOP 10 countries in currency "

Page = scraperwiki.scrape('http://www.xe.com/')
Source = BeautifulSoup(Page)