Python beautifulSoup Examples, bs4.beautifulSoup Python Examples

Example #1

0

Show file

File: unity.py Project: Ronney221/unity-showcase

def fetchData(index):
    info = fetch.get('https://unity.com/madewith')
    soup = beautifulSoup(info.text, "lxml")

    link = soup.find_all('div',
                         attrs={
                             'class': 'section-home-stories--item-image',
                             'style': True
                         })
    images = []
    for count, pics in enumerate(link):
        images.append("https://unity.com" +
                      re.findall("\('(/sites/.*)'\)", link[count]['style'])[0])

    projects = [
        proj.text for proj in soup.find_all(
            'div', attrs={'class': 'section-home-stories--item-title'})
    ]
    authors = [
        auth.text for auth in soup.find_all(
            'div', attrs={'class': 'section-home-stories--item-studio'})
    ]

    urls = []
    for links in soup.find_all('article'):
        urls.append("https://unity.com" + links.find('a').attrs['href'])

    page = fetch.get(urls[index])
    soup2 = beautifulSoup(page.text, "lxml")

    headers = [
        head.text
        for head in soup2.find_all('div', attrs={'class': 'title-large'})
    ]
    texts = [
        txt.text
        for txt in soup2.find_all('div',
                                  attrs={'class': 'section-article-text'})
    ]

    fetched = [{
        'project': projects[index],
        'author': authors[index],
        'link': urls[index],
        'bg': images[index],
        'h1': headers[0],
        'h2': headers[1],
        'h3': headers[2],
        'h1text': texts[0],
        'h2text': texts[1],
        'h3text': texts[2],
    }]

    return fetched

Example #2

0

Show file

def get_pages_count(html):
    soup = beautifulSoup(html, 'html.parser')
    pagination =  soup.find_all('span', class_= 'mhide')
    if pagination:
        return pagination[-1].get_text())
    else:
        return 1

Example #3

0

Show file

File: sale.py Project: AlphaRoy14/Amazon-Sale-Alert

def check_price(T_price):

    page = requests.get(URL, headers=headers)

    soup = beautifulSoup(page.content, 'html.parser')
    title = soup.find(id="productTitle").get_text()
    price = soup.find(id="priceblock_ourprice").get_text()
    converted_price = float(price[:5])
    if converted_price < T_price:
        send_mail()

Example #4

0

Show file

File: UpdatePomInTarget.py Project: Priyanks27/TransformCode

    def update_pom(self, pom_location, service_name):
        with open(pom_location, "r") as file:
            file_content = file.readlines()
            content = "".join(file_content)
            beautify_content = beautifulSoup(content, "lxml")
            beautify_content.find('name').string = service_name

        file_pointer = codecs.open(pom_location, "w", "utf-8")
        file_pointer.write(str(beautify_content))
        file_pointer.close()

Example #5

0

Show file

def downloadXkcd(startComic, endComic):
	#download the webpage
	print('Downloading page http://xkcd.com/%s...' % (urlNumber))
	res = requests.get('http://xkcd.co,/%s' % (urlNumber))
	res.raise_for_status()

	soup = bs4.beautifulSoup(res.text)

	# Get the URL of the comic image
	comicELEM = SOUP.SELECT('#comic img')
	if comicElem = []:
		print('Could not fins comic image.')

Example #6

0

Show file

	def trackPrice(self):

		page = requests.get(self.__url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"})
		soup1 = beautifulSoup(page.content, "html.parser")
		soup2 = beautifulSoup(soup1.prettify(), "html.parser")

		element = soup2.find(self.__webData["tag"], attrs = self.__webData["attributes"])

		if (element):

			if self.__webData["inside"] == False:

				text = element.getText()

			else:

				text = element[self.__webData["inside"]]

			text2 = text.strip().replace(",", ".")
			price = float(re.sub("$|€", '', text2))

			self.__price = price

			if (self.__price <= self.__desirePrice):

				self.sendEmail();

			else:

				print("No element detected")

		else:

			print("No web detetected")

		print("Track done")

Example #7

0

Show file

File: DependencyScanning.py Project: Priyanks27/TransformCode

    def __scan_pom_file(self, pom_location):

        with open(pom_location, "r") as file:
            file_content = file.readlines()
            content = "".join(file_content)
            beautify_content = beautifulSoup(content, "lxml")
            results = beautify_content.find_all("dependency")
            dependencies = {}
            key = None
            value = None
            for res in results:
                for child in res.children:
                    if child.name == "groupid":
                        key = str(child.contents[0])
                    if child.name == "artifactid":
                        value = str(child.contents[0])
                    if key is not None and value is not None:
                        dependencies[key] = value
                        key = None
                        value = None
            return dependencies

Example #8

0

Show file

def get_content(html):
    soup = beautifulSoup(html, 'html.parser')
    items = soup.find_all('a', class_='na-card-item') # получаем ссылки из нужного нам блока с классом
    
    cars = [] # Создаем каталог, который будем наполнять с помощью цикла
    for item in items:
        # Проверяем наличие данных. Если данных нет, то заменяем их на необходимый текст
        uah_price = item.find('span', class_='size15')
        if uah_price:
            uah_price = uah_price.get_text().replace('*', '') # replace позволяет удалить ненужный символ
        else:
            uah_price = 'Цену уточняйте'
        # Конец проверки
        cars.append({
            'title': item.find('div', class_='na-card-name').get_text(strip = True), # strip позволяет уберать концевые пробелы
            'link': HOST + item.find('span', class_='link').get('href'),
            'usd_price': item.find('strong', class_='green').get._text(),
            'uah_price': uah_price,
            'city': item.find('svg', class_='svg_116_pin').find_text(), # в данном случаем находим текст привязанный к иконке
        })
        
    return cars

Example #9

0

Show file

File: API_Requests_Web_Scraping.py Project: nitinKumarInfy12/Python_material

"""

import bs4, requests

# bs4.BeautifulSoup() function needs to be called with a string containing the HTML
# it will parse. The bs4.BeautifulSoup() function returns is a BeautifulSoup object

res = requests.get('http://nostarch.com')
res.raise_for_status()
noStarchSoup = bs4.BeautifulSoup(res.text)  # text attribute of teh response is passed tp bs4.beautifulsoup
type(noStarchSoup)   # <class 'bs4.BeautifulSoup'>


# HTML file can also be loaded from the hard drive
examplefile = open('example.html')
exampleSoup = bs4.beautifulSoup(examplefile)
type(exampleSoup) # <class 'bs4.BeautifulSoup'>

# select() method can be used find the element of interest
# select method with CSS selector if the element
# these selectors are like regex, they specify an HTML pattern to look for in HTML pages
# ex:soup.select('#author') The element with an id attribute of author
# The select() method will return a list of HTML Tag objects.
# The list will contain one Tag object for every match in the BeautifulSoup object’s HTML.
# Tag values can be passed to the str() function to show the HTML TAGs they represent.
# Tag values also have an attrs attribute that shows all the HTML attributes of the TAG as a dictionary
# getText() can be used with the tag object to check teh text value they represent
# The get() method for Tag objects makes it simple to access attribute values of a TAG element.
# The method is passed an attribute name and returns that attribute’s # value.

examplefile = open('example.html')

Example #10

0

Show file

File: parse_html.py Project: wilcerowii/mtec2002_assignments

5. do the same thing for http://isitchristmas.com
6. do the same thing for http://emerging-media.info/class/
7. try a url that you know is missing; what status code do you get?
8. notice all of the repeated code?  how could we reduce the redundant code?
"""
# import the bs4 module
import bs4
# create the following html string and assign it to a variable named unordered_list
unordered_list = """
<ul>
<li>one</li>
<li>two</li>
</ul>
""" 
# create a beautiful soup object using the html above
soup = bs4.beautifulSoup(unordered_list)
# print out a formatted version of the soup object
print soup.prettify()
# print out the ul tag
print soup.ul
# print out the first li in the ul
print "==="
print soup.ul.il.string
# print out the string that's between the first li tags

# create the following html string and assign it to a variable named paragraphs
paragraphs = """
<div>
<p>This is a paragraph.</p>
<p>So is <strong>this</strong>.</p>
<p class="foo">This has a <strong>class</strong> attribute!</p>

Example #11

0

Show file

File: WebScraper.py Project: orlando1424/Webscraper

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as beautifulSoup

# https://www.youtube.com/watch?v=XQgXKtPSzUI  youtube video for reference of script

my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=StoreIM&Depa=1&Category=38'

# Opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

# HTML Parser
page_soup = beautifulSoup(page_html, "html.parser")

# Grabs each product
containers = page_soup.findAll("div", {"class": "item-container"})

filename = "products.csv"
f = open(filename, "w")

headers = "brand, product_name, shipping \n"
f.write(headers)

for container in containers:

    divWithInfo = containers[0].find("div", "item-info")

    brand = divWithInfo.div.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})

Example #12

0

Show file

File: Google_Search.py Project: nitinKumarInfy12/Python_material

#! pythonn3

import webbrowser, bs4, requests, sys

print("Googling...")  # prints googling meanwhile web is loaded

res = requests.get('https://www.google.com/search?q='+ ' '.join(sys.argv[1:]))
res.raise_for_status()  # res.status_code == requests.codes.ok

googleSoup = bs4.beautifulSoup(res.text)
linkElems = googleSoup.select('.r a')

tabnums = min(5, len(linkElems))  # returns teh minimum number from both

for i in range(tabnums)
    webbrowser.open('http:google.com'+linkElems[i].get('href'))

Example #13

0

Show file

File: scrapper_base.py Project: Syllaweg/Snips

import os
import requests #requetes et protocole HTTP
import shutil # manip de fichier, copie ect..
from bs4 import beautifulSoup



with requests.Session() as rs:

    p = rs.post('#URL#', data=#ADD DATA VARB(dict)#)

    for e in range(1, page+1):

        requete = s.get("#URL#", , "#page#", str(e))
        content = requete.content
        soup = beautifulSoup(content)

        for i in soup.find('div', '#ADD HTML#').findAll('EX: img'):
            url = I['src'].replace('//', 'https://')
            file = url.rsplit('/', 1)[1]

            rs.headers.update("#dict URL#")
            ret = s.get(url, stream=True)

            # sauvegarde
            with open file, 'wb' as ret_file:
                ret.raw.decode_content = True
                shutil.copyfileobj(ret.raw, out_file)

Example #14

0

Show file

File: Webscraping.py Project: 3136Saurav/Competitive_problems

from bs4 import beautifulSoup
import requests
source = requests.get("http://coreyms.com").text
soup = beautifulSoup(source, 'html.parser')
print(soup.prettify())
article = soup.find('article')
#print(article.prettify())
#headline = article.h2.a.text
#print(headline)
summary = article.find('div', class_='entry-content').p.text
print(summary)

Example #15

0

Show file

File: __init__.py Project: 3136Saurav/Competitive_problems

from bs4 import beautifulSoup
import requests
source = requests.get("http://coreyms.com").text
soup = beautifulSoup(source, 'lxml')
print(soup.prettify())
article = soup.find('article')
#print(article.prettify())
#headline = article.h2.a.text
#print(headline)
summary = article.find('div', class_='entry-content').p.text
print(summary)