Python Soup Examples, gazpacho.Soup Python Examples

Example #1

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find_multiple(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("div", {"class": "baz"})
    assert len(result) == 2
    assert str(result[1]) == '<div class="baz">Oh No!</div>'

Example #2

0

Show file

def make_soup(query, page=1):
    params = {'q': query, 'commit': 'Search', 'page': page}
    html = get(URL, params)
    soup = Soup(html)
    return soup

Example #3

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find_with_attrs(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("p", {"id": "blarg"})
    assert str(result) == '<p id="blarg">Try for 2</p>'

Example #4

0

Show file

from gazpacho import get, Soup
from selenium import webdriver
import re
import urllib
import time
import pandas as pd

browser = webdriver.Firefox()
browser.get("https://www.pinterest.ca/catherine8610/greek-pottery/")
for i in range(1, 5000):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

soup = Soup(browser.page_source)

greek_images = re.findall(r'https:\/\/[^"]+?\.jpg', str(soup))

greek_df = pd.DataFrame(greek_images)
greek_urls = greek_df[0].unique().tolist()

for url in greek_urls[1::2]:
    try:
        urllib.request.urlretrieve(
            url, f"/Users/username/Greece/greek{greek_urls.index(url)}.jpg")
    except:
        pass

browser = webdriver.Firefox()
browser.get(
    "https://www.pinterest.ca/armidaptaylor/italian-renaissance-ceramicsceramiche-rinascimenta/"
)
for i in range(1, 2500):

Example #5

0

Show file

def make_soup():
    url = 'https://www.amazon.ca/hz/wishlist/ls/17CRUYWGYZ5Y2'
    browser.get(url)
    html = browser.page_source
    soup = Soup(html)
    return soup

Example #6

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find_strict(fake_html_2):
    soup = Soup(fake_html_2)
    result = soup.find("div", {"class": "foo"}, strict=True, mode="all")
    assert len(result) == 1

Example #7

0

Show file

File: spend.py Project: synapticrumble/scrape.world

from gazpacho import get, Soup

url = "https://scrape.world/spend"
html = get(url)
soup = Soup(html)

trs = soup.find("tr", {"class": "tmx"})


def parse_tr(tr):
    team = tr.find("td", {"data-label": "TEAM"}).text
    spend = float(
        tr.find("td", {
            "data-label": "TODAYS CAP HIT"
        }).text.replace(",", "")[1:])
    return team, spend


spend = [parse_tr(tr) for tr in trs]

print(spend)

Example #8

0

Show file

File: test_soup.py Project: defiantprogramming/gazpacho

def test_find_no_match_first(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("a", mode="first")
    assert result == None

Example #9

0

Show file

File: test_soup.py Project: defiantprogramming/gazpacho

def test_find_no_match_all(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("a", mode="all")
    assert result == []

Example #10

0

Show file

File: test_soup.py Project: defiantprogramming/gazpacho

def test_find_mutliple_imgs(fake_html_3):
    soup = Soup(fake_html_3)
    result = soup.find("img")
    assert result[1].attrs["src"] == "bye.jpg"

Example #11

0

Show file

File: test_soup.py Project: defiantprogramming/gazpacho

def test_remove_tags(fake_html_4):
    soup = Soup(fake_html_4)
    result = soup.remove_tags()
    assert (
        result ==
        "I like soup and I really like cold soup I guess hot soup is okay too")

Example #12

0

Show file

from gazpacho import get
from gazpacho import Soup
import pandas as pd
import time

position = 'F'

base = f'https://www.cbssports.com/fantasy/hockey/stats'
url = f'{base}/{position}/2019/restofseason/projections/'
html = get(url)
soup = Soup(html)
# HTML: <tr class="TableBase-bodyTr ">
rows = soup.find('tr', {'class': 'TableBase-bodyTr '})
row = rows[0]
print(row)

Example #13

0

Show file

from gazpacho import Soup, get

url = "https://scrape.world/books"
html = get(url)
soup = Soup(html)
books = soup.find("div", {"class": "book-"}, partial=True)


def parse(book):
    name = book.find("h4").text
    price = float(book.find("p").text[1:].split(" ")[0])
    return name, price


[parse(book) for book in books]

Example #14

0

Show file

def make_soup(url):
    html = get(url)
    soup = Soup(html)
    return soup

Example #15

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find_text(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("p", {"id": "blarg"})
    assert result.text == "Try for 2"

Example #16

0

Show file

File: test_soup.py Project: defiantprogramming/gazpacho

def test_find_no_match_auto(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("a", mode="auto")
    assert result == None

Example #17

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find_nested_groups(fake_html_2):
    soup = Soup(fake_html_2)
    results = soup.find("div", {"class": "foo"})
    assert len(results) == 2

Example #18

0

Show file

File: 02-scraping.py Project: maxhumber/ellehacks

from gazpacho import Soup
from tqdm import tqdm
import time
import random

# step 1: get the html and make it parsable

symbol = "LDCAP"
url = f"https://www.hsx.com/security/view/{symbol}"
soup = Soup.get(url)

# step 2: get the name and current price

name = soup.find("p", {"class": "security_name"}).text.split(" (")[0]
value = float(soup.find("p", {'class': "value"}).text[2:])

# step 3: find movies

lis = soup.find("ul", {"class": "credit"}).find("li")
li = lis[1]
movie_symbol = li.find("span").find("a").attrs["href"].split("/")[-1]

# step 4: get movie price (which is really just step 1 + 2)

url = f"https://www.hsx.com/security/view/{movie_symbol}"
soup = Soup.get(url)

name = soup.find("p", {"class": "security_name"}).text.split(" (")[0]
value = float(soup.find("p", {'class': "value"}).text[2:])

print(name, value)

Example #19

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find_nested_empty_tag(fake_html_3):
    soup = Soup(fake_html_3)
    result = soup.find("a", {"class": "foo"})
    assert len(result) == 2

Example #20

0

Show file

File: 02-scraping.py Project: maxhumber/ellehacks

def price(symbol):
    url = f"https://www.hsx.com/security/view/{symbol}"
    soup = Soup.get(url)
    value = float(soup.find("p", {'class': "value"}).text[2:])
    return value

Example #21

0

Show file

File: 03_protected.py Project: rush175/scrape.localhost

from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options

url = "http://localhost:5000/protected"

options = Options()
options.headless = True
browser = Firefox(executable_path="/usr/local/bin/geckodriver", options=options)
browser.get(url)

# username
username = browser.find_element_by_id("username")
username.clear()
username.send_keys("admin")

# password
password = browser.find_element_by_name("password")
password.clear()
password.send_keys("admin")

# submit
browser.find_element_by_xpath("/html/body/main/form/button").click()

# refetch
browser.get(url)

# gazpacho
html = browser.page_source
soup = Soup(html)
soup.find("blockquote").remove_tags()

Example #22

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("span")
    assert str(result) == "<span>Hi</span>"

Example #23

0

Show file

from gazpacho import get, Soup
import re
import pandas as pd
from itertools import groupby
from bs4 import BeautifulSoup
from collections import Counter

url = 'https://seinfeldscripts.com/seinfeld-scripts.html'
html = get(url)
soup = Soup(html)
table = soup.find('table')[1]
refs = table.find('a')
links = [i.attrs['href'] for i in refs]
links = [i.replace(" ", "") for i in links]
links


def scrape_script(episode):
    url = 'https://seinfeldscripts.com/' + str(episode)
    html = get(url)
    soup = Soup(html)
    table = soup.find('div', {'id': 'content'})
    script = table.find('p')
    scrip = [i.remove_tags() for i in script]
    lines = same_line(scrip)
    scri = [i.replace('\n', '') for i in lines]
    spaces = [re.sub(' +', ' ', i) for i in scri]
    lines = same_line(spaces)
    bracks = [re.sub('\[.*?\]', '', i) for i in lines]
    return bracks

Example #24

0

Show file

File: test_soup.py Project: theendsofinvention/gazpacho

def test_find_first(fake_html_1):
    soup = Soup(fake_html_1)
    result = soup.find("p", mode="first")
    assert str(result) == "<p>'IDK!'</p>"

Example #25

0

Show file

File: titans.py Project: synapticrumble/scrape.world

from gazpacho import get, Soup

url = "https://scrape.world/titans"

html = get(url)
soup = Soup(html)

titans = soup.find("b")[1:]
titans = [t.find("a").text for t in titans]

print(titans)

Example #26

0

Show file

File: 04_ocr.py Project: rush175/scrape.localhost

from urllib.request import urlopen
from gazpacho import get, Soup
from PIL import Image  # pip install pillow
import pytesseract  # pip install pytesseract

base = 'http://localhost:5000'
url = base + '/ocr'
html = get(url)
soup = Soup(html)

soup.find("img")

imgs = soup.find('img')
paths = [i.attrs['src'] for i in imgs]

images = []
for path in paths:
    url = base + path
    im = Image.open(urlopen(url))
    images.append(im)

text = ''
for image in images:
    i2t = pytesseract.image_to_string(image)
    text += i2t

print(text)