def test_find_multiple(fake_html_1): soup = Soup(fake_html_1) result = soup.find("div", {"class": "baz"}) assert len(result) == 2 assert str(result[1]) == '<div class="baz">Oh No!</div>'
def make_soup(query, page=1): params = {'q': query, 'commit': 'Search', 'page': page} html = get(URL, params) soup = Soup(html) return soup
def test_find_with_attrs(fake_html_1): soup = Soup(fake_html_1) result = soup.find("p", {"id": "blarg"}) assert str(result) == '<p id="blarg">Try for 2</p>'
from gazpacho import get, Soup from selenium import webdriver import re import urllib import time import pandas as pd browser = webdriver.Firefox() browser.get("https://www.pinterest.ca/catherine8610/greek-pottery/") for i in range(1, 5000): browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") soup = Soup(browser.page_source) greek_images = re.findall(r'https:\/\/[^"]+?\.jpg', str(soup)) greek_df = pd.DataFrame(greek_images) greek_urls = greek_df[0].unique().tolist() for url in greek_urls[1::2]: try: urllib.request.urlretrieve( url, f"/Users/username/Greece/greek{greek_urls.index(url)}.jpg") except: pass browser = webdriver.Firefox() browser.get( "https://www.pinterest.ca/armidaptaylor/italian-renaissance-ceramicsceramiche-rinascimenta/" ) for i in range(1, 2500):
def make_soup(): url = 'https://www.amazon.ca/hz/wishlist/ls/17CRUYWGYZ5Y2' browser.get(url) html = browser.page_source soup = Soup(html) return soup
def test_find_strict(fake_html_2): soup = Soup(fake_html_2) result = soup.find("div", {"class": "foo"}, strict=True, mode="all") assert len(result) == 1
from gazpacho import get, Soup url = "https://scrape.world/spend" html = get(url) soup = Soup(html) trs = soup.find("tr", {"class": "tmx"}) def parse_tr(tr): team = tr.find("td", {"data-label": "TEAM"}).text spend = float( tr.find("td", { "data-label": "TODAYS CAP HIT" }).text.replace(",", "")[1:]) return team, spend spend = [parse_tr(tr) for tr in trs] print(spend)
def test_find_no_match_first(fake_html_1): soup = Soup(fake_html_1) result = soup.find("a", mode="first") assert result == None
def test_find_no_match_all(fake_html_1): soup = Soup(fake_html_1) result = soup.find("a", mode="all") assert result == []
def test_find_mutliple_imgs(fake_html_3): soup = Soup(fake_html_3) result = soup.find("img") assert result[1].attrs["src"] == "bye.jpg"
def test_remove_tags(fake_html_4): soup = Soup(fake_html_4) result = soup.remove_tags() assert ( result == "I like soup and I really like cold soup I guess hot soup is okay too")
from gazpacho import get from gazpacho import Soup import pandas as pd import time position = 'F' base = f'https://www.cbssports.com/fantasy/hockey/stats' url = f'{base}/{position}/2019/restofseason/projections/' html = get(url) soup = Soup(html) # HTML: <tr class="TableBase-bodyTr "> rows = soup.find('tr', {'class': 'TableBase-bodyTr '}) row = rows[0] print(row)
from gazpacho import Soup, get url = "https://scrape.world/books" html = get(url) soup = Soup(html) books = soup.find("div", {"class": "book-"}, partial=True) def parse(book): name = book.find("h4").text price = float(book.find("p").text[1:].split(" ")[0]) return name, price [parse(book) for book in books]
def make_soup(url): html = get(url) soup = Soup(html) return soup
def test_find_text(fake_html_1): soup = Soup(fake_html_1) result = soup.find("p", {"id": "blarg"}) assert result.text == "Try for 2"
def test_find_no_match_auto(fake_html_1): soup = Soup(fake_html_1) result = soup.find("a", mode="auto") assert result == None
def test_find_nested_groups(fake_html_2): soup = Soup(fake_html_2) results = soup.find("div", {"class": "foo"}) assert len(results) == 2
from gazpacho import Soup from tqdm import tqdm import time import random # step 1: get the html and make it parsable symbol = "LDCAP" url = f"https://www.hsx.com/security/view/{symbol}" soup = Soup.get(url) # step 2: get the name and current price name = soup.find("p", {"class": "security_name"}).text.split(" (")[0] value = float(soup.find("p", {'class': "value"}).text[2:]) # step 3: find movies lis = soup.find("ul", {"class": "credit"}).find("li") li = lis[1] movie_symbol = li.find("span").find("a").attrs["href"].split("/")[-1] # step 4: get movie price (which is really just step 1 + 2) url = f"https://www.hsx.com/security/view/{movie_symbol}" soup = Soup.get(url) name = soup.find("p", {"class": "security_name"}).text.split(" (")[0] value = float(soup.find("p", {'class': "value"}).text[2:]) print(name, value)
def test_find_nested_empty_tag(fake_html_3): soup = Soup(fake_html_3) result = soup.find("a", {"class": "foo"}) assert len(result) == 2
def price(symbol): url = f"https://www.hsx.com/security/view/{symbol}" soup = Soup.get(url) value = float(soup.find("p", {'class': "value"}).text[2:]) return value
from selenium.webdriver import Firefox from selenium.webdriver.firefox.options import Options url = "http://localhost:5000/protected" options = Options() options.headless = True browser = Firefox(executable_path="/usr/local/bin/geckodriver", options=options) browser.get(url) # username username = browser.find_element_by_id("username") username.clear() username.send_keys("admin") # password password = browser.find_element_by_name("password") password.clear() password.send_keys("admin") # submit browser.find_element_by_xpath("/html/body/main/form/button").click() # refetch browser.get(url) # gazpacho html = browser.page_source soup = Soup(html) soup.find("blockquote").remove_tags()
def test_find(fake_html_1): soup = Soup(fake_html_1) result = soup.find("span") assert str(result) == "<span>Hi</span>"
from gazpacho import get, Soup import re import pandas as pd from itertools import groupby from bs4 import BeautifulSoup from collections import Counter url = 'https://seinfeldscripts.com/seinfeld-scripts.html' html = get(url) soup = Soup(html) table = soup.find('table')[1] refs = table.find('a') links = [i.attrs['href'] for i in refs] links = [i.replace(" ", "") for i in links] links def scrape_script(episode): url = 'https://seinfeldscripts.com/' + str(episode) html = get(url) soup = Soup(html) table = soup.find('div', {'id': 'content'}) script = table.find('p') scrip = [i.remove_tags() for i in script] lines = same_line(scrip) scri = [i.replace('\n', '') for i in lines] spaces = [re.sub(' +', ' ', i) for i in scri] lines = same_line(spaces) bracks = [re.sub('\[.*?\]', '', i) for i in lines] return bracks
def test_find_first(fake_html_1): soup = Soup(fake_html_1) result = soup.find("p", mode="first") assert str(result) == "<p>'IDK!'</p>"
from gazpacho import get, Soup url = "https://scrape.world/titans" html = get(url) soup = Soup(html) titans = soup.find("b")[1:] titans = [t.find("a").text for t in titans] print(titans)
from urllib.request import urlopen from gazpacho import get, Soup from PIL import Image # pip install pillow import pytesseract # pip install pytesseract base = 'http://localhost:5000' url = base + '/ocr' html = get(url) soup = Soup(html) soup.find("img") imgs = soup.find('img') paths = [i.attrs['src'] for i in imgs] images = [] for path in paths: url = base + path im = Image.open(urlopen(url)) images.append(im) text = '' for image in images: i2t = pytesseract.image_to_string(image) text += i2t print(text)