def fetchData(index): info = fetch.get('https://unity.com/madewith') soup = beautifulSoup(info.text, "lxml") link = soup.find_all('div', attrs={ 'class': 'section-home-stories--item-image', 'style': True }) images = [] for count, pics in enumerate(link): images.append("https://unity.com" + re.findall("\('(/sites/.*)'\)", link[count]['style'])[0]) projects = [ proj.text for proj in soup.find_all( 'div', attrs={'class': 'section-home-stories--item-title'}) ] authors = [ auth.text for auth in soup.find_all( 'div', attrs={'class': 'section-home-stories--item-studio'}) ] urls = [] for links in soup.find_all('article'): urls.append("https://unity.com" + links.find('a').attrs['href']) page = fetch.get(urls[index]) soup2 = beautifulSoup(page.text, "lxml") headers = [ head.text for head in soup2.find_all('div', attrs={'class': 'title-large'}) ] texts = [ txt.text for txt in soup2.find_all('div', attrs={'class': 'section-article-text'}) ] fetched = [{ 'project': projects[index], 'author': authors[index], 'link': urls[index], 'bg': images[index], 'h1': headers[0], 'h2': headers[1], 'h3': headers[2], 'h1text': texts[0], 'h2text': texts[1], 'h3text': texts[2], }] return fetched
def get_pages_count(html): soup = beautifulSoup(html, 'html.parser') pagination = soup.find_all('span', class_= 'mhide') if pagination: return pagination[-1].get_text()) else: return 1
def check_price(T_price): page = requests.get(URL, headers=headers) soup = beautifulSoup(page.content, 'html.parser') title = soup.find(id="productTitle").get_text() price = soup.find(id="priceblock_ourprice").get_text() converted_price = float(price[:5]) if converted_price < T_price: send_mail()
def update_pom(self, pom_location, service_name): with open(pom_location, "r") as file: file_content = file.readlines() content = "".join(file_content) beautify_content = beautifulSoup(content, "lxml") beautify_content.find('name').string = service_name file_pointer = codecs.open(pom_location, "w", "utf-8") file_pointer.write(str(beautify_content)) file_pointer.close()
def downloadXkcd(startComic, endComic): #download the webpage print('Downloading page http://xkcd.com/%s...' % (urlNumber)) res = requests.get('http://xkcd.co,/%s' % (urlNumber)) res.raise_for_status() soup = bs4.beautifulSoup(res.text) # Get the URL of the comic image comicELEM = SOUP.SELECT('#comic img') if comicElem = []: print('Could not fins comic image.')
def trackPrice(self): page = requests.get(self.__url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}) soup1 = beautifulSoup(page.content, "html.parser") soup2 = beautifulSoup(soup1.prettify(), "html.parser") element = soup2.find(self.__webData["tag"], attrs = self.__webData["attributes"]) if (element): if self.__webData["inside"] == False: text = element.getText() else: text = element[self.__webData["inside"]] text2 = text.strip().replace(",", ".") price = float(re.sub("$|€", '', text2)) self.__price = price if (self.__price <= self.__desirePrice): self.sendEmail(); else: print("No element detected") else: print("No web detetected") print("Track done")
def __scan_pom_file(self, pom_location): with open(pom_location, "r") as file: file_content = file.readlines() content = "".join(file_content) beautify_content = beautifulSoup(content, "lxml") results = beautify_content.find_all("dependency") dependencies = {} key = None value = None for res in results: for child in res.children: if child.name == "groupid": key = str(child.contents[0]) if child.name == "artifactid": value = str(child.contents[0]) if key is not None and value is not None: dependencies[key] = value key = None value = None return dependencies
def get_content(html): soup = beautifulSoup(html, 'html.parser') items = soup.find_all('a', class_='na-card-item') # получаем ссылки из нужного нам блока с классом cars = [] # Создаем каталог, который будем наполнять с помощью цикла for item in items: # Проверяем наличие данных. Если данных нет, то заменяем их на необходимый текст uah_price = item.find('span', class_='size15') if uah_price: uah_price = uah_price.get_text().replace('*', '') # replace позволяет удалить ненужный символ else: uah_price = 'Цену уточняйте' # Конец проверки cars.append({ 'title': item.find('div', class_='na-card-name').get_text(strip = True), # strip позволяет уберать концевые пробелы 'link': HOST + item.find('span', class_='link').get('href'), 'usd_price': item.find('strong', class_='green').get._text(), 'uah_price': uah_price, 'city': item.find('svg', class_='svg_116_pin').find_text(), # в данном случаем находим текст привязанный к иконке }) return cars
""" import bs4, requests # bs4.BeautifulSoup() function needs to be called with a string containing the HTML # it will parse. The bs4.BeautifulSoup() function returns is a BeautifulSoup object res = requests.get('http://nostarch.com') res.raise_for_status() noStarchSoup = bs4.BeautifulSoup(res.text) # text attribute of teh response is passed tp bs4.beautifulsoup type(noStarchSoup) # <class 'bs4.BeautifulSoup'> # HTML file can also be loaded from the hard drive examplefile = open('example.html') exampleSoup = bs4.beautifulSoup(examplefile) type(exampleSoup) # <class 'bs4.BeautifulSoup'> # select() method can be used find the element of interest # select method with CSS selector if the element # these selectors are like regex, they specify an HTML pattern to look for in HTML pages # ex:soup.select('#author') The element with an id attribute of author # The select() method will return a list of HTML Tag objects. # The list will contain one Tag object for every match in the BeautifulSoup object’s HTML. # Tag values can be passed to the str() function to show the HTML TAGs they represent. # Tag values also have an attrs attribute that shows all the HTML attributes of the TAG as a dictionary # getText() can be used with the tag object to check teh text value they represent # The get() method for Tag objects makes it simple to access attribute values of a TAG element. # The method is passed an attribute name and returns that attribute’s # value. examplefile = open('example.html')
5. do the same thing for http://isitchristmas.com 6. do the same thing for http://emerging-media.info/class/ 7. try a url that you know is missing; what status code do you get? 8. notice all of the repeated code? how could we reduce the redundant code? """ # import the bs4 module import bs4 # create the following html string and assign it to a variable named unordered_list unordered_list = """ <ul> <li>one</li> <li>two</li> </ul> """ # create a beautiful soup object using the html above soup = bs4.beautifulSoup(unordered_list) # print out a formatted version of the soup object print soup.prettify() # print out the ul tag print soup.ul # print out the first li in the ul print "===" print soup.ul.il.string # print out the string that's between the first li tags # create the following html string and assign it to a variable named paragraphs paragraphs = """ <div> <p>This is a paragraph.</p> <p>So is <strong>this</strong>.</p> <p class="foo">This has a <strong>class</strong> attribute!</p>
from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as beautifulSoup # https://www.youtube.com/watch?v=XQgXKtPSzUI youtube video for reference of script my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=StoreIM&Depa=1&Category=38' # Opening up connection, grabbing the page uClient = uReq(my_url) page_html = uClient.read() uClient.close() # HTML Parser page_soup = beautifulSoup(page_html, "html.parser") # Grabs each product containers = page_soup.findAll("div", {"class": "item-container"}) filename = "products.csv" f = open(filename, "w") headers = "brand, product_name, shipping \n" f.write(headers) for container in containers: divWithInfo = containers[0].find("div", "item-info") brand = divWithInfo.div.img["title"] title_container = container.findAll("a", {"class": "item-title"})
#! pythonn3 import webbrowser, bs4, requests, sys print("Googling...") # prints googling meanwhile web is loaded res = requests.get('https://www.google.com/search?q='+ ' '.join(sys.argv[1:])) res.raise_for_status() # res.status_code == requests.codes.ok googleSoup = bs4.beautifulSoup(res.text) linkElems = googleSoup.select('.r a') tabnums = min(5, len(linkElems)) # returns teh minimum number from both for i in range(tabnums) webbrowser.open('http:google.com'+linkElems[i].get('href'))
import os import requests #requetes et protocole HTTP import shutil # manip de fichier, copie ect.. from bs4 import beautifulSoup with requests.Session() as rs: p = rs.post('#URL#', data=#ADD DATA VARB(dict)#) for e in range(1, page+1): requete = s.get("#URL#", , "#page#", str(e)) content = requete.content soup = beautifulSoup(content) for i in soup.find('div', '#ADD HTML#').findAll('EX: img'): url = I['src'].replace('//', 'https://') file = url.rsplit('/', 1)[1] rs.headers.update("#dict URL#") ret = s.get(url, stream=True) # sauvegarde with open file, 'wb' as ret_file: ret.raw.decode_content = True shutil.copyfileobj(ret.raw, out_file)
from bs4 import beautifulSoup import requests source = requests.get("http://coreyms.com").text soup = beautifulSoup(source, 'html.parser') print(soup.prettify()) article = soup.find('article') #print(article.prettify()) #headline = article.h2.a.text #print(headline) summary = article.find('div', class_='entry-content').p.text print(summary)
from bs4 import beautifulSoup import requests source = requests.get("http://coreyms.com").text soup = beautifulSoup(source, 'lxml') print(soup.prettify()) article = soup.find('article') #print(article.prettify()) #headline = article.h2.a.text #print(headline) summary = article.find('div', class_='entry-content').p.text print(summary)