def grab100(): result = [] # For end result for pageCounter in range(1, 2): # Create url addres url = 'https://www.amazon.com/Best-Sellers-Books-Biographies/zgbs/books/2' + str(pageCounter) # Connect to page connect = uRequest(url) response = connect.read() connect.close() # Parse response and grab data pRespone = bSoup(response, 'html.parser') bookContainer = pRespone.findAll('li', {'class':'book'}) booksContent = [] # Grab data for book in bookContainer: bookTitle = book.findAll('a', {'class':'bookTitle'})[0].text bookAuthor = book.findAll('a', {'itemprop':'name'})[0].text bookRank = book.findAll("div", {"class":"sprite"})[0].text bookStatsBox = book.findAll("div", {"class":"book-stats"})[0].findAll("span", {"class":"font-szary-4a"}) bookReaders = bookStatsBox[0].text bookOpinions = bookStatsBox[1].text bookRate = bookStatsBox[2].text # Delete reserved characters reserved_chars = ('★', '⬈', '⬊', '⬌','\'', '\"') reserved_list = [bookTitle, bookAuthor, bookRank] free_list = [] for element in reserved_list: for rChar in reserved_chars: if rChar in element: element = element.replace(rChar, '') free_list.append(element) # Add to end result result.append((free_list[0], free_list[1], free_list[2], bookReaders, bookOpinions, bookRate)) print('Successful download data from website\n\n') return result
def _search_(self, startNum, displayNum): sortWay = 'sort=' + self.sortWay start = '&start=' + str(startNum) display = '&display=' + str(displayNum) query = '&query=' + uQuote_plus( self.searchWord ) # 사용자에게 검색어를 입력받아 quote_plus 함수로 UTF-8 타입에 맞도록 변환시켜 줍니다. fullURL = self.defaultURL + sortWay + start + display + query # HTTP 요청을 하기 전에 헤더 정보를 이용해 request 객체를 생성합니다. urllib 모듈에서 헤더 정보를 서버에 전달할 때 사용하는 대표적인 방법입니다. req = uRequest(fullURL, headers=self.headers) # 생성된 request객체를 uplopen함수의 인수로 전달합니다. 이렇게 되면 헤더 정보를 포함하여 서버에게 HTTP 요청을 하게 됩니다. f = uUrlopen(req) resultXML = f.read() xmlsoup = BeautifulSoup(resultXML, 'lxml') return xmlsoup
def scrapensave(myUrl, num, num2=0): # Open connection to the web-page, read, and close. uClient = uRequest(myUrl) pageHtml = uClient.read() uClient.close() # HTML "page soup" is stored here. page_soup = parser(pageHtml, "html.parser") if num2: text = page_soup.find("div", { "class": "field field-name-body field-type-text-with-summary field-label-above"}).getText()[ num:num2] else: text = page_soup.find("div", { "class": "field field-name-body field-type-text-with-summary field-label-above"}).getText()[ num:] # Return text in lowered form for uniformity when doing calculations. return text.lower()
from urllib.request import urlopen as uRequest from bs4 import BeautifulSoup as Soup import pyautogui import tkinter as tk the_url = 'https://forlap.ristekdikti.go.id/mahasiswa/detail/MzM2QUQ5NjQtNzAxQi00QTA4LUEyRkUtNTRBODNBRURCQjg3' #open connection uClient = uRequest(the_url) Tgtpage = uClient.read() uClient.close() #html parse soup_page = Soup(Tgtpage, "html.parser") #mengambil div pada general information containers = soup_page.find_all("div", {"class": "main"}) file_name = "profile.csv" berkas = open(file_name, "w") for container in containers: #mengambil jabatan bernama title pada div Jabatan_contain = container.find("div", {"class": "title"}) jabatan = Jabatan_contain.find(text=True) #mengambil table data bernama table1 di div tabel_contain = container.find("table", {"class": "table1"}) tabel_row = tabel_contain.find_all("tr")
def downloadRawHtml(url): webClient = uRequest(url) page_rawhtml = webClient.read() webClient.close() return page_rawhtml
# name the output file to write to local disk out_filename = "zaful_reviews.csv" # header of csv file to be written headers = "SKU,Individual Rating,Number of Pictures,Comment,Date Stamp,Color/Size,Overall Fit,Height,Waist,Hips,Bust\n" # opens file, and writes headers f = open(out_filename, "w", encoding="utf-8") f.write(headers) for page in range(1, 6): # loops over each page if page == 1: # page 1 has different url format my_url = 'https://www.zaful.com/w/floral-dresses/e_5/' else: my_url = f'https://www.zaful.com/w/floral-dresses/e_5/g_{page}.html' # opening up connection, grabbing page, then close first_uClient = uRequest(my_url) page_html = first_uClient.read() first_uClient.close() # html parser floral_dresses_soup = soup(page_html, "html.parser") # grabs each product floral_dresses = floral_dresses_soup.findAll( "li", {"class": "js_proList_item logsss_event_ps"}) # floral_dresses = floral_dresses[8:11] for floral_dress in floral_dresses: # loops over items on page sku = str( floral_dress.find('strong', class_='my_shop_price')['data-sku'].strip())
datestayreview = eachreviews.find("span", {"class": "location-review-review-list-parts-EventDate__event_date--1epHa"}).text[14:] #print(datestayreview) #print March 2020 datepostreview = eachreviews.find("div", {"class": "social-member-event-MemberEventOnObjectBlock__event_type--3njyv"}) datepostreview = datepostreview.find("span").text[-8:] #print(datepostreview) #print Mar 2020 #tabdelimiter = userreview+"\t"+datepostreview+"\t"+starsreview+"\t"+titlereviews+"\t"+textreviews+"\t"+datestayreview tabdelimiter = userreview + "\t" + datepostreview + "\t" + starsreview + "\t" + titlereviews + "\t" + textreviews + "\t" + datestayreview + "\n" filewrite.write(tabdelimiter) filewrite.close() #Intro to Web Scraping with Python and Beautiful Soup from urllib.request import urlopen as uRequest from bs4 import BeautifulSoup as soup myurl = "https://www.newegg.com/p/pl?d=graphics+cards" #opening up connecting, grap the webpage uClient = uRequest(myurl) rawhtml = uClient.read() #closing connection uClient.close() #html parse or the html code for the webpage htmlparsed = soup(rawhtml, "html.parser") print(htmlparsed.h1) #print <h1 class="page-title-text">"graphics cards"</h1> print(htmlparsed.p) #print <p>Newegg.com - A great place to buy computers, computer parts, electronics, software, accessories, and DVDs online. With great prices, fast shipping, and top-rated customer service - Newegg shopping upgraded â„¢</p> #instructor says inspect website's html code and its elements #print(htmlparsed.body) #prints all the body print(htmlparsed.body.span) #print <span class="noCSS">Skip to:</span> #instructor found the div class named item-container which has the html code for one of the graphics card and its complete information. I saved a sample of the div class named item-container in file name /home/mar/python/divclassitemcontainersample.html classcontainer = htmlparsed.findAll("div", {"class": "item-container"}) print(len(classcontainer)) #print 41. The 41 includes the four graphics cards under "You May Also Be Interested In:" print(type(classcontainer[0].div)) #print <class 'bs4.element.Tag'> print(classcontainer[0])
def downloadRawHtml(self): webClient = uRequest(self.rootUrl+self.podcastId) page_rawhtml = webClient.read() webClient.close() self.page = page_rawhtml
""" Created on Fri May 17 16:32:06 2019 @author: stephaniejones """ # https://github.com/jsubroto/billboard-hot-100-web-scraper/blob/master/billboard_hot_100_websraper.py from urllib.request import urlopen as uRequest from bs4 import BeautifulSoup as soup import http.client conn = http.client.HTTPSConnection("www.sanjamar.com") conn.request("GET", "/") r1 = conn.getresponse() print(r1.status, r1.reason) url = 'https://www.billboard.com/charts/hot-100' # Opening up connection, grabbing the page uClient = uRequest(url) page_html = uClient.read() # Offloads content into a variable uClient.close() # Close the client # HTML parsing page_soup = soup(page_html, "html.parser") # Grabs all information related to the top 100 songs containers = page_soup.select('article[class*=chart]') # *= means contains filename = 'billboard_hot_100.csv' f = open(filename, 'w') # w = write headers = 'Song, Artist, Last Week, Peak Position, Weeks on Chart\n' f.write(headers)
from urllib.request import urlopen as uRequest from bs4 import BeautifulSoup as soup import pandas as pd import time my_url="https://ng.indeed.com/jobs?q=solar&l&vjk=9b861d63eaeecede" #Opening connection and grabbing the page at the URL uClient = uRequest(my_url) #Read the downloaded page and storing it in a variable page_html = uClient.read() #close the connection so we don't leave it opened uClient.close() #page parser page_soup = soup(page_html, "html.parser") def extract_job_title_from_result(soup): jobs = [] for div in soup.find_all(name=”div”, attrs={“class”:”row”}): for a in div.find_all(name=”a”, attrs={“data-tn-element”:”jobTitle”}): jobs.append(a[“title”]) return(jobs) extract_job_title_from_result(soup) def extract_company_from_result(soup):
def __init__(self, url: str): self.url = url u_client = uRequest(url) self.page_html = u_client.read() self.page_soup = soup(self.page_html, features="lxml")
deal_str = '' sale_str = '' rating = '' review_total = '' true_fit_percentage = '' too_small_percentage = '' too_large_percentage = '' review_info_str = '' color_str = '' if page == 1: # page 1 has different url format my_url = 'https://www.zaful.com/w/floral-dresses/e_5/' else: my_url = f'https://www.zaful.com/w/floral-dresses/e_5/g_{page}.html' # opening up connection, grabbing page, then close first_uClient = uRequest(my_url) page_html = first_uClient.read() first_uClient.close() # html parser floral_dresses_soup = soup(page_html, "html.parser") # grabs each product floral_dresses = floral_dresses_soup.findAll( "li", {"class": "js_proList_item logsss_event_ps"}) # floral_dresses = floral_dresses[0:15] for floral_dress in floral_dresses: # loops over items on page js_data = json.loads( floral_dress.div.a.img['data-logsss-browser-value'].replace( "\'", "\"")) rank = str(js_data['bv']['rank'])
import bs4 from urllib.request import urlopen as uRequest from bs4 import BeautifulSoup as bSoup myUrl = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card' uClient = uRequest(myUrl) data = uClient.read() uClient.close() pageData = bSoup(data, "html.parser") containers = pageData.find_all("div", {"class": "item-container"}) fileName = "products.csv" f = open(fileName, "w") header = "Brand, ProductName, shipping\n" f.write(header) print(pageData.h1) # print(len(containers)) # print(containers[1].find("div", {"class": "item-info"})) for container in containers: brand = container.find("div", {"class": "item-info"}).div.a.img["title"] titleContainer = container.findAll("a", {"class": "item-title"}) productTitle = titleContainer[0].text shippingContainer = container.findAll("li", {"class": "price-ship"}) shipping = shippingContainer[0].text.strip() print(brand) print(productTitle) print(shipping) f.write(brand + "," + productTitle.replace(",", "|") + "," + shipping + "\n")
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uRequest file_name = 'fsktm.csv' f = open(file_name, 'w') f.write('Title,Name,URL,Department,Phone Number,Email,Expertise\n') inital_url = 'https://umexpert.um.edu.my/cv_search_page.php?selCat=01&txtname=&fak=C&dept=-&page_no={}' for i in range(1, 10): # global str uClient = uRequest(inital_url.format(str(i))) temp_page = uClient.read() uClient.close() soup_page = soup(temp_page, "html.parser") containers = soup_page.findAll("table", {"style": "border:1px solid #CCC; border-bottom:"}) for container in containers: name = container.strong.a.text.strip() title = container.strong.find_next("br").next.strip() department = container.strong.find_next("br").next.next.next.strip().replace("\r\n ", "").replace(',', '.') try: phone_num = container.find("i", {"class": "fa fa-phone-square"}).next.replace("\xa0", "").replace(" ", "").replace("\t", "") except: print('Phone number error, {}'.format(name)) try: email = container.find("i", {"class": "fa fa-envelope"}).next.replace("\xa0", "").replace(" ", "").replace("\t", "") except: print('Email error, {}'.format(name))
from urllib.request import urlopen as uRequest from bs4 import BeautifulSoup as soup # The destination URL to be scraped (This is newegg graphics cards) myURL = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics+cards" uClient = uRequest(myURL) # Opening the connection to the website, grabbing the page pageHTML = uClient.read() # Loading page contents into variable uClient.close() # Close the connection # Parsing the webpage data with BS pageSoup = soup(pageHTML, "html.parser") # Store each desired item to a list containers = pageSoup.findAll("div", {"class":"item-cell"}) # Creating a new csv file to store data fileName = "products.csv" f = open(fileName, 'w') # Setting the headers for columns headers = "brand, productName, price, shipPrice\n" # Adding the headers to the csv file f.write(headers) # Loop through each item for container in containers: # Getting the brand of the product brand = container.div.div.a.img["title"]