def grab100():
	result = []		# For end result



	for pageCounter in range(1, 2):
		# Create url addres
		url = 'https://www.amazon.com/Best-Sellers-Books-Biographies/zgbs/books/2' + str(pageCounter)


		# Connect to page
		connect  = uRequest(url)
		response = connect.read()
		connect.close()



		# Parse response and grab data
		pRespone 		= bSoup(response, 'html.parser')
		bookContainer 	= pRespone.findAll('li', {'class':'book'})
		booksContent 	= []



		# Grab data
		for book in bookContainer:
			bookTitle 		= book.findAll('a', {'class':'bookTitle'})[0].text
			bookAuthor		= book.findAll('a', {'itemprop':'name'})[0].text
			bookRank 		= book.findAll("div", {"class":"sprite"})[0].text

			bookStatsBox 	= book.findAll("div", {"class":"book-stats"})[0].findAll("span", {"class":"font-szary-4a"})
			bookReaders 	= bookStatsBox[0].text
			bookOpinions 	= bookStatsBox[1].text
			bookRate 		= bookStatsBox[2].text


			# Delete reserved characters
			reserved_chars = ('★', '⬈', '⬊', '⬌','\'', '\"')
			reserved_list = [bookTitle, bookAuthor, bookRank]
			free_list = []

			for element in reserved_list:
				for rChar in reserved_chars:
					if rChar in element:
						element = element.replace(rChar, '')
				free_list.append(element)


			# Add to end result
			result.append((free_list[0], free_list[1], free_list[2], bookReaders, bookOpinions, bookRate))



	print('Successful download data from website\n\n')
	return result
Example #2
0
    def _search_(self, startNum, displayNum):
        sortWay = 'sort=' + self.sortWay
        start = '&start=' + str(startNum)
        display = '&display=' + str(displayNum)
        query = '&query=' + uQuote_plus(
            self.searchWord
        )  # 사용자에게 검색어를 입력받아 quote_plus 함수로 UTF-8 타입에 맞도록 변환시켜 줍니다.
        fullURL = self.defaultURL + sortWay + start + display + query

        # HTTP 요청을 하기 전에 헤더 정보를 이용해 request 객체를 생성합니다. urllib 모듈에서 헤더 정보를 서버에 전달할 때 사용하는 대표적인 방법입니다.
        req = uRequest(fullURL, headers=self.headers)
        # 생성된 request객체를 uplopen함수의 인수로 전달합니다. 이렇게 되면 헤더 정보를 포함하여 서버에게 HTTP 요청을 하게 됩니다.

        f = uUrlopen(req)
        resultXML = f.read()
        xmlsoup = BeautifulSoup(resultXML, 'lxml')

        return xmlsoup
def scrapensave(myUrl, num, num2=0):
    # Open connection to the web-page, read, and close.
    uClient = uRequest(myUrl)
    pageHtml = uClient.read()
    uClient.close()

    # HTML "page soup" is stored here.
    page_soup = parser(pageHtml, "html.parser")
    if num2:
        text = page_soup.find("div",
                              {
                                  "class": "field field-name-body field-type-text-with-summary field-label-above"}).getText()[
               num:num2]
    else:
        text = page_soup.find("div",
                              {
                                  "class": "field field-name-body field-type-text-with-summary field-label-above"}).getText()[
               num:]

    # Return text in lowered form for uniformity when doing calculations.
    return text.lower()
Example #4
0
from urllib.request import urlopen as uRequest
from bs4 import BeautifulSoup as Soup

import pyautogui
import tkinter as tk

the_url = 'https://forlap.ristekdikti.go.id/mahasiswa/detail/MzM2QUQ5NjQtNzAxQi00QTA4LUEyRkUtNTRBODNBRURCQjg3'

#open connection
uClient = uRequest(the_url)
Tgtpage = uClient.read()
uClient.close()

#html parse
soup_page = Soup(Tgtpage, "html.parser")

#mengambil div pada general information
containers = soup_page.find_all("div", {"class": "main"})

file_name = "profile.csv"
berkas = open(file_name, "w")

for container in containers:
    #mengambil jabatan bernama title pada div
    Jabatan_contain = container.find("div", {"class": "title"})
    jabatan = Jabatan_contain.find(text=True)

    #mengambil table data bernama table1 di div
    tabel_contain = container.find("table", {"class": "table1"})
    tabel_row = tabel_contain.find_all("tr")
Example #5
0
def downloadRawHtml(url):
    webClient = uRequest(url)
    page_rawhtml = webClient.read()
    webClient.close()
    return page_rawhtml
# name the output file to write to local disk
out_filename = "zaful_reviews.csv"
# header of csv file to be written
headers = "SKU,Individual Rating,Number of Pictures,Comment,Date Stamp,Color/Size,Overall Fit,Height,Waist,Hips,Bust\n"

# opens file, and writes headers
f = open(out_filename, "w", encoding="utf-8")
f.write(headers)

for page in range(1, 6):  # loops over each page
    if page == 1:  # page 1 has different url format
        my_url = 'https://www.zaful.com/w/floral-dresses/e_5/'
    else:
        my_url = f'https://www.zaful.com/w/floral-dresses/e_5/g_{page}.html'
# opening up connection, grabbing page, then close
    first_uClient = uRequest(my_url)
    page_html = first_uClient.read()
    first_uClient.close()
    # html parser
    floral_dresses_soup = soup(page_html, "html.parser")
    # grabs each product
    floral_dresses = floral_dresses_soup.findAll(
        "li", {"class": "js_proList_item logsss_event_ps"})
    # floral_dresses = floral_dresses[8:11]

    for floral_dress in floral_dresses:  # loops over items on page

        sku = str(
            floral_dress.find('strong',
                              class_='my_shop_price')['data-sku'].strip())
Example #7
0
        datestayreview = eachreviews.find("span", {"class": "location-review-review-list-parts-EventDate__event_date--1epHa"}).text[14:]
        #print(datestayreview) #print March 2020
        datepostreview = eachreviews.find("div", {"class": "social-member-event-MemberEventOnObjectBlock__event_type--3njyv"})
        datepostreview = datepostreview.find("span").text[-8:]
        #print(datepostreview) #print Mar 2020
        #tabdelimiter = userreview+"\t"+datepostreview+"\t"+starsreview+"\t"+titlereviews+"\t"+textreviews+"\t"+datestayreview
        tabdelimiter = userreview + "\t" + datepostreview + "\t" + starsreview + "\t" + titlereviews + "\t" + textreviews + "\t" + datestayreview + "\n"
        filewrite.write(tabdelimiter)
filewrite.close()

#Intro to Web Scraping with Python and Beautiful Soup
from urllib.request import urlopen as uRequest
from bs4 import BeautifulSoup as soup
myurl = "https://www.newegg.com/p/pl?d=graphics+cards"
#opening up connecting, grap the webpage
uClient = uRequest(myurl)
rawhtml = uClient.read()
#closing connection
uClient.close()
#html parse or the html code for the webpage
htmlparsed = soup(rawhtml, "html.parser")
print(htmlparsed.h1) #print <h1 class="page-title-text">"graphics cards"</h1>
print(htmlparsed.p) #print <p>Newegg.com - A great place to buy computers, computer parts, electronics, software, accessories, and DVDs online. With great prices, fast shipping, and top-rated customer service - Newegg shopping upgraded â„¢</p>
#instructor says inspect website's html code and its elements
#print(htmlparsed.body) #prints all the body
print(htmlparsed.body.span) #print <span class="noCSS">Skip to:</span>
#instructor found the div class named item-container which has the html code for one of the graphics card and its complete information.  I saved a sample of the div class named item-container in file name /home/mar/python/divclassitemcontainersample.html
classcontainer = htmlparsed.findAll("div", {"class": "item-container"})
print(len(classcontainer)) #print 41.  The 41 includes the four graphics cards under "You May Also Be Interested In:"
print(type(classcontainer[0].div)) #print <class 'bs4.element.Tag'>
print(classcontainer[0])
Example #8
0
 def downloadRawHtml(self):
     webClient = uRequest(self.rootUrl+self.podcastId)
     page_rawhtml = webClient.read()
     webClient.close()
     self.page = page_rawhtml
Example #9
0
"""
Created on Fri May 17 16:32:06 2019

@author: stephaniejones
"""
# https://github.com/jsubroto/billboard-hot-100-web-scraper/blob/master/billboard_hot_100_websraper.py

from urllib.request import urlopen as uRequest
from bs4 import BeautifulSoup as soup

import http.client conn = http.client.HTTPSConnection("www.sanjamar.com") conn.request("GET", "/") r1 = conn.getresponse() print(r1.status, r1.reason)

url = 'https://www.billboard.com/charts/hot-100'

# Opening up connection, grabbing the page
uClient = uRequest(url)
page_html = uClient.read() # Offloads content into a variable
uClient.close() # Close the client

# HTML parsing
page_soup = soup(page_html, "html.parser")

# Grabs all information related to the top 100 songs
containers = page_soup.select('article[class*=chart]') # *= means contains

filename = 'billboard_hot_100.csv'
f = open(filename, 'w') # w = write

headers = 'Song, Artist, Last Week, Peak Position, Weeks on Chart\n'

f.write(headers)
Example #10
0
from urllib.request import urlopen as uRequest
from bs4 import BeautifulSoup as soup
import pandas as pd
import time

my_url="https://ng.indeed.com/jobs?q=solar&l&vjk=9b861d63eaeecede"

#Opening connection and grabbing the page at the URL
uClient = uRequest(my_url)

#Read the downloaded page and storing it in a variable
page_html = uClient.read()

#close the connection so we don't leave it opened
uClient.close()

#page parser
page_soup = soup(page_html, "html.parser")


def extract_job_title_from_result(soup):
  jobs = []
    for div in soup.find_all(name=”div”, attrs={“class”:”row”}):
      for a in div.find_all(name=”a”, attrs={“data-tn-element”:”jobTitle”}):
      jobs.append(a[“title”])
  return(jobs)
extract_job_title_from_result(soup)



def extract_company_from_result(soup):
Example #11
0
 def __init__(self, url: str):
     self.url = url
     u_client = uRequest(url)
     self.page_html = u_client.read()
     self.page_soup = soup(self.page_html, features="lxml")
    deal_str = ''
    sale_str = ''
    rating = ''
    review_total = ''
    true_fit_percentage = ''
    too_small_percentage = ''
    too_large_percentage = ''
    review_info_str = ''
    color_str = ''
    if page == 1:  # page 1 has different url format
        my_url = 'https://www.zaful.com/w/floral-dresses/e_5/'
    else:
        my_url = f'https://www.zaful.com/w/floral-dresses/e_5/g_{page}.html'

    # opening up connection, grabbing page, then close
    first_uClient = uRequest(my_url)
    page_html = first_uClient.read()
    first_uClient.close()
    # html parser
    floral_dresses_soup = soup(page_html, "html.parser")
    # grabs each product
    floral_dresses = floral_dresses_soup.findAll(
        "li", {"class": "js_proList_item logsss_event_ps"})

    # floral_dresses = floral_dresses[0:15]
    for floral_dress in floral_dresses:  # loops over items on page

        js_data = json.loads(
            floral_dress.div.a.img['data-logsss-browser-value'].replace(
                "\'", "\""))
        rank = str(js_data['bv']['rank'])
Example #13
0
import bs4
from urllib.request import urlopen as uRequest
from bs4 import BeautifulSoup as bSoup
myUrl = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'
uClient = uRequest(myUrl)
data = uClient.read()
uClient.close()
pageData = bSoup(data, "html.parser")

containers = pageData.find_all("div", {"class": "item-container"})
fileName = "products.csv"
f = open(fileName, "w")
header = "Brand, ProductName, shipping\n"
f.write(header)
print(pageData.h1)
# print(len(containers))
# print(containers[1].find("div", {"class": "item-info"}))

for container in containers:
    brand = container.find("div", {"class": "item-info"}).div.a.img["title"]
    titleContainer = container.findAll("a", {"class": "item-title"})
    productTitle = titleContainer[0].text
    shippingContainer = container.findAll("li", {"class": "price-ship"})
    shipping = shippingContainer[0].text.strip()
    print(brand)
    print(productTitle)
    print(shipping)
    f.write(brand + "," + productTitle.replace(",", "|") + "," + shipping +
            "\n")
Example #14
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uRequest

file_name = 'fsktm.csv'
f = open(file_name, 'w')
f.write('Title,Name,URL,Department,Phone Number,Email,Expertise\n')

inital_url = 'https://umexpert.um.edu.my/cv_search_page.php?selCat=01&txtname=&fak=C&dept=-&page_no={}'

for i in range(1, 10):
    # global str
    uClient = uRequest(inital_url.format(str(i)))
    temp_page = uClient.read()
    uClient.close()

    soup_page = soup(temp_page, "html.parser")
    containers = soup_page.findAll("table", {"style": "border:1px solid #CCC; border-bottom:"})
    for container in containers: 
        name = container.strong.a.text.strip()
        title = container.strong.find_next("br").next.strip()
        department = container.strong.find_next("br").next.next.next.strip().replace("\r\n                           ", "").replace(',', '.')
        
        try:
            phone_num = container.find("i", {"class": "fa fa-phone-square"}).next.replace("\xa0", "").replace(" ", "").replace("\t", "")
        except:
            print('Phone number error, {}'.format(name))

        try:
            email = container.find("i", {"class": "fa fa-envelope"}).next.replace("\xa0", "").replace(" ", "").replace("\t", "")
        except:
            print('Email error, {}'.format(name))
Example #15
0
from urllib.request import urlopen as uRequest
from bs4 import BeautifulSoup as soup

# The destination URL to be scraped (This is newegg graphics cards)
myURL = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics+cards"

uClient = uRequest(myURL)	# Opening the connection to the website, grabbing the page
pageHTML = uClient.read()	# Loading page contents into variable
uClient.close()				    # Close the connection 

# Parsing the webpage data with BS
pageSoup = soup(pageHTML, "html.parser")	

# Store each desired item to a list
containers = pageSoup.findAll("div", {"class":"item-cell"})

# Creating a new csv file to store data
fileName = "products.csv"      
f = open(fileName, 'w')

# Setting the headers for columns
headers = "brand, productName, price, shipPrice\n"

# Adding the headers to the csv file
f.write(headers)

# Loop through each item
for container in containers:
	# Getting the brand of the product
	brand = container.div.div.a.img["title"]