def mainPageScrape(f): address = "https://www.newegg.com/Processors-Desktops/SubCategory/ID-343" # opening up connection grabbing the page uClient = UReq(address) page_html = uClient.read() uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # add each processor item container to a list of containers containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: list = (containerScrape(container)) csv_string = list[0] + "," + list[1] + "," + list[2] + "," + list[3] + "," + list[4] + "," + list[5] + "," + \ list[6] if descriptionlog.__contains__(list[1]): print("Duplicate processor found. Not writing to list.") else: descriptionlog.append(list[1]) print(csv_string) f.write(csv_string + "\n") containers.clear()
def parse_page(url): x = Ureq(url) page = x.read() x.close() page_parsed = Bsoup(page, 'html.parser') return (page_parsed)
def getMostBoughtData(): my_url = 'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-bought-by-hedge-funds/' # saves the information from the url into the client Client = Req(my_url) # saves page_hmtl = Client.read() Client.close() # parses the html of the website page_soup = soup(page_hmtl, "html.parser") # finds all the parts of the webpage that would hold the titles symbols = page_soup.findAll("h2", {"class": "Fz(m)"}) tickers = page_soup.findAll("a", {"class": "Fw(b)"}) prices_html = page_soup.findAll( "td", {"class": "data-col2 Ta(end) Pstart(10px) Pend(6px) Fw(b)"}) changes_html = page_soup.findAll( "td", {"class": "data-col4 Ta(end) Pstart(10px) Pend(6px)"}) volumes_html = page_soup.findAll( "td", {"class": "data-col6 Ta(end) Pstart(10px) Pend(6px)"}) avg_volumes_html = page_soup.findAll( "td", {"class": "data-col7 Ta(end) Pstart(10px) Pend(6px)"}) sym = int(symbols[1].text[0:2]) x = 0 data = [] while x < 48 - 1: if len(tickers[x].text ) > 5 or tickers[x].text == "Tech" or tickers[x].text == "News": x = x + 1 else: break y = x while x < y + sym: ticker = tickers[x].text price = prices_html[x - (y + sym)].text change_str = changes_html[x - (y + sym)].text if change_str.startswith('+'): change = change_str[1:] else: change = change_str volume = volumes_html[x - (y + sym)].text avg_volume = avg_volumes_html[x - (y + sym)].text data.append((ticker, price, change, volume, avg_volume)) x = x + 1 return data
def get_insider_trading_data(): my_url = 'http://openinsider.com/screener?s=&o=&pl=&ph=&ll=&lh=&fd=730&fdr=&td=0&tdr=&fdlyl=&fdlyh=&daysago=&xp=1&vl=&vh=&ocl=&och=&sic1=-1&sicl=100&sich=9999&grp=0&nfl=&nfh=&nil=&nih=&nol=&noh=&v2l=&v2h=&oc2l=&oc2h=&sortcol=0&cnt=1000&page=1' Client = Req(my_url) page_html = Client.read() Client.close() page_soup = soup(page_html, "html.parser") # finds the ticker locations using this method findsT = page_soup.findAll("a",{"onmouseout":"UnTip()"}) # finds the title of the person that was insider trading findsTi = page_soup.findAll("td") # finds the title of the person trading on the inside findsNum = page_soup.findAll("td",{"align":"right"}) x = 0 # variable for the title y = 22 # finds the price of the stock when bought z = 23 # finds the quantity of stocks purchased w = 24 # finds the number of stocks already owned a = 72 # finds the title of the person insider trading data = [] while x < 1000: findT = findsT[x] findT = findT["href"].replace("/", "") findP = findsNum[y].text findQ = findsNum[z].text findO = findsNum[w].text findTi = findsTi[a].text data.append((findT, findTi, findP, findQ, findO)) x += 1 y += 12 z += 12 w += 12 a += 17 return data
def getPage(self): uClient = UReq(self.url) self.page_html = uClient.read() uClient.close()
from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup my_url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38" #opening url and grabbing page uClint = Ureq(my_url) page_html = uClint.read() uClint.close() #html parser page_soup = soup(page_html, "html.parser") #print(page_soup.h1) # prints H1 #print(page_soup.p)# prints paragraphs #print(page_soup.body.div) #grab each product containers = page_soup.findAll("div", {"class": "item-container"}) print(len(containers)) # to open a file file_name = "product.csv" f = open(file_name, "w") headers = "Brand", "prouct name", "Shipping\n" f.write("Brand, product name, shippig\n") #below 3 lines of code is for container 1 that is 0th #container = containers[0] #print(container.a) #print(container.div.div.a.img["title"]) # will return the title
import bs4 from urllib.request import urlopen as Req from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card' uclient = Req(my_url) page_html = uclient.read() uclient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "item-container"}) filename = "products_newegg.csv" f = open(filename, "w") headers = "Brand, Product_name\n" f.write(headers) for container in containers: brand = container.div.div.a.img["title"] title_container = container.findAll("a", {"class": "item-title"}) product_name = title_container[0].text print("Brand: " + brand) print("Product Name: " + product_name) f.write(brand + "," + product_name.replace(",", "|") + "\n")
def remainingPagesScrape(f): page = 2 duplicateCount = 0 link = 'https://www.newegg.com/Processors-Desktops/SubCategory/ID-343/Page-' while True: try: address = link + str(page) print() print("Preparing to Scrape Page: " + str(page)) print("Address: " + address) print() # opening up connection grabbing the page uClient = UReq(address) page_html = uClient.read() uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # add each processor item container to a list of containers containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: list = (containerScrape(container)) csv_string = list[0] + "," + list[1] + "," + list[ 2] + "," + list[3] + "," + list[4] + "," + list[ 5] + "," + list[6] if descriptionlog.__contains__(list[1]): print("Duplicate processor found. Not writing to list.") duplicateCount = duplicateCount + 1 else: descriptionlog.append(list[1]) print(csv_string) f.write(csv_string + "\n") containers.clear() if duplicateCount > 100: print() print( "Duplicate Count Is " + str(duplicateCount) + ". This Suggests The Data Is Being Reiterated. The Script Will Stop." ) print("Processor Scrape Complete") print() print("Traversed " + str(page) + " Pages") print( str(descriptionlog.__len__()) + " Unique Processors Found") print() print("Data Written To: " + f.name) f.close() break page = page + 1 except IndexError as e: print() page = page + 1 # f.close() print("So Far We Have Traversed " + str(page - 1) + " Pages") print(str(descriptionlog.__len__()) + " Unique Processors Found") print(str(duplicateCount) + " Duplicates Ignored")
from urllib.request import urlopen as UR from bs4 import BeautifulSoup as soup import re URL_SIMP = 'https://www.newegg.com/Xbox-One-Systems/SubCategory/ID-3216' URL_CLIENT = UR(URL_SIMP) PAGE = URL_CLIENT.read() URL_CLIENT.close() PAGE_SOUP = soup(PAGE, "html.parser") PAGE_CONTENT = PAGE_SOUP.findAll("div", {"class": "item-container"}) filename = "newegg.csv" f = open(filename, "w") headers = ("PRICE, SHIP, NAME\n") f.write(headers) for CONTENT in PAGE_CONTENT: PRICE_DATA = CONTENT.findAll("li", {"class": "price-current"}) PRICE = PRICE_DATA[0].text PRICE_SIMP = re.sub("[^\d\.]", "", PRICE) SHIP_CONTENT = CONTENT.findAll("li", {"class": "price-ship"}) SHIP = SHIP_CONTENT[0].text.strip() NAME = CONTENT.img["title"] print("Price: " + "$" + PRICE_SIMP) print("Shipping Cost: " + SHIP) print("Name of Product: " + NAME) print("\n")
from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card' # opening up connection, grabbing the page Uclient = Ureq(my_url) #it offloads the content into a variable page_html = Uclient.read() #close the connection Uclient.close() # html parsing page_soup = soup(page_html, "html.parser") # grabs each product containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: brand = container.div.div.a.img["title"] title_container = container.findAll("a",{"class":"item-title"}) product_name = title_container[0].text shipping_container = container.findAll("li",{"class":"price-ship"}) shipping = shipping_container[0].text.strip() print("brand: " + brand) print("product_name: " + product_name)
def scrape(): #################################################################################### concat = Sentry.get() #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html" my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat) my_url = my_url.replace(' ', '+') #################################################################################### uClient = Ureq(my_url) page_html = uClient.read() uClient.close() #html_parsing page_soup = Soup(page_html, "html.parser") #grabe each containers = page_soup.findAll("div", {"class": "item-container"}) #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"}) #print(manufacturer ) #print(len(containers)) #print(containers[5:]) #container = containers[5] #---------------------------------------- save the csv files fileName = "{}.csv".format( concat) ############################################### f = open(fileName, "w") headers = "BRAND , PRICES , SAVES , TITLES , LINK \n" # f.write(headers) for container in containers[4:]: #--------------------------------------------------------- brand_container = container.findAll("a", {"class": "item-brand"}) brand = brand_container[0].img["title"] #brand name #------------------------------------------------------------------- may_know = container.findAll("a", {"class": "item-title"}) #print(may_know) #################################################################### title = container.a.img["title"] #Name of selling #print(container) #######################################################3 hyper = brand_container[0]["href"] #hyper = container.findAll("div",{"class": "item-info"}) #hyper = hypers.a #print(hyper) #-------------------------------------------------------------- price_container = container.findAll("li", {"class": "price-current"}) price_container2 = price_container[0].strong price = re.findall(r'.\d.\d\d\d', str(price_container2)) prices = ''.join(price) #------------------------------------------------------------------------ save_container = container.findAll("span", {"class": "price-save-percent"}) save = re.findall(r'\d\d.', str(save_container)) saves = ''.join(save) if saves == '': saves = "None" else: saves = saves if prices == "": prices = "Not Available" else: prices = prices brandlistbox.insert(END, " : " + brand) pricelistbox.insert(END, "₱ " + prices) savelistbox.insert(END, saves) Listbox4.insert(END, " : " + title) hyperlink.insert(END, ' ' + hyper) #------------------------------------------------------------------------- f.write( brand.replace(',', '') + ", " + prices.replace( ',', '.').replace('0', '1').replace('>', ' ') + ',' + saves.replace('', '').replace('None', '0%') + ', ' + title.replace(',', '') + ', ' + hyper + "\n") f.close() new_win = Button(window, width=10, text="New_Win", command=mainwindow, height=1, font="Jokerman", relief=RAISED, activebackground="LightBlue1", background='sky blue') new_win.place(x=105, y=90) messagebox.showinfo("Happens", "DONE! \n press ok to proceed")
from urllib.request import urlopen as Req from bs4 import BeautifulSoup as soup f = open('data.csv', 'w') url = 'https://www.dicksmith.co.nz/dn/shop/phones/iphone/?page=1' Client = Req(url) page_html = Client.read() Client.close() page_soup = soup(page_html, 'html.parser') containers = page_soup.find_all('div', class_='_1umis') #container = containers[0] f.write('Phone, Rating, Price \n') for container in containers: Phone = container.find('a', itemprop='url').text if container.find('meta', itemprop='ratingValue') == None: Rating = 'no rating' else: Rating = container.find('meta', itemprop='ratingValue')['content'] Price = container.find('span', itemprop='price')['content'] f.write(Phone.replace(',',' |') + ', '+ Rating+', '+ Price + "\n") print(Phone.replace(',',' |') + ', '+ Rating+','+ Price.replace(',','') + "\n") f.close()
def GetUrl(url): Uclient = Req(url) page_html = Uclient.read() Uclient.close() page_soup = Soup(page_html, "html.parser") return page_soup
# Declare my_url variable my_url = "https://www.carfax.com/Used-Honda-Civic-Type-R_t10063" # Load my_url contents into Scrapee variable Scrapee = Req(my_url) # Extract html to variable Scrapee_html Scrapee_html = Scrapee.read() # Close web page Scrapee.close() # Parse html into node tree and strip html tags, store as variable Scrapee_soup Scrapee_soup = soup(Scrapee_html, "html.parser") #Find matching class data and store into three variables Scrapee_soup_model = Scrapee_soup.findAll( "span", {"class": "srp-list-item-basic-info-model"}) Scrapee_soup_price = Scrapee_soup.findAll("span", {"class": "srp-list-item-price"}) Scrapee_soup_location = Scrapee_soup.findAll( "div", {"class": "srp-list-item-dealership-location"})
def fetchPage(params={}): get = params.get link = get("link") ret_obj = {} if get("post_data"): log("called for : " + repr(params['link'])) else: log("called for : " + repr(params)) if not link or int(get("error", "0")) > 2: log("giving up") ret_obj["status"] = 500 return ret_obj if get("post_data"): if get("hide_post_data"): log("Posting data") else: log("Posting data: " + urlencode(get("post_data"))) request = HTTPRequest(link, urlencode(get("post_data"))) request.add_header('Content-Type', 'application/x-www-form-urlencoded') else: log("Got request") request = HTTPRequest(link) if get("headers"): for head in get("headers"): request.add_header(head[0], head[1]) request.add_header('User-Agent', USERAGENT) if get("cookie"): request.add_header('Cookie', get("cookie")) if get("refering"): request.add_header('Referer', get("refering")) try: log("connecting to server...") con = OpenRequest(request) ret_obj["header"] = con.info() ret_obj["new_url"] = con.geturl() if get("no-content", "false") == u"false" or get( "no-content", "false") == "false": inputdata = con.read() ret_obj["content"] = inputdata.decode("utf-8") con.close() log("Done") ret_obj["status"] = 200 return ret_obj except HTTPError as e: err = str(e) log("HTTPError : " + err) log("HTTPError - Headers: " + str(e.headers) + " - Content: " + e.fp.read()) params["error"] = str(int(get("error", "0")) + 1) ret = fetchPage(params) if not "content" in ret and e.fp: ret["content"] = e.fp.read() return ret ret_obj["status"] = 500 return ret_obj
from urllib.request import urlopen as UReq from bs4 import BeautifulSoup as beau myurl = 'https://campinascomprelocal.com.br/tipo/bares/' print(myurl) # open connection page uClient = UReq(myurl) page_html = uClient.read() uClient.close() soup = beau(page_html, 'lxml') contents = soup.title print(contents)
eg. Google Candybar The webpage will be opened in your laptop Send "Close" , Chrome browser will be killed. """ from urllib.request import urlopen as URL import urllib.error as err import webbrowser import os my_api = "647242931:AAG7wgAy4Fn-IWrgKn0RskeSnrowZcv6AOc" temp = "" while True: open_URL = 'https://api.telegram.org/bot' + my_api + '/getupdates' try: client_URL = URL(open_URL) data_URL = client_URL.read().decode('utf-8') client_URL.close() getstring = data_URL.split("{") getdata = getstring[-1].split("\"") data = getdata[-2] if (data == temp): pass else: print("data received:", data) temp = data if (data[0:6] == "google" or data[0:6] == "Google"): url = "http://www.google.co.in/search?q=" + data[6:] webbrowser.open(url) elif (data[0:5] == "close" or data[0:6] == "Close"): print("Closed") #os.system('TASKKILL /F /IM chrome.exe')#works for windows else: