def mainPageScrape(f): address = "https://www.newegg.com/Processors-Desktops/SubCategory/ID-343" # opening up connection grabbing the page uClient = UReq(address) page_html = uClient.read() uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # add each processor item container to a list of containers containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: list = (containerScrape(container)) csv_string = list[0] + "," + list[1] + "," + list[2] + "," + list[3] + "," + list[4] + "," + list[5] + "," + \ list[6] if descriptionlog.__contains__(list[1]): print("Duplicate processor found. Not writing to list.") else: descriptionlog.append(list[1]) print(csv_string) f.write(csv_string + "\n") containers.clear()
def reporttohub(self, requestdata): if requestdata == "": webrequest = GenerateWebRequest(self.urlendpoint) else: webrequest = GenerateWebRequest(self.urlendpoint, data=requestdata.encode( 'ascii', 'ignore')) tries = 0 outcome = "" while tries < self.maximumtrieslimit: try: outcome = GetWebPage(webrequest) tries = 99999 except WebError as errorobject: tries = tries + 1 print("Error accessing Hub: ", errorobject.reason) if tries == 99999: outcome = outcome.read() outcome = outcome.decode('utf-8', 'ignore') else: print("Gave up accessing Hub") return outcome
def parse_page(url): x = Ureq(url) page = x.read() x.close() page_parsed = Bsoup(page, 'html.parser') return (page_parsed)
def get_json(url): req = Request(url, headers=hdr) page = Ureq(req) try: js = page.read().decode() js = json.loads(js) except: js = None return js
def getMostBoughtData(): my_url = 'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-bought-by-hedge-funds/' # saves the information from the url into the client Client = Req(my_url) # saves page_hmtl = Client.read() Client.close() # parses the html of the website page_soup = soup(page_hmtl, "html.parser") # finds all the parts of the webpage that would hold the titles symbols = page_soup.findAll("h2", {"class": "Fz(m)"}) tickers = page_soup.findAll("a", {"class": "Fw(b)"}) prices_html = page_soup.findAll( "td", {"class": "data-col2 Ta(end) Pstart(10px) Pend(6px) Fw(b)"}) changes_html = page_soup.findAll( "td", {"class": "data-col4 Ta(end) Pstart(10px) Pend(6px)"}) volumes_html = page_soup.findAll( "td", {"class": "data-col6 Ta(end) Pstart(10px) Pend(6px)"}) avg_volumes_html = page_soup.findAll( "td", {"class": "data-col7 Ta(end) Pstart(10px) Pend(6px)"}) sym = int(symbols[1].text[0:2]) x = 0 data = [] while x < 48 - 1: if len(tickers[x].text ) > 5 or tickers[x].text == "Tech" or tickers[x].text == "News": x = x + 1 else: break y = x while x < y + sym: ticker = tickers[x].text price = prices_html[x - (y + sym)].text change_str = changes_html[x - (y + sym)].text if change_str.startswith('+'): change = change_str[1:] else: change = change_str volume = volumes_html[x - (y + sym)].text avg_volume = avg_volumes_html[x - (y + sym)].text data.append((ticker, price, change, volume, avg_volume)) x = x + 1 return data
def json_from_url(url, params=None): try: from urllib.request import urlopen as Urlopen from urllib.parse import quote as Quote except ImportError: from urllib import pathname2url as Quote from urllib2 import urlopen as Urlopen if params: url += Quote(params) res = Urlopen(url) body = res.read().decode() return json.loads(body)
def __init__(self, url): """ Class constructor """ try: response = UrlOpen(url) content = response.read() data = Load(content.decode("UTF-8")) self.bitcoin_data(data) except HTTPError as error: print("Error Code: {}".format(error.code))
def hockey_bet(): # Pull in url for schedule # TODO: Check date, and if it is not during the season, exit function url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html' # Run through BeautifulSoup steps uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_ = 'left') game = [team.get_text() for team in game] drop_list = ['Date','Visitor','Home','Notes',''] # Clean data game = [game for game in game if game not in drop_list] bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Date','Visitor','Home'] # Clean team names into readable format row_count = 0 visitor = df['Visitor'].str.split(" ", expand = True) home = df['Home'].str.split(" ", expand = True) while row_count < len(df): if visitor[2][row_count] == None: df['Visitor'][row_count] = visitor[1][row_count] elif visitor[2][row_count] != None: df['Visitor'][row_count] = visitor[2][row_count] if home[2][row_count] == None: df['Home'][row_count] = home[1][row_count] elif home[2][row_count] != None: df['Home'][row_count] = home[2][row_count] row_count += 1 # Only select todays games todays_date = datetime.now().strftime('%Y-%m-%d') todays_games = df[df['Date'] == todays_date] todays_games = todays_games.reset_index() todays_games = todays_games[['Visitor','Home']] return todays_games
def get_insider_trading_data(): my_url = 'http://openinsider.com/screener?s=&o=&pl=&ph=&ll=&lh=&fd=730&fdr=&td=0&tdr=&fdlyl=&fdlyh=&daysago=&xp=1&vl=&vh=&ocl=&och=&sic1=-1&sicl=100&sich=9999&grp=0&nfl=&nfh=&nil=&nih=&nol=&noh=&v2l=&v2h=&oc2l=&oc2h=&sortcol=0&cnt=1000&page=1' Client = Req(my_url) page_html = Client.read() Client.close() page_soup = soup(page_html, "html.parser") # finds the ticker locations using this method findsT = page_soup.findAll("a",{"onmouseout":"UnTip()"}) # finds the title of the person that was insider trading findsTi = page_soup.findAll("td") # finds the title of the person trading on the inside findsNum = page_soup.findAll("td",{"align":"right"}) x = 0 # variable for the title y = 22 # finds the price of the stock when bought z = 23 # finds the quantity of stocks purchased w = 24 # finds the number of stocks already owned a = 72 # finds the title of the person insider trading data = [] while x < 1000: findT = findsT[x] findT = findT["href"].replace("/", "") findP = findsNum[y].text findQ = findsNum[z].text findO = findsNum[w].text findTi = findsTi[a].text data.append((findT, findTi, findP, findQ, findO)) x += 1 y += 12 z += 12 w += 12 a += 17 return data
def getCount(switch,phrase): print('Checking for phrase:'+str(phrase)) Count = 0 try: headers = {} headers['User-Agent'] = useragent() if switch == 'err': return 0; query = URLENCODE({'q': phrase}) URL = SEARCH['Google'] % query REQ = REQUEST(URL,headers=headers) RESP = OPEN(REQ) RESULTS = RESP.read().decode("utf-8") JSON_RES = LOAD(RESULTS) #Suspected Terms of Service Abuse if JSON_RES['responseStatus'] == 403: if switch == 'False': print("Suspicion detected by Google-Sleep for 30 seconds") time.sleep(30) # Sleep 20 seconds after blockge. getCount('True',phrase) elif switch == 'True': print("Suspicion detected by Google-Sleep for 60 seconds") time.sleep(60) #Sleep, atleast safe execution getCount('err',phrase) elif JSON_RES['responseStatus'] == 200: #Everything is good DATA = JSON_RES['responseData'] if len(DATA['results']) > 1 : #Looks stupid but this is how it works count = DATA['cursor']['estimatedResultCount']; Count = int(count) else : Count = 1 time.sleep(3) return Count except Exception as ex: print(str(ex))
def baseball_bet(): # Set the current date in a readable form and the form used for the html todays_date = datetime.now().strftime('%m-%d-%Y') date_html = datetime.now().strftime('%Y%m%d') # Set Opening Day date openeing_day = "03-26-2020" # Parse OD date OD = datetime.strptime(openeing_day, "%m-%d-%Y") # Set current date present = datetime.now() # If it is before OD, return from function if present.date() < OP.date(): print('Opening Day is not until March 26. Please come back then.') return # Set url for todays date if season has already started url = 'https://www.espn.com/mlb/schedule/_/date/' + date_html # Make sure that there are acutally games being played # If there are not, the url will not work try: uClient = Ureq(url) raw_content = uClient.read() except: print('There are no games being played on this day.') return # Run through BeautifulSoup steps to pull out desired data page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_='external') game_date_list = [] # Fix dates given into readable datetime format for x in range(1, len(game)): game_date = game[x]['href'].split('/')[5].split('-')[-3:-1] game_date.append('2020') sent_str = "" for i in game_date: sent_str += str(i) + "-" sent_str = sent_str[:-1] date = datetime.strptime(sent_str, '%m-%d-%Y') date = date.strftime('%m-%d-%Y') game_date_list.append(date) # Get the names of the teams that are playing on that day game = html.findAll(class_='team-name') game = [team.get_text() for team in game] game_list = [] for item in game: # The abbrvs are only the last three characters in the str item = item[-3:] game_list.append(item) # Split home and away teams from the list of cleaned teams bin_len = 2 start = 0 end = start + bin_len week_list = [] while end < (len(game_list) + 1): week = game_list[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Visitor', 'Home'] df['Date'] = game_date_list todays_games = df[df['Date'] == todays_date] # Apply the lambda function that will clean the team names into more colloquial names todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x]) todays_games['Visitor'] = todays_games['Visitor'].apply( lambda x: teams_dict[x]) # return data frame of games that are being played today return todays_games
import bs4 from urllib.request import urlopen as Req from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card' uclient = Req(my_url) page_html = uclient.read() uclient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "item-container"}) filename = "products_newegg.csv" f = open(filename, "w") headers = "Brand, Product_name\n" f.write(headers) for container in containers: brand = container.div.div.a.img["title"] title_container = container.findAll("a", {"class": "item-title"}) product_name = title_container[0].text print("Brand: " + brand) print("Product Name: " + product_name) f.write(brand + "," + product_name.replace(",", "|") + "\n")
def scrape(): #################################################################################### concat = Sentry.get() #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html" my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat) my_url = my_url.replace(' ', '+') #################################################################################### uClient = Ureq(my_url) page_html = uClient.read() uClient.close() #html_parsing page_soup = Soup(page_html, "html.parser") #grabe each containers = page_soup.findAll("div", {"class": "item-container"}) #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"}) #print(manufacturer ) #print(len(containers)) #print(containers[5:]) #container = containers[5] #---------------------------------------- save the csv files fileName = "{}.csv".format( concat) ############################################### f = open(fileName, "w") headers = "BRAND , PRICES , SAVES , TITLES , LINK \n" # f.write(headers) for container in containers[4:]: #--------------------------------------------------------- brand_container = container.findAll("a", {"class": "item-brand"}) brand = brand_container[0].img["title"] #brand name #------------------------------------------------------------------- may_know = container.findAll("a", {"class": "item-title"}) #print(may_know) #################################################################### title = container.a.img["title"] #Name of selling #print(container) #######################################################3 hyper = brand_container[0]["href"] #hyper = container.findAll("div",{"class": "item-info"}) #hyper = hypers.a #print(hyper) #-------------------------------------------------------------- price_container = container.findAll("li", {"class": "price-current"}) price_container2 = price_container[0].strong price = re.findall(r'.\d.\d\d\d', str(price_container2)) prices = ''.join(price) #------------------------------------------------------------------------ save_container = container.findAll("span", {"class": "price-save-percent"}) save = re.findall(r'\d\d.', str(save_container)) saves = ''.join(save) if saves == '': saves = "None" else: saves = saves if prices == "": prices = "Not Available" else: prices = prices brandlistbox.insert(END, " : " + brand) pricelistbox.insert(END, "₱ " + prices) savelistbox.insert(END, saves) Listbox4.insert(END, " : " + title) hyperlink.insert(END, ' ' + hyper) #------------------------------------------------------------------------- f.write( brand.replace(',', '') + ", " + prices.replace( ',', '.').replace('0', '1').replace('>', ' ') + ',' + saves.replace('', '').replace('None', '0%') + ', ' + title.replace(',', '') + ', ' + hyper + "\n") f.close() new_win = Button(window, width=10, text="New_Win", command=mainwindow, height=1, font="Jokerman", relief=RAISED, activebackground="LightBlue1", background='sky blue') new_win.place(x=105, y=90) messagebox.showinfo("Happens", "DONE! \n press ok to proceed")
import re from statistics import mean from urllib.request import urlopen as Req from bs4 import BeautifulSoup as soup # Declare my_url variable my_url = "https://www.carfax.com/Used-Honda-Civic-Type-R_t10063" # Load my_url contents into Scrapee variable Scrapee = Req(my_url) # Extract html to variable Scrapee_html Scrapee_html = Scrapee.read() # Close web page Scrapee.close() # Parse html into node tree and strip html tags, store as variable Scrapee_soup Scrapee_soup = soup(Scrapee_html, "html.parser") #Find matching class data and store into three variables Scrapee_soup_model = Scrapee_soup.findAll( "span", {"class": "srp-list-item-basic-info-model"}) Scrapee_soup_price = Scrapee_soup.findAll("span",
def retrieve_raw(self, request): request = self.auth.add_authorization_header(request) req = UrlLibOpen(request) res = json.loads(req.read().decode('utf-8')) return (req, res)
from urllib.request import urlopen as UReq from bs4 import BeautifulSoup as beau myurl = 'https://campinascomprelocal.com.br/tipo/bares/' print(myurl) # open connection page uClient = UReq(myurl) page_html = uClient.read() uClient.close() soup = beau(page_html, 'lxml') contents = soup.title print(contents)
from urllib.request import urlopen as URL import bs4 from bs4 import BeautifulSoup as BS #Created a tuple for which need to scrape GetURL = ("https://www.moneycontrol.com/india/stockpricequote/computers-software/infosys/IT", "https://www.moneycontrol.com/india/stockpricequote/computers-software/tataconsultancyservices/TCS") #Blank List to store values printlist = [] for eachURL in GetURL: html = URL(eachURL) HTML=html.read() GetHTMLData = BS(HTML, 'lxml') title = GetHTMLData.title #print(title) text = title.get_text() printlist.append(text) rows = GetHTMLData.find_all('span') for row in rows: Price_Item = row.get("id") if (Price_Item == "Bse_Prc_tick") or (Price_Item =="Nse_Prc_tick"):
from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup my_url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38" #opening url and grabbing page uClint = Ureq(my_url) page_html = uClint.read() uClint.close() #html parser page_soup = soup(page_html, "html.parser") #print(page_soup.h1) # prints H1 #print(page_soup.p)# prints paragraphs #print(page_soup.body.div) #grab each product containers = page_soup.findAll("div", {"class": "item-container"}) print(len(containers)) # to open a file file_name = "product.csv" f = open(file_name, "w") headers = "Brand", "prouct name", "Shipping\n" f.write("Brand, product name, shippig\n") #below 3 lines of code is for container 1 that is 0th #container = containers[0] #print(container.a) #print(container.div.div.a.img["title"]) # will return the title
#! /usr/bin/env python3.6 from urllib.request import urlopen as Uop from bs4 import BeautifulSoup as BS4Soup html = Uop("http://pythonscraping.com/pages/page1.html") bs_obj = BS4Soup(html.read(), "html.parser") print("bs_obj.html.body.h1 = ", bs_obj.html.body.h1) print("bs_obj.html.h1 = ", bs_obj.html.h1) print("bs_obj.body.h1 = ", bs_obj.body.h1) print("bs_obj.h1 = ", bs_obj.h1) #print(bs_obj) #print(html.read()) #bs_obj_local_html = BS4Soup("./simple_prac.html", "html.parser") #print("bs_obj_local_html.h2 = ", bs_obj_local_html.h2)
from bs4 import BeautifulSoup as Soup from selenium import webdriver #The Website driver = webdriver.Firefox() driver.get( "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)" ) my_url = ( "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)" ) # ---------------------------------------------------------------------------------------------------------------------- # Uclient downloads the Url which is stored in the variable my_url Uclient = UReq(my_url) # This reads my HTML which has been downloaded Html = Uclient.read() # Closes the HTML to prevent the console from crashing Uclient.close() # ----------------------------------------------------------------------------------------------------------------------- # Parses the HTML Page_soup = Soup(Html, "html.parser") # Grabs each product # mobile = driver.find_elements_by_xpath('//span[@class="call-modal__phone_number"]') modals = driver.find_elements_by_xpath('//*[@data-testid="lpv-call-button"]') containers = Page_soup.findAll("div", {"class": "ListItem__Root-sc-1i3osc0-1 hMPXKC"}) # Creating the file, the headers and the name of the file filename = "properties.csv" f = open(filename, "w") headers = "Property name" + '|' + "Location" + '|' + "Price" + '|' + "Bedrooms" + '|' + "PhoneNumber" + "\n"
def GetUrl(url): Uclient = Req(url) page_html = Uclient.read() Uclient.close() page_soup = Soup(page_html, "html.parser") return page_soup
from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card' # opening up connection, grabbing the page Uclient = Ureq(my_url) #it offloads the content into a variable page_html = Uclient.read() #close the connection Uclient.close() # html parsing page_soup = soup(page_html, "html.parser") # grabs each product containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: brand = container.div.div.a.img["title"] title_container = container.findAll("a",{"class":"item-title"}) product_name = title_container[0].text shipping_container = container.findAll("li",{"class":"price-ship"}) shipping = shipping_container[0].text.strip() print("brand: " + brand) print("product_name: " + product_name)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # importing libraries from datetime import datetime from datetime import timedelta from urllib.request import urlopen as UReq from bs4 import BeautifulSoup as soup now = datetime.now() todaysDay = now.day todaysMonth = now.month #Fetching altomfotball webpage my_url = "http://www.altomfotball.no/element.do?cmd=tournament&tournamentId=1&useFullUrl=false" uClient = UReq(my_url) oversiktsside = uClient.read() uClient.close() #Tables for upcoming matches datoliste = [] rundeliste = [] konkurranseliste = [] hjemmelagliste = [] stillingliste = [] bortelagliste = [] kanalliste = [] #HTML parsing page_soup = soup(oversiktsside, "html.parser") #fills the individual lists with information about the upcoming round
def hockey_win(date): url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html' # Run through BeautifulSoup steps uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_ = 'left') results = html.findAll(class_ = 'right') game = [team.get_text() for team in game] results = [team.get_text() for team in results] results_drop = ['LOG'] results = [results for results in results if results not in results_drop] drop_list = ['Date','Visitor','Home','Notes',''] # Clean data game = [game for game in game if game not in drop_list] bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Date','Visitor','Home'] # Clean team names into readable format bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(results) + 1): week = results[start:end] start = end end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) row_count = 0 visitor = df['Visitor'].str.split(" ", expand = True) home = df['Home'].str.split(" ", expand = True) while row_count < len(df): if visitor[2][row_count] == None: df['Visitor'][row_count] = visitor[1][row_count] elif visitor[2][row_count] != None: df['Visitor'][row_count] = visitor[2][row_count] if home[2][row_count] == None: df['Home'][row_count] = home[1][row_count] elif home[2][row_count] != None: df['Home'][row_count] = home[2][row_count] row_count += 1 # Only select todays games df_1.columns = ['Visitor_Goals','Home_Goals','Attendance','Time'] total_df = pd.concat([df,df_1],axis=1,join='inner') win_count = 0 win_list = [] while win_count < len(total_df): if (total_df['Visitor_Goals'][win_count]) > (total_df['Home_Goals'][win_count]): win_list.append(total_df['Visitor'][win_count]) elif (total_df['Home_Goals'][win_count]) > (total_df['Visitor_Goals'][win_count]): win_list.append(total_df['Home'][win_count]) elif (total_df['Home_Goals'][win_count]) != '' and (total_df['Visitor_Goals'][win_count]) != '' and (total_df['Home_Goals'][win_count]) == (total_df['Visitor_Goals'][win_count]): win_list.append('Tie') else: win_list.append('Incomplete') win_count += 1 total_df['Winner'] = win_list todays_games = total_df[total_df['Date'] == date] todays_games = todays_games.reset_index() return todays_games
def basketball_bet(): # Get the current month and day in order to get the games playing today current_month_text = datetime.now().strftime('%B').lower() current_day = datetime.now().strftime('%d') # Pull the url based on the current month try: url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html' except: print('There are currently no basketball games being played today') return uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] schedule_text = html.findAll(class_="left") # Get the text from the html schedule = [game.get_text() for game in schedule_text] # Fill dataframe with game date, visiting team name, and home team name bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(schedule) + 1): week = schedule[start:end] start = end + 1 end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) df_1.columns = ['Date', 'Visitor', 'Home'] # Clean all of the comlumns row_count = 0 new = df_1['Date'].str.split(" ", n=3, expand=True) while row_count < len(df_1): df_1['Date'][row_count] = new[2][row_count][:-1] row_count += 1 game_time = html.findAll(class_='right') game_time = [team.get_text() for team in game_time] bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(game_time) + 1): week = game_time[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Game_Time (EST)', 'Stat1', 'Stat2', 'Stat3'] df = df['Game_Time (EST)'] # Concat the dataframes to get desired data todays_games = pd.concat([df_1, df], axis=1, join='inner') todays_games = todays_games[todays_games['Date'] == current_day] # If there are no games being played, exit function if len(todays_games) == 0: print('There are currently no basketball games being played today.') return # Clean team names into more readable forms todays_games = todays_games.reset_index() todays_games = todays_games[['Visitor', 'Home', 'Game_Time (EST)']] todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x]) todays_games['Visitor'] = todays_games['Visitor'].apply( lambda x: teams_dict[x]) # Return games being played today return todays_games
def remainingPagesScrape(f): page = 2 duplicateCount = 0 link = 'https://www.newegg.com/Processors-Desktops/SubCategory/ID-343/Page-' while True: try: address = link + str(page) print() print("Preparing to Scrape Page: " + str(page)) print("Address: " + address) print() # opening up connection grabbing the page uClient = UReq(address) page_html = uClient.read() uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # add each processor item container to a list of containers containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: list = (containerScrape(container)) csv_string = list[0] + "," + list[1] + "," + list[ 2] + "," + list[3] + "," + list[4] + "," + list[ 5] + "," + list[6] if descriptionlog.__contains__(list[1]): print("Duplicate processor found. Not writing to list.") duplicateCount = duplicateCount + 1 else: descriptionlog.append(list[1]) print(csv_string) f.write(csv_string + "\n") containers.clear() if duplicateCount > 100: print() print( "Duplicate Count Is " + str(duplicateCount) + ". This Suggests The Data Is Being Reiterated. The Script Will Stop." ) print("Processor Scrape Complete") print() print("Traversed " + str(page) + " Pages") print( str(descriptionlog.__len__()) + " Unique Processors Found") print() print("Data Written To: " + f.name) f.close() break page = page + 1 except IndexError as e: print() page = page + 1 # f.close() print("So Far We Have Traversed " + str(page - 1) + " Pages") print(str(descriptionlog.__len__()) + " Unique Processors Found") print(str(duplicateCount) + " Duplicates Ignored")
def getPage(self): uClient = UReq(self.url) self.page_html = uClient.read() uClient.close()
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as Req url = "https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off" Client = Req(url) page_html = Client.read() page_soup = soup(page_html, "html.parser") # print(page_soup.contents) containers = page_soup.findAll("div", {"class": "_3O0U0u"}) container = containers[0] print(container.div.img['alt']) price = container.findAll("div", {"class": "col col-5-12 _2o7WAb"}) print(price[0].text) rating = container.findAll("div", {"class": "hGSR34"}) print(rating[0].text) filename = "flpphn.csv" f = open(filename, "w") headers = "Products_Name,Pricing,Ratings\n"
def fetchPage(params={}): get = params.get link = get("link") ret_obj = {} if get("post_data"): log("called for : " + repr(params['link'])) else: log("called for : " + repr(params)) if not link or int(get("error", "0")) > 2: log("giving up") ret_obj["status"] = 500 return ret_obj if get("post_data"): if get("hide_post_data"): log("Posting data") else: log("Posting data: " + urlencode(get("post_data"))) request = HTTPRequest(link, urlencode(get("post_data"))) request.add_header('Content-Type', 'application/x-www-form-urlencoded') else: log("Got request") request = HTTPRequest(link) if get("headers"): for head in get("headers"): request.add_header(head[0], head[1]) request.add_header('User-Agent', USERAGENT) if get("cookie"): request.add_header('Cookie', get("cookie")) if get("refering"): request.add_header('Referer', get("refering")) try: log("connecting to server...") con = OpenRequest(request) ret_obj["header"] = con.info() ret_obj["new_url"] = con.geturl() if get("no-content", "false") == u"false" or get( "no-content", "false") == "false": inputdata = con.read() ret_obj["content"] = inputdata.decode("utf-8") con.close() log("Done") ret_obj["status"] = 200 return ret_obj except HTTPError as e: err = str(e) log("HTTPError : " + err) log("HTTPError - Headers: " + str(e.headers) + " - Content: " + e.fp.read()) params["error"] = str(int(get("error", "0")) + 1) ret = fetchPage(params) if not "content" in ret and e.fp: ret["content"] = e.fp.read() return ret ret_obj["status"] = 500 return ret_obj
#Iterate through every page on https://pitchfork.com/reviews/albums/ while(True): url = (base_url_main_page+"?page="+str(page_numbers)) #iterate through until no page is found. Ignore other HTTP response errors try: response = Ureq(url) except urllib.error.HTTPError as e: error_message = e.read() if e.getcode() == 404: sys.exit("No page found") else: print(error_message) else: page_html = response.read() page_soup = soup(page_html, "html.parser") url_names = page_soup.findAll("div",class_= "review") count = 0 #enter urls of album reviews for item in url_names: url_name = url_names[count].a["href"] album_url = (base_url_album_pages+url_name) #ignore HTTP response errors try: album_response = Ureq(album_url) except urllib.error.HTTPError as ea:
def basketball_win(date): current_month = date[0:2] current_day = date[3:5] string = current_month current_month_text = datetime.strptime(string, "%m") current_month_text = datetime.strftime(current_month_text, "%B").lower() # Pull the url based on the current month try: url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html' except: print('There are currently no basketball games being played today') return uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] schedule_text = html.findAll(class_="left") # Get the text from the html schedule = [game.get_text() for game in schedule_text] # Fill dataframe with game date, visiting team name, and home team name bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(schedule) + 1): week = schedule[start:end] start = end + 1 end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) df_1.columns = ['Date', 'Visitor', 'Home'] # Clean all of the comlumns row_count = 0 new = df_1['Date'].str.split(" ", n=3, expand=True) while row_count < len(df_1): df_1['Date'][row_count] = new[2][row_count][:-1] row_count += 1 game_time = html.findAll(class_='right') game_time = [team.get_text() for team in game_time] bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(game_time) + 1): week = game_time[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Game_Time', 'Visitor_Points', 'Home_Points', 'Stat3'] df.drop(columns=['Stat3'], inplace=True) total_df = pd.concat([df_1, df], axis=1, join='inner') win_list = [] row_count = 0 for row in total_df['Date']: if (total_df['Visitor_Points'][row_count]) > ( total_df['Home_Points'][row_count]): win_list.append(total_df['Visitor'][row_count]) elif (total_df['Home_Points'][row_count]) > ( total_df['Visitor_Points'][row_count]): win_list.append(total_df['Home'][row_count]) elif (total_df['Home_Points'][row_count]) != '' and ( total_df['Visitor_Points'][row_count]) != '' and ( total_df['Home_Points'][row_count]) == ( total_df['Visitor_Points'][row_count]): win_list.append('Tie') else: win_list.append('Incomplete') row_count += 1 total_df['Winner'] = win_list todays_games = total_df[total_df['Date'] == current_day] if len(todays_games) == 0: print('There are currently no basketball games being played today.') todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x]) todays_games['Visitor'] = todays_games['Visitor'].apply( lambda x: teams_dict[x]) todays_games['Winner'] = todays_games['Winner'].apply( lambda x: teams_dict[x]) return todays_games
def football_bet(): # Ensure that the football season is currently going on year_date = datetime.now().strftime('%Y-%m-%d') if year_date > 'February 2 2020' and year_date < 'September 10 2020': print( "The next football season hasn't begun yet. Please come back on September 10." ) return elif year_date < 'February 2 2020': url = 'https://www.pro-football-reference.com/years/2019/games.htm' else: url = 'https://www.pro-football-reference.com/years/2020/games.htm' # Run through BeautifulSoup steps to pull wanted data uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] teams_win_loss = html.findAll(class_='left') game = html.findAll(class_='right') game = [team.get_text() for team in game] teams_win_loss = [team.get_text() for team in teams_win_loss] removal = ['Day'] teams_win_loss = [item for item in teams_win_loss if item not in removal] # Set todays date that will be used to select todays games date = datetime.now().strftime('%B %d') # Clean stats bin_len = 8 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end + 1 end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) df_1.columns = [ 'Game_Week', 'Time (EST)', 'Stat1', 'Stat2', 'Stat3', 'Stat4', 'Stat5', 'Stat6' ] bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(teams_win_loss) + 1): week = teams_win_loss[start:end] start = end end = start + bin_len week_list.append(week) df_2 = pd.DataFrame(week_list) df_2.columns = ['Day_Of_Week', 'Date', 'Home', 'Visitor'] # Concat data frames football = pd.concat( [df_1[['Game_Week', 'Time (EST)']], df_2[['Date', 'Home', 'Visitor']]], axis=1, join='inner') # Select only games being played today todays_games = football[football['Date'] == date] # Return dataframe return todays_games