def football_bet(): # Ensure that the football season is currently going on year_date = datetime.now().strftime('%Y-%m-%d') if year_date > 'February 2 2020' and year_date < 'September 10 2020': print( "The next football season hasn't begun yet. Please come back on September 10." ) return elif year_date < 'February 2 2020': url = 'https://www.pro-football-reference.com/years/2019/games.htm' else: url = 'https://www.pro-football-reference.com/years/2020/games.htm' # Run through BeautifulSoup steps to pull wanted data uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] teams_win_loss = html.findAll(class_='left') game = html.findAll(class_='right') game = [team.get_text() for team in game] teams_win_loss = [team.get_text() for team in teams_win_loss] removal = ['Day'] teams_win_loss = [item for item in teams_win_loss if item not in removal] # Set todays date that will be used to select todays games date = datetime.now().strftime('%B %d') # Clean stats bin_len = 8 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end + 1 end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) df_1.columns = [ 'Game_Week', 'Time (EST)', 'Stat1', 'Stat2', 'Stat3', 'Stat4', 'Stat5', 'Stat6' ] bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(teams_win_loss) + 1): week = teams_win_loss[start:end] start = end end = start + bin_len week_list.append(week) df_2 = pd.DataFrame(week_list) df_2.columns = ['Day_Of_Week', 'Date', 'Home', 'Visitor'] # Concat data frames football = pd.concat( [df_1[['Game_Week', 'Time (EST)']], df_2[['Date', 'Home', 'Visitor']]], axis=1, join='inner') # Select only games being played today todays_games = football[football['Date'] == date] # Return dataframe return todays_games
from urllib.request import urlopen as Req from bs4 import BeautifulSoup as soup f = open('data.csv', 'w') url = 'https://www.dicksmith.co.nz/dn/shop/phones/iphone/?page=1' Client = Req(url) page_html = Client.read() Client.close() page_soup = soup(page_html, 'html.parser') containers = page_soup.find_all('div', class_='_1umis') f.write('Phone, Rating, Price \n') for container in containers: Phone = container.find('a', itemprop='url').text if container.find('meta', itemprop='ratingValue') == None: Rating = 'no rating' else: Rating = container.find('meta', itemprop='ratingValue')['content'] Price = container.find('span', itemprop='price')['content'] f.write(Phone.replace(',', ' |') + ', ' + Rating + ', ' + Price + "\n") print( Phone.replace(',', ' |') + ', ' + Rating + ',' + Price.replace(',', '') + "\n") f.close()
from bs4 import BeautifulSoup as soup import requests from urllib.request import urlopen as UReq # spørgsmål 1: Hvor mange produkter kommer frem, når man søger på "breaking benjamin" (se URL'en) my_url = 'https://www.merchbar.com/search?q=breaking%20benjamin&p=1' uClient = UReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'html.parser') # print(page_soup.title) # grabs each product containers = page_soup.findAll("div", {"class": "col-md-4 col-6"}) print(len(containers)) # 20 er svaret divWithInfo = containers[0].find("div", "MerchTile.module__brandName")
from urllib.request import urlopen as UReq from bs4 import BeautifulSoup as Soup from selenium import webdriver #The Website driver = webdriver.Firefox() driver.get( "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)" ) my_url = ( "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)" ) # ---------------------------------------------------------------------------------------------------------------------- # Uclient downloads the Url which is stored in the variable my_url Uclient = UReq(my_url) # This reads my HTML which has been downloaded Html = Uclient.read() # Closes the HTML to prevent the console from crashing Uclient.close() # ----------------------------------------------------------------------------------------------------------------------- # Parses the HTML Page_soup = Soup(Html, "html.parser") # Grabs each product # mobile = driver.find_elements_by_xpath('//span[@class="call-modal__phone_number"]') modals = driver.find_elements_by_xpath('//*[@data-testid="lpv-call-button"]') containers = Page_soup.findAll("div", {"class": "ListItem__Root-sc-1i3osc0-1 hMPXKC"}) # Creating the file, the headers and the name of the file filename = "properties.csv" f = open(filename, "w")
def GetUrl(url): Uclient = Req(url) page_html = Uclient.read() Uclient.close() page_soup = Soup(page_html, "html.parser") return page_soup
def getPage(self): uClient = UReq(self.url) self.page_html = uClient.read() uClient.close()
from urllib.request import urlopen as URL import bs4 from bs4 import BeautifulSoup as BS #Created a tuple for which need to scrape GetURL = ("https://www.moneycontrol.com/india/stockpricequote/computers-software/infosys/IT", "https://www.moneycontrol.com/india/stockpricequote/computers-software/tataconsultancyservices/TCS") #Blank List to store values printlist = [] for eachURL in GetURL: html = URL(eachURL) HTML=html.read() GetHTMLData = BS(HTML, 'lxml') title = GetHTMLData.title #print(title) text = title.get_text() printlist.append(text) rows = GetHTMLData.find_all('span') for row in rows: Price_Item = row.get("id") if (Price_Item == "Bse_Prc_tick") or (Price_Item =="Nse_Prc_tick"):
#!/usr/bin/env python3 #import Python libraries regex, statistics, urllib.request and Beautiful Soup import locale import re from statistics import mean from urllib.request import urlopen as Req from bs4 import BeautifulSoup as soup # Declare my_url variable my_url = "https://www.carfax.com/Used-Honda-Civic-Type-R_t10063" # Load my_url contents into Scrapee variable Scrapee = Req(my_url) # Extract html to variable Scrapee_html Scrapee_html = Scrapee.read() # Close web page Scrapee.close() # Parse html into node tree and strip html tags, store as variable Scrapee_soup Scrapee_soup = soup(Scrapee_html, "html.parser") #Find matching class data and store into three variables
def hockey_win(date): url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html' # Run through BeautifulSoup steps uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_ = 'left') results = html.findAll(class_ = 'right') game = [team.get_text() for team in game] results = [team.get_text() for team in results] results_drop = ['LOG'] results = [results for results in results if results not in results_drop] drop_list = ['Date','Visitor','Home','Notes',''] # Clean data game = [game for game in game if game not in drop_list] bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Date','Visitor','Home'] # Clean team names into readable format bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(results) + 1): week = results[start:end] start = end end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) row_count = 0 visitor = df['Visitor'].str.split(" ", expand = True) home = df['Home'].str.split(" ", expand = True) while row_count < len(df): if visitor[2][row_count] == None: df['Visitor'][row_count] = visitor[1][row_count] elif visitor[2][row_count] != None: df['Visitor'][row_count] = visitor[2][row_count] if home[2][row_count] == None: df['Home'][row_count] = home[1][row_count] elif home[2][row_count] != None: df['Home'][row_count] = home[2][row_count] row_count += 1 # Only select todays games df_1.columns = ['Visitor_Goals','Home_Goals','Attendance','Time'] total_df = pd.concat([df,df_1],axis=1,join='inner') win_count = 0 win_list = [] while win_count < len(total_df): if (total_df['Visitor_Goals'][win_count]) > (total_df['Home_Goals'][win_count]): win_list.append(total_df['Visitor'][win_count]) elif (total_df['Home_Goals'][win_count]) > (total_df['Visitor_Goals'][win_count]): win_list.append(total_df['Home'][win_count]) elif (total_df['Home_Goals'][win_count]) != '' and (total_df['Visitor_Goals'][win_count]) != '' and (total_df['Home_Goals'][win_count]) == (total_df['Visitor_Goals'][win_count]): win_list.append('Tie') else: win_list.append('Incomplete') win_count += 1 total_df['Winner'] = win_list todays_games = total_df[total_df['Date'] == date] todays_games = todays_games.reset_index() return todays_games
#! /usr/bin/env python3.6 from urllib.request import urlopen as Uop from bs4 import BeautifulSoup as BS4Soup html = Uop("http://pythonscraping.com/pages/page1.html") bs_obj = BS4Soup(html.read(), "html.parser") print("bs_obj.html.body.h1 = ", bs_obj.html.body.h1) print("bs_obj.html.h1 = ", bs_obj.html.h1) print("bs_obj.body.h1 = ", bs_obj.body.h1) print("bs_obj.h1 = ", bs_obj.h1) #print(bs_obj) #print(html.read()) #bs_obj_local_html = BS4Soup("./simple_prac.html", "html.parser") #print("bs_obj_local_html.h2 = ", bs_obj_local_html.h2)
def baseball_bet(): # Set the current date in a readable form and the form used for the html todays_date = datetime.now().strftime('%m-%d-%Y') date_html = datetime.now().strftime('%Y%m%d') # Set Opening Day date openeing_day = "03-26-2020" # Parse OD date OD = datetime.strptime(openeing_day, "%m-%d-%Y") # Set current date present = datetime.now() # If it is before OD, return from function if present.date() < OP.date(): print('Opening Day is not until March 26. Please come back then.') return # Set url for todays date if season has already started url = 'https://www.espn.com/mlb/schedule/_/date/' + date_html # Make sure that there are acutally games being played # If there are not, the url will not work try: uClient = Ureq(url) raw_content = uClient.read() except: print('There are no games being played on this day.') return # Run through BeautifulSoup steps to pull out desired data page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_='external') game_date_list = [] # Fix dates given into readable datetime format for x in range(1, len(game)): game_date = game[x]['href'].split('/')[5].split('-')[-3:-1] game_date.append('2020') sent_str = "" for i in game_date: sent_str += str(i) + "-" sent_str = sent_str[:-1] date = datetime.strptime(sent_str, '%m-%d-%Y') date = date.strftime('%m-%d-%Y') game_date_list.append(date) # Get the names of the teams that are playing on that day game = html.findAll(class_='team-name') game = [team.get_text() for team in game] game_list = [] for item in game: # The abbrvs are only the last three characters in the str item = item[-3:] game_list.append(item) # Split home and away teams from the list of cleaned teams bin_len = 2 start = 0 end = start + bin_len week_list = [] while end < (len(game_list) + 1): week = game_list[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Visitor', 'Home'] df['Date'] = game_date_list todays_games = df[df['Date'] == todays_date] # Apply the lambda function that will clean the team names into more colloquial names todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x]) todays_games['Visitor'] = todays_games['Visitor'].apply( lambda x: teams_dict[x]) # return data frame of games that are being played today return todays_games
(1) Collect all the customer reviews of the product [2019 Dell labtop](https://www.amazon.com/Dell-Inspiron-5000-5570-Laptop/dp/B07N49F51N/ref=sr_1_11?crid=1IJ7UWF2F4GHH&keywords=dell%2Bxps%2B15&qid=1580173569& sprefix=dell%2Caps%2C181&sr=8-11&th=1) on amazon. (2) Collect the top 100 User Reviews of the film [Joker](https://www.imdb.com/title/tt7286456/reviews?ref_=tt_urv) from IMDB. (3) Collect the abstracts of the top 100 research papers by using the query [natural language processing](https://citeseerx.ist.psu.edu/search?q=natural+language+processing&submit.x=0&submit.y=0&sort=rlv&t=doc) from CiteSeerX. (4) Collect the top 100 tweets by using hashtag ["#wuhancoronovirus"](https://twitter.com/hashtag/wuhancoronovirus) from Twitter. """ from urllib.request import urlopen as UReq from bs4 import BeautifulSoup as soup import csv import pandas as pd my_url = 'https://www.imdb.com/title/tt7286456/reviews?ref_=tt_urv' uClient = UReq(my_url) page_html = uClient.read() uClient.close page_soup = soup(page_html, "html.parser") with open('joker.csv', 'w', newline='') as file: file_input = csv.writer(file) file_input.writerow(["Number", "User Name", "Description"]) for x in range(0, 100): name = page_soup.findAll("span", {"class": "display-name-link"}) name_id = name[0].text review = page_soup.findAll("div", {"class": "review-container"}) reviews = review[0].text
def scrape(): #################################################################################### concat = Sentry.get() #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html" my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat) my_url = my_url.replace(' ', '+') #################################################################################### uClient = Ureq(my_url) page_html = uClient.read() uClient.close() #html_parsing page_soup = Soup(page_html, "html.parser") #grabe each containers = page_soup.findAll("div", {"class": "item-container"}) #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"}) #print(manufacturer ) #print(len(containers)) #print(containers[5:]) #container = containers[5] #---------------------------------------- save the csv files fileName = "Online_Sales.csv" f = open(fileName, "w") headers = "BRAND , PRICES , SAVES , TITLES , LINK \n" # f.write(headers) for container in containers[4:25]: #--------------------------------------------------------- brand_container = container.findAll("a", {"class": "item-brand"}) brand = brand_container[0].img["title"] #brand name #------------------------------------------------------------------- may_know = container.findAll("a", {"class": "item-title"}) #print(may_know) #################################################################### title = container.a.img["title"] #Name of selling #print(container) #######################################################3 hyper = brand_container[0]["href"] #hyper = container.findAll("div",{"class": "item-info"}) #hyper = hypers.a #print(hyper) #-------------------------------------------------------------- price_container = container.findAll("li", {"class": "price-current"}) price_container2 = price_container[0].strong price = re.findall(r'.\d.\d\d\d', str(price_container2)) prices = ''.join(price) #------------------------------------------------------------------------ save_container = container.findAll("span", {"class": "price-save-percent"}) save = re.findall(r'\d\d.', str(save_container)) saves = ''.join(save) if saves == '': saves = "None" else: saves = saves if prices == "": prices = "Not Available" else: prices = prices #------------------------------------------------------------------------- f.write( brand.replace(',', '') + ", " + prices.replace( ',', '.').replace('0', '1').replace('>', ' ') + ',' + saves.replace('', '').replace('None', '0%') + ', ' + title.replace(',', '') + ', ' + hyper + "\n") f.close() new_win = Button(window, width=10, text="New_Win", command=mainwindow, height=1, font="Jokerman", relief=RAISED, activebackground="LightBlue1", background='sky blue') new_win.place(x=105, y=90) messagebox.showinfo("Happens", "DONE! \n press ok to proceed")
from urllib.request import urlopen as UR from bs4 import BeautifulSoup as soup import re URL_SIMP = 'https://www.newegg.com/Xbox-One-Systems/SubCategory/ID-3216' URL_CLIENT = UR(URL_SIMP) PAGE = URL_CLIENT.read() URL_CLIENT.close() PAGE_SOUP = soup(PAGE, "html.parser") PAGE_CONTENT = PAGE_SOUP.findAll("div", {"class": "item-container"}) filename = "newegg.csv" f = open(filename, "w") headers = ("PRICE, SHIP, NAME\n") f.write(headers) for CONTENT in PAGE_CONTENT: PRICE_DATA = CONTENT.findAll("li", {"class": "price-current"}) PRICE = PRICE_DATA[0].text PRICE_SIMP = re.sub("[^\d\.]", "", PRICE) SHIP_CONTENT = CONTENT.findAll("li", {"class": "price-ship"}) SHIP = SHIP_CONTENT[0].text.strip() NAME = CONTENT.img["title"] print("Price: " + "$" + PRICE_SIMP) print("Shipping Cost: " + SHIP) print("Name of Product: " + NAME) print("\n")
import pandas as pd from urllib.request import urlopen as Req from bs4 import BeautifulSoup as Soup my_url = 'https://batdongsan.com.vn/nha-dat-can-thue' #graping the page Uclient = Req(my_url) page_html = Uclient.read() Uclient.close() #checking file and translate (parser) page_soup = Soup(page_html, "html.parser") #selecting containers = page_soup.findAll("div", {"class": "branch"}) text text1 final = [] final1 = [] final3 = [] for i in range(len(containers)): store = containers[i].findAll("div", {"class": "branch-name"}) text = store[0].text print("Title : " + text) store1 = containers[i].findAll("div", {"class": "branch-add"}) text1 = store1[0].text print("Info : " + text1) final.append(text) final1.append(text1)
Run the script and connect "@kitrak_bot" on telegram app Send message like "Google <something you want to search> eg. Google Candybar The webpage will be opened in your laptop Send "Close" , Chrome browser will be killed. """ from urllib.request import urlopen as URL import urllib.error as err import webbrowser import os my_api = "647242931:AAG7wgAy4Fn-IWrgKn0RskeSnrowZcv6AOc" temp = "" while True: open_URL = 'https://api.telegram.org/bot' + my_api + '/getupdates' try: client_URL = URL(open_URL) data_URL = client_URL.read().decode('utf-8') client_URL.close() getstring = data_URL.split("{") getdata = getstring[-1].split("\"") data = getdata[-2] if (data == temp): pass else: print("data received:", data) temp = data if (data[0:6] == "google" or data[0:6] == "Google"): url = "http://www.google.co.in/search?q=" + data[6:] webbrowser.open(url) elif (data[0:5] == "close" or data[0:6] == "Close"): print("Closed")