def scrape(player_type, stats, filename): my_url = 'https://www.mlb.com' client = urequest(my_url + '/players') page_html = client.read() client.close() page_soup = soup(page_html, "html.parser") player_list = page_soup.findAll("li", {"class": "p-related-links__item"}) f = open('stat_scraper/generated_stats/' + filename, "w") f.write(",".join(stats) + '\n') for player in player_list: player_url = my_url + player.a['href'] try: client = urequest(player_url) page_html = client.read() client.close() except: continue page_soup = soup(page_html, "html.parser") player_recent_html = page_soup.findAll( "div", {'class': 'player-splits--last player-splits--last-x'}) if player_recent_html: position_html = page_soup.find("div", {'class': 'player-header--vitals'}) position = position_html.ul.li.text if player_type == 'P' and not position == player_type: continue elif player_type == 'H' and position == 'P': continue name = page_soup.find("span", { 'class': 'player-header--vitals-name' }).text table = player_recent_html[0].div.div.div.div.table.tbody rows = table.findChildren(['th', 'tr']) for row in rows: cells = row.findChildren('td') row_values = [cell.span.text for cell in cells] row_values = [name, position] + row_values f.write(",".join(row_values) + '\n') f.close()
def scrape_pitching(years): my_url = 'https://www.baseball-reference.com/leagues/MLB/' f = open('stat_scraper/generated_stats/team_pitching_statistics.csv', "w") pitching_columns = ['Team','#P','PAge','RA/G','W','L','W-L%','ERA','G','GS','GF','CG','tSho','cSho','SV','IP','H','R','ER','HR','BB', 'IBB','SO','HBP','BK','WP','BF','ERA+','FIP','WHIP','H9','HR9','BB9','SO9','SO/W','LOB', 'Postseason'] f.write(",".join(pitching_columns) + '\n') for year in years: link = my_url + str(year) + '.shtml' client = urequest(link) page_html = client.read() client.close() page_soup = soup(page_html, "html.parser") standard_pitching_html = page_soup.find("div", {'id': 'all_teams_standard_pitching'}) new_page_soup = soup(str(standard_pitching_html), 'lxml') pitching_comments = new_page_soup.findAll(text=lambda text:isinstance(text, Comment))[0] pitching_soup = soup(pitching_comments, 'lxml') standard_pitching_html = pitching_soup.find('div', {'id': 'div_teams_standard_pitching'}) pitching_table = standard_pitching_html.table.tbody pitching_rows = pitching_table.findChildren(['tr']) pitching_rows = pitching_rows[:len(pitching_rows)-1] postseason_html = page_soup.find("div", {'id': 'all_postseason'}) new_page_soup = soup(str(postseason_html), 'lxml') comments = new_page_soup.findAll(text=lambda text:isinstance(text, Comment))[0] commentsoup = soup(comments, 'lxml') postseason_table = commentsoup.find('div', {'id': 'div_postseason'}).table.tbody postseason_rows = postseason_table.findChildren(['tr']) postseason_teams = [] for row in postseason_rows: td_list = row.find_all('td') look_at_td = td_list[2] teams = look_at_td.findAll('a') if not teams[0].text in postseason_teams: postseason_teams.append(teams[0].text) if not teams[1].text in postseason_teams: postseason_teams.append(teams[1].text) for row in pitching_rows: team_name = row.th.a['title'] cells = row.findChildren('td') row_values = [cell.text for cell in cells] row_values = [str(year) + ' ' + team_name] + row_values if team_name in postseason_teams: row_values.append('1') else: row_values.append('0') f.write(",".join(row_values) + '\n') f.close()
def create_soup_page(url: str): page_soup = None u_client = None try: u_client = urequest(url) page_html = u_client.read() page_soup = URequest.html_to_soup(page_html) except Exception as e: print(e) finally: u_client.close() return page_soup
def scrape_hitting(years): my_url = 'https://www.baseball-reference.com/leagues/MLB/' f = open('stat_scraper/generated_stats/team_hitting_statistics.csv', "w") hitting_columns = ['Team', '#Bat', 'BatAge', 'R/G', 'G', 'PA', 'AB', 'R','H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'LOB', 'Postseason'] f.write(",".join(hitting_columns) + '\n') for year in years: link = my_url + str(year) + '.shtml' client = urequest(link) page_html = client.read() client.close() page_soup = soup(page_html, "html.parser") standard_batting_html = page_soup.find("div", {'id': 'all_teams_standard_batting'}).find('div', {'id': 'div_teams_standard_batting'}) postseason_html = page_soup.find("div", {'id': 'all_postseason'}) new_page_soup = soup(str(postseason_html), 'lxml') comments = new_page_soup.findAll(text=lambda text:isinstance(text, Comment))[0] commentsoup = soup(comments, 'lxml') postseason_table = commentsoup.find('div', {'id': 'div_postseason'}).table.tbody postseason_rows = postseason_table.findChildren(['tr']) postseason_teams = [] for row in postseason_rows: td_list = row.find_all('td') look_at_td = td_list[2] teams = look_at_td.findAll('a') if not teams[0].text in postseason_teams: postseason_teams.append(teams[0].text) if not teams[1].text in postseason_teams: postseason_teams.append(teams[1].text) batting_table = standard_batting_html.table.tbody batting_rows = batting_table.findChildren(['tr']) batting_rows = batting_rows[:len(batting_rows)-1] for row in batting_rows: team_name = row.th.a['title'] cells = row.findChildren('td') row_values = [cell.text for cell in cells] row_values = [str(year) + ' ' + team_name] + row_values if team_name in postseason_teams: row_values.append('1') else: row_values.append('0') f.write(",".join(row_values) + '\n') f.close()
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as urequest url = 'https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off' uClient = urequest(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "_3O0U0u"}) filename = "products.csv" f = open(filename, "w") headers = "Product_Name, Price, Rating\n" f.write(headers) for container in containers: product_name = container.div.img["alt"] price_container = container.findAll("div", {"class": "col col-5-12 _2o7WAb"}) price = price_container[0].text rating_container = container.findAll("div", {"class": "hGSR34"}) rating = rating_container[0].text print("product_name:" + product_name) print("price:" + price) print("rating:" + rating)
from urllib.request import urlopen as urequest from bs4 import BeautifulSoup as soup modulePage = 'http://www.open.ac.uk/courses/modules' OU = "http://www.open.ac.uk" #grabbing OU module page uClient = urequest(modulePage) #Making it readable pageHtml = uClient.read() #closing connection uClient.close() # structure the page into hmtl pageSoup = soup(pageHtml, "html.parser") #print(pageSoup.h1) #modules are located in class "int-grid7" on the page modules = pageSoup.findAll("div", {"class": "int-grid7"}) for module in modules: print(module.text) #level are located in class "int-grid5" on the page levels = pageSoup.findAll("div", {"class": "int-grid5"}) for level in levels: print(level.text)
from urllib.request import urlopen as urequest from bs4 import BeautifulSoup as bsoup my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards' # Opening up connection, accessing web page uClient = urequest(my_url) page_html = uClient.read() uClient.close() #html parser page_soup = bsoup(page_html, "html.parser") # grab each product containers = page_soup.findAll("div", {"class": "item-container"}) filename = "Newegg_products.csv" f = open(filename, "w") headers = "brand, product_name, shipping\n" f.write(headers) for container in containers: branding = container.findAll("div", {"class": "item-branding"}) brand = branding[0].a.img["title"] title_container = container.findAll("a", {"class": "item-title"}) product_name = title_container[0].text shipping_container = container.findAll("li", {"class": "price-ship"})