def test_all(): root = "http://www.nanzao.com" main = scraper.get_soup(root) if main: print "Fetched index page" all_links = get_links(main) num_links = len(all_links) print "num_links", num_links i = 0 for url, cat in all_links: print "URL AND CAT" print url, cat soup = scraper.get_soup(root + url) if soup: publishdate = get_publishdate(soup) print "PUBLISHDATE" print publishdate content = get_content(soup) print "CONTENT" print content meta = scraper.get_meta(soup) print "META" print meta print "ARTICLE NUM", i i += 1 print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i print "TOTAL NUMBER OF ARTICLES", num_links
def test_all(): main = scraper.get_soup("http://caixin.com/") links = get_links(main) num_links = len(links) i = 0 for link in links: # add this 1 line print link print i url, category, date = link print url print category print date article = scraper.get_soup(url) meta = scraper.get_meta(article) print meta content = get_content(article) if content: for line in get_content(article): print line else: print "PHOTOS ONLY" i += 1 print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
def grab_game_ids(self): # start_date = datetime(2021, 10, 19) start_date = datetime.now() - timedelta(3) end_date = datetime.now() - timedelta(1) logging.info('-----Grabbing gameids-----') self.games_to_add = {} formatted_dates = util.get_list_of_formatted_dates( start_date, end_date) for formatted_date in formatted_dates: # "https://www.espn.com/nba/scoreboard/_/date/20220128" url = "{0}{1}".format(SCOREBOARD_PREFIX, formatted_date) logging.info('Grabbing soup for {}'.format(url)) soup = scraper.get_soup(url) # DO NOT update self.Games, we will update when we've actually updated the record if formatted_date not in self.Games or formatted_date not in self.games_to_add: self.games_to_add[formatted_date] = [] # grab game ids and append a_tags = soup.findAll( 'a', { 'class': 'AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100' }) for a_tag in a_tags: game_id = a_tag['href'].split("/") game_id = str(game_id[len(game_id) - 1]) if game_id not in self.games_to_add[formatted_date] and ( formatted_date not in self.Games or game_id not in self.Games[formatted_date]): logging.info( 'Appending game_id {} to {} games to add list'.format( game_id, formatted_date)) self.games_to_add[formatted_date].append(str(game_id))
def scrape_pf(url, search): """ Scrapes the headline and answer from a PolitiFact fact check. Returns a data dictionary with the link, headline, answer, generated score, and source. """ soup = get_soup(url) headline = soup.find("meta", {"name": "twitter:title"})["content"] answer = soup.find("div", {"class": "meter"}).a.img["alt"] score = get_pf_score(search, title_text=soup.find("h1", { "class": "article__title" }).get_text().strip(), statement_text=soup.find("div", { "class": "statement__text" }).get_text().strip(), article_text=soup.find("div", { "class": "article__text" }).get_text().strip(), sources_text=soup.find("div", { "class": "widget__content" }).get_text().strip()) return { "link": url, "headline": headline, "answer": answer, "score": score, "source": "@PolitiFact" }
def get_player_urls(url): soup = scraper.get_soup(url) players = soup.find("table", { "id": "per_game_stats" }).findAll("tr", {"class": "full_table"}) for player in players: player_url = BASE + player.findAll("a")[0]["href"] player_stats = get_player_info(player_url) db.insert(stat=player_stats, table="player_info")
def test_all(): print "test11" main = scraper.get_soup("http://www.infzm.com/") print "test22" links = get_links(main) num_links = len(links) i = 0 for link in links: print i print link article = scraper.get_soup(link) meta = scraper.get_meta(article) print meta content = get_content(article) print content i += 1 print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
def test_get_soup(self): sleep = self.mocker.patch.object(time, 'sleep') httpretty.register_uri(httpretty.GET, 'https://www.wiz.pl/134.html', status=200, body='<div>welcome in wiz</div>') bs = get_soup('https://www.wiz.pl/134.html') assert sleep.call_count == 1 assert bs == BeautifulSoup(b'<div>welcome in wiz</div>', 'html.parser')
def test_all(): main = scraper.get_soup("http://cyol.com") links = get_links(main) num_links = len(links) #import pdb; pdb.set_trace() i = 0 for link in links: print i url, category, date = link print url print category print date article = scraper.get_soup(url) meta = scraper.get_meta(article) print meta author = get_author(article) print author content = get_content(article) print content i += 1 print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
def test_all(): """Test scraper on all articles""" main = scraper.get_soup("http://www.huanqiu.com/") if main: print "got the main soup" all_links = get_links(main) num_links = len(all_links) i = 0 for link in all_links: # add this one line print link category = get_category(link) print category soup = scraper.get_soup(link) if soup: encoded_content = [line for line in get_content(soup)] all_meta = scraper.get_meta(soup) author = get_author(soup) publishdate = get_publishdate(soup) parsedate = time.strftime("%Y-%m-%d %H:%M:%S") #for testing if encoded_content: for p in encoded_content: print p print all_meta print author print publishdate print parsedate print "ARTICLE NUMBER", i i += 1 print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i print "TOTAL NUMBER OF ARTICLES", num_links
def scrape_day(month, day, year): # url = "https://www.basketball-reference.com/boxscores/?month={}&day={}&year={}".format(month, day, year) url = "https://www.basketball-reference.com/boxscores/?month=02&day=15&year=2019" soup = scraper.get_soup(url) if soup is None: # LOG return a_tags = soup.find("div", {"class": "game_summaries"}).findAll("a") for a_tag in a_tags: href = a_tag["href"] if "boxscore" in href and a_tag.text == 'Box Score': base = "https://www.basketball-reference.com" bs = scrape_box_score(base + href) db.insert(bs, table="games")
def scrape(h_tag, link_tag, text_tag, target_url): """A Program to Scrape a website""" url = target_url target = scraper.create_target(target_url) page = scraper.get_page(target) soup = scraper.get_soup(page) headings = scraper.get_heading(soup, h_tag) texts = scraper.get_texts(soup, text_tag) db_functions.create_scrape(url, h_tag, text_tag, link_tag) for h in headings: db_functions.create_result(url, h_tag, h, '', link_tag) for t in texts: db_functions.create_result(url, text_tag, '', t, link_tag) links = scraper.get_links(soup, link_tag) for l in links: db_functions.create_result(url, link_tag, '', '', l) db_functions.print_scrape(url) db_functions.print_records(url)
def search_pf(headline): """ Searches PolitiFact for a particular headline and returns the results. """ query = {"q": headline} query_string = urllib.urlencode(query) url = "http://www.politifact.com/search/statements/?" + query_string search_soup = get_soup(url) results = search_soup.findAll("li", "search-results__item") pages = [] for result in results: link = "http://www.politifact.com" + result.find( "a", "search-results__link")['href'] page = scrape_pf(link, headline) pages.append(page) time.sleep(3) return pages
def scrape_box_score(box_score_url): scraper.sleep(3, 8) soup = scraper.get_soup(box_score_url) if soup is None: # LOG return home, away = get_teams(soup) st = soup.find("meta", {"name": "Description"})["content"] st = st[st.find("(") + 1:st.rfind(")")] home_score = int(st[st.rfind("(") + 1:]) away_score = int(st[:st.find(")")]) game_date = soup.find("div", {"class": "scorebox_meta"}).find("div").text game_date = datetime.strptime(game_date, "%I:%M %p, %B %d, %Y") season = game_date.year if game_date.month > 8: season += 1 game_date = game_date.strftime("%Y%m%d") stats = [] tables = soup.findAll( "table", {"class": ["sortable", "stats_table", "now_sortable"]}) for table in tables: tbody = table.find("tbody") trs = tbody.findAll("tr") team = table.get("id").split("_")[1].upper() for i, tr in enumerate(trs): player_stats = { "game_date": game_date, "opp": away if team == home else home, "team": team, "home_score": home_score, "away_score": away_score, "season": season, } tds = tr.findAll("td") try: pid = tr.th.a["href"].split("/")[-1][:-5] starter = True if i < 5 else False player_stats["pid"] = pid player_stats["starter"] = starter # Inefficient, but makes the data cleaner for index, stat in enumerate(stats): if stat["pid"] == pid: player_stats = stat stats.remove(stats[index]) break # stuff the dict with number vals for td in tds: if td["data-stat"] == "mp": m, s = td.text.split(":") player_stats["play_time_raw"] = td.text player_stats["mp"] = int(m) player_stats["sp"] = int(m) * 60 + int(s) player_stats[td["data-stat"]] = scraper.get_number_type( td.text) stats.append(player_stats) except TypeError: pass except Exception as e: logging.error("Exception {} caught".format(e)) return stats
def scrape_pbp(pbp_url): scraper.sleep(3, 8) soup = scraper.get_soup(pbp_url) if soup is None: # LOG return home, away = get_teams(soup) trs = soup.find("table", {"id": "pbp"}).findAll("tr") game_date = soup.find("div", {"class": "scorebox_meta"}).find("div").text game_date = datetime.strptime(game_date, "%I:%M %p, %B %d, %Y") season = game_date.year if game_date.month > 8: season += 1 game_date = game_date.strftime("%Y%m%d") stats = [] quarter = 1 for tr in trs: stat = {} if tr.get("id") is not None: quarter = int(tr.get("id")[1:]) tds = tr.findAll("td") tds_size = len(tds) if tds_size == 6: for i in range(6): td_text = tds[i].text # skip if no text if len(td_text) <= 1: continue # PLAYS if i in [1, 5]: parse_play(tds[i], stat) # stuff we don't care about (home/away point differential(extractable)) elif i in [2, 4]: continue # SCORES elif i == 3: away_score, home_score = td_text.split("-") stat["away_score"] = int(away_score) stat["home_score"] = int(home_score) # i == 0 else: stat["play_time_raw"] = td_text minutes, seconds = td_text.split(".")[0].split(":") ms = td_text.split(".")[1] seconds = float(seconds) + (float(minutes) * 60) + float(ms) stat["play_time"] = seconds elif tds_size == 2: stat["play_time_raw"] = tds[0].text minutes, seconds = tds[0].text.split(".")[0].split(":") ms = tds[0].text.split(".")[1] seconds = float(seconds) + (float(minutes) * 60) + float(ms) stat["play_time"] = seconds stat["play"] = tds[1].text if stat: stat["quarter"] = quarter stat["game_date"] = game_date stat["season"] = season stats.append(stat) return stats
from datetime import date, timedelta from scraper import get_soup # Get yesterdays date and make payload for GET request date = date.today() - timedelta(1) payload = { 'month': str(date.month), 'day': str(date.day), 'year': str(date.year) } soup = get_soup(payload) #this is a test
def check_for_live_games(self): # scenario 1: we've already checked or are in test mode if self.now < self.next_live_game_check or self.test_mode: return # scenario 2: there are games that are live, don't re-check until tomorrow at 8 AM nba_game_times_url = "https://www.google.com/search?q=nba+games+today" soup = scraper.get_soup(nba_game_times_url) logging.info('Grabbed soup for {}'.format(nba_game_times_url)) spans = soup.findAll('span', {'class': 'imso-medium-font'}) if len(spans) > 0: self.sleep_time = 0 self.next_live_game_check = datetime(self.now.year, self.now.month, self.now.day + 1, 8, 0, 0) logging.info( 'There are live games, don\'t re-check for live games until tomorrow at 8 AM ({} seconds)' .format(self.next_live_game_check)) return # scenario 3: there are games today but they aren't being played yet. Sleep until 10 minutes before the first game start time game_times = soup.findAll('div', {'class': 'imspo_mt__mtc-no'}) first_game_time = None for game_time in game_times: game_time = game_time.find( 'div', { 'class': 'imspo_mt__ndl-p imspo_mt__pm-inf imspo_mt__pm-infc imso-medium-font' }) if game_time is not None: first_game_time = game_time.text.strip() break if len(first_game_time) > 0: first_game_time_text = first_game_time first_game_time = datetime.strptime(first_game_time, '%I:%M %p') first_game_time = datetime(self.now.year, self.now.month, self.now.day, first_game_time.hour, first_game_time.minute, 0) # sleep until ten minutes before the next game time self.sleep_time = (first_game_time - self.now).seconds - 600 logging.info( 'There are games today but they aren\'t being played yet. Sleep until 10 minutes before the first game start time at {} ({} seconds)' .format(first_game_time_text, self.sleep_time)) return # scenario 4: there are (1) no live games today or (2) games scheduled for later today. Check tomorrow at 8 AM today = soup.find( 'div', { 'class': 'imspo_mt__pm-inf imspo_mt__pm-infc imspo_mt__date imso-medium-font' }) if today is not None: today = today.text.strip().lower() == 'today' if not today: self.next_live_game_check = datetime(self.now.year, self.now.month, self.now.day + 1, 8, 0, 0) self.sleep_time = (self.next_live_game_check - self.now).seconds logging.info( 'There are no games today. Check tomorrow at 8 AM ({} seconds)' .format(self.sleep_time)) return # scenario 5: shit's broken logging.error( 'Unknown scenario encountered while checking for live games from Google' ) return
def get_dk_spreads(self, file): if self.test_mode: f = open(file, "r") soup = f.read() f.close() soup = BeautifulSoup(soup, "html.parser") else: draft_kings_sportsbook_url = "https://sportsbook.draftkings.com/leagues/basketball/88670846" loop = 5 + 1 success = False while loop > 0: try: soup = scraper.get_soup(draft_kings_sportsbook_url) logging.info('Grabbed soup for {}'.format( draft_kings_sportsbook_url)) loop = 0 success = True except: logging.info('Failed dk scrape {}'.format( draft_kings_sportsbook_url)) loop = loop - 1 if not success: logging.critical( 'Failed scrape after {} attempts'.format(loop)) return tbody = soup.find('tbody', {'class': 'sportsbook-table__body'}) trs = tbody.findAll('tr') if len(trs) % 2 != 0: logging.warning( 'Found an uneven amount of rows while parsing DraftKings for spreads' ) self.sleep_time = self.min_sleep_time raw_times = tbody.findAll('span', {'class': 'event-cell__time'}) raw_quarters = tbody.findAll('span', {'class': 'event-cell__period'}) if len(raw_times) != len(raw_quarters): # there are live games, re-check if len(raw_times) > 0 or len(raw_quarters) > 0: self.sleep_time = self.min_sleep_time # there are no live games, just return and wait for next sleep cycle logging.warning( 'raw_times != raw_quarters, returning with {}s sleep'.format( self.sleep_time)) return times = [raw_time.text.strip() for raw_time in raw_times] quarters = [raw_quarter.text.strip() for raw_quarter in raw_quarters] self.dk_spreads = [] class_team = {'class': 'event-cell__name-text'} class_spread = {'class': 'sportsbook-outcome-cell__line'} class_spread_odds = {'class': 'sportsbook-odds american default-color'} class_ml = { 'class': 'sportsbook-odds american no-margin default-color' } class_score = {'class': 'event-cell__score'} # read per game, so two rows at a time for i in range(0, len(trs) - 1, 2): home_tr, away_tr = trs[i + 1], trs[i] # game hasn't started home_score, away_score = home_tr.find('span', class_score), away_tr.find( 'span', class_score) if home_score is None or away_score is None: continue # the following fields are guaranteed to exist home_team, away_team = home_tr.find( 'div', class_team).text.strip(), away_tr.find( 'div', class_team).text.strip() home_score, away_score = int(home_score.text.strip()), int( away_score.text.strip()) quarter = quarters[i] time_left = times[i] # the following fields are not guaranteed to exist, so skip if they don't try: home_spread, away_spread = home_tr.find( 'span', class_spread).text.strip(), away_tr.find( 'span', class_spread).text.strip() home_spread_odds, away_spread_odds = home_tr.find( 'span', class_spread_odds).text.strip(), away_tr.find( 'span', class_spread_odds).text.strip() home_ml, away_ml = home_tr.find( 'span', class_ml).text.strip(), away_tr.find( 'span', class_ml).text.strip() except: logging.warning( 'No odds for {} vs. {} game with a score of {}-{}{} in the {}.' .format( home_team, away_team, home_score, away_score, ' with {} left'.format(time_left), quarter, )) continue game_spreads = { 'home_team': home_team, 'home_score': home_score, 'home_spread': home_spread, 'home_spread_odds': home_spread_odds, 'home_ml': home_ml, 'away_team': away_team, 'away_score': away_score, 'away_spread': away_spread, 'away_spread_odds': away_spread_odds, 'away_ml': away_ml, 'quarter': quarter, 'time_left': time_left, } self.dk_spreads.append(game_spreads) return
def update_records(self): logging.info('-----Updating records-----') # only grab info from games_to_add, Games ONLY contains gameids that have been added to Records for game_date in self.games_to_add: logging.info('Updating records for: {}'.format(game_date)) for game_id in self.games_to_add[game_date]: # https://www.espn.com/nba/matchup?gameId=401360182 url = "{0}{1}".format(MATCHUP_PREFIX, game_id) soup = scraper.get_soup(url) logging.info('Grabbed soup for {}'.format(url)) text = soup.prettify() try: # grab home/away teams, date home = text[text.find("espn.gamepackage.homeTeamName = ") + len("espn.gamepackage.homeTeamName = "):] home = home[:home.find(";")].replace('"', '') away = text[text.find("espn.gamepackage.awayTeamName = ") + len("espn.gamepackage.awayTeamName = "):] away = away[:away.find(";")].replace('"', '') date = text[text.find("espn.gamepackage.timestamp = ") + len("espn.gamepackage.timestamp = "):] date = date[:date.find(";")].replace('"', '')[:10] date = datetime.strptime(date, "%Y-%m-%d") - timedelta(1) date = "{0}-{1}-{2}".format(date.year, date.month, date.day) home_score = int( soup.find('div', { 'class': 'score icon-font-before' }).text) away_score = int( soup.find('div', { 'class': 'score icon-font-after' }).text) # grab lead/deficits for each team # reasonable assumption (?): index 0 is ALWAYS away and index 1 is ALWAYS home tds = soup.find("tr", { "data-stat-attr": "largestLead" }).find_all("td")[1:] home_lead = int(tds[1].text.strip()) away_lead = int(tds[0].text.strip()) records = { home: { "game_date": date, "game_id": game_id, "largest_lead": home_lead, "largest_deficit": away_lead, "opp_team": away, "score": home_score, "opp_score": away_score, "win": home_score > away_score, }, away: { "game_date": date, "game_id": game_id, "largest_lead": away_lead, "largest_deficit": home_lead, "opp_team": home, "score": away_score, "opp_score": home_score, "win": away_score > home_score, }, } # update the records with non-duplicates for team in records: skip = False if team in self.Records: for record in self.Records[team]: if record["game_id"] == game_id: logging.info( "Skipping game_id {}".format(game_id)) skip = True break if not skip: self.Records[team].append(records[team]) else: self.Records[team] = [records[team]] if not skip: logging.info( "Appending game_id {} to self.Records (record data) and self.Games (game_id) for team {} and date {}" .format(game_id, team, game_date)) # only update self.Games if we've added it to the record if game_date not in self.Games: self.Games[game_date] = [game_id] else: self.Games[game_date].append(game_id) except Exception as e: logging.error( "Error with game_id: {0} with exception {1}".format( game_id, e)) continue