def make_player(player_url): try: return db.query(Player).filter(Player.url == player_url).one() except: # really make it pass if not ("http://" in player_url or "https://" in player_url): return None # uniform url protocol player_url = player_url.replace("http://", "https://") print "new player", player_url # find more information r = requests.get(player_url) soup = BeautifulSoup(r.content, "html.parser") cat = soup.find("div", {"class":"player-wins"}).findChildren()[2].contents[0].lower().split() if not "ms" in cat: print "> not men's singles" return None player = Player() try: ages = soup.find("div", {"class":"player-age"}).findChildren()[-1].contents[0].strip().split("/") player.birthyear = ages[2] player.birthmonth = ages[1] player.birthday = ages[0] except: # no birth information pass hand = soup.find("div", {"class":"player-handed"}).findChildren()[-1].contents[0].strip().lower() if hand != "n/a": player.handedness = "right" if "right" in hand else "left" info = soup.find("div", {"class":"player-profile-country-wrap"}) # create player player.name = info.findChildren()[1].contents[0].strip().lower() player.country = info.findChildren()[0].attrs['title'].lower() player.gender = "male" player.playertype = "singles" player.url = player_url print player.name, player.country, player.gender, player.birth(), player.handedness db.add(player) db.commit() return player
def delete_tournament(name): r = db.query(Tournament).filter(Tournament.name == name).one() matches = db.query(Match).filter(Tournament.url == r.url).all() for match in matches: ms1 = match.stat1 ms2 = match.stat2 sets = match.sets for s in sets: db.delete(s) db.delete(ms1) db.delete(ms2) db.delete(match) db.commit()
def make_scrape_page(year, week, page): url = make_url(year, week, page) r = requests.get(url) soup = BeautifulSoup(r.content, "html.parser") rows = soup.findAll("tr") del rows[::2] # remove every second item for row in rows: tds = row.findAll("td") rank = int(tds[0].contents[0].strip()) country = tds[1].findChildren()[0].findChildren()[0].contents[0].strip().upper() name = tds[2].findChildren()[0].findChildren()[0].findChildren()[0].contents[0].strip().lower() winslosses = tds[4].contents[0].strip().split(" - ") wins = int(winslosses[0]) losses = int(winslosses[1]) try: money = float(tds[5].contents[0].strip().replace(',','').replace('$','')) except ValueError: # N/A money = 0.0 pointstournaments = tds[6].findChildren()[0].contents[0].strip().replace(',','').split(" / ") points = int(pointstournaments[0]) tournaments = int(pointstournaments[1]) player_url = tds[2].find("a").attrs['href'] try: player = get_player_url(player_url) except: player = make_player(player_url) if player is None: # player is no longer active player continue print ">", player.name # if already scraped this week hasrank = len( db.query( PlayerRanking ).filter(PlayerRanking.player == player, PlayerRanking.week == week, PlayerRanking.year == year) .all() ) > 0 if hasrank: print "has rank", year, ":", week continue ranking = PlayerRanking() ranking.player = player ranking.week = week ranking.year = year ranking.rank = rank ranking.wins = wins ranking.losses = losses ranking.points = points ranking.tournaments = tournaments ranking.prizemoney = money db.add(ranking) db.commit() print "> stored rank", year, ":", week
from db.db_handler import db from db.models import * import datetime now = datetime.datetime.now() thisyear = now.year thisweek = now.isocalendar()[1] # remove future rankings rankings = db.query(PlayerRanking).filter(PlayerRanking.year == thisyear, PlayerRanking.week > thisweek).all() [db.delete(r) for r in rankings] print "removed", len(rankings), "rankings" db.commit()
def make_scrape_page(year): url = make_url(year) r = requests.get(url, verify=False) soup = BeautifulSoup(r.content, "html.parser") rows = soup.findAll("tr") for tr in rows[1:]: tds = tr.findAll("td") week = tds[0].contents[0].strip() try: country = tds[1].findChildren()[0].findChildren( )[0].contents[0].strip() except IndexError: continue dates = tds[2].contents[0].strip() name = tds[3].findChildren()[0].findChildren()[0].contents[0].strip() url = tds[3].findChildren()[0].findChildren()[0].attrs['href'] money = tds[4].findChildren()[0].contents[0].strip() if not "bwfbadminton.com" in url: continue if money == "-": prizemoney = 0 else: prizemoney = int(re.sub(r'[^\d.]', '', money)) category = tds[5].findChildren()[0].findChildren( )[0].contents[0].strip() city = tds[6].findChildren()[0].contents[0].strip() tours = db.query(Tournament).filter(Tournament.name == name, Tournament.year == year).all() has_tournament = len(tours) > 0 if has_tournament: continue t = Tournament() t.week = week t.start_date = dates.split("-")[0] t.end_date = dates.split("-")[1] t.name = name t.url = url t.country = country t.prizemoney = prizemoney t.category = category t.city = city t.year = year print "new tournament", t.name print t.url def go(t): try: make_scrape_tournament(t) except requests.exceptions.SSLError: print "bwfbadminton.com is down 1" return False except requests.exceptions.ConnectionError: # does not exist print "bad connection" return False except Exception as e: # e.g., timeout ... print "<<<<TRY AGAIN>>>>>" traceback.print_exc() # try again return go(t) return True success = go(t) if success: db.add(t) db.commit() else: db.rollback()