/
ultra.py
257 lines (247 loc) · 11.3 KB
/
ultra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
"""Do some Stuff
"""
###
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import csv
import time
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
# PD options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
#classes
class Runner:
'''Doc
'''
def __init__(self, array, url, expected):
#age,number of races, url
self.events = []
self.url = url
self.expectedfinish = expected
self.first = array[0].split(' ')[0]
self.last = array[0].split(' ')[1]
self.age = self.just_age(array[0].split(' ')[-1])
self.total = array[1].split(' ')[0]
self.division = self.m_or_f(array[0].split(' ')[-1])
def m_or_f(self, age):
if re.search(r'[MF]\d{1,3}', age):
return (age[:1])
else: # had to add this for sloppy entries with no age
return ''
def just_age(self, age):
if re.search(r'[MF]\d{1,3}', age):
return (age[1:])
else:
return ''
def real_total(self):
fin = []
for r in self.events:
if r['status'] == 'Complete':
fin.append(r)
return len(fin)
def to_dict(self, current):
return {'first': self.first, 'last': self.last, 'expectedfinish': self.expectedfinish, 'age': self.age, 'races previously run': self.real_total(), 'Gender': self.division,
'current race distance in miles': current.text, 'months since last race': '', 'last race distance in miles': self.lastmiles(),
'difference in last race to current race': '', 'last race pace per mile': 'lastracepacepermile',
'ever run farther than distance of current race': '','ever run distance of current race': bool([ele for ele in self.events if(ele['distance'] in racedist) and ele['status'] != 'Future'])}
def to_miles(self, Km):
return round(Km * 0.621371, 2)
def lastrace(self):
complete_races = []
for i in self.events:
if i['status'] == 'Complete':
complete_races.append(i)
return complete_races[0]
def lastmiles(self):
try:
complete_races = []
for i in self.events:
if i['status'] == 'Complete':
complete_races.append(i)
lastrace = complete_races[0]['distance']
print(f"!!{lastrace}")
if re.search(r'.*\d*K.*$', lastrace):
digits = [int(s) for s in re.findall(r'-?\d+\.?\d*', lastrace)][0]
lasymiles = self.to_miles(digits)
return lasymiles ### was a typo, now new variable name
elif re.search(r'.*\d*.*MILER.*$', lastrace):
lasymiles = [int(s) for s in re.findall(r'-?\d+\.?\d*', lastrace)][0]
return lasymiles
else:
return None
except:
return None
# 'current race distance in miles': racedist.text,
# 'current race trail or road': '',# how to find this? scrape page and look for key phrases?
# !'gender': r.division,
# !'age': r.age,
# !'races previously run': len(r.events),
# 'months since last race': '', #do some math? count futures
# 'last race distance in miles': '',#do some math? ^^
# 'difference in last race to current race': '', #do some math?^^^^
# 'last race pace per mile': '',# make function work, last non-future
# #'last race elevation gain': '',
# #'last race trail or road': '',
# 'ever run farther than distance of current race': '',# parse data and y/n
# 'ever run distance of current race': bool([ele for ele in run.events if(ele['distance'] in racedist)]),# parse data and y/n
# #'temperature last race': '',
# }
class Race:
'''Doc for Race
'''
def __init__(self, array):
self.name = array[0].split('-')[0].strip()
self.date = array[1].strip()
#these are DNF/DNS and Future races
if len(array) <= 2:
if array[0].split('-')[-1].strip() == 'DNS' or array[0].split('-')[-1].strip() == 'DNF':
self.status = array[0].split('-')[-1].strip()
self.distance = array[0].split('-')[-3].strip()
self.location = array[0].split('-')[-2].strip()
#self.weather = ?
else:
self.distance = array[0].split('-')[-2].strip()
self.status = 'Future'
# these are just odd cases
elif len(array) < 4:
self.endtime = array[2]
self.distance = array[0].split('-')[-2].strip()
self.location = array[0].split('-')[-1].strip()
self.status = 'Complete'
#self.weather = ?
self.pace = self.get_pace(self.endtime, self.distance)
#most races completed go into else
else:
if array[0].split('-')[-1].strip() == 'No Rank Finish':
self.distance = array[0].split('-')[1].strip()
self.endtime = array[3]
self.location = array[0].split('-')[-1].strip()
self.status = 'Complete'
else:
self.distance = array[0].split('-')[1].strip()
self.endtime = array[3]
self.age = array[4].split(': ')[1]
self.location = array[0].split('-')[-1].strip()
self.status = 'Complete'
self.pace = self.get_pace(self.endtime, self.distance)
def to_miles(self, Km):
return round(Km * 0.621371, 2)
def get_pace(self, dur, dis):
try:
dis = dis.upper()
if re.search(r'\d*:\d*.*$', dur):
hours = int(dur.split(':')[0]) *60 + float((dur.split(':')[1]))
if re.search(r'.*\d*K.*$', dis):
found = re.search('\d*', dis)
p = self.to_miles(int(found.group()))
result = round(hours / p, 2) # second number for round if for places in decimal
elif re.search(r'.*\d*.*MILER.*$', dis):
found = re.search('\d*', dis)
p = int(found.group())
result = round(hours / p, 2)
elif re.search(r'^.*MARATHON.*$', dis):
result = round(hours / 26.2, 2)
elif re.search(r'^.*HALF MARATHON.*$', dis):
result = round(hours / 13.1, 2)
#figure out the hr races too!
else:
result = 'No Clue'
return result
except:
return 'No Clue'
BASEURL = 'https://ultrasignup.com'
EVENTURL = 'https://ultrasignup.com/register.aspx?did=79789'
#URL = 'https://ultrasignup.com/entrants_event.aspx?did=79789'
URL = 'https://ultrasignup.com/entrants_event.aspx?did=80402'
#start scraping
page = requests.get(URL)
soup = bs(page.text, "html.parser")
racedist = soup.find('span',{'class': 'distances'})
nameofthisrace = soup.find('h1',{'class': 'event-title'})
#sometimes there are multiple races per page
if racedist.text.__contains__(","):
racedist = soup.find('a',{'class': 'event_selected_link'})
ultragrid = soup.findAll('table',{"class":"ultra_grid"})[0]
runners = []
tr = ultragrid.find_all('tr')
for t in tr:
runner = []
runnerurl = []
for f in t.find_all('td'):
stats =[]
if re.match('Results', f.text.strip()):
for bs in f.find_all('a'):
runnerurl.append(bs['href'])
runner.append(bs['href'])
else:
runner.append(f.text.strip())
if len(runner) != 0:
# this is to get a number to compare with results found against all users with the name given
expectedfinish = runner[3]
numofracesfromrace = runner[2]
agerange = runner[4]
if len(runnerurl) != 0:
# paste ultraparticipant here
URL = (BASEURL + runnerurl[0])
opts = Options()
opts.add_argument('--headless')
browser = Firefox(options=opts)
browser.get(URL)
time.sleep(1)
acchead = browser.find_elements_by_class_name('accordion-heading')
accordion = browser.find_elements_by_class_name('accordion-content')
for num, runner in enumerate(acchead, start=0):
runarray = runner.text.split('\n')
run = Runner(runarray, URL, expectedfinish)
races = accordion[num].find_elements_by_class_name('rowlines')
print(run.url)
for r in races:
racelist = r.text.split('\n')
#print(racelist)
obj = Race(racelist)
run.events.append(vars(obj))
complete = []
for e in run.events:
if e['status'] == 'Complete':
complete.append(e)
#print(numofracesfromrace)#wtf this???!!!
#print(len(complete))
for ele in run.events:
if(ele['name'] in nameofthisrace) and ele['status'] == 'Future': # this makes sure the user is signed up in the future
if numofracesfromrace == '':
numofracesfromrace = 0
if len(complete) == int(numofracesfromrace): #Sometimes this is wrong. It is the number that is stored in ultrasignup though not counted
print('Match!')
runtodict = run.to_dict(racedist)
runners.append(runtodict)
print(vars(run))
elif len(complete) > 5 and len(complete) == int(numofracesfromrace) - 1: # this is to catch discrepancies, maybe contested results that no longer posted
print('Match! in elif')
runtodict = run.to_dict(racedist)
runners.append(runtodict)
print(vars(run))
browser.close()
frame = pd.DataFrame(runners)
# #do the work to make the proper table
# rundict = {'predicted pace per mile of current race': Race.get_pace('', r.expectedfinish, racedist.text),# use the fucntions in class?
# 'current race distance in miles': racedist.text,
# 'current race trail or road': '',# how to find this? scrape page and look for key phrases?
# 'gender': r.division,
# 'age': r.age,
# 'races previously run': len(r.events),
# 'months since last race': '', #do some math? count futures
# 'last race distance in miles': '',#do some math? ^^
# 'difference in last race to current race': '', #do some math?^^^^
# 'last race pace per mile': '',# make function work, last non-future
# #'last race elevation gain': '',
# #'last race trail or road': '',
# 'ever run farther than distance of current race': '',# parse data and y/n
# 'ever run distance of current race': bool([ele for ele in run.events if(ele['distance'] in racedist)]),# parse data and y/n
# #'temperature last race': '',
# }
#add r to a new table with cleaned data! dust hands!!!