-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
134 lines (123 loc) · 4.69 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# Any copyright is dedicated to the Public Domain.
# http://creativecommons.org/publicdomain/zero/1.0/
import html5lib
import json
import os
import re
import sys
import requests
import urlparse
known_sessions_file = os.path.join(os.path.dirname(__file__), 'known_sessions')
def get_user_id(doc):
script = [s for s in doc.getElementsByTagName('script') if not s.hasAttribute('src') and 'var user =' in s.firstChild.wholeText]
if not script:
raise 'Can\'t find user id script'
m = re.search('\{.*\}', script[0].firstChild.wholeText)
if not m:
raise 'Can\'t find user id data'
return int(json.loads(m.group(0))['id'])
def get_data(doc):
script = [s for s in doc.getElementsByTagName('script') if not s.hasAttribute('src') and 'var index_data =' in s.firstChild.wholeText]
if not script:
raise 'Can\'t find index_data script'
m = re.search('\[.*\]', script[0].firstChild.wholeText)
if not m:
raise 'Can\'t find index_data data'
return json.loads(m.group(0))
def get_csrf_token(doc):
metas = doc.getElementsByTagName('meta')
p = [m.getAttribute('content') for m in metas if m.getAttribute('name') == 'csrf-param']
if not p:
raise 'No csrf-param'
token_name = p[0]
v = [m.getAttribute('content') for m in metas if m.getAttribute('name') == 'csrf-token']
if not v:
raise 'No csrf-token'
token_value = v[0]
return dict([(token_name, token_value)])
def read_known_sessions():
if not os.path.isfile(known_sessions_file):
return set()
with open(known_sessions_file, 'rb') as f:
return set(int(s.strip()) for s in f.readlines())
def write_known_sessions(data):
with open(known_sessions_file, 'wb') as f:
f.write('\n'.join(str(s) for s in sorted(data)))
def check_download_session(url, download_dir, cookies):
r = requests.head(url, cookies=cookies)
if r.status_code != 200 or 'Content-Disposition' not in r.headers:
return False
m = re.search('filename="(.+)"', r.headers['Content-Disposition'])
if not m:
return False
f = m.group(1)
filename = os.path.join(download_dir, f)
if os.path.isfile(filename):
return True
# get it
print "Fetching %s" % f
r2 = requests.get(url, cookies=cookies)
if r2.status_code != 200:
return False
with open(filename, 'wb') as f:
f.write(r2.content)
return True
def fetch_data(download_dir):
# Fetch the index page to get a CSRF token.
r = requests.get('https://www.runtastic.com/')
if r.status_code != 200:
raise 'Sucks'
cookies = dict(r.cookies)
doc = html5lib.parse(r.text, treebuilder='dom')
csrf = get_csrf_token(doc)
# Now log in.
user, pw = read_user_pass()
login = dict(csrf)
login['user[email]'] = user
login['user[password]'] = pw
r2 = requests.post('https://www.runtastic.com/en/d/users/sign_in.json', data=login, cookies=cookies)
if r2.status_code != 200:
raise 'Sucks 2'
cookies.update(r2.cookies)
j = r2.json()
if not j['success']:
raise 'Login failed'
doc = html5lib.parse(j['update'], treebuilder='dom')
# Find the sport-sessions page and fetch it to get a User ID
# and a list of session IDs.
links = [l.getAttribute('href') for l in doc.getElementsByTagName('a') if l.getAttribute('href').endswith('/sport-sessions')]
sessions_url = urlparse.urljoin(r2.url, links[0])
r3 = requests.get(sessions_url, cookies=cookies)
if r3.status_code != 200:
raise 'Sucks 3'
cookies.update(r3.cookies)
doc = html5lib.parse(r3.text, treebuilder='dom')
uid = get_user_id(doc)
data = get_data(doc)
# Now hit the API to get data about each session.
request_data = dict(csrf)
request_data['user_id'] = uid
request_data['items'] = ','.join(str(d[0]) for d in data)
r4 = requests.post('https://www.runtastic.com/api/run_sessions/json',
cookies=cookies,
data=request_data)
if r4.status_code != 200:
raise 'Sucks 4'
cookies.update(r4.cookies)
sessions = r4.json()
known_sessions = read_known_sessions()
for s in sessions:
if s['id'] in known_sessions:
continue
if check_download_session(urlparse.urljoin(r4.url, s['page_url']) + '.tcx', download_dir, cookies):
known_sessions.add(s['id'])
write_known_sessions(known_sessions)
def read_user_pass():
auth_file = os.path.join(os.path.dirname(__file__), 'auth')
return [x.strip() for x in open(auth_file).read().splitlines()]
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Usage: scrape.py <download directory>')
sys.exit(1)
fetch_data(sys.argv[1])