-
Notifications
You must be signed in to change notification settings - Fork 0
/
rides_into_postgis.py
158 lines (133 loc) · 4.15 KB
/
rides_into_postgis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# this script is separate because we need to keep clearing memory
# to get it to work. there are too many entries...
from itertools import groupby
from datetime import datetime, timedelta
def dedup(ls, key=lambda x: x, keep='first'):
gb = groupby(sorted(ls, key=key), key=key)
if keep == 'last':
return [reversed(g).next() for _, g in gb]
else:
return [g.next() for _, g in gb]
def round_time(dt, res):
# res is in seconds
seconds = dt.hour * 60 * 60 + dt.minute * 60 + dt.second
rounding = (seconds + res / 2) // res * res
return dt + timedelta(0, rounding - seconds, -dt.microsecond)
data_base = '/Users/julienclancy/Desktop/RIPS 2015/databases'
# just download the zipped folder and unpack in the data_base route
import json
import ijson # for very large files
import os
import fiona
import gc
from Levenshtein import distance as leven_dist
from os import listdir, chdir, getcwd
from os.path import join as path_join
from datetime import datetime
from sqlalchemy import create_engine, case
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
# all times are in EST
# all units are metric
import models
from models import Base, Weather, BlockGroup, BikeStation, BikeRide, \
SubwayStation, SubwayDelay, Location
engine = create_engine('postgresql://localhost/dc', convert_unicode=True)
db_session = scoped_session(sessionmaker(
autocommit=False,
autoflush=False,
bind=engine
))
os.chdir(os.path.join(data_base, 'bikeshare'))
stations_hs = {s.name: s for s in db_session.query(BikeStation).all()}
with open('station_aliases.json', 'r') as f:
aliases = json.load(f)
ride_ls = []
failed_ls = []
def find_station(stat_n):
# finds the bike station using the Levenshtein distance if necessary
if stat_n in stations_hs:
return stations_hs[stat_n]
elif stat_n.strip() in stations_hs:
return stations_hs[stat_n.strip()]
elif stat_n in aliases:
return stations_hs[aliases[stat_n]]
else:
print 'couldn\'t find "%s"' % stat_n
return min(stations_hs.values(), key=lambda x: leven_dist(x, stat_n))
f = open('ride_data.json', 'r')
dct = ijson.items(f, 'item')
# ijson lets us stream the file rather than open it all at once --- too big
i = 0
failed = 0
for r in dct:
i += 1
if i % 10000 == 0:
print i
ride = BikeRide()
try:
ride.start_station = find_station(r['start station'])
ride.end_station = find_station(r['end station'])
except:
print 'failed on station', r['start station'], 'or', r['end station']
try:
st_dt = datetime.strptime(r['start date'], "%Y-%m-%dT%H:%M:%S")
ed_dt = datetime.strptime(r['end date'], "%Y-%m-%dT%H:%M:%S")
ride.start_date = st_dt
ride.end_date = ed_dt
ride.duration = r['duration']
ride.subscribed = False if r['user type'] == 'casual' else True
# ride_ls.append(ride)
except :
failed += 1
print 'fail number', failed
print {'ride': r, 'exception': e}
failed_ls.append({'ride': r, 'exception': e})
continue
# if i % 100000 == 0:
# db_session.add_all(ride_ls)
# db_session.commit()
# del ride_ls
# gc.collect()
# ride_ls = []
# print 'loaded part', i / 100000, 'of about 50'
f.close()
# db_session.add_all(ride_ls)
# db_session.commit()
with open('failed.json', 'w') as f:
json.dump(failed_ls, f)
"""
for r in dct:
i += 1
if i % 10000 == 0:
print i
try:
ride = BikeRide()
st_dt = datetime.strptime(r['start date'], "%Y-%m-%dT%H:%M:%S")
ed_dt = datetime.strptime(r['end date'], "%Y-%m-%dT%H:%M:%S")
ride.start_date = st_dt
ride.end_date = ed_dt
ride.duration = r['duration']
ride.subscribed = False if r['user type'] == 'casual' else True
ride.start_station = find_station(r['start station'])
ride.end_station = find_station(r['end station'])
# ride_ls.append(ride)
except Exception as e:
failed += 1
print 'fail number', failed
print {'ride': r, 'exception': e}
failed_ls.append({'ride': r, 'exception': e})
continue
# if i % 100000 == 0:
# db_session.add_all(ride_ls)
# db_session.commit()
# del ride_ls
# gc.collect()
# ride_ls = []
# print 'loaded part', i / 100000, 'of about 50'
f.close()
# db_session.add_all(ride_ls)
# db_session.commit()
with open('failed.json', 'w') as f:
json.dump(failed_ls, f)
"""