def make_valid(outpath): times = [] for i, line in enumerate(taxi_it('train')): time = line['timestamp'] latitude = line['latitude'] if len(latitude) == 0: continue duration = 15 * (len(latitude) - 1) times.append((i, int(time), int(time + duration))) if i % 1000 == 0: print times[-1] with sqlite3.connect(outpath) as timedb: c = timedb.cursor() c.execute(''' CREATE TABLE trip_times (trip INTEGER, begin INTEGER, end INTEGER) ''') print "Adding data..." c.executemany('INSERT INTO trip_times(trip, begin, end) VALUES(?, ?, ?)', times) timedb.commit() print "Creating index..." c.execute('''CREATE INDEX trip_begin_index ON trip_times (begin)''')
def compute_number_coordinates(): n_coordinates = 0 for ride in taxi_it('train'): n_coordinates += len(ride['latitude']) print n_coordinates return n_coordinates
def make_valid(outpath): times = [] for i, line in enumerate(taxi_it('train')): time = line['timestamp'] latitude = line['latitude'] if len(latitude) == 0: continue duration = 15 * (len(latitude) - 1) times.append((i, int(time), int(time + duration))) if i % 1000 == 0: print times[-1] with sqlite3.connect(outpath) as timedb: c = timedb.cursor() c.execute(''' CREATE TABLE trip_times (trip INTEGER, begin INTEGER, end INTEGER) ''') print "Adding data..." c.executemany( 'INSERT INTO trip_times(trip, begin, end) VALUES(?, ?, ?)', times) timedb.commit() print "Creating index..." c.execute('''CREATE INDEX trip_begin_index ON trip_times (begin)''')
def compute_number_coordinates(): # Count the number of coordinates n_coordinates = 0 for ride in taxi_it('train'): n_coordinates += len(ride['latitude']) print n_coordinates return n_coordinates
def make_valid(cutfile, outpath): cuts = importlib.import_module('.%s' % cutfile, 'data.cuts').cuts print "Number of cuts:", len(cuts) valid = [] for line in taxi_it('train'): time = line['timestamp'] latitude = line['latitude'] longitude = line['longitude'] if len(latitude) == 0: continue for ts in cuts: if time <= ts and time + 15 * (len(latitude) - 1) >= ts: # keep it n = (ts - time) / 15 + 1 line.update({ 'latitude': latitude[:n], 'longitude': longitude[:n], 'destination_latitude': latitude[-1], 'destination_longitude': longitude[-1], 'travel_time': 15 * (len(latitude)-1) }) valid.append(line) break print "Number of trips in validation set:", len(valid) file = h5py.File(outpath, 'a') clen = file['trip_id'].shape[0] alen = len(valid) for field in _fields: dset = file[field] dset.resize((clen + alen,)) for i in xrange(alen): dset[clen + i] = valid[i][field] splits = file.attrs['split'] slen = splits.shape[0] splits = numpy.resize(splits, (slen+len(_fields),)) for (i, field) in enumerate(_fields): splits[slen+i]['split'] = ('cuts/%s' % cutfile).encode('utf8') splits[slen+i]['source'] = field.encode('utf8') splits[slen+i]['start'] = clen splits[slen+i]['stop'] = alen splits[slen+i]['indices'] = None splits[slen+i]['available'] = True splits[slen+i]['comment'] = '.' file.attrs['split'] = splits file.flush() file.close()
def make_valid(cutfile, outpath): cuts = importlib.import_module('.%s' % cutfile, 'data.cuts').cuts print "Number of cuts:", len(cuts) valid = [] for line in taxi_it('train'): time = line['timestamp'] latitude = line['latitude'] longitude = line['longitude'] if len(latitude) == 0: continue for ts in cuts: if time <= ts and time + 15 * (len(latitude) - 1) >= ts: # keep it n = (ts - time) / 15 + 1 line.update({ 'latitude': latitude[:n], 'longitude': longitude[:n], 'destination_latitude': latitude[-1], 'destination_longitude': longitude[-1], 'travel_time': 15 * (len(latitude) - 1) }) valid.append(line) break print "Number of trips in validation set:", len(valid) file = h5py.File(outpath, 'a') clen = file['trip_id'].shape[0] alen = len(valid) for field in _fields: dset = file[field] dset.resize((clen + alen, )) for i in xrange(alen): dset[clen + i] = valid[i][field] splits = file.attrs['split'] slen = splits.shape[0] splits = numpy.resize(splits, (slen + len(_fields), )) for (i, field) in enumerate(_fields): splits[slen + i]['split'] = ('cuts/%s' % cutfile).encode('utf8') splits[slen + i]['source'] = field.encode('utf8') splits[slen + i]['start'] = clen splits[slen + i]['stop'] = alen splits[slen + i]['indices'] = None splits[slen + i]['available'] = True splits[slen + i]['comment'] = '.' file.attrs['split'] = splits file.flush() file.close()
def extract_coordinates(n_coordinates=None): if n_coordinates is None: n_coordinates = compute_number_coordinates() coordinates = np.zeros((n_coordinates, 2), dtype="float32") c = 0 for ride in taxi_it('train'): for point in zip(ride['latitude'], ride['longitude']): coordinates[c] = point c += 1 print c cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb"))
def extract_coordinates(n_coordinates=None): """Extract coordinates from the dataset and store them in a numpy array""" if n_coordinates is None: n_coordinates = compute_number_coordinates() coordinates = np.zeros((n_coordinates, 2), dtype="float32") c = 0 for ride in taxi_it('train'): for point in zip(ride['latitude'], ride['longitude']): coordinates[c] = point c += 1 print c cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb"))
#!/usr/bin/env python from data.hdf5 import taxi_it from visualizer import Vlist, Point if __name__ == '__main__': points = Vlist(heatmap=True) for line in taxi_it('test'): for (lat, lon) in zip(line['latitude'], line['longitude']): points.append(Point(lat, lon)) points.save('test positions')
import numpy import cPickle import scipy.misc import os from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets.samples_generator import make_blobs from itertools import cycle import data from data.hdf5 import taxi_it from data.transformers import add_destination dests = [] for v in taxi_it("train"): if len(v['latitude']) == 0: continue dests.append([v['latitude'][-1], v['longitude'][-1]]) pts = numpy.array(dests) with open(os.path.join(data.path, "arrivals.pkl"), "w") as f: cPickle.dump(pts, f, protocol=cPickle.HIGHEST_PROTOCOL) print "Doing clustering" bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000) print bw bw = 0.001 ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5) ms.fit(pts) cluster_centers = ms.cluster_centers_
#!/usr/bin/env python from data.hdf5 import taxi_it from visualizer import Vlist, Point if __name__ == '__main__': it = taxi_it('stands') next(it) points = Vlist() for (i, line) in enumerate(it): points.append( Point(line['stands_latitude'], line['stands_longitude'], 'Stand (%d): %s' % (i + 1, line['stands_name']))) points.save('stands')
import numpy import cPickle import scipy.misc import os from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets.samples_generator import make_blobs from itertools import cycle import data from data.hdf5 import taxi_it from data.transformers import add_destination print "Generating arrival point list" dests = [] for v in taxi_it("train"): if len(v['latitude']) == 0: continue dests.append([v['latitude'][-1], v['longitude'][-1]]) pts = numpy.array(dests) with open(os.path.join(data.path, "arrivals.pkl"), "w") as f: cPickle.dump(pts, f, protocol=cPickle.HIGHEST_PROTOCOL) print "Doing clustering" bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000) print bw bw = 0.001 # ( ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5) ms.fit(pts) cluster_centers = ms.cluster_centers_
#!/usr/bin/env python from data.hdf5 import taxi_it from visualizer import Vlist, Point if __name__ == '__main__': it = taxi_it('stands') next(it) # Ignore the "no stand" entry points = Vlist() for (i, line) in enumerate(it): points.append(Point(line['stands_latitude'], line['stands_longitude'], 'Stand (%d): %s' % (i+1, line['stands_name']))) points.save('stands')
#!/usr/bin/env python from data.hdf5 import taxi_it from visualizer import Vlist, Point _sample_size = 5000 if __name__ == '__main__': points = Vlist(cluster=True) for line in taxi_it('train'): if len(line['latitude']) > 0: points.append(Point(line['latitude'][-1], line['longitude'][-1])) if len(points) >= _sample_size: break points.save('destinations (cluster)') points.cluster = False points.heatmap = True points.save('destinations (heatmap)')
#!/usr/bin/env python from data.hdf5 import taxi_it from visualizer import Vlist, Point _sample_size = 5000 if __name__ == '__main__': points = Vlist(cluster=True) for line in taxi_it('train'): if len(line['latitude'])>0: points.append(Point(line['latitude'][-1], line['longitude'][-1])) if len(points) >= _sample_size: break points.save('destinations (cluster)') points.cluster = False points.heatmap = True points.save('destinations (heatmap)')