コード例 #1
0
ファイル: make_time_index.py プロジェクト: DragonCircle/taxi
def make_valid(outpath):
    times = []
    for i, line in enumerate(taxi_it('train')):
        time = line['timestamp']
        latitude = line['latitude']

        if len(latitude) == 0:
            continue

        duration = 15 * (len(latitude) - 1)

        times.append((i, int(time), int(time + duration)))
        if i % 1000 == 0:
            print times[-1]


    with sqlite3.connect(outpath) as timedb:
        c = timedb.cursor()
        c.execute('''
                CREATE TABLE trip_times
                    (trip INTEGER, begin INTEGER, end INTEGER)
        ''')
        print "Adding data..."
        c.executemany('INSERT INTO trip_times(trip, begin, end) VALUES(?, ?, ?)', times)
        timedb.commit()
        print "Creating index..."
        c.execute('''CREATE INDEX trip_begin_index ON trip_times (begin)''')
コード例 #2
0
def compute_number_coordinates():
    n_coordinates = 0
    for ride in taxi_it('train'):
        n_coordinates += len(ride['latitude'])
    print n_coordinates

    return n_coordinates
コード例 #3
0
def make_valid(outpath):
    times = []
    for i, line in enumerate(taxi_it('train')):
        time = line['timestamp']
        latitude = line['latitude']

        if len(latitude) == 0:
            continue

        duration = 15 * (len(latitude) - 1)

        times.append((i, int(time), int(time + duration)))
        if i % 1000 == 0:
            print times[-1]

    with sqlite3.connect(outpath) as timedb:
        c = timedb.cursor()
        c.execute('''
                CREATE TABLE trip_times
                    (trip INTEGER, begin INTEGER, end INTEGER)
        ''')
        print "Adding data..."
        c.executemany(
            'INSERT INTO trip_times(trip, begin, end) VALUES(?, ?, ?)', times)
        timedb.commit()
        print "Creating index..."
        c.execute('''CREATE INDEX trip_begin_index ON trip_times (begin)''')
コード例 #4
0
ファイル: maps.py プロジェクト: DragonCircle/taxi
def compute_number_coordinates():

    # Count the number of coordinates
    n_coordinates = 0
    for ride in taxi_it('train'):
        n_coordinates += len(ride['latitude'])
    print n_coordinates

    return n_coordinates
コード例 #5
0
ファイル: make_valid_cut.py プロジェクト: DragonCircle/taxi
def make_valid(cutfile, outpath):
    cuts = importlib.import_module('.%s' % cutfile, 'data.cuts').cuts

    print "Number of cuts:", len(cuts)

    valid = []

    for line in taxi_it('train'):
        time = line['timestamp']
        latitude = line['latitude']
        longitude = line['longitude']

        if len(latitude) == 0:
            continue

        for ts in cuts:
            if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
                # keep it
                n = (ts - time) / 15 + 1
                line.update({
                    'latitude': latitude[:n],
                    'longitude': longitude[:n],
                    'destination_latitude': latitude[-1],
                    'destination_longitude': longitude[-1],
                    'travel_time': 15 * (len(latitude)-1)
                })
                valid.append(line)
                break

    print "Number of trips in validation set:", len(valid)
    
    file = h5py.File(outpath, 'a')
    clen = file['trip_id'].shape[0]
    alen = len(valid)
    for field in _fields:
        dset = file[field]
        dset.resize((clen + alen,))
        for i in xrange(alen):
            dset[clen + i] = valid[i][field]

    splits = file.attrs['split']
    slen = splits.shape[0]
    splits = numpy.resize(splits, (slen+len(_fields),))
    for (i, field) in enumerate(_fields):
        splits[slen+i]['split'] = ('cuts/%s' % cutfile).encode('utf8')
        splits[slen+i]['source'] = field.encode('utf8')
        splits[slen+i]['start'] = clen
        splits[slen+i]['stop'] = alen
        splits[slen+i]['indices'] = None
        splits[slen+i]['available'] = True
        splits[slen+i]['comment'] = '.'
    file.attrs['split'] = splits

    file.flush()
    file.close()
コード例 #6
0
def make_valid(cutfile, outpath):
    cuts = importlib.import_module('.%s' % cutfile, 'data.cuts').cuts

    print "Number of cuts:", len(cuts)

    valid = []

    for line in taxi_it('train'):
        time = line['timestamp']
        latitude = line['latitude']
        longitude = line['longitude']

        if len(latitude) == 0:
            continue

        for ts in cuts:
            if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
                # keep it
                n = (ts - time) / 15 + 1
                line.update({
                    'latitude': latitude[:n],
                    'longitude': longitude[:n],
                    'destination_latitude': latitude[-1],
                    'destination_longitude': longitude[-1],
                    'travel_time': 15 * (len(latitude) - 1)
                })
                valid.append(line)
                break

    print "Number of trips in validation set:", len(valid)

    file = h5py.File(outpath, 'a')
    clen = file['trip_id'].shape[0]
    alen = len(valid)
    for field in _fields:
        dset = file[field]
        dset.resize((clen + alen, ))
        for i in xrange(alen):
            dset[clen + i] = valid[i][field]

    splits = file.attrs['split']
    slen = splits.shape[0]
    splits = numpy.resize(splits, (slen + len(_fields), ))
    for (i, field) in enumerate(_fields):
        splits[slen + i]['split'] = ('cuts/%s' % cutfile).encode('utf8')
        splits[slen + i]['source'] = field.encode('utf8')
        splits[slen + i]['start'] = clen
        splits[slen + i]['stop'] = alen
        splits[slen + i]['indices'] = None
        splits[slen + i]['available'] = True
        splits[slen + i]['comment'] = '.'
    file.attrs['split'] = splits

    file.flush()
    file.close()
コード例 #7
0
def extract_coordinates(n_coordinates=None):
    if n_coordinates is None:
        n_coordinates = compute_number_coordinates()
    coordinates = np.zeros((n_coordinates, 2), dtype="float32")
    c = 0
    for ride in taxi_it('train'):
        for point in zip(ride['latitude'], ride['longitude']):
            coordinates[c] = point
            c += 1
    print c
    cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb"))
コード例 #8
0
ファイル: maps.py プロジェクト: DragonCircle/taxi
def extract_coordinates(n_coordinates=None):
    """Extract coordinates from the dataset and store them in a numpy array"""

    if n_coordinates is None:
        n_coordinates = compute_number_coordinates()

    coordinates = np.zeros((n_coordinates, 2), dtype="float32")

    c = 0
    for ride in taxi_it('train'):
        for point in zip(ride['latitude'], ride['longitude']):
            coordinates[c] = point
            c += 1

    print c

    cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb"))
コード例 #9
0
#!/usr/bin/env python

from data.hdf5 import taxi_it
from visualizer import Vlist, Point

if __name__ == '__main__':
    points = Vlist(heatmap=True)
    for line in taxi_it('test'):
        for (lat, lon) in zip(line['latitude'], line['longitude']):
            points.append(Point(lat, lon))
    points.save('test positions')
コード例 #10
0
ファイル: cluster_des.py プロジェクト: JayDhi/Des_predict
import numpy
import cPickle
import scipy.misc
import os

from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs
from itertools import cycle

import data
from data.hdf5 import taxi_it
from data.transformers import add_destination

dests = []
for v in taxi_it("train"):
    if len(v['latitude']) == 0: continue
    dests.append([v['latitude'][-1], v['longitude'][-1]])
pts = numpy.array(dests)

with open(os.path.join(data.path, "arrivals.pkl"), "w") as f:
    cPickle.dump(pts, f, protocol=cPickle.HIGHEST_PROTOCOL)

print "Doing clustering"
bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000)
print bw
bw = 0.001

ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(pts)
cluster_centers = ms.cluster_centers_
コード例 #11
0
#!/usr/bin/env python

from data.hdf5 import taxi_it
from visualizer import Vlist, Point

if __name__ == '__main__':
    it = taxi_it('stands')
    next(it)
    points = Vlist()
    for (i, line) in enumerate(it):
        points.append(
            Point(line['stands_latitude'], line['stands_longitude'],
                  'Stand (%d): %s' % (i + 1, line['stands_name'])))
    points.save('stands')
コード例 #12
0
ファイル: cluster_arrival.py プロジェクト: DragonCircle/taxi
import numpy
import cPickle
import scipy.misc
import os

from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs
from itertools import cycle

import data
from data.hdf5 import taxi_it
from data.transformers import add_destination

print "Generating arrival point list"
dests = []
for v in taxi_it("train"):
    if len(v['latitude']) == 0: continue
    dests.append([v['latitude'][-1], v['longitude'][-1]])
pts = numpy.array(dests)

with open(os.path.join(data.path, "arrivals.pkl"), "w") as f:
    cPickle.dump(pts, f, protocol=cPickle.HIGHEST_PROTOCOL)

print "Doing clustering"
bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000)
print bw
bw = 0.001 # (

ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(pts)
cluster_centers = ms.cluster_centers_
コード例 #13
0
ファイル: test_positions.py プロジェクト: DragonCircle/taxi
#!/usr/bin/env python

from data.hdf5 import taxi_it
from visualizer import Vlist, Point


if __name__ == '__main__':
    points = Vlist(heatmap=True)
    for line in taxi_it('test'):
        for (lat, lon) in zip(line['latitude'], line['longitude']):
            points.append(Point(lat, lon))
    points.save('test positions')
コード例 #14
0
ファイル: stands.py プロジェクト: DragonCircle/taxi
#!/usr/bin/env python

from data.hdf5 import taxi_it
from visualizer import Vlist, Point


if __name__ == '__main__':
    it = taxi_it('stands')
    next(it) # Ignore the "no stand" entry

    points = Vlist()
    for (i, line) in enumerate(it):
        points.append(Point(line['stands_latitude'], line['stands_longitude'], 'Stand (%d): %s' % (i+1, line['stands_name'])))
    points.save('stands')
コード例 #15
0
#!/usr/bin/env python

from data.hdf5 import taxi_it
from visualizer import Vlist, Point
_sample_size = 5000

if __name__ == '__main__':
    points = Vlist(cluster=True)
    for line in taxi_it('train'):
        if len(line['latitude']) > 0:
            points.append(Point(line['latitude'][-1], line['longitude'][-1]))
            if len(points) >= _sample_size:
                break
    points.save('destinations (cluster)')
    points.cluster = False
    points.heatmap = True
    points.save('destinations (heatmap)')
コード例 #16
0
ファイル: destinations.py プロジェクト: DragonCircle/taxi
#!/usr/bin/env python

from data.hdf5 import taxi_it
from visualizer import Vlist, Point


_sample_size = 5000

if __name__ == '__main__':
    points = Vlist(cluster=True)
    for line in taxi_it('train'):
        if len(line['latitude'])>0:
            points.append(Point(line['latitude'][-1], line['longitude'][-1]))
            if len(points) >= _sample_size:
                break
    points.save('destinations (cluster)')
    points.cluster = False
    points.heatmap = True
    points.save('destinations (heatmap)')