def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    v_mn = 0
    head = 0
    if len(x)>1:
        v_mn = haversineKaggle(x[0,:], x[1,:])[0]
        head = heading(x[0,:], x[1,:])
    # distance from the center till cutting point
    d_st = haversineKaggle(x[0,:],  CITY_CENTER)
    h_st = heading(x[0,:],  CITY_CENTER[0])
    data += [x[-1,0], x[-1,1], d_st, h_st, v_mn, head]
    return data
Esempio n. 2
0
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    v_mn = 0
    head = 0
    if len(x) > 1:
        v_mn = haversineKaggle(x[0, :], x[-1, :])[0]
        head = heading(x[0, :], x[-1, :])
    # distance from the center till cutting point
    d_st = haversineKaggle(x[0, :], CITY_CENTER)
    h_st = heading(x[0, :], CITY_CENTER[0])
    data += [x[-1, 0], x[-1, 1], d_st, h_st, v_mn, head]
    return data
Esempio n. 3
0
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    d_st = haversineKaggle(x, CITY_CENTER)
    head = heading(x, CITY_CENTER[0])
    data += [x[0], x[1], d_st, head]
    return data
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    d_st = haversineKaggle(x,  CITY_CENTER)
    head = heading(x,  CITY_CENTER[0])
    data += [x[0], x[1], d_st, head]
    return data
Esempio n. 5
0
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # cumulative sum of distance
    d_cs = 0
    vcar = 0
    vmed = 0
    head = 0
    if x.shape[0] > 1:
        d1 = haversineKaggle(x[:-1,:], x[1:,:])
        d_cs = np.sum(d1)
        vmed = np.median(d1)
        vcar = d1[-1]
        head = heading(x[-2,:], x[-1,:])
    # distance from the center till cutting point
    d_st = haversineKaggle(x[0,:],  CITY_CENTER)[0]
    h_st = heading(x[0,:],  CITY_CENTER[0])
    d_cut = haversineKaggle(x[-1,:], CITY_CENTER)[0]
    h_cut = heading(CITY_CENTER[0], x[-1,:])
    data += [x.shape[0], x[0,0], x[0,1], x[-1,0], x[-1,1], d_st, h_st, d_cut, 
             h_cut, d_cs, vmed, vcar, head]
    return data
Esempio n. 6
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from utils import haversineKaggle

           
DATA_DIR = '../data'

t0 = time.time()
for filename in ['train_pp_N1.csv', 'train_pp_N2.csv', 'train_pp_N3.csv', 
                 'train_pp_RND.csv']:
    print('reading training data from %s ...' % filename)
 
    df = pd.read_csv(os.path.join(DATA_DIR, filename))
    d1 = haversineKaggle(df[['xs', 'ys']].values, df[['xe', 'ye']].values)

    # create training set
    y = np.log(df['len']*15 + 1)
    # remove non-predictive features
    df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True)
    X = np.array(df, dtype=np.float)

    # clean data by removing long distance tracks
    th1 = np.percentile(d1, [99.9])
    X = X[(d1<th1), :]
    y = y[(d1<th1)]
                                                   
    print('training a random forest regressor ...')
    # Initialize the famous Random Forest Regressor from scikit-learn
    clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21)
Esempio n. 7
0
    if not os.path.isfile(filename):
        continue

    df = pd.read_csv(filename)
    if df.shape[0] < 1000:
        print('skipping key point %i (%i)' % (id_, df.shape[0]))
        continue

    # factorize categorical columns in training set
    #df['CALL_TYPE'], ct_index = pd.factorize(df['CALL_TYPE'])
    #df = df[df['CALL_TYPE'] == 0]    # A=2, B=1, C=0
    # fill all NaN values with -1
    #df = df.fillna(-1)

    # remove long distance
    d1 = haversineKaggle(df[['xs', 'ys']], df[['xe', 'ye']])
    th1 = np.percentile(d1, [99.9])
    df = df.loc[d1 < th1]

    y = np.ravel(np.log(df['len'] * 15 + 1))
    df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True)
    X = np.array(df, dtype=np.float)

    print('training classifier of key point %i  (sz=%i) ...' %
          (id_, X.shape[0]))
    # Initialize the famous Random Forest Regressor from scikit-learn
    clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21)
    clf.fit(X, y)
    pred_rf = clf.predict(X_tst[id_, :])

    clf = GradientBoostingRegressor(n_estimators=200,
Esempio n. 8
0
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from utils import haversineKaggle

DATA_DIR = '../data'

t0 = time.time()
for filename in [
        'train_pp_N1.csv', 'train_pp_N2.csv', 'train_pp_N3.csv',
        'train_pp_RND.csv'
]:
    print('reading training data from %s ...' % filename)

    df = pd.read_csv(os.path.join(DATA_DIR, filename))
    d1 = haversineKaggle(df[['xs', 'ys']].values, df[['xe', 'ye']].values)

    # create training set
    y = np.log(df['len'] * 15 + 1)
    # remove non-predictive features
    df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True)
    X = np.array(df, dtype=np.float)

    # clean data by removing long distance tracks
    th1 = np.percentile(d1, [99.9])
    X = X[(d1 < th1), :]
    y = y[(d1 < th1)]

    print('training a random forest regressor ...')
    # Initialize the famous Random Forest Regressor from scikit-learn
    clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21)
    if not os.path.isfile(filename):
        continue
    
    df = pd.read_csv(filename)
    if df.shape[0] < 1000:
        print('skipping key point %i (%i)' % (id_, df.shape[0]))
        continue
    
    # factorize categorical columns in training set
    #df['CALL_TYPE'], ct_index = pd.factorize(df['CALL_TYPE'])
    #df = df[df['CALL_TYPE'] == 0]    # A=2, B=1, C=0
    # fill all NaN values with -1
    #df = df.fillna(-1)
        
    # remove long distance
    d1 = haversineKaggle(df[['xs', 'ys']], df[['xe', 'ye']])
    th1 = np.percentile(d1, [99.9])
    df = df.loc[d1 < th1]

    y = np.ravel(np.log(df['len']*15 + 1))
    df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True)
    X = np.array(df, dtype=np.float)

    print('training classifier of key point %i  (sz=%i) ...' % (id_, X.shape[0]))                                            
    # Initialize the famous Random Forest Regressor from scikit-learn
    clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21)
    clf.fit(X, y)
    pred_rf = clf.predict(X_tst[id_, :])

    clf = GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=21)
    clf.fit(X, y)