def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point v_mn = 0 head = 0 if len(x)>1: v_mn = haversineKaggle(x[0,:], x[1,:])[0] head = heading(x[0,:], x[1,:]) # distance from the center till cutting point d_st = haversineKaggle(x[0,:], CITY_CENTER) h_st = heading(x[0,:], CITY_CENTER[0]) data += [x[-1,0], x[-1,1], d_st, h_st, v_mn, head] return data
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point v_mn = 0 head = 0 if len(x) > 1: v_mn = haversineKaggle(x[0, :], x[-1, :])[0] head = heading(x[0, :], x[-1, :]) # distance from the center till cutting point d_st = haversineKaggle(x[0, :], CITY_CENTER) h_st = heading(x[0, :], CITY_CENTER[0]) data += [x[-1, 0], x[-1, 1], d_st, h_st, v_mn, head] return data
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point d_st = haversineKaggle(x, CITY_CENTER) head = heading(x, CITY_CENTER[0]) data += [x[0], x[1], d_st, head] return data
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # cumulative sum of distance d_cs = 0 vcar = 0 vmed = 0 head = 0 if x.shape[0] > 1: d1 = haversineKaggle(x[:-1,:], x[1:,:]) d_cs = np.sum(d1) vmed = np.median(d1) vcar = d1[-1] head = heading(x[-2,:], x[-1,:]) # distance from the center till cutting point d_st = haversineKaggle(x[0,:], CITY_CENTER)[0] h_st = heading(x[0,:], CITY_CENTER[0]) d_cut = haversineKaggle(x[-1,:], CITY_CENTER)[0] h_cut = heading(CITY_CENTER[0], x[-1,:]) data += [x.shape[0], x[0,0], x[0,1], x[-1,0], x[-1,1], d_st, h_st, d_cut, h_cut, d_cs, vmed, vcar, head] return data
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from utils import haversineKaggle DATA_DIR = '../data' t0 = time.time() for filename in ['train_pp_N1.csv', 'train_pp_N2.csv', 'train_pp_N3.csv', 'train_pp_RND.csv']: print('reading training data from %s ...' % filename) df = pd.read_csv(os.path.join(DATA_DIR, filename)) d1 = haversineKaggle(df[['xs', 'ys']].values, df[['xe', 'ye']].values) # create training set y = np.log(df['len']*15 + 1) # remove non-predictive features df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True) X = np.array(df, dtype=np.float) # clean data by removing long distance tracks th1 = np.percentile(d1, [99.9]) X = X[(d1<th1), :] y = y[(d1<th1)] print('training a random forest regressor ...') # Initialize the famous Random Forest Regressor from scikit-learn clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21)
if not os.path.isfile(filename): continue df = pd.read_csv(filename) if df.shape[0] < 1000: print('skipping key point %i (%i)' % (id_, df.shape[0])) continue # factorize categorical columns in training set #df['CALL_TYPE'], ct_index = pd.factorize(df['CALL_TYPE']) #df = df[df['CALL_TYPE'] == 0] # A=2, B=1, C=0 # fill all NaN values with -1 #df = df.fillna(-1) # remove long distance d1 = haversineKaggle(df[['xs', 'ys']], df[['xe', 'ye']]) th1 = np.percentile(d1, [99.9]) df = df.loc[d1 < th1] y = np.ravel(np.log(df['len'] * 15 + 1)) df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True) X = np.array(df, dtype=np.float) print('training classifier of key point %i (sz=%i) ...' % (id_, X.shape[0])) # Initialize the famous Random Forest Regressor from scikit-learn clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21) clf.fit(X, y) pred_rf = clf.predict(X_tst[id_, :]) clf = GradientBoostingRegressor(n_estimators=200,
import pandas as pd from sklearn.ensemble import RandomForestRegressor from utils import haversineKaggle DATA_DIR = '../data' t0 = time.time() for filename in [ 'train_pp_N1.csv', 'train_pp_N2.csv', 'train_pp_N3.csv', 'train_pp_RND.csv' ]: print('reading training data from %s ...' % filename) df = pd.read_csv(os.path.join(DATA_DIR, filename)) d1 = haversineKaggle(df[['xs', 'ys']].values, df[['xe', 'ye']].values) # create training set y = np.log(df['len'] * 15 + 1) # remove non-predictive features df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True) X = np.array(df, dtype=np.float) # clean data by removing long distance tracks th1 = np.percentile(d1, [99.9]) X = X[(d1 < th1), :] y = y[(d1 < th1)] print('training a random forest regressor ...') # Initialize the famous Random Forest Regressor from scikit-learn clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21)
if not os.path.isfile(filename): continue df = pd.read_csv(filename) if df.shape[0] < 1000: print('skipping key point %i (%i)' % (id_, df.shape[0])) continue # factorize categorical columns in training set #df['CALL_TYPE'], ct_index = pd.factorize(df['CALL_TYPE']) #df = df[df['CALL_TYPE'] == 0] # A=2, B=1, C=0 # fill all NaN values with -1 #df = df.fillna(-1) # remove long distance d1 = haversineKaggle(df[['xs', 'ys']], df[['xe', 'ye']]) th1 = np.percentile(d1, [99.9]) df = df.loc[d1 < th1] y = np.ravel(np.log(df['len']*15 + 1)) df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True) X = np.array(df, dtype=np.float) print('training classifier of key point %i (sz=%i) ...' % (id_, X.shape[0])) # Initialize the famous Random Forest Regressor from scikit-learn clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=21) clf.fit(X, y) pred_rf = clf.predict(X_tst[id_, :]) clf = GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=21) clf.fit(X, y)