def process_train(): df = read_data('../data/train.csv') print('Total samples:', len(df)) df = df.loc[(lon_min <= df.pickup_longitude) & (df.pickup_longitude <= lon_max) & (lat_min <= df.pickup_latitude) & (df.pickup_latitude <= lat_max)] df = df.reset_index(drop=True) print('After preprocessing1', len(df)) df = df.loc[(lon_min <= df.dropoff_longitude) & (df.dropoff_longitude <= lon_max) & (lat_min <= df.dropoff_latitude) & (df.dropoff_latitude <= lat_max)] df = df.reset_index(drop=True) print('After preprocessing1', len(df)) df.to_csv('../processed_data/train.csv', index=False)
def get_df_routes(yy=2016, mm=3, day=1, hh_start=9, hh_end=10): ''' Output {'year':, 'month':, 'day':, 'start_hour':, 'end_hour': 'routes': {'distance':, 'google_duration':, 'routes':[[lon, lat]], 'pre_road_time':[], 'pickup_datetime':, 'dropoff_datetime':, 'duration':}, :param yy: :param mm: :param day: :param hh_start: :param hh_end: :return: ''' df = read_data('../data/train.csv') time_start = datetime.datetime(yy, mm, day, hh_start) if hh_end != 24: time_end = datetime.datetime(yy, mm, day, hh_end) - datetime.timedelta(seconds=1) else: time_end = datetime.datetime(yy, mm, day, 23, 59, 59) df = df[(time_start <= df.pickup_datetime) & (df.pickup_datetime < time_end)] print(len(df)) df = df.reset_index(drop=True) outs = [] for i in range(len(df)): origin = [df.pickup_longitude[i], df.pickup_latitude[i]] destination = [df.dropoff_longitude[i], df.dropoff_latitude[i]] out = get_routes(origin, destination) out['pickup_datetime'] = datetime_tostr(df.pickup_datetime[i]) out['dropoff_datetime'] = datetime_tostr(df.dropoff_datetime[i]) out['duration'] = (df.dropoff_datetime[i] - df.pickup_datetime[i]).total_seconds() outs.append(out) print('Row %d: duration = %f' % (i, out['duration'])) outs = { 'year': yy, 'month': mm, 'day': day, 'start_hour': hh_start, 'end_hour': hh_end, 'routes': outs } output_json( outs, '../processed_data/routes/%d-%d-%d_%d-%d.json' % (yy, mm, day, hh_start, hh_end))
def cal_distance_ratio(): df = read_data('../processed_data/test_train_google_19999.csv') df = df[df.google_distance < 1000000] df = df.reset_index(drop=True) ratio = [ df.google_distance[i] / cal_distance([df.pickup_longitude[i], df.pickup_latitude[i]], [df.dropoff_longitude[i], df.dropoff_latitude[i]]) for i in range(len(df)) ] print(ratio) print(min(ratio)) ratio = [x for x in ratio if 1 <= x <= 2] pd.DataFrame({ 'ratio': ratio }).to_csv('../statistics/distance_ratio.csv', index=False) return np.mean(ratio) # 1.3540919895360521
def cal_day_order_speed(): ''' calculate daily hourly order number and speed For every day: output a DataFrame['hour', 'order', 'speed'] csv name = date.csv :return: ''' # df = read_data('../processed_data/test_train_google2.csv') df = read_data('../processed_data/train.csv') print('Total samples:', len(df)) distance_ratio = 1.3540919895360521 df['date'] = [datetime_tostr(t, '%Y-%m-%d') for t in df.pickup_datetime] df['hour'] = [t.hour for t in df.pickup_datetime] dates = np.unique(df.date) df['actual_distance'] = [ distance_ratio * cal_distance([df.pickup_longitude[i], df.pickup_latitude[i]], [df.dropoff_longitude[i], df.dropoff_latitude[i]]) for i in range(len(df)) ] df['order'] = 1 if 'trip_duration' not in df.columns: df['trip_duration'] = df.duration for i, day in enumerate(dates): # if i>10: break tmp = df[df.date == day] print('Day %s: total order = %d' % (day, len(tmp))) tables = pd.pivot_table( tmp[['hour', 'actual_distance', 'trip_duration', 'order']], index=['hour'], values=['actual_distance', 'trip_duration', 'order'], aggfunc=[np.sum]) tables = tables['sum'] # print(tables) tables['speed'] = tables.actual_distance / tables.trip_duration tables = tables[['speed', 'order']] tables = tables.reset_index() tables = tables.rename(index=str, columns={'index': 'hour'}) print(tables) tables.to_csv('../processed_data/order_speed/%s.csv' % day, index=False)
def cal_daily_order(): ''' calculate daily order number Output DataFrame['date', 'count'] :return: ''' # df = read_data('../processed_data/test_train_google2.csv') df = read_data('../processed_data/train.csv') print('Total samples:', len(df)) df['date'] = [datetime_tostr(t, '%Y-%m-%d') for t in df.pickup_datetime] tables = pd.pivot_table(df[['date', 'id']], index=['date'], values=['id'], aggfunc=[len]) tables = tables['len'] tables = tables.reset_index() tables = tables.rename(index=str, columns={'index': 'date', 'id': 'count'}) tables.to_csv('../processed_data/daily_order_count/daily_order.csv', index=False) print(tables)
def output_daily_hour_origin_destination(): ''' For every day: {'pick up': {'6': [[lon, lat]]}, 'drop off': {'6': [[lon, lat]]} :return: ''' # df = read_data('../processed_data/test_train_google2.csv') df = read_data('../processed_data/train.csv') print('Total samples:', len(df)) df['date'] = [datetime_tostr(t, '%Y-%m-%d') for t in df.pickup_datetime] df['pick_hour'] = [t.hour for t in df.pickup_datetime] df['drop_hour'] = [t.hour for t in df.dropoff_datetime] dates = np.unique(df.date) for i, day in enumerate(dates): # if i>10: continue picks = {} drops = {} for hour in range(24): tmp = df[(df.date == day) & (df.pick_hour == hour)] tmp = tmp.reset_index(drop=True) pick_positions = [[ tmp.pickup_longitude[i], tmp.pickup_latitude[i] ] for i in range(len(tmp))] tmp = df[(df.date == day) & (df.drop_hour == hour)] tmp = tmp.reset_index(drop=True) drop_positions = [[ tmp.dropoff_longitude[i], tmp.dropoff_latitude[i] ] for i in range(len(tmp))] picks[str(hour)] = pick_positions drops[str(hour)] = drop_positions out = {'pick': picks, 'drop': drops} output_json(out, '../processed_data/daily_hourly_od/%s.json' % day)