def process_train():
    df = read_data('../data/train.csv')
    print('Total samples:', len(df))
    df = df.loc[(lon_min <= df.pickup_longitude)
                & (df.pickup_longitude <= lon_max) &
                (lat_min <= df.pickup_latitude) &
                (df.pickup_latitude <= lat_max)]
    df = df.reset_index(drop=True)
    print('After preprocessing1', len(df))
    df = df.loc[(lon_min <= df.dropoff_longitude)
                & (df.dropoff_longitude <= lon_max) &
                (lat_min <= df.dropoff_latitude) &
                (df.dropoff_latitude <= lat_max)]
    df = df.reset_index(drop=True)
    print('After preprocessing1', len(df))
    df.to_csv('../processed_data/train.csv', index=False)
def get_df_routes(yy=2016, mm=3, day=1, hh_start=9, hh_end=10):
    '''
    Output {'year':, 'month':, 'day':, 'start_hour':, 'end_hour':
            'routes': {'distance':, 'google_duration':, 'routes':[[lon, lat]], 'pre_road_time':[],
                        'pickup_datetime':, 'dropoff_datetime':, 'duration':},
    :param yy:
    :param mm:
    :param day:
    :param hh_start:
    :param hh_end:
    :return:
    '''
    df = read_data('../data/train.csv')
    time_start = datetime.datetime(yy, mm, day, hh_start)
    if hh_end != 24:
        time_end = datetime.datetime(yy, mm, day,
                                     hh_end) - datetime.timedelta(seconds=1)
    else:
        time_end = datetime.datetime(yy, mm, day, 23, 59, 59)
    df = df[(time_start <= df.pickup_datetime)
            & (df.pickup_datetime < time_end)]
    print(len(df))
    df = df.reset_index(drop=True)

    outs = []
    for i in range(len(df)):
        origin = [df.pickup_longitude[i], df.pickup_latitude[i]]
        destination = [df.dropoff_longitude[i], df.dropoff_latitude[i]]
        out = get_routes(origin, destination)
        out['pickup_datetime'] = datetime_tostr(df.pickup_datetime[i])
        out['dropoff_datetime'] = datetime_tostr(df.dropoff_datetime[i])
        out['duration'] = (df.dropoff_datetime[i] -
                           df.pickup_datetime[i]).total_seconds()
        outs.append(out)
        print('Row %d: duration = %f' % (i, out['duration']))
    outs = {
        'year': yy,
        'month': mm,
        'day': day,
        'start_hour': hh_start,
        'end_hour': hh_end,
        'routes': outs
    }
    output_json(
        outs, '../processed_data/routes/%d-%d-%d_%d-%d.json' %
        (yy, mm, day, hh_start, hh_end))
def cal_distance_ratio():
    df = read_data('../processed_data/test_train_google_19999.csv')
    df = df[df.google_distance < 1000000]
    df = df.reset_index(drop=True)
    ratio = [
        df.google_distance[i] /
        cal_distance([df.pickup_longitude[i], df.pickup_latitude[i]],
                     [df.dropoff_longitude[i], df.dropoff_latitude[i]])
        for i in range(len(df))
    ]
    print(ratio)
    print(min(ratio))
    ratio = [x for x in ratio if 1 <= x <= 2]
    pd.DataFrame({
        'ratio': ratio
    }).to_csv('../statistics/distance_ratio.csv', index=False)
    return np.mean(ratio)  # 1.3540919895360521
def cal_day_order_speed():
    '''
    calculate daily hourly order number and speed
    For every day:
        output a DataFrame['hour', 'order', 'speed']
        csv name = date.csv
    :return:
    '''
    # df = read_data('../processed_data/test_train_google2.csv')
    df = read_data('../processed_data/train.csv')
    print('Total samples:', len(df))

    distance_ratio = 1.3540919895360521

    df['date'] = [datetime_tostr(t, '%Y-%m-%d') for t in df.pickup_datetime]
    df['hour'] = [t.hour for t in df.pickup_datetime]
    dates = np.unique(df.date)
    df['actual_distance'] = [
        distance_ratio *
        cal_distance([df.pickup_longitude[i], df.pickup_latitude[i]],
                     [df.dropoff_longitude[i], df.dropoff_latitude[i]])
        for i in range(len(df))
    ]
    df['order'] = 1
    if 'trip_duration' not in df.columns:
        df['trip_duration'] = df.duration

    for i, day in enumerate(dates):
        # if i>10: break
        tmp = df[df.date == day]
        print('Day %s: total order = %d' % (day, len(tmp)))
        tables = pd.pivot_table(
            tmp[['hour', 'actual_distance', 'trip_duration', 'order']],
            index=['hour'],
            values=['actual_distance', 'trip_duration', 'order'],
            aggfunc=[np.sum])
        tables = tables['sum']
        # print(tables)
        tables['speed'] = tables.actual_distance / tables.trip_duration
        tables = tables[['speed', 'order']]
        tables = tables.reset_index()
        tables = tables.rename(index=str, columns={'index': 'hour'})
        print(tables)
        tables.to_csv('../processed_data/order_speed/%s.csv' % day,
                      index=False)
def cal_daily_order():
    '''
    calculate daily order number
    Output DataFrame['date', 'count']
    :return:
    '''
    # df = read_data('../processed_data/test_train_google2.csv')
    df = read_data('../processed_data/train.csv')
    print('Total samples:', len(df))

    df['date'] = [datetime_tostr(t, '%Y-%m-%d') for t in df.pickup_datetime]
    tables = pd.pivot_table(df[['date', 'id']],
                            index=['date'],
                            values=['id'],
                            aggfunc=[len])
    tables = tables['len']
    tables = tables.reset_index()
    tables = tables.rename(index=str, columns={'index': 'date', 'id': 'count'})
    tables.to_csv('../processed_data/daily_order_count/daily_order.csv',
                  index=False)
    print(tables)
def output_daily_hour_origin_destination():
    '''
    For every day:
        {'pick up': {'6': [[lon, lat]]}, 'drop off': {'6': [[lon, lat]]}
    :return:
    '''
    # df = read_data('../processed_data/test_train_google2.csv')
    df = read_data('../processed_data/train.csv')
    print('Total samples:', len(df))

    df['date'] = [datetime_tostr(t, '%Y-%m-%d') for t in df.pickup_datetime]
    df['pick_hour'] = [t.hour for t in df.pickup_datetime]
    df['drop_hour'] = [t.hour for t in df.dropoff_datetime]
    dates = np.unique(df.date)

    for i, day in enumerate(dates):
        # if i>10: continue

        picks = {}
        drops = {}

        for hour in range(24):
            tmp = df[(df.date == day) & (df.pick_hour == hour)]
            tmp = tmp.reset_index(drop=True)
            pick_positions = [[
                tmp.pickup_longitude[i], tmp.pickup_latitude[i]
            ] for i in range(len(tmp))]

            tmp = df[(df.date == day) & (df.drop_hour == hour)]
            tmp = tmp.reset_index(drop=True)
            drop_positions = [[
                tmp.dropoff_longitude[i], tmp.dropoff_latitude[i]
            ] for i in range(len(tmp))]
            picks[str(hour)] = pick_positions
            drops[str(hour)] = drop_positions

        out = {'pick': picks, 'drop': drops}
        output_json(out, '../processed_data/daily_hourly_od/%s.json' % day)