Example #1
0
def quick_load_bj_taxi_external(timeslots, description, length):
    """
    load external data quickly, if time slots is the same
    :param timeslots: the demanded time
    :param description: global or local
    :return:
    """
    start_time = str(timeslots[0][0])
    end_time = str(timeslots[-1][0])
    path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data'

    filename = 'bj_taxi_external_{}_{}_{}_{}.h5'.format(
        description, length, start_time, end_time)
    f = h5py.File(os.path.join(path, filename), 'a')

    # encode the time
    encode_time = np.asarray([utils.encode_time(batch) for batch in timeslots])

    # if the same, load directly
    if 'timeslots' in f and (encode_time == f['timeslots'][()]).all():

        print('cache load bj taxi {} external data from {} to {}'.format(
            description, start_time, end_time))
        print('-' * 30)

        vacation = f['vacation'][()]
        dayOfWeek = f['dayOfWeek'][()]
        weather = f['weather'][()]
        continuous_external = f['continuous_external'][()]
        hour = f['hour'][()]

        f.close()
        return vacation, hour, dayOfWeek, weather, continuous_external

    f.close()

    # else calculate then cache
    f = h5py.File(os.path.join(path, filename), 'w')

    print('calculate bj taxi {} external data from {} to {}'.format(
        description, start_time, end_time))
    if description == 'global':
        vacation, hour, dayOfWeek, weather, continuous_external = \
            load_bj_taxi_external(timeslots, 'bj taxi global external data')
    else:
        vacation, hour, dayOfWeek, weather, continuous_external = \
            load_bj_taxi_external(timeslots, 'bj taxi local external data')

    print('cache bj taxi {} external data from {} to {}'.format(
        description, start_time, end_time))

    f['timeslots'] = encode_time
    f['vacation'] = vacation
    f['hour'] = hour
    f['dayOfWeek'] = dayOfWeek
    f['weather'] = weather
    f['continuous_external'] = continuous_external

    f.close()
    return vacation, hour, dayOfWeek, weather, continuous_external
Example #2
0
def quick_load_ny_bike_external(timeslots, description, length):
    """
    load external data quickly, if time slots is the same
    :param timeslots: the demanded time
    :param description: global or local
    :return:
    """
    start_time = str(timeslots[0][0])
    end_time = str(timeslots[-1][0])
    path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data'

    filename = 'ny_bike_external_{}_{}_{}_{}.h5'.format(
        description, length, start_time, end_time)
    f = h5py.File(os.path.join(path, filename), 'a')

    # encode the time
    encode_time = np.asarray([utils.encode_time(batch) for batch in timeslots])

    # if the same, load directly
    if 'timeslots' in f and (encode_time == f['timeslots'][()]).all():

        print('cache load ny bike {} external data from {} to {}'.format(
            description, start_time, end_time))
        print('-' * 30)

        dayOfWeek = f['dayOfWeek'][()]
        hour = f['hour'][()]

        f.close()
        return hour, dayOfWeek

    f.close()

    # else calculate then cache
    f = h5py.File(os.path.join(path, filename), 'w')

    print('calculate ny bike {} external data from {} to {}'.format(
        description, start_time, end_time))
    if description == 'global':
        hour, dayOfWeek = load_ny_bike_external(
            timeslots, 'ny bike global external data')
    else:
        hour, dayOfWeek = load_ny_bike_external(timeslots,
                                                'ny bike local external data')

    print('cache ny bike {} external data from {} to {}'.format(
        description, start_time, end_time))

    f['timeslots'] = encode_time
    f['hour'] = hour
    f['dayOfWeek'] = dayOfWeek

    f.close()
    return hour, dayOfWeek
Example #3
0
def quick_get_data(dataset, predict_time, index_cut, neighbor_size, lstm_seq_len, flow_gate_len, att_lstm_num, att_lstm_seq_len):

    start_time = str(predict_time[0])
    end_time = str(predict_time[-1])
    path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data'

    filename = '{}_baseline_stdn_neighbor_{}_{}_{}_{}.h5'.format(dataset, neighbor_size, lstm_seq_len, flow_gate_len, att_lstm_num, att_lstm_seq_len)
    f = h5py.File(os.path.join(path, filename), 'a')

    # encode the time
    encode_time = np.asarray([utils.encode_time(batch) for batch in predict_time])

    # if the same, load directly
    if 'predict_time' in f and (encode_time == f['predict_time'][()]).all():
        print('cache load stdn {} data from {} to {}, neighbor is {}'.format(dataset, start_time, end_time, neighbor_size))
        print('-' * 30)

        train_dict = {key + '_train' : f[key + '_train'][()] for key in data_name}
        test_dict = {key + '_test' : f[key + '_test'][()] for key in data_name}
        external_dim = f['external_dim'][()]

        if(train_dict['lstm_input_train'].shape[1] == lstm_seq_len
                and train_dict['flow_input_train'].shape[3] == neighbor_size * 2 + 1
                and train_dict['att_nbhd_input_train'].shape[1] == att_lstm_num * att_lstm_seq_len
                and train_dict['att_lstm_input_train'].shape[1] == att_lstm_num):
            f.close()
            return train_dict, test_dict, external_dim

    f.close()

    # else calculate then cache
    f = h5py.File(os.path.join(path, filename), 'w')

    print('calculate stdn {} data from {} to {}, neighbor is {}'.format(dataset, start_time, end_time, neighbor_size))
    train_dict, test_dict, external_dim = \
        get_data(dataset, predict_time, index_cut, neighbor_size, lstm_seq_len, flow_gate_len, att_lstm_num, att_lstm_seq_len)

    print('cache stdn {} data from {} to {}, neighbor is {}'.format(dataset, start_time, end_time, neighbor_size))

    f['predict_time'] = encode_time

    for data in [test_dict, train_dict]:
        for key, value in data.items():
            f[key] = value

    f['external_dim'] = external_dim

    f.close()
    return train_dict, test_dict, external_dim
Example #4
0
def quick_get_data(dataset, predict_time, index_cut, neighbor_size, len_close,
                   len_period, len_trend):
    """
    load st-res data quickly if there is the file
    :param predict_time:
    :param index_cut:
    :param neighbor_size:
    :param len_close:
    :param len_period:
    :param len_trend:
    :return:
    """
    start_time = str(predict_time[0])
    end_time = str(predict_time[-1])
    path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data'

    filename = '{}_baseline_stres_neighbor_{}_{}_{}_{}.h5'.format(
        dataset, neighbor_size, len_close, len_period, len_trend)
    f = h5py.File(os.path.join(path, filename), 'a')

    # encode the time
    encode_time = np.asarray(
        [utils.encode_time(batch) for batch in predict_time])

    # if the same, load directly
    if 'predict_time' in f and (encode_time == f['predict_time'][()]).all():
        print('cache load st-res {} data from {} to {}, neighbor is {}'.format(
            dataset, start_time, end_time, neighbor_size))
        print('-' * 30)

        X_train = f['X_train'][()]
        X_test = f['X_test'][()]
        Y_train = f['Y_train'][()]
        Y_test = f['Y_test'][()]
        T_train = f['T_train'][()]
        T_test = f['T_test'][()]
        External_train = f['External_train'][()]
        External_test = f['External_test'][()]
        external_dim = f['external_dim'][()]

        f.close()
        return X_train, X_test, Y_train, Y_test, T_train, T_test, External_train, External_test, external_dim

    f.close()

    # else calculate then cache
    f = h5py.File(os.path.join(path, filename), 'w')

    print('calculate st-res {} data from {} to {}, neighbor is {}'.format(
        dataset, start_time, end_time, neighbor_size))

    X_train, X_test, Y_train, Y_test, T_train, T_test, External_train, External_test, external_dim = \
        get_data(dataset, predict_time, index_cut, neighbor_size, len_close, len_period, len_trend)

    encode_T_train = np.asarray(
        [utils.encode_time(batch) for batch in T_train])
    encode_T_test = np.asarray([utils.encode_time(batch) for batch in T_test])

    print('cache st-res {} data from {} to {}, neighbor is {}'.format(
        dataset, start_time, end_time, neighbor_size))

    f['predict_time'] = encode_time
    f['X_train'] = X_train
    f['X_test'] = X_test
    f['Y_train'] = Y_train
    f['Y_test'] = Y_test
    f['T_train'] = encode_T_train
    f['T_test'] = encode_T_test
    f['External_train'] = External_train
    f['External_test'] = External_test
    f['external_dim'] = external_dim

    f.close()
    return X_train, X_test, Y_train, Y_test, T_train, T_test, External_train, External_test, external_dim
Example #5
0
def quick_get_data(dataset, predict_time, hours, days, weeks):
    """
    load astgcn data quickly if there is the file
    :param predict_time:
    :param index_cut:
    :param neighbor_size:
    :param hours:
    :param days:
    :param weeks:
    :param predict_len: 1 default
    :return:
    """
    start_time = str(predict_time[0])
    end_time = str(predict_time[-1])
    path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data'

    filename = '{}_baseline_astgcn_{}_{}_{}.h5'.format(dataset, hours, days,
                                                       weeks)
    f = h5py.File(os.path.join(path, filename), 'a')

    # encode the time
    encode_time = np.asarray(
        [utils.encode_time(batch) for batch in predict_time])

    # if the same, load directly
    if 'predict_time' in f and (encode_time == f['predict_time'][()]).all():
        print('cache load astgcn {} data from {} to {}'.format(
            dataset, start_time, end_time))
        print('-' * 30)

        X_train = f['X_train'][()]
        X_test = f['X_test'][()]
        Y_train = f['Y_train'][()]
        Y_test = f['Y_test'][()]
        T_train = f['T_train'][()]
        T_test = f['T_test'][()]

        f.close()
        return X_train, X_test, Y_train, Y_test, T_train, T_test

    f.close()

    # else calculate then cache
    f = h5py.File(os.path.join(path, filename), 'w')

    print('calculate astgcn {} data from {} to {}'.format(
        dataset, start_time, end_time))

    X_train, X_test, Y_train, Y_test, T_train, T_test = \
        get_data(dataset, predict_time, hours, days, weeks)

    encode_T_train = np.asarray(
        [utils.encode_time(batch) for batch in T_train])
    encode_T_test = np.asarray([utils.encode_time(batch) for batch in T_test])

    print('cache astgcn {} data from {} to {}'.format(dataset, start_time,
                                                      end_time))

    f['predict_time'] = encode_time
    f['X_train'] = X_train
    f['X_test'] = X_test
    f['Y_train'] = Y_train
    f['Y_test'] = Y_test
    f['T_train'] = encode_T_train
    f['T_test'] = encode_T_test

    f.close()
    return X_train, X_test, Y_train, Y_test, T_train, T_test
Example #6
0
def load_ny_bike(proportion_test,
                 len_global=7,
                 len_local=4,
                 neighbor_size=3,
                 region_num=5):
    """
    load all the data
    args:
        len_global: the time length of global data
        len_local: the time length of local data
        neighbor_size: the local size, size = (val * 2 + 1) * (val * 2 + 1)
        region_num: how many regions that a map contains
    ret:
        train set and test set, including:
        1. global external and flow data
        2. local external and flow data
        3. ground truth
    """
    date, data, mmn, index = load_ny_bike_flow()

    # get global and local flow data, ground truth and the corresponding date
    global_flow, stack_local_flow, ground_truth, current_local_flow, index_cut, \
    predict_time, global_timeslots, local_timeslots = \
        utils.get_flow_data(date, data, len_global, len_local, neighbor_size, region_num,
                            unit_len=24, width=16, height=8)

    # get global and local external data
    g_hour, g_dayOfWeek = quick_load_ny_bike_external(global_timeslots,
                                                      'global', len_global)

    t_hour, t_dayOfWeek = quick_load_ny_bike_external(local_timeslots, 'local',
                                                      len_local)

    # change encode to ascii for time
    predict_time = np.asarray(utils.encode_time(predict_time))
    global_timeslots = np.asarray(
        [utils.encode_time(batch) for batch in global_timeslots])
    local_timeslots = np.asarray(
        [utils.encode_time(batch) for batch in local_timeslots])

    # build train set and test set according to the param:len_test
    data_dict = {
        'global_flow': global_flow,
        'stack_local_flow': stack_local_flow,
        'ground_truth': ground_truth,
        'current_local_flow': current_local_flow,
        'index_cut': index_cut,
        'predict_time': predict_time,
        'global_timeslots': global_timeslots,
        'local_timeslots': local_timeslots,
        'g_hour': g_hour,
        'g_dayOfWeek': g_dayOfWeek,
        't_hour': t_hour,
        't_dayOfWeek': t_dayOfWeek
    }

    data_dict = utils.duplicate_data(data_dict, region_num)

    total_length = g_dayOfWeek.shape[0]
    len_test = math.ceil(total_length * proportion_test)
    len_train = total_length - len_test
    print('train set length {:d}\ntest set length {:d}\n{}'.format(
        len_train, len_test, '-' * 30))

    train_set, test_set, data_name = utils.divide_train_and_test(
        len_test, data_dict)

    print('predict start: {}\npredict end: {}'.format(
        data_dict["predict_time"][0].decode('utf-8'),
        data_dict["predict_time"][-1].decode('utf-8')))
    print('-' * 30)

    return train_set, test_set, mmn, data_name
Example #7
0
def load_rdw_ny_bike(proportion_test,
                     len_recent=4,
                     len_daily=4,
                     len_week=4,
                     neighbor_size=2,
                     region_num=5):
    """
    load all the data
    args:
        len_global: the time length of global data
        len_local: the time length of local data
        neighbor_size: the local size, size = (val * 2 + 1) * (val * 2 + 1)
        region_num: how many regions that a map contains
    ret:
        train set and test set, including:
        1. global external and flow data
        2. local external and flow data
        3. ground truth
    """
    date, data, mmn, index = load_ny_bike_flow()

    # get global and local flow data, ground truth and the corresponding date
    recent_local_flow, daily_local_flow, week_local_flow, ground_truth, current_local_flow \
        , index_cut, predict_time, recent_time, daily_time, week_time, current_time = \
        utils.get_flow_rdw_data(date, data, len_recent, len_daily, len_week,
                                neighbor_size, region_num, unit_len=24, width=16, height=8)

    # get recent, daily, week, current external data
    recent_hour, recent_dayOfWeek = quick_load_ny_bike_external(
        recent_time, 'recent', len_recent)
    daily_hour, daily_dayOfWeek = quick_load_ny_bike_external(
        daily_time, 'daily', len_daily)
    week_hour, week_dayOfWeek = quick_load_ny_bike_external(
        week_time, 'week', len_week)
    current_hour, current_dayOfWeek = quick_load_ny_bike_external(
        current_time, 'current', 1)

    # change encode to ascii for time
    predict_time = np.asarray(utils.encode_time(predict_time))
    recent_time = np.asarray(
        [utils.encode_time(batch) for batch in recent_time])
    daily_time = np.asarray([utils.encode_time(batch) for batch in daily_time])
    week_time = np.asarray([utils.encode_time(batch) for batch in week_time])
    current_time = np.asarray(
        [utils.encode_time(batch) for batch in current_time])

    # build train set and test set according to the param:len_test
    data_dict = {
        'recent_local_flow': recent_local_flow,
        'daily_local_flow': daily_local_flow,
        'week_local_flow': week_local_flow,
        'current_local_flow': current_local_flow,
        'ground_truth': ground_truth,
        'index_cut': index_cut,
        'predict_time': predict_time,
        'recent_time': recent_time,
        'daily_time': daily_time,
        'week_time': week_time,
        'current_time': current_time,
        'recent_hour': recent_hour,
        'recent_dayOfWeek': recent_dayOfWeek,
        'daily_hour': daily_hour,
        'daily_dayOfWeek': daily_dayOfWeek,
        'week_hour': week_hour,
        'week_dayOfWeek': week_dayOfWeek,
        'current_hour': current_hour,
        'current_dayOfWeek': current_dayOfWeek
    }

    data_dict = utils.duplicate_rdw_data(data_dict, region_num)

    total_length = current_time.shape[0]
    len_test = math.ceil(total_length * proportion_test)
    len_train = total_length - len_test
    print('train set length {:d}\ntest set length {:d}\n{}'.format(
        len_train, len_test, '-' * 30))

    train_set, test_set, data_name = utils.divide_train_and_test(
        len_test, data_dict)

    print('predict start: {}\npredict end: {}'.format(
        data_dict["predict_time"][0].decode('utf-8'),
        data_dict["predict_time"][-1].decode('utf-8')))
    print('-' * 30)

    return train_set, test_set, mmn, data_name