コード例 #1
0
def prepare_data_for_train(df, metadata, input_days, window, verbose=True):
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    is_day_off, cluster_features_v2 = [], []
    clock = []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        days_off = sub_df.is_holiday.values
        if window != 'hourly':
            consumption = group_sum(consumption, 24)
            days_off = days_off[::24]
            step = input_days
        else:
            clock_values = np.concatenate([np.linspace(0, 1, 24)] *
                                          (len(sub_df) // 24),
                                          axis=0)
            step = input_days * 24
        for start_idx in range(len(consumption) - step - 1):
            is_day_off.append(days_off[start_idx:start_idx + step])
            past_consumption_values = consumption[start_idx:start_idx + step]
            mean_value, std_value = np.mean(past_consumption_values), np.std(
                past_consumption_values)
            past_consumption.append(past_consumption_values / mean_value)
            future_consumption.append(consumption[start_idx + step] /
                                      mean_value)
            cluster_features_v2.append(get_cluster_features_v2(series_id))
            clock.append(clock_values[start_idx:start_idx + step])

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, 2)
    future_consumption = np.array(future_consumption, dtype=np.float32)
    is_day_off = np.array(is_day_off, dtype=np.float32)
    is_day_off = np.expand_dims(is_day_off, 2)
    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)
    cluster_features_v2 = np.expand_dims(cluster_features_v2, 1)
    cluster_features_v2 = np.repeat(cluster_features_v2,
                                    is_day_off.shape[1],
                                    axis=1)
    clock = np.array(clock, dtype=np.float32)
    clock = np.expand_dims(clock, 2)

    x = {
        'past_consumption': past_consumption,
        'is_day_off': is_day_off,
        'cluster_features_v2': cluster_features_v2,
        'clock': clock,
    }
    return x, future_consumption
コード例 #2
0
def _prepare_cluster_features_v2(series_id):
    cluster_ohe = get_cluster_features_v2(series_id)
    cluster_ohe = np.array(cluster_ohe, dtype=np.float32)
    cluster_ohe = np.expand_dims(cluster_ohe, axis=0)
    return cluster_ohe
コード例 #3
0
def prepare_data_for_train(df,
                           metadata,
                           input_days,
                           window,
                           only_working_days,
                           verbose=True):
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    is_day_off, cluster_features_v2 = [], []
    weekday = []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        days_off = sub_df.is_holiday.values
        weekdays = sub_df.weekday.values
        if window != 'hourly':
            consumption = group_sum(consumption, 24)
            days_off = days_off[::24]
            step = 1
            past_samples = input_days
            future_samples = WINDOW_TO_PRED_DAYS[window]
        else:
            step = 24
            past_samples = input_days * 24
            future_samples = 24
        for start_idx in range(
                0,
                len(consumption) - future_samples - past_samples + step, step):
            if days_off[start_idx + past_samples] and only_working_days:
                continue
            if not days_off[start_idx +
                            past_samples] and not only_working_days:
                continue
            is_day_off.append(days_off[start_idx:start_idx + past_samples])
            weekday.append(weekdays[start_idx:start_idx + past_samples])
            past_consumption_values = consumption[start_idx:start_idx +
                                                  past_samples]
            mean_value = np.mean(past_consumption_values)
            mean_value *= normalization_factor(
                days_off[start_idx:start_idx + past_samples],
                1 - int(only_working_days))
            past_consumption.append(past_consumption_values / mean_value)
            future_consumption.append(
                consumption[start_idx + past_samples:start_idx + past_samples +
                            future_samples] / mean_value)
            cluster_features_v2.append(get_cluster_features_v2(series_id))

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, 2)
    future_consumption = np.array(future_consumption, dtype=np.float32)
    future_consumption = np.expand_dims(future_consumption, 2)
    is_day_off = np.array(is_day_off, dtype=np.float32)
    is_day_off = np.expand_dims(is_day_off, 2)
    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)
    cluster_features_v2 = np.expand_dims(cluster_features_v2, 1)
    cluster_features_v2 = np.repeat(cluster_features_v2,
                                    is_day_off.shape[1],
                                    axis=1)
    weekday = np.array(weekday, dtype=np.float32)
    weekday = np.expand_dims(weekday, 2)
    weekday /= 7.

    x = {
        'past_consumption': past_consumption,
        'is_day_off': is_day_off,
        'cluster_features_v2': cluster_features_v2,
        'weekday': weekday,
    }
    return x, future_consumption
コード例 #4
0
def prepare_data_for_train(df, input_days, window, verbose=True):
    """
    Returns
    --------

    ::

        x = {
            'past_consumption': past_consumption,
            'cluster_features_v2': cluster_features_v2,
            'past_weekday': past_weekday,
            'future_weekday': future_weekday,
            'past_day_off': past_day_off,
            'future_day_off': future_day_off,
        }
        return x, future_consumption
    """
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    past_day_off, future_day_off = [], []
    past_weekday, future_weekday = [], []
    cluster_features_v2 = []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        days_off = sub_df.is_holiday.values
        weekdays = sub_df.weekday.values

        if window != 'hourly':
            consumption = group_sum(consumption, 24)
            days_off = days_off[::24]
            weekdays = weekdays[::24]
            step = 1
            past_samples = input_days
            future_samples = pred_days
        else:
            step = 24
            past_samples = input_days * 24
            future_samples = 24
        for start_idx in range(
                0,
                len(consumption) - future_samples - past_samples + step, step):
            past_idx = start_idx + past_samples
            past_day_off.append(days_off[start_idx:past_idx])
            past_weekday.append(weekdays[start_idx:past_idx])

            future_idx = past_idx + future_samples
            if window == 'weekly':
                future_day_off.append(
                    np.reshape(days_off[past_idx:future_idx], (2, -1)))
                future_weekday.append(weekdays[past_idx:future_idx:7])
            else:
                future_day_off.append(days_off[past_idx:future_idx])
                future_weekday.append(weekdays[past_idx:future_idx])

            past_consumption_values = consumption[start_idx:past_idx]
            mean_value = np.mean(past_consumption_values)
            mean_value *= normalization_factor(past_day_off[-1],
                                               future_day_off[-1])
            if window == 'weekly':
                mean_value *= 7
            past_consumption.append(past_consumption_values / mean_value)
            if window == 'weekly':
                future_consumption.append(
                    group_sum(consumption[past_idx:future_idx], 7) /
                    mean_value)
            else:
                future_consumption.append(consumption[past_idx:future_idx] /
                                          mean_value)
            cluster_features_v2.append(get_cluster_features_v2(series_id))
            # TODO: I should do refinement on weekly predictions

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, 2)
    future_consumption = np.array(future_consumption, dtype=np.float32)
    future_consumption = np.expand_dims(future_consumption, 2)

    past_day_off = np.array(past_day_off, dtype=np.float32)
    past_day_off = np.expand_dims(past_day_off, 2)
    future_day_off = np.array(future_day_off, dtype=np.float32)
    if window != 'weekly':
        future_day_off = np.expand_dims(future_day_off, 2)

    past_weekday = [[_weekday_ohe(weekday) for weekday in week]
                    for week in past_weekday]
    past_weekday = np.array(past_weekday, dtype=np.float32)
    future_weekday = [[_weekday_ohe(weekday) for weekday in week]
                      for week in future_weekday]
    future_weekday = np.array(future_weekday, dtype=np.float32)

    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)

    x = {
        'past_consumption': past_consumption,
        'cluster_features_v2': cluster_features_v2,
        'past_weekday': past_weekday,
        'future_weekday': future_weekday,
        'past_day_off': past_day_off,
        'future_day_off': future_day_off,
    }
    return x, future_consumption
コード例 #5
0
def prepare_data_for_train(df, metadata, input_days, window, verbose=True):
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    is_day_off, data_trend, metadata_ohe = [], [], []
    metadata_days_off = []
    cluster_id_ohe, cluster_features_v2 = [], []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        if window != 'hourly':
            consumption = group_sum(consumption, 24)
        series_is_day_off = [
            int(value) for value in sub_df.is_holiday.values[::24]
        ]
        for start_idx in range(
                len(series_is_day_off) - input_days - pred_days + 1):
            is_day_off.append(series_is_day_off[start_idx:start_idx +
                                                input_days + pred_days])
            val_idx = start_idx + input_days
            if window == 'hourly':
                x = np.reshape(consumption[start_idx * 24:val_idx * 24],
                               newshape=(-1, 24))
                y = consumption[val_idx * 24:(val_idx + pred_days) * 24]
            else:
                x = consumption[start_idx:val_idx]
                x = np.expand_dims(x, axis=1)
                y = consumption[val_idx:val_idx + pred_days]
                if window == 'weekly':
                    x = np.repeat(x, 2, axis=1)
                    y = group_sum(y, 7)
                else:
                    x = np.repeat(x, 7, axis=1)
            y_mean = np.mean(y)
            past_consumption.append(x / y_mean)
            future_consumption.append(y / y_mean)
            # Data trend
            if window == 'hourly':
                _data_trend = group_sum(
                    consumption[start_idx * 24:val_idx * 24], 24)
            else:
                _data_trend = consumption[start_idx:val_idx].copy()
            _data_trend /= np.mean(_data_trend)
            data_trend.append(_data_trend)
            metadata_ohe.append(_get_metadata_ohe(metadata, series_id))
            metadata_days_off.append(
                _get_metadata_days_off(metadata, series_id))
            cluster_id_ohe.append(get_cluster_ohe(series_id))
            cluster_features_v2.append(get_cluster_features_v2(series_id))

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.transpose(past_consumption, axes=(0, 2, 1))
    future_consumption = np.array(future_consumption, dtype=np.float32)
    is_day_off = np.array(is_day_off, dtype=np.float32)
    is_day_off[is_day_off == 0] = -1
    data_trend = np.array(data_trend, dtype=np.float32)
    metadata_ohe = np.array(metadata_ohe, dtype=np.float32)
    metadata_days_off = np.array(metadata_days_off, dtype=np.float32)
    cluster_id_ohe = np.array(cluster_id_ohe, dtype=np.float32)
    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)
    x = {
        'past_consumption': past_consumption,
        'is_day_off': is_day_off,
        'data_trend': data_trend,
        'metadata_ohe': metadata_ohe,
        'metadata_days_off': metadata_days_off,
        'cluster_id_ohe': cluster_id_ohe,
        'cluster_features_v2': cluster_features_v2,
    }
    return x, future_consumption
コード例 #6
0
def test_ẗhat_all_series_id_have_v2_features(all_series_ids):
    for series_id in all_series_ids:
        get_cluster_features_v2(series_id)