Example #1
0
def _prepare_data_trend(window, df):
    consumption = df.consumption.values
    data_trend = group_sum(consumption, 24)
    data_trend /= np.mean(data_trend)
    data_trend = np.array(data_trend, dtype=np.float32)
    data_trend = np.expand_dims(data_trend, axis=0)
    return data_trend
 def _daily_predict(self, series_id, consumption, weekdays, dates):
     is_day_off = self._get_is_day_off(weekdays, series_id, dates)
     is_day_off = is_day_off[-self._input_days:]
     org_key = ''.join([str(i) for i in is_day_off])
     pred = []
     for offset in range(7):
         weekday = weekdays[-1]
         date = dates[-1]
         for _ in range(offset + 1):
             weekday = _get_next_weekday(weekday)
         date = _get_next_date(date, offset + 1)
         key = org_key + str(self._is_day_off(weekday, series_id, date))
         while 1:
             if key in self.train_data['daily'][offset]:
                 break
             else:
                 # print(key, 'not found')
                 key = key[1:]
             if not len(key):
                 msg = 'Key not found: %s\tWindow: %s\tOffset: %s' % (
                     org_key, 'daily', offset)
                 raise KeyError(msg)
         x = consumption[-(len(key) - 1) * 24:]
         x = group_sum(x, 24)
         x = np.expand_dims(x, axis=0)
         weights = self.train_data['daily'][offset][key]['weights']
         # print(consumption.shape, x.shape, weights.shape, key)
         pred.append(x.dot(weights)[0])
     return np.array(pred)
Example #3
0
def _prepare_past_consumption(window, df):
    past_consumption = df.consumption.values.copy()
    if window != 'hourly':
        past_consumption = group_sum(past_consumption, 24)
    mean_consumption = np.mean(past_consumption)
    past_consumption /= mean_consumption
    past_consumption = np.expand_dims(past_consumption, axis=1)
    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, axis=0)
    return past_consumption, mean_consumption
Example #4
0
def prepare_data_for_train(df, metadata, input_days, window, verbose=True):
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    is_day_off, cluster_features_v2 = [], []
    clock = []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        days_off = sub_df.is_holiday.values
        if window != 'hourly':
            consumption = group_sum(consumption, 24)
            days_off = days_off[::24]
            step = input_days
        else:
            clock_values = np.concatenate([np.linspace(0, 1, 24)] *
                                          (len(sub_df) // 24),
                                          axis=0)
            step = input_days * 24
        for start_idx in range(len(consumption) - step - 1):
            is_day_off.append(days_off[start_idx:start_idx + step])
            past_consumption_values = consumption[start_idx:start_idx + step]
            mean_value, std_value = np.mean(past_consumption_values), np.std(
                past_consumption_values)
            past_consumption.append(past_consumption_values / mean_value)
            future_consumption.append(consumption[start_idx + step] /
                                      mean_value)
            cluster_features_v2.append(get_cluster_features_v2(series_id))
            clock.append(clock_values[start_idx:start_idx + step])

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, 2)
    future_consumption = np.array(future_consumption, dtype=np.float32)
    is_day_off = np.array(is_day_off, dtype=np.float32)
    is_day_off = np.expand_dims(is_day_off, 2)
    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)
    cluster_features_v2 = np.expand_dims(cluster_features_v2, 1)
    cluster_features_v2 = np.repeat(cluster_features_v2,
                                    is_day_off.shape[1],
                                    axis=1)
    clock = np.array(clock, dtype=np.float32)
    clock = np.expand_dims(clock, 2)

    x = {
        'past_consumption': past_consumption,
        'is_day_off': is_day_off,
        'cluster_features_v2': cluster_features_v2,
        'clock': clock,
    }
    return x, future_consumption
Example #5
0
def _prepare_past_consumption(window, df):
    past_consumption = df.consumption.values.copy()
    if window != 'hourly':
        past_consumption = group_sum(past_consumption, 24)
    mean_consumption = np.mean(past_consumption)
    past_consumption /= mean_consumption
    pred_size = WINDOW_TO_PRED_DAYS[window]
    if window == 'hourly':
        pred_size *= 24
    pred = np.zeros(pred_size)
    past_consumption = np.concatenate([past_consumption, pred])
    past_consumption = np.expand_dims(past_consumption, axis=1)
    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, axis=0)
    return past_consumption, mean_consumption
Example #6
0
def _prepare_past_consumption(window, df):
    consumption = df.consumption.values
    if window == 'hourly':
        past_consumption = np.reshape(consumption, newshape=(-1, 24))
    else:
        past_consumption = group_sum(consumption, 24)
        past_consumption = np.expand_dims(past_consumption, axis=1)
        if window == 'weekly':
            past_consumption = np.repeat(past_consumption, 2, axis=1)
        else:
            past_consumption = np.repeat(past_consumption, 7, axis=1)
    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, axis=0)
    past_consumption = np.transpose(past_consumption, axes=(0, 2, 1))
    return past_consumption
    def _prepare_weekly_data(self, df):
        for series_id in tqdm_notebook(df.series_id.unique(),
                                       desc='Preparing data'):
            sub_df = df[df.series_id == series_id]
            consumption = sub_df.consumption.values
            consumption = group_sum(consumption, 24)
            is_day_off = self._get_is_day_off_from_df(sub_df)
            for input_days in range(1, 1 + self._input_days):
                for start_idx in range(len(is_day_off) - input_days - 14):
                    key = ''.join([
                        str(i)
                        for i in is_day_off[start_idx:start_idx + input_days]
                    ])

                    x = consumption[start_idx:start_idx + input_days]
                    x = np.expand_dims(x, axis=0)
                    val_idx = start_idx + input_days
                    y = consumption[val_idx:val_idx + 14]
                    y = group_sum(y, 7)
                    y_mean = np.mean(y)
                    for offset in range(2):
                        final_key = key
                        self._add_train_data(x / y_mean, [y[offset] / y_mean],
                                             'weekly', offset, final_key)
def visualize_idx(idx, train, train_arrange, preds, metadata):
    row = train_arrange.loc[idx]
    df = train[train.series_id == row['series_id']]
    consumption = df.consumption.values[
        row['train_start_idx']:row['val_end_idx']]
    dates = df.timestamp.values[row['train_start_idx']:row['val_end_idx']]
    weekdays = df.weekday.values[row['train_start_idx']:row['val_end_idx']]

    if row['window'] == 'hourly':
        batch_size = 24
    elif row['window'] == 'daily':
        batch_size = 1
        weekdays = weekdays[::24]
        dates = dates[::24]
        consumption = group_sum(consumption, 24)

    plt.plot(dates[-len(preds[idx]):], preds[idx], color='green', lw=3)
    plt.plot(dates[-len(preds[idx]):][::batch_size],
             preds[idx][::batch_size],
             'o',
             color='green',
             lw=3)
    for i in range(len(dates) // batch_size):
        weekday = weekdays[i * batch_size]
        if _is_day_off(row['series_id'], weekday, metadata):
            color = 'orange'
        else:
            color = 'blue'
        plt.plot(dates[i * batch_size:(i + 1) * batch_size + 1],
                 consumption[i * batch_size:(i + 1) * batch_size + 1],
                 color=color)
        plt.plot(dates[i * batch_size:(i) * batch_size + 1],
                 consumption[i * batch_size:(i) * batch_size + 1],
                 'o',
                 color=color)
    plt.title('%i Nmae: %.3f' % (idx, row['nmae']))
Example #9
0
def prepare_data_for_train(df,
                           metadata,
                           input_days,
                           window,
                           only_working_days,
                           verbose=True):
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    is_day_off, cluster_features_v2 = [], []
    weekday = []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        days_off = sub_df.is_holiday.values
        weekdays = sub_df.weekday.values
        if window != 'hourly':
            consumption = group_sum(consumption, 24)
            days_off = days_off[::24]
            step = 1
            past_samples = input_days
            future_samples = WINDOW_TO_PRED_DAYS[window]
        else:
            step = 24
            past_samples = input_days * 24
            future_samples = 24
        for start_idx in range(
                0,
                len(consumption) - future_samples - past_samples + step, step):
            if days_off[start_idx + past_samples] and only_working_days:
                continue
            if not days_off[start_idx +
                            past_samples] and not only_working_days:
                continue
            is_day_off.append(days_off[start_idx:start_idx + past_samples])
            weekday.append(weekdays[start_idx:start_idx + past_samples])
            past_consumption_values = consumption[start_idx:start_idx +
                                                  past_samples]
            mean_value = np.mean(past_consumption_values)
            mean_value *= normalization_factor(
                days_off[start_idx:start_idx + past_samples],
                1 - int(only_working_days))
            past_consumption.append(past_consumption_values / mean_value)
            future_consumption.append(
                consumption[start_idx + past_samples:start_idx + past_samples +
                            future_samples] / mean_value)
            cluster_features_v2.append(get_cluster_features_v2(series_id))

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, 2)
    future_consumption = np.array(future_consumption, dtype=np.float32)
    future_consumption = np.expand_dims(future_consumption, 2)
    is_day_off = np.array(is_day_off, dtype=np.float32)
    is_day_off = np.expand_dims(is_day_off, 2)
    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)
    cluster_features_v2 = np.expand_dims(cluster_features_v2, 1)
    cluster_features_v2 = np.repeat(cluster_features_v2,
                                    is_day_off.shape[1],
                                    axis=1)
    weekday = np.array(weekday, dtype=np.float32)
    weekday = np.expand_dims(weekday, 2)
    weekday /= 7.

    x = {
        'past_consumption': past_consumption,
        'is_day_off': is_day_off,
        'cluster_features_v2': cluster_features_v2,
        'weekday': weekday,
    }
    return x, future_consumption
Example #10
0
def prepare_data_for_train(df, input_days, window, verbose=True):
    """
    Returns
    --------

    ::

        x = {
            'past_consumption': past_consumption,
            'cluster_features_v2': cluster_features_v2,
            'past_weekday': past_weekday,
            'future_weekday': future_weekday,
            'past_day_off': past_day_off,
            'future_day_off': future_day_off,
        }
        return x, future_consumption
    """
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    past_day_off, future_day_off = [], []
    past_weekday, future_weekday = [], []
    cluster_features_v2 = []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        days_off = sub_df.is_holiday.values
        weekdays = sub_df.weekday.values

        if window != 'hourly':
            consumption = group_sum(consumption, 24)
            days_off = days_off[::24]
            weekdays = weekdays[::24]
            step = 1
            past_samples = input_days
            future_samples = pred_days
        else:
            step = 24
            past_samples = input_days * 24
            future_samples = 24
        for start_idx in range(
                0,
                len(consumption) - future_samples - past_samples + step, step):
            past_idx = start_idx + past_samples
            past_day_off.append(days_off[start_idx:past_idx])
            past_weekday.append(weekdays[start_idx:past_idx])

            future_idx = past_idx + future_samples
            if window == 'weekly':
                future_day_off.append(
                    np.reshape(days_off[past_idx:future_idx], (2, -1)))
                future_weekday.append(weekdays[past_idx:future_idx:7])
            else:
                future_day_off.append(days_off[past_idx:future_idx])
                future_weekday.append(weekdays[past_idx:future_idx])

            past_consumption_values = consumption[start_idx:past_idx]
            mean_value = np.mean(past_consumption_values)
            mean_value *= normalization_factor(past_day_off[-1],
                                               future_day_off[-1])
            if window == 'weekly':
                mean_value *= 7
            past_consumption.append(past_consumption_values / mean_value)
            if window == 'weekly':
                future_consumption.append(
                    group_sum(consumption[past_idx:future_idx], 7) /
                    mean_value)
            else:
                future_consumption.append(consumption[past_idx:future_idx] /
                                          mean_value)
            cluster_features_v2.append(get_cluster_features_v2(series_id))
            # TODO: I should do refinement on weekly predictions

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.expand_dims(past_consumption, 2)
    future_consumption = np.array(future_consumption, dtype=np.float32)
    future_consumption = np.expand_dims(future_consumption, 2)

    past_day_off = np.array(past_day_off, dtype=np.float32)
    past_day_off = np.expand_dims(past_day_off, 2)
    future_day_off = np.array(future_day_off, dtype=np.float32)
    if window != 'weekly':
        future_day_off = np.expand_dims(future_day_off, 2)

    past_weekday = [[_weekday_ohe(weekday) for weekday in week]
                    for week in past_weekday]
    past_weekday = np.array(past_weekday, dtype=np.float32)
    future_weekday = [[_weekday_ohe(weekday) for weekday in week]
                      for week in future_weekday]
    future_weekday = np.array(future_weekday, dtype=np.float32)

    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)

    x = {
        'past_consumption': past_consumption,
        'cluster_features_v2': cluster_features_v2,
        'past_weekday': past_weekday,
        'future_weekday': future_weekday,
        'past_day_off': past_day_off,
        'future_day_off': future_day_off,
    }
    return x, future_consumption
Example #11
0
def test_group_sum(x, group_size, output):
    assert all(output == group_sum(x, group_size))
Example #12
0
def prepare_data_for_train(df, metadata, input_days, window, verbose=True):
    pred_days = WINDOW_TO_PRED_DAYS[window]
    past_consumption, future_consumption = [], []
    is_day_off, data_trend, metadata_ohe = [], [], []
    metadata_days_off = []
    cluster_id_ohe, cluster_features_v2 = [], []
    if verbose:
        iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data')
    else:
        iterator = df.series_id.unique()
    for series_id in iterator:
        sub_df = df[df.series_id == series_id]
        consumption = sub_df.consumption.values
        if window != 'hourly':
            consumption = group_sum(consumption, 24)
        series_is_day_off = [
            int(value) for value in sub_df.is_holiday.values[::24]
        ]
        for start_idx in range(
                len(series_is_day_off) - input_days - pred_days + 1):
            is_day_off.append(series_is_day_off[start_idx:start_idx +
                                                input_days + pred_days])
            val_idx = start_idx + input_days
            if window == 'hourly':
                x = np.reshape(consumption[start_idx * 24:val_idx * 24],
                               newshape=(-1, 24))
                y = consumption[val_idx * 24:(val_idx + pred_days) * 24]
            else:
                x = consumption[start_idx:val_idx]
                x = np.expand_dims(x, axis=1)
                y = consumption[val_idx:val_idx + pred_days]
                if window == 'weekly':
                    x = np.repeat(x, 2, axis=1)
                    y = group_sum(y, 7)
                else:
                    x = np.repeat(x, 7, axis=1)
            y_mean = np.mean(y)
            past_consumption.append(x / y_mean)
            future_consumption.append(y / y_mean)
            # Data trend
            if window == 'hourly':
                _data_trend = group_sum(
                    consumption[start_idx * 24:val_idx * 24], 24)
            else:
                _data_trend = consumption[start_idx:val_idx].copy()
            _data_trend /= np.mean(_data_trend)
            data_trend.append(_data_trend)
            metadata_ohe.append(_get_metadata_ohe(metadata, series_id))
            metadata_days_off.append(
                _get_metadata_days_off(metadata, series_id))
            cluster_id_ohe.append(get_cluster_ohe(series_id))
            cluster_features_v2.append(get_cluster_features_v2(series_id))

    past_consumption = np.array(past_consumption, dtype=np.float32)
    past_consumption = np.transpose(past_consumption, axes=(0, 2, 1))
    future_consumption = np.array(future_consumption, dtype=np.float32)
    is_day_off = np.array(is_day_off, dtype=np.float32)
    is_day_off[is_day_off == 0] = -1
    data_trend = np.array(data_trend, dtype=np.float32)
    metadata_ohe = np.array(metadata_ohe, dtype=np.float32)
    metadata_days_off = np.array(metadata_days_off, dtype=np.float32)
    cluster_id_ohe = np.array(cluster_id_ohe, dtype=np.float32)
    cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32)
    x = {
        'past_consumption': past_consumption,
        'is_day_off': is_day_off,
        'data_trend': data_trend,
        'metadata_ohe': metadata_ohe,
        'metadata_days_off': metadata_days_off,
        'cluster_id_ohe': cluster_id_ohe,
        'cluster_features_v2': cluster_features_v2,
    }
    return x, future_consumption