Ejemplo n.º 1
0
def test_score(predicted, actual, plot_file):
    scores = []
    for i in range(len(predicted)):
        scores.append(utils.smape(actual=actual[i], predicted=predicted[i]))
    plt.figure(figsize=(10, 8))
    plt.plot(range(len(scores)), scores, 'b-', label='score')
    plt.legend(loc='lower right')
    plt.savefig(plot_file)
    plt.show()
    return scores
Ejemplo n.º 2
0
    def error(self, data, times=None, metric='mape'):
        """
        Model prediction error.

        metric : str
            Error metric to use. It can be "mape", "smape", "logaccratio", and
            "rmse". Default: mape.
        """
        if times is None:
            times = numpy.arange(len(data))
        y = self.simulate(times)
        if metric == 'mape':
            return mape(y, data)
        elif metric == 'smape':
            return smape(y, data)
        elif metric == 'logaccratio':
            return logaccratio(y, data)
        elif metric == 'rmse':
            return numpy.sqrt(self.cost_)
        else:
            raise ValueError("No such metric: {}".format(metric))
def stacked_lstm_multi_step_forecast(series,
                                     validation_series,
                                     input_length,
                                     horizon,
                                     del_outliers=False,
                                     normalize=False,
                                     plot=False):
    """
    Perform forecasting of a time series using an lstm neural network. The network is trained using samples of shape
    input_length (corresponding to the last input_length days) to predict an array of horizon values (corresponding to
    horizon days). In this case, the network predicts horizon days at the time. Performance of the trained network is
    assessed on a validation series. The size of the validation series must be horizon.

    :param series:
    :param validation_series:
    :param input_length:
    :param horizon:
    :param del_outliers:
    :param normalize:
    :param plot:
    :return: SMAPE for the validation series, the forecast validation series
    """

    # whether to remove outliers in the training series
    if del_outliers:
        working_series = remove_outliers(series)

    else:
        working_series = series

    # whether to normalize the training series
    if normalize:
        scaler, working_series = normalize_series(working_series)

    else:
        scaler = None

    # input sequence is our data, np.log1p is applied to the data and mae error is used to approximate SMAPE error
    train_series = np.log1p(working_series)

    # we use the last n_steps_in days as input and predict n_steps_out
    n_steps_in, n_steps_out = input_length, horizon

    # split into samples
    train_samples, train_targets = split_sequence(train_series, n_steps_in,
                                                  n_steps_out)

    # here we work with the original series so only the actual values
    n_features = 1
    train_samples = train_samples.reshape(
        (train_samples.shape[0], train_samples.shape[1], n_features))

    # create the model
    model = Sequential()
    model.add(
        LSTM(256, activation='relu', input_shape=(n_steps_in, n_features)))

    # we predict n_steps_out values
    model.add(Dense(n_steps_out))

    # we use 'mae' with data transformed with log1p and expm1 to approach SMAPE error
    model.compile(optimizer='adam', loss='mae')

    # fit model
    model.fit(train_samples, train_targets, epochs=200, verbose=0)

    # perform prediction

    # input is the last n_steps_in values of the train series (working_series is not log1p transformed)
    validation_in_sample = np.log1p(
        np.array(working_series.values[-n_steps_in:]))
    validation_in_sample = validation_in_sample.reshape(
        (1, n_steps_in, n_features))
    validation_forecast = model.predict(validation_in_sample, verbose=0)

    # dataframe which contains the result
    forecast_dataframe = pd.DataFrame(index=validation_series.index)

    # if data was normalized, we need to apply the reverse transform
    if normalize:

        # first reverse log1p using expm1
        validation_forecast = np.expm1(validation_forecast)

        # use scaler to reverse normalizing
        denormalized_forecast = scaler.inverse_transform(
            validation_forecast.reshape(-1, 1))
        denormalized_forecast = [val[0] for val in denormalized_forecast]

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = denormalized_forecast

    else:

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = np.expm1(validation_forecast)

    if plot:
        plt.figure(figsize=(10, 6))

        plt.plot(series[-100:], color="blue", linestyle="-")
        plt.plot(validation_series, color="green", linestyle="-")
        plt.plot(forecast_dataframe, color="red", linestyle="--")

        plt.legend(["Train series", "Validation series", "Predicted series"])

        plt.title("Validation of LSTM with input size " + str(n_steps_in) +
                  " output size " + str(n_steps_out))

        plt.show()

    return smape(
        validation_series,
        forecast_dataframe['forecast']), forecast_dataframe['forecast']
Ejemplo n.º 4
0
def nn_with_past_outliers_single_step_forecast(series,
                                               validation_series,
                                               input_length,
                                               horizon,
                                               del_outliers=False,
                                               normalize=False,
                                               plot=False):
    """
    Perform forecasting of a time series using a simple neural network with a single 128 neurons hidden layer.
    The network is trained using samples of shape input_length (corresponding to the last input_length days) to predict
    an array of horizon values (corresponding to horizon days). In this case, the network predicts one day at the time.
    Performance of the trained network is assessed on a validation series. This is computed by repeating one day
    predictions and shifting the input values. The size of the validation series must be horizon.

    This function differs from nn_single_step_forecast as in addition to the last input_length days, we also use the
    value from the same day the previous year as an input to the network. This value is normalized but contains the
    outliers. The hope is to gain information from the previous year.

    :param series:
    :param validation_series:
    :param input_length:
    :param horizon:
    :param del_outliers:
    :param normalize:
    :param plot:
    :return: SMAPE for the validation series, the forecast validation series
    """

    # whether to remove outliers in the training series
    if del_outliers:
        working_series = remove_outliers(series)

    else:
        working_series = series

    # whether to normalize the training series
    if normalize:
        scaler, working_series = normalize_series(working_series)
        scaler_bis, working_series_with_outliers = normalize_series(series)
    else:
        scaler = None
        working_series_with_outliers = series

    # input sequence is our data, np.log1p is applied to the data and mae error is used to approximate SMAPE error
    train_series = np.log1p(working_series)

    # we use the last n_steps_in days as input and predict one step
    n_steps_in, n_steps_out = input_length, 1

    # split into samples, using sample from previous year
    # implementation from multi steps can be used here since single step is special case of multi steps
    train_samples, train_targets = split_sequence_nn_with_past_outliers_multi_step(
        train_series, working_series_with_outliers, n_steps_in, n_steps_out)

    # create the model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=n_steps_in + 1))

    # we predict n_steps_out values
    model.add(Dense(n_steps_out))

    # we use 'mae' with data transformed with log1p and expm1 to approach SMAPE error
    model.compile(optimizer='adam', loss='mae')

    # fit model
    model.fit(train_samples, train_targets, epochs=200, verbose=0)

    # perform prediction

    # we start by transforming the normalized series into log1p, new one day predictions will be added to this series
    # as we predict them and these predictions will be used for the next forecasting step
    working_series_values = np.log1p(working_series.values)

    # perform horizon predictions
    for i in range(horizon):
        # input contains the value from the previous year for the forecast day
        validation_in_sample = np.append(
            np.array(working_series_with_outliers[-365 + 1]),
            np.array(working_series_values[-n_steps_in:]))
        validation_in_sample = validation_in_sample.reshape(
            (1, n_steps_in + 1))

        validation_forecast = model.predict(validation_in_sample, verbose=0)

        working_series_values = np.append(working_series_values,
                                          validation_forecast)

        working_series_with_outliers = np.append(working_series_with_outliers,
                                                 validation_forecast)

    # take last horizon values from the series (this is the forecast for the validation series
    validation_forecast = working_series_values[-horizon:]

    # dataframe which contains the result
    forecast_dataframe = pd.DataFrame(index=validation_series.index)

    # if data was normalized, we need to apply the reverse transform
    if normalize:

        # first reverse log1p using expm1
        validation_forecast = np.expm1(validation_forecast)

        # use scaler to reverse normalizing
        denormalized_forecast = scaler.inverse_transform(
            validation_forecast.reshape(-1, 1))
        denormalized_forecast = [val[0] for val in denormalized_forecast]

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = denormalized_forecast

    else:

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = np.expm1(validation_forecast)

    if plot:
        plt.figure(figsize=(10, 6))

        plt.plot(series[-100:], color="blue", linestyle="-")
        plt.plot(validation_series, color="green", linestyle="-")
        plt.plot(forecast_dataframe, color="red", linestyle="--")

        plt.legend(["Train series", "Validation series", "Predicted series"])

        plt.title("Validation of simple NN with input size " +
                  str(n_steps_in) + " output size " + str(n_steps_out))

        plt.show()

    return smape(
        validation_series,
        forecast_dataframe['forecast']), forecast_dataframe['forecast']
Ejemplo n.º 5
0
def train_model(city,
                air_code,
                model_name='model_a',
                train_mother=True,
                train_child=True,
                test=False,
                special_date=False,
                special_startday=None,
                special_endday=None,
                min_aq=-999,
                max_aq=1e9):
    #prepare training data set
    #####################################################################
    meo_codes = [
        'temperature', 'pressure', 'humidity', 'wind_direction',
        'wind_speed/kph'
    ]
    aq_stations_file = './data/{}_aq_stations.csv'.format(city.lower())
    aq_data_root = './data/from_aq/'
    meo_data_root = './data/from_grid/'
    #####################################################################

    ###############################################################
    window = 1 * 24
    predict_step = 1 * 24
    predict_hours = 2 * 24
    normalized = 2
    file_norm = './data/norm_pars_{}_{}_{}.csv'.format(city.lower(), air_code,
                                                       model_name)
    folder_norm = os.path.join('./model/', model_name, city, air_code)
    ##############################################################

    if not os.path.isdir(folder_norm):
        os.makedirs(folder_norm)

    ########################################################################
    train_rate = 0.8
    ######################################################################

    x_stations_obs = {}
    x_stations_pre = {}
    y_stations = {}
    all_data = []
    total_samples = 0
    data_aq_station = pd.read_csv(aq_stations_file, usecols=[1])
    aq_stations = data_aq_station.values
    for station in aq_stations:
        meo_file = os.path.join(
            meo_data_root,
            'from_grid_{}_{}.csv'.format(city.lower(), station[0]))
        aq_file = os.path.join(
            aq_data_root, 'from_aq_{}_{}.csv'.format(city.lower(), station[0]))
        x_obs = []
        x_pre = []
        y = []
        data_aq = pd.read_csv(aq_file)
        data_aq = data_aq[air_code]
        data_aq = data_aq.values
        data_aq = data_aq.astype('float32')
        data_meo = pd.read_csv(meo_file)
        data_meo = data_meo[meo_codes]
        data_meo = data_meo.values
        data_meo = data_meo.astype('float32')
        len_hours = min(data_aq.shape[0], data_meo.shape[0])
        data_aq = data_aq.reshape((data_aq.shape[0], 1))
        data = np.concatenate(
            (data_aq[:len_hours, :], data_meo[:len_hours, :]), axis=1)
        all_data.append(data)
        for i in range(0, (len_hours - window - predict_hours), predict_step):
            if min(data[i:i + window + predict_hours, 0]) < 0 or min(
                    data[i:i + window + predict_hours, 0]) < min_aq or max(
                        data[i:i + window + predict_hours, 0]) > max_aq:
                continue
            day = i // 24
            if special_date:
                if day >= special_startday and day < special_endday:
                    y.append(data[i + window:i + window + predict_hours, 0])
                    x_obs.append(data[i:i + window, :])
                    x_pre.append(data[i + window:i + window + predict_hours,
                                      1:])

            else:
                y.append(data[i + window:i + window + predict_hours, 0])
                x_obs.append(data[i:i + window, :])
                x_pre.append(data[i + window:i + window + predict_hours, 1:])
        y = np.array(y)
        x_obs = np.array(x_obs)
        x_pre = np.array(x_pre)
        x_stations_obs[station[0]] = x_obs
        x_stations_pre[station[0]] = x_pre
        y_stations[station[0]] = y
        total_samples += y.shape[0]

    data_for_statistic = all_data[0]
    for i in range(1, len(all_data)):
        data_for_statistic = np.concatenate((data_for_statistic, all_data[i]))

    means = data_for_statistic.mean(axis=0)
    stds = data_for_statistic.std(axis=0)
    maxs = data_for_statistic.max(axis=0)
    mins = data_for_statistic.min(axis=0)
    with open(file_norm, 'w') as f:
        f.write(',')
        f.write(air_code + ',')
        for code in meo_codes:
            f.write(code + ',')
        f.write('\n')
        f.write('means:,')
        for mean in means:
            f.write(str(mean) + ',')
        f.write('\n')
        f.write('stds:,')
        for std in stds:
            f.write(str(std) + ',')
        f.write('\n')
        f.write('maxs:,')
        for max_ in maxs:
            f.write(str(max_) + ',')
        f.write('\n')
        f.write('mins:,')
        for min_ in mins:
            f.write(str(min_) + ',')
        f.write('\n')

    if mins[0] < 0:
        mins[0] = 0

    norm_y = utils.normalization(normalized, means[0], stds[0], maxs[0],
                                 mins[0])
    norm_x_obs = utils.normalization(normalized, means, stds, maxs, mins)
    norm_x_pre = utils.normalization(normalized, means[1:], stds[1:], maxs[1:],
                                     mins[1:])

    if train_mother:
        norm_y.save(os.path.join(folder_norm, 'norm_y.json'))
        norm_x_obs.save(os.path.join(folder_norm, 'norm_x_obs.json'))
        norm_x_pre.save(os.path.join(folder_norm, 'norm_x_pre.json'))

    if special_date:
        for station in aq_stations:
            key = station[0]
            if y_stations[key].shape[0] == 0:
                continue
            days = np.array(range(y_stations[key].shape[0]))
            np.random.shuffle(days)
            x_stations_obs[key] = x_stations_obs[key][days]
            x_stations_pre[key] = x_stations_pre[key][days]
            y_stations[key] = y_stations[key][days]

    i = 0
    for station in aq_stations:
        key = station[0]
        if y_stations[key].shape[0] == 0:
            continue
        train_row = round(train_rate * y_stations[key].shape[0])
        if i == 0:
            x_train_1 = x_stations_obs[key][:train_row, :, :]
            x_train_2 = x_stations_pre[key][:train_row, :, :]
            y_train = y_stations[key][:train_row, :]
            x_test_1 = x_stations_obs[key][train_row:, :, :]
            x_test_2 = x_stations_pre[key][train_row:, :, :]
            y_test = y_stations[key][train_row:, :]
        else:
            x_train_1 = np.concatenate(
                (x_train_1, x_stations_obs[key][:train_row, :, :]))
            x_train_2 = np.concatenate(
                (x_train_2, x_stations_pre[key][:train_row, :, :]))
            y_train = np.concatenate((y_train, y_stations[key][:train_row, :]))
            x_test_1 = np.concatenate(
                (x_test_1, x_stations_obs[key][train_row:, :, :]))
            x_test_2 = np.concatenate(
                (x_test_2, x_stations_pre[key][train_row:, :, :]))
            y_test = np.concatenate((y_test, y_stations[key][train_row:, :]))

        i += 1

    x_train_1 = norm_x_obs(x_train_1)
    x_train_2 = norm_x_pre(x_train_2)
    y_train = norm_y(y_train)
    x_test_1 = norm_x_obs(x_test_1)
    x_test_2 = norm_x_pre(x_test_2)
    y_test = norm_y(y_test)

    #paras
    ######################################################################
    lr = 1e-5
    batch_size = 512
    epoches = 2000

    dim_rnn = [256, 256, 512]
    dim_dense = [512, 256, 128, 64]
    drop = 0.2
    activations = ['relu', 'sigmoid']

    root_model = os.path.join('./model/', model_name, city, air_code)
    model_structure_file = './model/{}/{}.png'.format(model_name, model_name)
    #####################################################################

    if not os.path.isdir(root_model):
        os.mkdir(root_model)

    test_total_scores = {}

    #-----------------------------------------------------------------------------------------------------
    '''
    train a model with data in all stations
    '''
    if train_mother:

        input_shape_obs = (window, len(meo_codes) + 1)
        input_shape_pre = (predict_hours, len(meo_codes))
        output_shape = predict_hours

        optimizer = keras.optimizers.RMSprop(lr=lr)
        loss_fuc = my_loss.loss_smape_rmse
        model = models.model_f(input_shape_obs,
                               input_shape_pre,
                               output_shape,
                               opt=optimizer,
                               loss=loss_fuc,
                               dim_rnn=dim_rnn,
                               dim_dense=dim_dense,
                               drop=drop,
                               activations=activations)
        model.summary()
        plot_model(model,
                   to_file=model_structure_file,
                   show_shapes=True,
                   show_layer_names=True)

        print('x_train_obs shape:', x_train_1.shape)
        print('x_train_pre shape:', x_train_2.shape)
        print('train samples:', x_train_1.shape[0])
        print('test samples:', x_test_1.shape[0])

        hist = model.fit([x_train_1, x_train_2],
                         y_train,
                         batch_size=batch_size,
                         epochs=epoches,
                         verbose=1,
                         validation_data=([x_test_1, x_test_2], y_test))

        model.save(os.path.join(root_model, model_name + '.h5'))

        _predicted, _y, scores = test_model(root_model, model_name, model,
                                            [x_test_1, x_test_2], [y_test],
                                            norm_y)

        test_total_scores[model_name] = utils.smape(_y, _predicted)
        print("total_score:{}".format(utils.smape(_y, _predicted)))

        compare_predict_actual(
            _y, _predicted,
            os.path.join(root_model, '{}_test.png'.format(model_name)))

        K.clear_session()

    #------------------------------------------------------------------------------------------------------------

    #-----------------------------------------------------------------------------------------------------------------------------------
    '''
    train models for each station with the mother model
    '''

    if train_child:
        for station in aq_stations:
            s_model_name = model_name + '_' + station[0]
            key = station[0]
            print('*' * 5 + key + '*' * 5)

            train_row = round(train_rate * y_stations[key].shape[0])
            s_x_train_1 = x_stations_obs[key][:train_row, :, :]
            s_x_train_2 = x_stations_pre[key][:train_row, :, :]
            s_y_train = y_stations[key][:train_row, :]
            s_x_test_1 = x_stations_obs[key][train_row:, :, :]
            s_x_test_2 = x_stations_pre[key][train_row:, :, :]
            s_y_test = y_stations[key][train_row:, :]

            s_x_train_1 = norm_x_obs(s_x_train_1)
            s_x_train_2 = norm_x_pre(s_x_train_2)
            s_y_train = norm_y(s_y_train)
            s_x_test_1 = norm_x_obs(s_x_test_1)
            s_x_test_2 = norm_x_pre(s_x_test_2)
            s_y_test = norm_y(s_y_test)

            input_shape_obs = (window, len(meo_codes) + 1)
            input_shape_pre = (predict_hours, len(meo_codes))
            output_shape = predict_hours

            model = keras.models.load_model(
                os.path.join(root_model, model_name + '.h5'),
                custom_objects={'loss_smape_rmse': my_loss.loss_smape_rmse})
            reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                          factor=0.2,
                                          patience=5,
                                          cooldown=10,
                                          min_lr=1e-8)
            early_stopping = EarlyStopping(monitor='val_loss', patience=100)
            model.summary()

            print('x_train_obs shape:', s_x_train_1.shape)
            print('x_train_pre shape:', s_x_train_2.shape)
            print('train samples:', s_x_train_1.shape[0])
            print('test samples:', s_x_test_1.shape[0])

            hist = model.fit([s_x_train_1, s_x_train_2],
                             s_y_train,
                             batch_size=batch_size,
                             epochs=epoches,
                             verbose=1,
                             shuffle=True,
                             validation_data=([s_x_test_1,
                                               s_x_test_2], s_y_test),
                             callbacks=[reduce_lr, early_stopping])

            model.save(os.path.join(root_model, s_model_name + '.h5'))

            _predicted, _y, scores = test_model(root_model, s_model_name,
                                                model,
                                                [s_x_test_1, s_x_test_2],
                                                [s_y_test], norm_y)

            test_total_scores[s_model_name] = utils.smape(_y, _predicted)
            print("total_score:{}".format(utils.smape(_y, _predicted)))

            compare_predict_actual(
                _y, _predicted,
                os.path.join(root_model, '{}_test.png'.format(s_model_name)))

            K.clear_session()

    #---------------------------------------------------------------------------------------------------------------------------------------

    #****************************************************************************#
    '''
    save scores
    '''
    with open(os.path.join(root_model, 'scores.json'), 'w') as f:
        json.dump(test_total_scores, f)

    #***************************************************************************#

    if test:
        test_total_scores = {}
        model = keras.models.load_model(
            os.path.join(root_model, model_name + '.h5'),
            custom_objects={'loss_smape_rmse': my_loss.loss_smape_rmse})
        _predicted, _y, scores = test_model(root_model, model_name, model,
                                            [x_test_1, x_test_2], [y_test],
                                            norm_y)

        test_total_scores[model_name] = utils.smape(_y, _predicted)
        print("total_score:{}".format(utils.smape(_y, _predicted)))

        K.clear_session()

        for station in aq_stations:
            s_model_name = model_name + '_' + station[0]
            key = station[0]

            train_row = round(train_rate * y_stations[key].shape[0])
            s_x_train_1 = x_stations_obs[key][:train_row, :, :]
            s_x_train_2 = x_stations_pre[key][:train_row, :, :]
            s_y_train = y_stations[key][:train_row, :]
            s_x_test_1 = x_stations_obs[key][train_row:, :, :]
            s_x_test_2 = x_stations_pre[key][train_row:, :, :]
            s_y_test = y_stations[key][train_row:, :]

            s_x_train_1 = norm_x_obs(s_x_train_1)
            s_x_train_2 = norm_x_pre(s_x_train_2)
            s_y_train = norm_y(s_y_train)
            s_x_test_1 = norm_x_obs(s_x_test_1)
            s_x_test_2 = norm_x_pre(s_x_test_2)
            s_y_test = norm_y(s_y_test)

            model = keras.models.load_model(
                os.path.join(root_model, s_model_name + '.h5'),
                custom_objects={'loss_smape_rmse': my_loss.loss_smape_rmse})
            _predicted, _y, scores = test_model(root_model, s_model_name,
                                                model,
                                                [s_x_test_1, s_x_test_2],
                                                [s_y_test], norm_y)

            test_total_scores[s_model_name] = utils.smape(_y, _predicted)
            print("total_score:{}".format(utils.smape(_y, _predicted)))

            K.clear_session()

        with open(os.path.join(root_model, 'ztest_scores.json'), 'w') as f:
            json.dump(test_total_scores, f)
Ejemplo n.º 6
0
    def test_standard(self, predict_file, predict_time):
        '''
        Calc scores based on the official evaluation
        '''

        root = os.path.join(os.path.split(predict_file)[0], 'test')
        predict_file_name = os.path.splitext(os.path.split(predict_file)[1])[0]
        forecast_hours = self.request_paras['forecast']

        #live data
        bien = self.request_paras['bien']
        request_real_data_aq(bien['cities'], forecast_hours, root,
                             predict_time)
        utils.clip_real_data_aq(root,
                                self.request_paras['stations'],
                                now=predict_time,
                                backward_hours=forecast_hours,
                                interpolate=False)
        p_aq_codes = {
            'PM2.5': 'PM25_Concentration',
            'PM10': 'PM10_Concentration',
            'O3': 'O3_Concentration'
        }
        obs = {}
        for station_id in self.station_infos:
            station_info = self.station_infos[station_id]
            aq_file = os.path.join(root, 'bien', 'aq', station_id + '.csv')
            pd_aq = pd.read_csv(aq_file)
            obs_station = {}
            for p in station_info['pollutions']:
                aq = pd_aq[p_aq_codes[p]]
                obs_station[p] = aq
            obs[station_id] = obs_station

        #weather forecast
        pd_predict = pd.read_csv(predict_file,
                                 usecols=['test_id', 'PM2.5', 'PM10', 'O3'],
                                 index_col=['test_id'])
        pre = {}
        for station_id in self.station_infos:
            station_info = self.station_infos[station_id]
            station_ids = []
            for i in range(forecast_hours):
                station_ids.append('{}#{}'.format(station_id, i))
            pre_station = {}
            for p in station_info['pollutions']:
                pre_station[p] = pd_predict[p].ix[station_ids]
            pre[station_id] = pre_station

        #scores
        score = {}
        for station_id in self.station_infos:
            station_info = self.station_infos[station_id]

            obss = []
            for p in station_info['pollutions']:
                obss.append(obs[station_id][p].values)
            obss = np.array(obss)
            for i in range(obss.shape[1]):
                if np.isnan(obss[:, i]).any() or (obss[:, i] < 0).any():
                    obss[:, i] = np.nan
            i = 0
            for p in station_info['pollutions']:
                obs[station_id][p] = obss[i]
                i += 1

            score_station = {}
            for p in station_info['pollutions']:
                predicts = pre[station_id][p].values
                observes = obs[station_id][p]
                score_station[p] = utils.smape(observes, predicts)
                dic = {'obs': observes, 'pre': predicts}
                pd_data = pd.DataFrame(dic)
                pd_data.to_csv(
                    os.path.join(
                        root, predict_file_name +
                        '{}_{}_scores.csv'.format(station_id, p)))
            score[station_id] = score_station

        with open(os.path.join(root, predict_file_name + '_scores.csv'),
                  'w') as f:
            for station_id in self.station_infos:
                station_info = self.station_infos[station_id]
                f.write(station_id)
                f.write('\n')
                for p in station_info['pollutions']:
                    f.write(p + ',')
                    f.write(str(score[station_id][p]) + ',')
                f.write('\n')

        with open(os.path.join(root, predict_file_name + '_scorelist.csv'),
                  'w') as f:
            f.write('station_id,PM2.5,PM10,O3\n')
            for station_id in self.station_infos:
                station_info = self.station_infos[station_id]
                f.write(station_id)
                f.write(',')
                line = ''
                for p in station_info['pollutions']:
                    line += (str(score[station_id][p]) + ',')
                line = line[:-1]
                f.write(line)
                f.write('\n')

        return score
Ejemplo n.º 7
0
def arima_forecast(series,
                   validation_series,
                   horizon,
                   order,
                   seasonal_order,
                   del_outliers=False,
                   normalize=False,
                   plot=False):
    """
    Creates an arima model with the provided order and seasonal order and assess performance of the model is on a
    validation series.

    :param series:
    :param validation_series:
    :param horizon:
    :param order:
    :param seasonal_order:
    :param del_outliers:
    :param normalize:
    :param plot:
    :return: SMAPE for the validation series, the forecast validation series
    """

    # whether to remove outliers in the training series
    if del_outliers:
        working_series = remove_outliers(series)

    else:
        working_series = series

    # whether to normalize the training series
    if normalize:
        scaler, working_series = normalize_series(working_series)

    else:
        scaler = None

    # input sequence is our data
    train_series = working_series

    # perform search for best parameters and fit
    model = arima.ARIMA(order=order,
                        seasonal_order=seasonal_order,
                        suppress_warnings=True)

    model.fit(train_series)

    # perform predictions
    f_autoarima = model.predict(n_periods=horizon)

    # dataframe which contains the result
    forecast_dataframe = pd.DataFrame(index=validation_series.index)

    # if data was normalized, we need to apply the reverse transform
    if normalize:

        # first reverse log1p using expm1
        validation_forecast = f_autoarima

        # use scaler to reverse normalizing
        denormalized_forecast = scaler.inverse_transform(
            validation_forecast.reshape(-1, 1))
        denormalized_forecast = [val[0] for val in denormalized_forecast]

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = denormalized_forecast

    else:

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = f_autoarima

    if plot:
        plt.figure(figsize=(10, 6))

        plt.plot(series[-100:], color="blue", linestyle="-")
        plt.plot(validation_series, color="green", linestyle="-")
        plt.plot(forecast_dataframe, color="red", linestyle="--")

        plt.legend(["Train series", "Validation series", "Predicted series"])

        plt.title("Validation of arima model with order " + str(order) +
                  " seasonal order " + str(seasonal_order))

        plt.show()

    return smape(
        validation_series,
        forecast_dataframe['forecast']), forecast_dataframe['forecast']
Ejemplo n.º 8
0
def auto_arima_forecast(series,
                        validation_series,
                        horizon,
                        del_outliers=False,
                        normalize=False,
                        plot=False):
    """
    Fits an auto arima model from the series to find the best parameters. Performance of the trained model is assessed
    on a validation series.

    :param series:
    :param validation_series:
    :param horizon:
    :param del_outliers:
    :param normalize:
    :param plot:
    :return: SMAPE for the validation series, the forecast validation series, order, seasonal_order
    """

    # whether to remove outliers in the training series
    if del_outliers:
        working_series = remove_outliers(series)

    else:
        working_series = series

    # whether to normalize the training series
    if normalize:
        scaler, working_series = normalize_series(working_series)

    else:
        scaler = None

    # input sequence is our data
    train_series = working_series

    # perform search for best parameters and fit
    model = auto_arima(train_series,
                       seasonal=True,
                       max_D=2,
                       m=7,
                       trace=True,
                       error_action='ignore',
                       suppress_warnings=True,
                       stepwise=True)

    order = model.get_params()['order']
    seasonal_order = model.get_params()['seasonal_order']

    # apparently useless model.fit(train_series)

    # perform predictions
    f_autoarima = model.predict(n_periods=horizon)

    # dataframe which contains the result
    forecast_dataframe = pd.DataFrame(index=validation_series.index)

    # if data was normalized, we need to apply the reverse transform
    if normalize:

        # first reverse log1p using expm1
        validation_forecast = f_autoarima

        # use scaler to reverse normalizing
        denormalized_forecast = scaler.inverse_transform(
            validation_forecast.reshape(-1, 1))
        denormalized_forecast = [val[0] for val in denormalized_forecast]

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = denormalized_forecast

    else:

        # save the forecast in the dataframe
        forecast_dataframe['forecast'] = f_autoarima

    if plot:
        plt.figure(figsize=(10, 6))

        plt.plot(series[-100:], color="blue", linestyle="-")
        plt.plot(validation_series, color="green", linestyle="-")
        plt.plot(forecast_dataframe, color="red", linestyle="--")

        plt.legend(["Train series", "Validation series", "Predicted series"])

        plt.title("Validation of auto arima model")

        plt.show()

    return smape(validation_series, forecast_dataframe['forecast']
                 ), forecast_dataframe['forecast'], order, seasonal_order