Esempio n. 1
0
def qf_single_state_prediction(state, lookback, horizon, predictors):
    """
    RQF WITHOUT CLUSTER SERIES
    :param state: 2-letter code for state
    :param lookback: number of steps of history to use
    :param horizon: number of weeks ahead to predict
    :param predictors: predictor variables
    :return:
    """

    if state == "CE":
        s = 'Ceará'
    else:
        s = state
    cities = list(get_cities_from_state(s))

    for city in cities:
        if os.path.isfile('/saved_models/quantile_forest_no_cluster/{}/qf_metrics_{}.pkl'.format(state, city)):
            print(city, 'done')
            continue
        data = combined_data(city, DATA_TYPES)
        data = data[predictors]
        data.drop('casos', axis=1, inplace=True)

        target = 'casos_est'
        data_lag = build_lagged_features(data, lookback)
        data_lag.dropna()
        targets = {}
        for d in range(1, horizon + 1):
            if d == 1:
                targets[d] = data_lag[target].shift(-(d - 1))
            else:
                targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

        X_data = data_lag.drop(target, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target],
                                                            train_size=0.7, test_size=0.3, shuffle=False)

        city_name = get_city_names([city, 0])[0][1]
        preds = np.empty((len(data_lag), horizon))
        metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score',
                                      'mean_squared_error', 'mean_squared_log_error',
                                      'median_absolute_error', 'r2_score'))
        for d in range(1, horizon + 1):
            tgt = targets[d][:len(X_train)]
            tgtt = targets[d][len(X_train):]

            model = rolling_forecasts(X_train, target=tgt, horizon=horizon)
            pred = model.predict(X_data[:len(targets[d])], quantile=50)

            dif = len(data_lag) - len(pred)
            if dif > 0:
                pred = list(pred) + ([np.nan] * dif)
            preds[:, (d - 1)] = pred

            pred_m = model.predict(X_test[(d - 1):])
            metrics[d] = calculate_metrics(pred_m, tgtt)

        metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest_no_cluster', state, city))
        plot_prediction(preds, targets[1], city_name, len(X_train))
def cluster_viz(geocode, clusters):
    data, group = get_cluster_data(geocode=geocode, clusters=clusters,
                                   data_types=DATA_TYPES, cols=['casos'])

    city_names = dict(get_city_names(group))
    df_hm = data.reset_index().rename(columns={'index': 'week'})
    df_hm = pd.melt(df_hm, id_vars=['week'], var_name='city', value_name='incidence')
    df_hm['city'] = [int(re.sub('casos_', '', i)) for i in df_hm.city]
    df_hm['city'] = [city_names[i] for i in df_hm.city]

#     return df_hm
    curve_opts = dict(line_width=10, line_alpha=0.4,tools=[])
    overlay_opts = dict(width=900, height=200,tools=[])
    hm_opts = dict(width=900, height=500, tools=[], logz=True, invert_yaxis=False, xrotation=90,
                   labelled=[], toolbar=None, xaxis=None)

    heatmap = hv.HeatMap(df_hm)
    heatmap.toolbar_location = None
    graphs = [hv.Curve((data.index, data[i]), 'Time', 'Incidence') for i in data.columns]
    final = graphs[0]
    for i in graphs[1:]:
        final = final * i

    opts = {'HeatMap': {'plot': hm_opts}, 'Overlay': {'plot': overlay_opts},
            'Curve': {'plot': curve_opts,
                      'style': dict(color='blue', line_alpha=0.2)}}
    return (heatmap + final).opts(opts).cols(1)
Esempio n. 3
0
def qf_prediction(city, state, horizon, lookback):
    with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp:
        clusters = pickle.load(fp)
    data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS, doenca=DISEASE)

    target = 'casos_est_{}'.format(city)
    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    # casos_columns = ['casos_{}'.format(i) for i in group]

    # data = data_full.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()
    targets = {}
    for d in range(1, horizon + 1):
        if d == 1:
            targets[d] = data_lag[target].shift(-(d - 1))
        else:
            targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

    X_data = data_lag.drop(casos_est_columns, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target],
                                                        train_size=0.7, test_size=0.3, shuffle=False)

    city_name = get_city_names([city, 0])[0][1]
    preds = np.empty((len(data_lag), horizon))
    preds25 = np.empty((len(data_lag), horizon))
    preds975 = np.empty((len(data_lag), horizon))
    metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score',
                                  'mean_squared_error', 'mean_squared_log_error',
                                  'median_absolute_error', 'r2_score'))
    for d in range(1, horizon + 1):
        tgt = targets[d][:len(X_train)]
        tgtt = targets[d][len(X_train):]

        model = rolling_forecasts(X_train, target=tgt, horizon=horizon)
        dump(model, 'saved_models/quantile_forest/{}/{}_city_model_{}W.joblib'.format(state, city, d))
        pred25 = model.predict(X_data[:len(targets[d])], quantile=2.5)
        pred = model.predict(X_data[:len(targets[d])], quantile=50)
        pred975 = model.predict(X_data[:len(targets[d])], quantile=97.5)

        dif = len(data_lag) - len(pred)
        if dif > 0:
            pred = list(pred) + ([np.nan] * dif)
            pred25 = list(pred25) + ([np.nan] * dif)
            pred975 = list(pred975) + ([np.nan] * dif)
        preds[:, (d - 1)] = pred
        preds25[:, (d - 1)] = pred25
        preds975[:, (d - 1)] = pred975

        pred_m = model.predict(X_test[(d - 1):], quantile=50)
        metrics[d] = calculate_metrics(pred_m, tgtt)

    metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city))

    plot_prediction(preds, preds25, preds975, targets[1], city_name, len(X_train))

    return model, preds, preds25, preds975, X_train, targets, data_lag, X_data.columns
Esempio n. 4
0
def single_prediction(city,
                      state,
                      predictors,
                      predict_n,
                      look_back,
                      hidden,
                      epochs,
                      predict=False):
    """
    Fit an LSTM model to generate predictions for a city, Using its cluster as regressors.
    :param city: geocode of the target city
    :param state: State containing the city
    :param predict_n: How many weeks ahead to predict
    :param look_back: Look-back time window length used by the model
    :param hidden: Number of hidden layers in each LSTM unit
    :param epochs: Number of epochs of training
    :param random: If the model should be trained on a random selection of ten cities of the same state.
    :return:
    """

    with open("../../analysis/clusters_{}.pkl".format(state), "rb") as fp:
        clusters = pickle.load(fp)
    data, group = get_cluster_data(geocode=city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=predictors)

    indice = list(data.index)
    indice = [i.date() for i in indice]

    city_name = get_city_names([city, 0])[0][1]
    if predict:
        ratio = 1
    else:
        ratio = 0.7

    predicted, X_test, Y_test, Y_train, factor = train_evaluate_model(
        city,
        data,
        predict_n,
        look_back,
        hidden,
        epochs,
        ratio=ratio,
        load=False)
    plot_predicted_vs_data(
        predicted,
        np.concatenate((Y_train, Y_test), axis=0),
        indice[:],
        label="{}".format(city_name),
        pred_window=predict_n,
        factor=factor,
        split_point=len(Y_train),
    )

    return predicted, indice, X_test, Y_test, Y_train, factor
def create_cluster(state, cols, t):
    cities_list = alocate_data(state)
    dists = distance(cities_list, cols)

    dists_full = dists + dists.T
    sns_plot = sns.clustermap(dists_full, cmap="vlag")
    sns_plot.savefig("cluster_corr_{}.png".format(state), dpi=400)

    Z, clusters = hierarchical_clustering(dists, t=t)
    print(clusters)
    matrix_cluster(cities_list=cities_list, clusters=clusters)

    with open('clusters_{}.pkl'.format(state), 'wb') as fp:
        pickle.dump(clusters, fp)
    print("{} clusters saved".format(state))
    name_ind = get_city_names(list(dists.index))

    return Z, name_ind
Esempio n. 6
0
def state_prediction(state,
                     predictors,
                     predict_n,
                     look_back,
                     hidden,
                     epochs,
                     predict=False):
    clusters = pd.read_pickle("../../analysis/clusters_{}.pkl".format(state))

    for cluster in clusters:
        data, group = get_cluster_data(
            geocode=cluster[0],
            clusters=clusters,
            data_types=DATA_TYPES,
            cols=predictors,
        )
        for city in cluster:
            if os.path.exists(
                    "../saved_models/LSTM/{}/predicted_lstm_{}.pkl".format(
                        state, city)):
                continue

            indice = list(data.index)
            indice = [i.date() for i in indice]

            city_name = get_city_names([city, 0])[0][1]
            if predict:
                ratio = 1
            else:
                ratio = 0.7

            predicted, X_test, Y_test, Y_train, factor = train_evaluate_model(
                city, data, predict_n, look_back, hidden, epochs, ratio=ratio)
            plot_predicted_vs_data(
                predicted,
                np.concatenate((Y_train, Y_test), axis=0),
                indice[:],
                label=city_name,
                pred_window=predict_n,
                factor=factor,
                split_point=len(Y_train),
            )
            print("{} done".format(city))
    return None
Esempio n. 7
0
def qf_prediction(city, state, horizon, lookback, doenca='chik'):
    with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp:
        clusters = pickle.load(fp)
    data, group = get_cluster_data(city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=PREDICTORS,
                                   doenca=doenca)

    target = 'casos_est_{}'.format(city)
    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    # casos_columns = ['casos_{}'.format(i) for i in group]

    # data = data_full.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()
    data_lag = data_lag['2016-01-01':]
    targets = {}
    for d in range(1, horizon + 1):
        if d == 1:
            targets[d] = data_lag[target].shift(-(d - 1))
        else:
            targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

    X_data = data_lag.drop(casos_est_columns, axis=1)

    city_name = get_city_names([city, 0])[0][1]

    #  Load dengue model
    model = joblib.load(
        os.path.join([
            RESULT_PATH,
            '{}/{}_city_model_{}W.joblib'.format(state, city, horizon)
        ]))
    pred25 = model.predict(X_data, quantile=2.5)
    pred = model.predict(X_data, quantile=50)
    pred975 = model.predict(X_data, quantile=97.5)

    # metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city))

    return model, pred, pred25, pred975, X_data, targets, data_lag
Esempio n. 8
0
def qf_state_prediction(state, lookback, horizon, predictors):
    """
    RQF prediction based on cluster of cities
    :param state:
    :param lookback:
    :param horizon:
    :param predictors:
    :return:
    """
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters,
                                            data_types=DATA_TYPES, cols=predictors)
        for city in cluster:
            if os.path.isfile(
                    './saved_models/{}/qf_metrics_{}.pkl'.format(
                        state, city)):
                print('done')
                continue

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            # casos_columns = ['casos_{}'.format(i) for i in group]

            # data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data_full, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target],
                                                                train_size=0.7, test_size=0.3, shuffle=False)

            city_name = get_city_names([city, 0])[0][1]
            preds = np.empty((len(data_lag), horizon))
            preds25 = np.empty((len(data_lag), horizon))
            preds975 = np.empty((len(data_lag), horizon))
            metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score',
                                          'mean_squared_error', 'mean_squared_log_error',
                                          'median_absolute_error', 'r2_score'))
            for d in range(1, horizon + 1):
                tgt = targets[d][:len(X_train)]
                tgtt = targets[d][len(X_train):]

                model = rolling_forecasts(X_train, target=tgt, horizon=horizon)
                pred = model.predict(X_data[:len(targets[d])], quantile=50)
                pred25 = model.predict(X_data[:len(targets[d])], quantile=2.5)
                pred975 = model.predict(X_data[:len(targets[d])], quantile=97.5)

                dif = len(data_lag) - len(pred)
                if dif > 0:
                    pred = list(pred) + ([np.nan] * dif)
                    pred25 = list(pred25) + ([np.nan] * dif)
                    pred975 = list(pred975) + ([np.nan] * dif)
                preds[:, (d - 1)] = pred
                preds25[:, (d - 1)] = pred25
                preds975[:, (d - 1)] = pred975

                pred_m = model.predict(X_test[(d - 1):])
                metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city))
            dump(model, 'saved_models/quantile_forest/{}_{}_state_model.joblib'.format(state, city))
            plot_prediction(preds, preds25, preds975, targets[1], city_name, len(X_train))
Esempio n. 9
0
def cluster_prediction(geocode, state, predictors, predict_n, look_back,
                       hidden, epochs):
    """
    Fit an LSTM model to generate predictions for all cities from a cluster, Using its cluster as regressors.
    :param city: geocode of the target city
    :param state: State containing the city
    :param predict_n: How many weeks ahead to predict
    :param look_back: Look-back time window length used by the model
    :param hidden: Number of hidden layers in each LSTM unit
    :param epochs: Number of epochs of training
    :return:
    """

    clusters = pd.read_pickle("../../analysis/clusters_{}.pkl".format(state))
    if os.path.exists('{}_cluster.csv'.format(geocode)):
        data = pd.read_csv('{}_cluster.csv.gz')
        cluster = pickle.load('{}_cluster.pkl')
    else:
        data, cluster = get_cluster_data(geocode=geocode,
                                         clusters=clusters,
                                         data_types=DATA_TYPES,
                                         cols=predictors,
                                         save=True)
    indice = list(data.index)
    indice = [i.date() for i in indice]

    fig, axs = P.subplots(nrows=2, ncols=2, figsize=(50, 45))

    targets = zip(cluster, axs.flatten())
    for (city, ax) in targets:
        print(city)
        city_name = get_city_names([city, 0])[0][1]
        predicted, X_test, Y_test, Y_train, factor = train_evaluate_model(
            city, data, predict_n, look_back, hidden, epochs)

        ## plot
        Ydata = np.concatenate((Y_train, Y_test), axis=0)
        split_point = len(Y_train)
        df_predicted = pd.DataFrame(predicted).T
        ymax = max(predicted.max() * factor, Ydata.max() * factor)

        ax.vlines(indice[split_point], 0, ymax, "g", "dashdot", lw=2)
        ax.text(indice[split_point + 1], 0.6 * ymax,
                "Out of sample Predictions")
        for n in range(df_predicted.shape[1] - predict_n):
            ax.plot(indice[n:n + predict_n],
                    pd.DataFrame(Ydata.T)[n] * factor, "k-")
            ax.plot(indice[n:n + predict_n], df_predicted[n] * factor, "r-")
            ax.vlines(
                indice[n:n + predict_n],
                np.zeros(predict_n),
                df_predicted[n] * factor,
                "b",
                alpha=0.2,
            )

        ax.grid()
        ax.set_title("Predictions for {}".format(city_name), fontsize=13)
        ax.legend(["data", "predicted"])

    P.tight_layout()
    P.savefig("{}/cluster_{}.pdf".format(FIG_PATH,
                                         geocode))  # , bbox_inches='tight')
    # P.show()

    return None
Esempio n. 10
0
def rf_state_prediction(state, lookback, horizon, predictors):
    """
    make predictions for all cities of a state using the cluster series
    :param state: State Symbol
    :param lookback: number of steps of history to use as predictors
    :param horizon: number of steps  to predict
    :param predictors: list of predictors to use
    :return:
    """
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0],
                                            clusters=clusters,
                                            data_types=DATA_TYPES,
                                            cols=predictors)
        for city in cluster:
            if os.path.isfile('./saved_models/{}/rf_metrics_{}.pkl'.format(
                    state, city)):
                print('done')
                continue

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            # casos_columns = ['casos_{}'.format(i) for i in group]

            # data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data_full, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(
                X_data,
                data_lag[target],
                train_size=0.7,
                test_size=0.3,
                shuffle=False)

            city_name = get_city_names([city, 0])[0][1]
            preds = np.empty((len(data_lag), horizon))
            metrics = pd.DataFrame(index=('mean_absolute_error',
                                          'explained_variance_score',
                                          'mean_squared_error',
                                          'mean_squared_log_error',
                                          'median_absolute_error', 'r2_score'))
            for d in range(1, horizon + 1):
                tgt = targets[d][:len(X_train)]
                tgtt = targets[d][len(X_train):]

                model = rolling_forecasts(X_train, target=tgt, horizon=horizon)
                pred = model.predict(X_data[:len(targets[d])])

                dif = len(data_lag) - len(pred)
                if dif > 0:
                    pred = list(pred) + ([np.nan] * dif)
                preds[:, (d - 1)] = pred

                pred_m = model.predict(X_test[(d - 1):])
                metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/rf_metrics_{}.pkl'.format(
                'saved_models/random_forest', state, city))
            plot_prediction(preds, targets[1], city_name, len(X_train))
            # plt.show()
    return None
Esempio n. 11
0
def rf_prediction(city, state, horizon, lookback):
    """
    make predictions for a given city using the cluster series
    :param city: city name
    :param state: State symbol
    :param horizon: number of steps ahead to predict
    :param lookback: number steps of history to use as predictors
    :return:
    """
    with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp:
        clusters = pickle.load(fp)
    data, group = get_cluster_data(city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=PREDICTORS)

    target = 'casos_est_{}'.format(city)
    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    # casos_columns = ['casos_{}'.format(i) for i in group]

    # data = data_full.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()
    targets = {}
    for d in range(1, horizon + 1):
        if d == 1:
            targets[d] = data_lag[target].shift(-(d - 1))
        else:
            targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

    X_data = data_lag.drop(casos_est_columns, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        data_lag[target],
                                                        train_size=0.7,
                                                        test_size=0.3,
                                                        shuffle=False)

    city_name = get_city_names([city, 0])[0][1]
    preds = np.empty((len(data_lag), horizon))
    metrics = pd.DataFrame(index=('mean_absolute_error',
                                  'explained_variance_score',
                                  'mean_squared_error',
                                  'mean_squared_log_error',
                                  'median_absolute_error', 'r2_score'))
    for d in range(1, horizon + 1):
        tgt = targets[d][:len(X_train)]
        tgtt = targets[d][len(X_train):]

        model = rolling_forecasts(X_train, target=tgt, horizon=horizon)
        pred = model.predict(X_data[:len(targets[d])])

        dif = len(data_lag) - len(pred)
        if dif > 0:
            pred = list(pred) + ([np.nan] * dif)
        preds[:, (d - 1)] = pred

        pred_m = model.predict(X_test[(d - 1):])
        metrics[d] = calculate_metrics(pred_m, tgtt)

    metrics.to_pickle('{}/{}/rf_metrics_{}.pkl'.format(
        'saved_models/random_forest', state, city))
    plot_prediction(preds, targets[1], city_name, len(X_train))

    return preds, X_train, targets, data_lag
Esempio n. 12
0
def lasso_single_prediction(city, state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))
    data, group = get_cluster_data(geocode=city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=predictors)

    target = 'casos_est_{}'.format(city)
    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    # casos_columns = ['casos_{}'.format(i) for i in group]

    # data = data_full.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()
    targets = {}
    for d in range(1, horizon + 1):
        if d == 1:
            targets[d] = data_lag[target].shift(-(d - 1))
        else:
            targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

    X_data = data_lag.drop(casos_est_columns, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        data_lag[target],
                                                        train_size=0.7,
                                                        test_size=0.3,
                                                        shuffle=False)

    if sum(y_train) == 0:
        print('aaaah', city)
        return None
    city_name = get_city_names([city, 0])[0][1]
    preds = np.empty((len(data_lag), horizon))
    metrics = pd.DataFrame(index=('mean_absolute_error',
                                  'explained_variance_score',
                                  'mean_squared_error',
                                  'mean_squared_log_error',
                                  'median_absolute_error', 'r2_score'))
    for d in range(1, horizon + 1):
        model = LassoLarsCV(max_iter=5, n_jobs=-1, normalize=False)

        tgt = targets[d][:len(X_train)]
        tgtt = targets[d][len(X_train):]
        try:
            model.fit(X_train, tgt)
            print(city, 'done')
        except ValueError as err:
            print('-----------------------------------------------------')
            print(city, 'ERRO')
            print('-----------------------------------------------------')
            break
        pred = model.predict(X_data[:len(targets[d])])

        dif = len(data_lag) - len(pred)
        if dif > 0:
            pred = list(pred) + ([np.nan] * dif)
        preds[:, (d - 1)] = pred
        pred_m = model.predict(X_test[:(len(tgtt))])
        metrics[d] = calculate_metrics(pred_m, tgtt)

    metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format(
        'saved_models/lasso', state, city))
    plot_prediction(preds, targets[1], city_name, len(X_train))
    return None
Esempio n. 13
0
def lasso_single_state_prediction(state, lookback, horizon, predictors):
    ##LASSO WITHOUT CLUSTER SERIES
    cities = list(get_cities_from_state('Ceará'))

    for city in cities:
        if os.path.isfile(
                '/home/elisa/Documentos/InfoDenguePredict/infodenguepredict/models/saved_models/lasso_no_cluster/{}/lasso_metrics_{}.pkl'
                .format(state, city)):
            print(city, 'done')
            continue
        data = combined_data(city, DATA_TYPES)
        data = data[predictors]
        data.drop('casos', axis=1, inplace=True)

        target = 'casos_est'
        data_lag = build_lagged_features(data, lookback)
        data_lag.dropna()
        targets = {}
        for d in range(1, horizon + 1):
            if d == 1:
                targets[d] = data_lag[target].shift(-(d - 1))
            else:
                targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

        X_data = data_lag.drop(target, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                            data_lag[target],
                                                            train_size=0.7,
                                                            test_size=0.3,
                                                            shuffle=False)

        city_name = get_city_names([city, 0])[0][1]
        preds = np.empty((len(data_lag), horizon))
        metrics = pd.DataFrame(index=('mean_absolute_error',
                                      'explained_variance_score',
                                      'mean_squared_error',
                                      'mean_squared_log_error',
                                      'median_absolute_error', 'r2_score'))
        for d in range(1, horizon + 1):
            model = LassoLarsCV(max_iter=15, n_jobs=-1, normalize=False)

            tgt = targets[d][:len(X_train)]
            tgtt = targets[d][len(X_train):]
            try:
                model.fit(X_train, tgt)
            except ValueError as err:
                print('-----------------------------------------------------')
                print(city, 'ERRO')
                print('-----------------------------------------------------')
                break
            pred = model.predict(X_data[:len(targets[d])])

            dif = len(data_lag) - len(pred)
            if dif > 0:
                pred = list(pred) + ([np.nan] * dif)
            preds[:, (d - 1)] = pred
            pred_m = model.predict(X_test[:(len(tgtt))])
            metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format(
                'saved_models/lasso_no_cluster', state, city))
        plot_prediction(preds,
                        targets[1],
                        city_name,
                        len(X_train),
                        path='lasso_no_cluster')
        # plt.show()
    return None
Esempio n. 14
0
def rgf_state_prediction(state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0],
                                            clusters=clusters,
                                            data_types=DATA_TYPES,
                                            cols=predictors)
        for city in cluster:
            if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format(
                    state, city)):
                print(city, 'done')
                continue

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            casos_columns = ['casos_{}'.format(i) for i in group]

            data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(
                X_data,
                data_lag[target],
                train_size=0.7,
                test_size=0.3,
                shuffle=False)

            city_name = get_city_names([city, 0])[0][1]
            preds = np.empty((len(data_lag), horizon))
            metrics = pd.DataFrame(index=('mean_absolute_error',
                                          'explained_variance_score',
                                          'mean_squared_error',
                                          'mean_squared_log_error',
                                          'median_absolute_error', 'r2_score'))
            for d in range(1, horizon + 1):
                model = RGFRegressor(max_leaf=300,
                                     algorithm="RGF_Sib",
                                     test_interval=100,
                                     loss="LS",
                                     verbose=False)

                tgt = targets[d][:len(X_train)]
                tgtt = targets[d][len(X_train):]
                try:
                    model.fit(X_train, tgt)
                except ValueError as err:
                    print(
                        '-----------------------------------------------------'
                    )
                    print(city, 'ERRO')
                    print(
                        '-----------------------------------------------------'
                    )
                    break
                pred = model.predict(X_data[:len(targets[d])])

                dif = len(data_lag) - len(pred)
                if dif > 0:
                    pred = list(pred) + ([np.nan] * dif)
                preds[:, (d - 1)] = pred
                pred_m = model.predict(X_test[:(len(tgtt))])
                metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format(
                'saved_models/rgf', state, city))
            plot_prediction(preds, targets[1], city_name, len(X_train))
            # plt.show()
    return None
Esempio n. 15
0
 def test_get_city_names(self):
     names = get_city_names([3304557, 4118204])
     self.assertEqual([(3304557, 'Rio de Janeiro'), (4118204, 'Paranaguá')],
                      names)
def single_prediction(city,
                      state,
                      predictors,
                      predict_n,
                      look_back,
                      hidden,
                      epochs,
                      predict=True,
                      doenca='chick'):
    """
    Fit an LSTM model to generate predictions for a city, Using its cluster as regressors.
    :param city: geocode of the target city
    :param state: State containing the city
    :param predict_n: How many weeks ahead to predict
    :param look_back: Look-back time window length used by the model
    :param hidden: Number of hidden layers in each LSTM unit
    :param epochs: Number of epochs of training
    :param predict: Only generate predictions
    :param random: If the model should be trained on a random selection of ten cities of the same state.
    :return:
    """

    with open("../../analysis/clusters_{}.pkl".format(state), "rb") as fp:
        clusters = pickle.load(fp)
    data, group = get_cluster_data(geocode=city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=predictors,
                                   doenca=doenca)
    data = data['2016-01-01':]
    x = data.index.shift(predict_n, freq='W')
    x = [i.date() for i in x]
    indice = list(data.index)
    indice = [i.date() for i in indice]

    city_name = get_city_names([city, 0])[0][1]
    if predict:
        ratio = 1
    else:
        ratio = 0.7

    if cluster:
        target_col = list(data.columns).index("casos_est_{}".format(city))
    else:
        target_col = list(data.columns).index("casos_est")
    norm_data, max_features = normalize_data(data)
    factor = max_features[target_col]
    ## split test and train
    X_train, Y_train, X_test, Y_test = split_data(
        norm_data,
        look_back=look_back,
        ratio=ratio,
        predict_n=predict_n,
        Y_column=target_col,
    )

    model = load_model("../saved_models/LSTM/{}/lstm_{}_epochs_{}.h5".format(
        state, city, epochs))
    predicted = np.stack(
        [model.predict(X_train, batch_size=1, verbose=1) for i in range(100)],
        axis=2)

    df_predicted = pd.DataFrame(np.percentile(predicted, 50, axis=2))
    df_predicted25 = pd.DataFrame(np.percentile(predicted, 2.5, axis=2))
    df_predicted975 = pd.DataFrame(np.percentile(predicted, 97.5, axis=2))

    plot_prediction(pred=df_predicted,
                    pred25=df_predicted25,
                    pred975=df_predicted975,
                    x=x,
                    ydata=Y_train,
                    factor=factor,
                    horizon=predict_n,
                    title="{}".format(city_name),
                    doenca=doenca)

    return predicted, indice, X_test, Y_test, Y_train, factor
def qf_prediction(city, state, horizon, lookback, doenca='chik'):
    with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp:
        clusters = pickle.load(fp)
    data, group = get_cluster_data(city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=PREDICTORS,
                                   doenca=doenca)

    target = 'casos_est_{}'.format(city)
    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    # casos_columns = ['casos_{}'.format(i) for i in group]

    # data = data_full.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()
    data_lag = data_lag['2016-01-01':]
    targets = {}
    for d in range(1, horizon + 1):
        if d == 1:
            targets[d] = data_lag[target].shift(-(d - 1))
        else:
            targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

    X_data = data_lag.drop(casos_est_columns, axis=1)

    city_name = get_city_names([city, 0])[0][1]
    preds = np.empty((len(data_lag), horizon))
    preds25 = np.empty((len(data_lag), horizon))
    preds975 = np.empty((len(data_lag), horizon))
    metrics = pd.DataFrame(index=('mean_absolute_error',
                                  'explained_variance_score',
                                  'mean_squared_error',
                                  'mean_squared_log_error',
                                  'median_absolute_error', 'r2_score'))
    # for d in range(1, horizon + 1):
    #     tgtt = targets[d][len(X_data):]
    #     # Load dengue model
    model = joblib.load(
        'saved_models/quantile_forest/{}/{}_city_model_{}W.joblib'.format(
            state, city, horizon))
    pred25 = model.predict(X_data, quantile=2.5)
    pred = model.predict(X_data, quantile=50)
    pred975 = model.predict(X_data, quantile=97.5)

    # dif = len(data_lag) - len(pred)
    # if dif > 0:
    #     pred = list(pred) + ([np.nan] * dif)
    #     pred25 = list(pred25) + ([np.nan] * dif)
    #     pred975 = list(pred975) + ([np.nan] * dif)
    # preds[:, (d - 1)] = pred
    # preds25[:, (d - 1)] = pred25
    # preds975[:, (d - 1)] = pred975

    # metrics[d] = calculate_metrics(preds, tgtt)
    # print(metrics)

    # metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city))
    plot_prediction(pred,
                    pred25,
                    pred975,
                    targets[1],
                    horizon,
                    city_name,
                    save=True,
                    doenca=doenca)

    return model, pred, pred25, pred975, X_data, targets, data_lag