def qf_single_state_prediction(state, lookback, horizon, predictors): """ RQF WITHOUT CLUSTER SERIES :param state: 2-letter code for state :param lookback: number of steps of history to use :param horizon: number of weeks ahead to predict :param predictors: predictor variables :return: """ if state == "CE": s = 'Ceará' else: s = state cities = list(get_cities_from_state(s)) for city in cities: if os.path.isfile('/saved_models/quantile_forest_no_cluster/{}/qf_metrics_{}.pkl'.format(state, city)): print(city, 'done') continue data = combined_data(city, DATA_TYPES) data = data[predictors] data.drop('casos', axis=1, inplace=True) target = 'casos_est' data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(target, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])], quantile=50) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest_no_cluster', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train))
def cluster_viz(geocode, clusters): data, group = get_cluster_data(geocode=geocode, clusters=clusters, data_types=DATA_TYPES, cols=['casos']) city_names = dict(get_city_names(group)) df_hm = data.reset_index().rename(columns={'index': 'week'}) df_hm = pd.melt(df_hm, id_vars=['week'], var_name='city', value_name='incidence') df_hm['city'] = [int(re.sub('casos_', '', i)) for i in df_hm.city] df_hm['city'] = [city_names[i] for i in df_hm.city] # return df_hm curve_opts = dict(line_width=10, line_alpha=0.4,tools=[]) overlay_opts = dict(width=900, height=200,tools=[]) hm_opts = dict(width=900, height=500, tools=[], logz=True, invert_yaxis=False, xrotation=90, labelled=[], toolbar=None, xaxis=None) heatmap = hv.HeatMap(df_hm) heatmap.toolbar_location = None graphs = [hv.Curve((data.index, data[i]), 'Time', 'Incidence') for i in data.columns] final = graphs[0] for i in graphs[1:]: final = final * i opts = {'HeatMap': {'plot': hm_opts}, 'Overlay': {'plot': overlay_opts}, 'Curve': {'plot': curve_opts, 'style': dict(color='blue', line_alpha=0.2)}} return (heatmap + final).opts(opts).cols(1)
def qf_prediction(city, state, horizon, lookback): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS, doenca=DISEASE) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) preds25 = np.empty((len(data_lag), horizon)) preds975 = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) dump(model, 'saved_models/quantile_forest/{}/{}_city_model_{}W.joblib'.format(state, city, d)) pred25 = model.predict(X_data[:len(targets[d])], quantile=2.5) pred = model.predict(X_data[:len(targets[d])], quantile=50) pred975 = model.predict(X_data[:len(targets[d])], quantile=97.5) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) pred25 = list(pred25) + ([np.nan] * dif) pred975 = list(pred975) + ([np.nan] * dif) preds[:, (d - 1)] = pred preds25[:, (d - 1)] = pred25 preds975[:, (d - 1)] = pred975 pred_m = model.predict(X_test[(d - 1):], quantile=50) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) plot_prediction(preds, preds25, preds975, targets[1], city_name, len(X_train)) return model, preds, preds25, preds975, X_train, targets, data_lag, X_data.columns
def single_prediction(city, state, predictors, predict_n, look_back, hidden, epochs, predict=False): """ Fit an LSTM model to generate predictions for a city, Using its cluster as regressors. :param city: geocode of the target city :param state: State containing the city :param predict_n: How many weeks ahead to predict :param look_back: Look-back time window length used by the model :param hidden: Number of hidden layers in each LSTM unit :param epochs: Number of epochs of training :param random: If the model should be trained on a random selection of ten cities of the same state. :return: """ with open("../../analysis/clusters_{}.pkl".format(state), "rb") as fp: clusters = pickle.load(fp) data, group = get_cluster_data(geocode=city, clusters=clusters, data_types=DATA_TYPES, cols=predictors) indice = list(data.index) indice = [i.date() for i in indice] city_name = get_city_names([city, 0])[0][1] if predict: ratio = 1 else: ratio = 0.7 predicted, X_test, Y_test, Y_train, factor = train_evaluate_model( city, data, predict_n, look_back, hidden, epochs, ratio=ratio, load=False) plot_predicted_vs_data( predicted, np.concatenate((Y_train, Y_test), axis=0), indice[:], label="{}".format(city_name), pred_window=predict_n, factor=factor, split_point=len(Y_train), ) return predicted, indice, X_test, Y_test, Y_train, factor
def create_cluster(state, cols, t): cities_list = alocate_data(state) dists = distance(cities_list, cols) dists_full = dists + dists.T sns_plot = sns.clustermap(dists_full, cmap="vlag") sns_plot.savefig("cluster_corr_{}.png".format(state), dpi=400) Z, clusters = hierarchical_clustering(dists, t=t) print(clusters) matrix_cluster(cities_list=cities_list, clusters=clusters) with open('clusters_{}.pkl'.format(state), 'wb') as fp: pickle.dump(clusters, fp) print("{} clusters saved".format(state)) name_ind = get_city_names(list(dists.index)) return Z, name_ind
def state_prediction(state, predictors, predict_n, look_back, hidden, epochs, predict=False): clusters = pd.read_pickle("../../analysis/clusters_{}.pkl".format(state)) for cluster in clusters: data, group = get_cluster_data( geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors, ) for city in cluster: if os.path.exists( "../saved_models/LSTM/{}/predicted_lstm_{}.pkl".format( state, city)): continue indice = list(data.index) indice = [i.date() for i in indice] city_name = get_city_names([city, 0])[0][1] if predict: ratio = 1 else: ratio = 0.7 predicted, X_test, Y_test, Y_train, factor = train_evaluate_model( city, data, predict_n, look_back, hidden, epochs, ratio=ratio) plot_predicted_vs_data( predicted, np.concatenate((Y_train, Y_test), axis=0), indice[:], label=city_name, pred_window=predict_n, factor=factor, split_point=len(Y_train), ) print("{} done".format(city)) return None
def qf_prediction(city, state, horizon, lookback, doenca='chik'): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS, doenca=doenca) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() data_lag = data_lag['2016-01-01':] targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) city_name = get_city_names([city, 0])[0][1] # Load dengue model model = joblib.load( os.path.join([ RESULT_PATH, '{}/{}_city_model_{}W.joblib'.format(state, city, horizon) ])) pred25 = model.predict(X_data, quantile=2.5) pred = model.predict(X_data, quantile=50) pred975 = model.predict(X_data, quantile=97.5) # metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) return model, pred, pred25, pred975, X_data, targets, data_lag
def qf_state_prediction(state, lookback, horizon, predictors): """ RQF prediction based on cluster of cities :param state: :param lookback: :param horizon: :param predictors: :return: """ clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile( './saved_models/{}/qf_metrics_{}.pkl'.format( state, city)): print('done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data_full, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) preds25 = np.empty((len(data_lag), horizon)) preds975 = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])], quantile=50) pred25 = model.predict(X_data[:len(targets[d])], quantile=2.5) pred975 = model.predict(X_data[:len(targets[d])], quantile=97.5) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) pred25 = list(pred25) + ([np.nan] * dif) pred975 = list(pred975) + ([np.nan] * dif) preds[:, (d - 1)] = pred preds25[:, (d - 1)] = pred25 preds975[:, (d - 1)] = pred975 pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) dump(model, 'saved_models/quantile_forest/{}_{}_state_model.joblib'.format(state, city)) plot_prediction(preds, preds25, preds975, targets[1], city_name, len(X_train))
def cluster_prediction(geocode, state, predictors, predict_n, look_back, hidden, epochs): """ Fit an LSTM model to generate predictions for all cities from a cluster, Using its cluster as regressors. :param city: geocode of the target city :param state: State containing the city :param predict_n: How many weeks ahead to predict :param look_back: Look-back time window length used by the model :param hidden: Number of hidden layers in each LSTM unit :param epochs: Number of epochs of training :return: """ clusters = pd.read_pickle("../../analysis/clusters_{}.pkl".format(state)) if os.path.exists('{}_cluster.csv'.format(geocode)): data = pd.read_csv('{}_cluster.csv.gz') cluster = pickle.load('{}_cluster.pkl') else: data, cluster = get_cluster_data(geocode=geocode, clusters=clusters, data_types=DATA_TYPES, cols=predictors, save=True) indice = list(data.index) indice = [i.date() for i in indice] fig, axs = P.subplots(nrows=2, ncols=2, figsize=(50, 45)) targets = zip(cluster, axs.flatten()) for (city, ax) in targets: print(city) city_name = get_city_names([city, 0])[0][1] predicted, X_test, Y_test, Y_train, factor = train_evaluate_model( city, data, predict_n, look_back, hidden, epochs) ## plot Ydata = np.concatenate((Y_train, Y_test), axis=0) split_point = len(Y_train) df_predicted = pd.DataFrame(predicted).T ymax = max(predicted.max() * factor, Ydata.max() * factor) ax.vlines(indice[split_point], 0, ymax, "g", "dashdot", lw=2) ax.text(indice[split_point + 1], 0.6 * ymax, "Out of sample Predictions") for n in range(df_predicted.shape[1] - predict_n): ax.plot(indice[n:n + predict_n], pd.DataFrame(Ydata.T)[n] * factor, "k-") ax.plot(indice[n:n + predict_n], df_predicted[n] * factor, "r-") ax.vlines( indice[n:n + predict_n], np.zeros(predict_n), df_predicted[n] * factor, "b", alpha=0.2, ) ax.grid() ax.set_title("Predictions for {}".format(city_name), fontsize=13) ax.legend(["data", "predicted"]) P.tight_layout() P.savefig("{}/cluster_{}.pdf".format(FIG_PATH, geocode)) # , bbox_inches='tight') # P.show() return None
def rf_state_prediction(state, lookback, horizon, predictors): """ make predictions for all cities of a state using the cluster series :param state: State Symbol :param lookback: number of steps of history to use as predictors :param horizon: number of steps to predict :param predictors: list of predictors to use :return: """ clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile('./saved_models/{}/rf_metrics_{}.pkl'.format( state, city)): print('done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data_full, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split( X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rf_metrics_{}.pkl'.format( 'saved_models/random_forest', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) # plt.show() return None
def rf_prediction(city, state, horizon, lookback): """ make predictions for a given city using the cluster series :param city: city name :param state: State symbol :param horizon: number of steps ahead to predict :param lookback: number steps of history to use as predictors :return: """ with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rf_metrics_{}.pkl'.format( 'saved_models/random_forest', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) return preds, X_train, targets, data_lag
def lasso_single_prediction(city, state, lookback, horizon, predictors): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) data, group = get_cluster_data(geocode=city, clusters=clusters, data_types=DATA_TYPES, cols=predictors) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) if sum(y_train) == 0: print('aaaah', city) return None city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = LassoLarsCV(max_iter=5, n_jobs=-1, normalize=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) print(city, 'done') except ValueError as err: print('-----------------------------------------------------') print(city, 'ERRO') print('-----------------------------------------------------') break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format( 'saved_models/lasso', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) return None
def lasso_single_state_prediction(state, lookback, horizon, predictors): ##LASSO WITHOUT CLUSTER SERIES cities = list(get_cities_from_state('Ceará')) for city in cities: if os.path.isfile( '/home/elisa/Documentos/InfoDenguePredict/infodenguepredict/models/saved_models/lasso_no_cluster/{}/lasso_metrics_{}.pkl' .format(state, city)): print(city, 'done') continue data = combined_data(city, DATA_TYPES) data = data[predictors] data.drop('casos', axis=1, inplace=True) target = 'casos_est' data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(target, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = LassoLarsCV(max_iter=15, n_jobs=-1, normalize=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) except ValueError as err: print('-----------------------------------------------------') print(city, 'ERRO') print('-----------------------------------------------------') break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format( 'saved_models/lasso_no_cluster', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train), path='lasso_no_cluster') # plt.show() return None
def rgf_state_prediction(state, lookback, horizon, predictors): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format( state, city)): print(city, 'done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] casos_columns = ['casos_{}'.format(i) for i in group] data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split( X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = RGFRegressor(max_leaf=300, algorithm="RGF_Sib", test_interval=100, loss="LS", verbose=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) except ValueError as err: print( '-----------------------------------------------------' ) print(city, 'ERRO') print( '-----------------------------------------------------' ) break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format( 'saved_models/rgf', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) # plt.show() return None
def test_get_city_names(self): names = get_city_names([3304557, 4118204]) self.assertEqual([(3304557, 'Rio de Janeiro'), (4118204, 'Paranaguá')], names)
def single_prediction(city, state, predictors, predict_n, look_back, hidden, epochs, predict=True, doenca='chick'): """ Fit an LSTM model to generate predictions for a city, Using its cluster as regressors. :param city: geocode of the target city :param state: State containing the city :param predict_n: How many weeks ahead to predict :param look_back: Look-back time window length used by the model :param hidden: Number of hidden layers in each LSTM unit :param epochs: Number of epochs of training :param predict: Only generate predictions :param random: If the model should be trained on a random selection of ten cities of the same state. :return: """ with open("../../analysis/clusters_{}.pkl".format(state), "rb") as fp: clusters = pickle.load(fp) data, group = get_cluster_data(geocode=city, clusters=clusters, data_types=DATA_TYPES, cols=predictors, doenca=doenca) data = data['2016-01-01':] x = data.index.shift(predict_n, freq='W') x = [i.date() for i in x] indice = list(data.index) indice = [i.date() for i in indice] city_name = get_city_names([city, 0])[0][1] if predict: ratio = 1 else: ratio = 0.7 if cluster: target_col = list(data.columns).index("casos_est_{}".format(city)) else: target_col = list(data.columns).index("casos_est") norm_data, max_features = normalize_data(data) factor = max_features[target_col] ## split test and train X_train, Y_train, X_test, Y_test = split_data( norm_data, look_back=look_back, ratio=ratio, predict_n=predict_n, Y_column=target_col, ) model = load_model("../saved_models/LSTM/{}/lstm_{}_epochs_{}.h5".format( state, city, epochs)) predicted = np.stack( [model.predict(X_train, batch_size=1, verbose=1) for i in range(100)], axis=2) df_predicted = pd.DataFrame(np.percentile(predicted, 50, axis=2)) df_predicted25 = pd.DataFrame(np.percentile(predicted, 2.5, axis=2)) df_predicted975 = pd.DataFrame(np.percentile(predicted, 97.5, axis=2)) plot_prediction(pred=df_predicted, pred25=df_predicted25, pred975=df_predicted975, x=x, ydata=Y_train, factor=factor, horizon=predict_n, title="{}".format(city_name), doenca=doenca) return predicted, indice, X_test, Y_test, Y_train, factor
def qf_prediction(city, state, horizon, lookback, doenca='chik'): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS, doenca=doenca) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() data_lag = data_lag['2016-01-01':] targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) preds25 = np.empty((len(data_lag), horizon)) preds975 = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) # for d in range(1, horizon + 1): # tgtt = targets[d][len(X_data):] # # Load dengue model model = joblib.load( 'saved_models/quantile_forest/{}/{}_city_model_{}W.joblib'.format( state, city, horizon)) pred25 = model.predict(X_data, quantile=2.5) pred = model.predict(X_data, quantile=50) pred975 = model.predict(X_data, quantile=97.5) # dif = len(data_lag) - len(pred) # if dif > 0: # pred = list(pred) + ([np.nan] * dif) # pred25 = list(pred25) + ([np.nan] * dif) # pred975 = list(pred975) + ([np.nan] * dif) # preds[:, (d - 1)] = pred # preds25[:, (d - 1)] = pred25 # preds975[:, (d - 1)] = pred975 # metrics[d] = calculate_metrics(preds, tgtt) # print(metrics) # metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) plot_prediction(pred, pred25, pred975, targets[1], horizon, city_name, save=True, doenca=doenca) return model, pred, pred25, pred975, X_data, targets, data_lag