def plot_fits(generator, prediction_times, sharex, sharey, draw_space, plot_obs=None, plot_uncertainty=False): """ Plot the result and draws from a model generator at some prediction times. Args: generator: (curvefit.model_generator.ModelPipeline) that has some draws prediction_times: (np.array) of prediction times sharex: (bool) fix the x axes sharey: (bool) fix the y axes draw_space: (callable) which curvefit.functions space to plot the draws in plot_obs: (optional str) column of observations to plot plot_uncertainty: (optional bool) plot the uncertainty intervals """ fig, ax = plt.subplots(len(generator.groups), 1, figsize=(8, 4 * len(generator.groups)), sharex=sharex, sharey=sharey) if len(generator.groups) == 1: ax = [ax] for i, group in enumerate(generator.groups): draws = generator.draws[group].copy() draws = data_translator( data=draws, input_space=generator.predict_space, output_space=draw_space ) mean_fit = generator.mean_predictions[group].copy() mean_fit = data_translator( data=mean_fit, input_space=generator.predict_space, output_space=draw_space ) mean = draws.mean(axis=0) ax[i].plot(prediction_times, mean, c='red', linestyle=':') ax[i].plot(prediction_times, mean_fit, c='black') if plot_uncertainty: lower = np.quantile(draws, axis=0, q=0.025) upper = np.quantile(draws, axis=0, q=0.975) ax[i].plot(prediction_times, lower, c='red', linestyle=':') ax[i].plot(prediction_times, upper, c='red', linestyle=':') if plot_obs is not None: df_data = generator.all_data.loc[generator.all_data[generator.col_group] == group].copy() ax[i].scatter(df_data[generator.col_t], df_data[plot_obs]) ax[i].set_title(f"{group} predictions")
def summarize_result(self, print_summary=True): """ Prints a table which characterizes fit quality. It has four columns: Location, RMSE ERF, RMSE DERF, RMSE LNR Where - RMSE ERF: residual squares for the fit in ERF space - RMSE DERF: residual squares for the fit in DERF space - RMSE LNR: residual squares for the exponential fit in DERF space, corresponds to the linear fit in ln(DERF) space, The table is sorted by -ln(RMSE DERF) + ln(RMSE LNR), which means that the fits where a simple exponential model works better than the CurveFit (which means the fit went badly) will go first. Returns: Dataframe with the data. """ models = self.models summary = [] df_summary = pd.DataFrame( {}, columns=['Location', 'RMSE ERF', 'RMSE DERF', 'RMSE LNR']) location_list = [] rmse_gaussian_cdf_list = [] rmse_gaussian_pdf_list = [] rmse_gaussian_pdf_linear_list = [] for i, (location, model) in enumerate(models.items()): gaussian_cdf_pred = model.fun(model.t, model.params[:, 0]) rmse_gaussian_cdf = np.linalg.norm(gaussian_cdf_pred - model.obs)**2 gaussian_pdf_obs = data_translator(model.obs, self.basic_model_dict['fun'], 'gaussian_pdf') gaussian_pdf_pred = gaussian_pdf(model.t, model.params[:, 0]) rmse_gaussian_pdf = np.linalg.norm(gaussian_pdf_obs - gaussian_pdf_pred)**2 rmse_gaussian_pdf_linear = self.preconditioner._statistics[ "linear_rmse"].get(location, 1e10) summary.append([ location, rmse_gaussian_cdf, rmse_gaussian_pdf, rmse_gaussian_pdf_linear ]) location_list.append(location) rmse_gaussian_cdf_list.append(rmse_gaussian_cdf) rmse_gaussian_pdf_list.append(rmse_gaussian_pdf) rmse_gaussian_pdf_linear_list.append(rmse_gaussian_pdf_linear) df_summary['Location'] = location_list df_summary['RMSE ERF'] = rmse_gaussian_cdf_list df_summary['RMSE DERF'] = rmse_gaussian_pdf_list df_summary['RMSE LNR'] = rmse_gaussian_pdf_linear_list return df_summary
def simulate(self, mp, num_simulations, prediction_times, group, epsilon=1e-2, theta=1): """ Simulate the residuals based on the mean and standard deviation of predicting into the future. Args: mp: (curvefit.model_generator.ModelPipeline) model pipeline prediction_times: (np.array) times to create predictions at num_simulations: number of simulations group: (str) the group to make the simulations for epsilon: (epsilon) the floor for standard deviation moving out into the future theta: (theta) scaling of residuals to do relative to prediction magnitude Returns: List[pd.DataFrame] list of data frames for each simulation """ data = mp.all_data.loc[mp.all_data[mp.col_group] == group].copy() max_t = int(np.round(data[mp.col_t].max())) num_obs = data.loc[~data[mp.col_obs_compare].isnull()][ mp.col_group].count() predictions = mp.mean_predictions[group] add_noise = prediction_times > max_t no_noise = prediction_times <= max_t forecast_out_times = prediction_times[add_noise] - max_t error = self.create_residual_samples( num_simulations=num_simulations, forecast_out_times=forecast_out_times, num_data=num_obs, epsilon=epsilon) no_error = np.zeros(shape=(num_simulations, sum(no_noise))) all_error = np.hstack([no_error, error]) noisy_forecast = predictions - (predictions**theta) * all_error noisy_forecast = data_translator(data=noisy_forecast, input_space=mp.predict_space, output_space=mp.predict_space) return noisy_forecast
def test_data_translator_exp(data, input_space, output_space): result = utils.data_translator(data, input_space, output_space) assert np.allclose(data, result)
def test_data_translator_diff(data, input_space, output_space): result = utils.data_translator(data, input_space, output_space) if 'log' in input_space: assert np.allclose(np.exp(data), np.cumsum(np.exp(result), axis=1)) else: assert np.allclose(data, np.cumsum(result, axis=1))