def plot_simulated_histogram_variety_test(parameters: np.array) -> None: params = {"review_prior": np.ones(5), "tendency_to_rate": 0.05} simulator = simulator_class.DoubleRhoSimulator(params) simulator.simulation_parameters = {"rho": parameters} with tqdm_joblib(tqdm(desc="Simulations", total=parameters.shape[0])) as progress_bar: simulations = Parallel(n_jobs=mp.cpu_count())( delayed(simulator.simulate_review_histogram)(i) for i in range(parameters.shape[0]) ) simulations = np.array(simulations) for i in range(parameters.shape[0]): plt.figure() plt.bar( [1, 2, 3, 4, 5], simulations[i, :], width=0.5, color=sns.xkcd_rgb["grey"], label=r"$\rho_{-}=$" + f"{parameters[i, 0]}, " + r"$\rho_{+}=$" + f"{parameters[i, 1]}", ) ax = plt.gca() ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) plt.xlabel("Rating") plt.ylabel("Number of ratings") plt.legend(loc="upper left", fontsize=20)
def plot_simulated_vs_actual_histogram_test( observed_histograms: np.array, posterior_samples: np.array, products_to_test: np.array, plot_histograms: bool = False, return_raw_simulations: bool = True, ) -> np.ndarray: print(posterior_samples.shape) products_to_test = products_to_test.astype("int") simulated_histograms = np.zeros((posterior_samples.shape[0], len(products_to_test), 5)) # Get the total number of reviews of the products we want to test # We will simulate as many reviews for each products as exist in their observed histograms # total_reviews = np.sum(observed_histograms[products_to_test, :], axis=1) params = {"review_prior": np.ones(5), "tendency_to_rate": 0.05, "simulation_type": "histogram"} simulator = simulator_class.DoubleRhoSimulator(params) # Take posterior samples of the products we want to test # We will simulate distributions using these posterior samples as parameters parameters = np.swapaxes(posterior_samples[:, products_to_test, :], 0, 1).reshape((-1, 2)) # We need to expand total reviews to be same number as the number of simulations to be run # total_reviews = np.tile(total_reviews[:, None], (1, posterior_samples.shape[0])).flatten() simulator.simulation_parameters = {"rho": parameters} with tqdm_joblib(tqdm(desc="Simulations", total=parameters.shape[0])) as progress_bar: simulations = Parallel(n_jobs=mp.cpu_count())( delayed(simulator.simulate_review_histogram)(i) for i in range(parameters.shape[0]) ) simulations = np.array(simulations) simulated_histograms[:, :, :] = simulations.reshape((-1, len(products_to_test), 5), order="F") simulated_histograms /= np.sum(simulated_histograms, axis=-1)[:, :, None] if plot_histograms: for i in range(len(products_to_test)): plt.figure() plt.plot( np.arange(5) + 1, observed_histograms[i, :] / np.sum(observed_histograms[i, :]), linewidth=4.0, color="black", ) # Get the HPDs of the simulated histograms hpd = arviz.hdi(simulated_histograms[:, i, :], hdi_prob=0.95) plt.fill_between(np.arange(5) + 1, hpd[:, 0], hpd[:, 1], color="black", alpha=0.4) plt.ylim([0, 1]) if return_raw_simulations: return simulations else: return simulated_histograms
def simulate( self, num_simulations: int, num_reviews_per_simulation: Optional[np.ndarray] = None) -> None: if num_reviews_per_simulation is not None: assert (len(num_reviews_per_simulation) == num_simulations), f""" {num_simulations} simulations to be done, but {len(num_reviews_per_simulation)} review counts per simulation provided """ self.simulation_parameters = self.generate_simulation_parameters( num_simulations) with tqdm_joblib(tqdm(desc="Simulations", total=num_simulations)) as progress_bar: simulations = Parallel(n_jobs=mp.cpu_count())( delayed(self.simulate_review_histogram)( i, num_reviews_per_simulation) for i in range(num_simulations)) self.simulations = np.array(simulations)
def plot_test_parameter_recovery( parameters: np.array, num_posterior_samples: int, simulator_type: str, simulation_type: str, plot_posteriors: bool = False, get_stats: bool = False, param_posterior_prob_band: Optional[float] = None, ) -> np.ndarray: # Simulate review histograms using provided parameters params = {"review_prior": np.ones(5), "tendency_to_rate": 0.05, "simulation_type": simulation_type} simulator = simulator_class.DoubleRhoSimulator(params) simulator.simulation_parameters = {"rho": parameters} with tqdm_joblib(tqdm(desc="Simulations", total=parameters.shape[0])) as progress_bar: simulations = Parallel(n_jobs=mp.cpu_count())( delayed(simulator.simulate_review_histogram)(i) for i in range(parameters.shape[0]) ) simulations = np.array(simulations) # The parameter prior doesn't matter here as it will be overridden by that of the loaded inference object parameter_prior = sbi.utils.BoxUniform( low=torch.tensor([0.0, 0.0]).type(torch.FloatTensor), high=torch.tensor([4.0, 4.0]).type(torch.FloatTensor) ) inferrer = inference_class.HistogramInference(parameter_prior=parameter_prior) inferrer.load_simulator(dirname=ARTIFACT_PATH, simulator_type=simulator_type, simulation_type=simulation_type) inferrer.load_inference(dirname=ARTIFACT_PATH) posterior_samples = inferrer.get_posterior_samples(simulations, num_samples=num_posterior_samples) # Plot the posterior samples inferred for the simulated data # We will plot upto 4 plots in one row of the panel if plot_posteriors: if len(parameters) <= 4: fig, ax = plt.subplots(1, len(parameters), squeeze=False) else: fig, ax = plt.subplots((len(parameters) + 1) // 4, 4, squeeze=False) row_index = 0 for i in range(len(parameters)): if len(parameters) > 4: row_index = i // 4 ax[row_index, i % 4].hist( posterior_samples[:, i, 0], color="black", alpha=0.5, bins=10, label=r"$\rho_{-}$" ) ax[row_index, i % 4].axvline(x=parameters[i, 0], linewidth=3.0, color="black", linestyle="--") ax[row_index, i % 4].hist(posterior_samples[:, i, 1], color="red", alpha=0.5, bins=10, label=r"$\rho_{+}$") ax[row_index, i % 4].axvline(x=parameters[i, 1], linewidth=3.0, color="red", linestyle="--") ax[row_index, i % 4].set_xlim([0, 4]) ax[row_index, i % 4].set_xticks([0, 1, 2, 3, 4]) ax[row_index, i % 4].legend() # add a big axis, hide frame fig.add_subplot(111, frameon=False) # hide tick and tick label of the big axis plt.tick_params(labelcolor="none", top=False, bottom=False, left=False, right=False) plt.xlabel(r"$\rho_{-}, \rho_{+}$") plt.ylabel("Number of samples") # If asked, print how many of the provided parameters are recovered by the inference engine # i.e, how often do the supplied parameters lie within the 95% HPD of the posterior if get_stats: f = open(ARTIFACT_PATH / "stats_parameter_recovery.txt", "w") assert ( posterior_samples.shape == (num_posterior_samples,) + parameters.shape ), f""" Expected shape {(num_posterior_samples,) + parameters.shape} for array of posterior samples, but got {posterior_samples.shape} instead """ # First get the HPD of each recovered posterior distribution hpd = np.array([arviz.hdi(posterior_samples[:, i, :], hdi_prob=0.95) for i in range(parameters.shape[0])]) assert hpd.shape == parameters.shape + (2,), f"Found shape {hpd.shape} for hpd" # See how many of the supplied rho_- and rho_+ are contained in these HPDs contained_rho_0 = [ True if (parameters[i, 0] < hpd[i, 0, 1] and parameters[i, 0] > hpd[i, 0, 0]) else False for i in range(parameters.shape[0]) ] contained_rho_1 = [ True if (parameters[i, 1] < hpd[i, 1, 1] and parameters[i, 1] > hpd[i, 1, 0]) else False for i in range(parameters.shape[0]) ] print( f""" rho- is recovered {np.sum(contained_rho_0)} times out of {parameters.shape[0]} = {100*(np.sum(contained_rho_0) / parameters.shape[0]):0.2f}%" """, file=f, ) print( f""" rho+ is recovered {np.sum(contained_rho_1)} times out of {parameters.shape[0]} = {100*(np.sum(contained_rho_1) / parameters.shape[0]):0.2f}%" """, file=f, ) print("=======================================================", file=f) # Now get the probability that the posterior distribution puts in a band/region around # the passed parameter values. For good parameter recovery, this number should be high assert ( param_posterior_prob_band is not None ), f""" Posterior probability band around parameter values need to be passed if stats are needed """ param_band_low = parameters - param_posterior_prob_band param_band_high = parameters + param_posterior_prob_band rho_0_probs = (posterior_samples[:, :, 0] >= param_band_low[None, :, 0]) * ( posterior_samples[:, :, 0] <= param_band_high[None, :, 0] ) rho_0_probs = np.mean(rho_0_probs, axis=0) rho_1_probs = (posterior_samples[:, :, 1] >= param_band_low[None, :, 1]) * ( posterior_samples[:, :, 1] <= param_band_high[None, :, 1] ) rho_1_probs = np.mean(rho_1_probs, axis=0) print( f""" In {100*np.mean(rho_0_probs>=0.5):0.2f}% of cases, the inferred posterior places more than 50% probability in a band of {2*param_posterior_prob_band} around the true value of rho- """, file=f, ) print( f""" In {100*np.mean(rho_1_probs>=0.5):0.2f}% of cases, the inferred posterior places more than 50% probability in a band of {2*param_posterior_prob_band} around the true value of rho+ """, file=f, ) f.close() # Finally, plot the distribution of the posterior probability the inference engine places in a # band around the true value of rho- and rho+ plt.figure() plt.hist(rho_0_probs, alpha=0.5, label=r"$\rho_{-}$") plt.hist(rho_1_probs, alpha=0.5, label=r"$\rho_{+}$") plt.legend() plt.title( f"Posterior probability placed by inference engine in a band of {2*param_posterior_prob_band}" + f"\n around the true value of the parameters ({parameters.shape[0]} trials)", fontsize=24.0, ) return posterior_samples