def test_invalid_model_spec_raises_error(self): observed_time_series = tf.ones([2]) design_matrix = tf.eye(2) with self.assertRaisesRegexp(ValueError, 'Weights prior must be a univariate normal'): gibbs_sampler.build_model_for_gibbs_fitting( observed_time_series, design_matrix=design_matrix, weights_prior=tfd.StudentT(df=10, loc=0., scale=1.), level_variance_prior=tfd.InverseGamma(0.01, 0.01), observation_noise_variance_prior=tfd.InverseGamma(0.01, 0.01)) with self.assertRaisesRegexp( ValueError, 'Level variance prior must be an inverse gamma'): gibbs_sampler.build_model_for_gibbs_fitting( observed_time_series, design_matrix=design_matrix, weights_prior=tfd.Normal(loc=0., scale=1.), level_variance_prior=tfd.LogNormal(0., 3.), observation_noise_variance_prior=tfd.InverseGamma(0.01, 0.01)) with self.assertRaisesRegexp( ValueError, 'noise variance prior must be an inverse gamma'): gibbs_sampler.build_model_for_gibbs_fitting( observed_time_series, design_matrix=design_matrix, weights_prior=tfd.Normal(loc=0., scale=1.), level_variance_prior=tfd.InverseGamma(0.01, 0.01), observation_noise_variance_prior=tfd.LogNormal(0., 3.))
def fn(key1, key2, seed=None): return [ tfd.Normal(0., 1.).sample([3, 2], seed=seed), { key1: tfd.Poisson([1., 2., 3., 4.]).sample(seed=seed + 1), key2: tfd.LogNormal(0., 1.).sample(seed=seed + 2) } ]
def create_prior(K, a_p=1, b_p=1, a_gamma=1, b_gamma=1, m_loc=0, g_loc=0.1, m_sigma=3, s_sigma=2, m_nu=0, s_nu=1, m_skew=0, g_skew=0.1, dtype=np.float64): return tfd.JointDistributionNamed( dict( p=tfd.Beta(dtype(a_p), dtype(b_p)), gamma_C=tfd.Gamma(dtype(a_gamma), dtype(b_gamma)), gamma_T=tfd.Gamma(dtype(a_gamma), dtype(b_gamma)), eta_C=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), eta_T=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), nu=tfd.Sample(tfd.LogNormal(dtype(m_nu), s_nu), sample_shape=K), sigma_sq=tfd.Sample(tfd.InverseGamma(dtype(m_sigma), dtype(s_sigma)), sample_shape=K), loc=lambda sigma_sq: tfd.Independent(tfd.Normal( dtype(m_loc), g_loc * tf.sqrt(sigma_sq)), reinterpreted_batch_ndims=1), skew=lambda sigma_sq: tfd.Independent(tfd.Normal( dtype(m_skew), g_skew * tf.sqrt(sigma_sq)), reinterpreted_batch_ndims=1), ))
def create_dp_sb_gmm(nobs, K, dtype=np.float64): return tfd.JointDistributionNamed( dict( # Mixture means mu=tfd.Independent(tfd.Normal(np.zeros(K, dtype), 3), reinterpreted_batch_ndims=1), # Mixture scales sigma=tfd.Independent(tfd.LogNormal(loc=np.full(K, -2, dtype), scale=0.5), reinterpreted_batch_ndims=1), # Mixture weights (stick-breaking construction) alpha=tfd.Gamma(concentration=np.float64(1.0), rate=10.0), v=lambda alpha: tfd.Independent( # NOTE: Dave Moore suggests doing this instead, to ensure # that a batch dimension in alpha doesn't conflict with # the other parameters. tfd.Beta(np.ones(K - 1, dtype), alpha[..., tf.newaxis]), reinterpreted_batch_ndims=1), # Observations (likelihood) obs=lambda mu, sigma, v: tfd.Sample( tfd.MixtureSameFamily( # This will be marginalized over. mixture_distribution=tfd.Categorical(probs=stickbreak(v)), components_distribution=tfd.Normal(mu, sigma)), sample_shape=nobs)))
def regularizer(t): out = tfd.LogNormal( 0., 1.).log_prob(1e-5 + tf.nn.softplus(c + t[Ellipsis, -1])) return -tf.reduce_sum(out) / num_updates
def _init_distribution(conditions, **kwargs): loc, scale = conditions["loc"], conditions["scale"] return tfd.LogNormal(loc=loc, scale=scale, **kwargs)
for m, s in zip(theta["mus"], theta["log_sigmas"]) ]) }, "log-normal": { "parameters": { "mean": { "support": [-inf, inf], "activation function": identity }, "variance": { "support": [0, inf], "activation function": softplus } }, "class": lambda theta: tensorflow_distributions.LogNormal( loc=theta["mean"], scale=tf.sqrt(theta["variance"])) }, "exponentially_modified_gaussian": { "parameters": { "location": { "support": [-inf, inf], "activation function": identity }, "scale": { "support": [0, inf], "activation function": softplus }, "rate": { "support": [0, inf], "activation function": softplus }
def compute_LK(alpha, rho, X, jitter=1e-6): kernel = SqExpKernel(alpha, rho) K = kernel.matrix(X, X) + tf.eye(X.shape[0], dtype=dtype) * jitter return tf.linalg.cholesky(K) def compute_f(alpha, rho, beta, eta): LK = compute_LK(alpha, rho, X) f = tf.linalg.matvec(LK, eta) # LK * eta, (matrix * vector) return f + beta[..., tf.newaxis] # GP Binary Classification Model. gpc_model = tfd.JointDistributionNamed( dict( alpha=tfd.LogNormal(dtype(0), dtype(1)), rho=tfd.LogNormal(dtype(0), dtype(1)), beta=tfd.Normal(dtype(0), dtype(1)), eta=tfd.Sample(tfd.Normal(dtype(0), dtype(1)), sample_shape=X.shape[0]), # NOTE: `Sample` and `Independent` resemble, respectively, # `filldist` and `arraydist` in Turing. obs=lambda alpha, rho, beta, eta: tfd.Independent( tfd.Bernoulli(logits=compute_f(alpha, rho, beta, eta)), reinterpreted_batch_ndims=1))) ### MODEL SET UP ### # For some reason, this is needed for the compiler # to know the correct model parameter dimensions. _ = gpc_model.sample()
def _base_dist(self, mu: TensorLike, sigma: TensorLike, *args, **kwargs): return tfd.LogNormal(loc=mu, scale=sigma, **kwargs)
plt.legend() # Here we will use the squared exponential covariance function: # # $$ # \alpha^2 \cdot \exp\left\{-\frac{d^2}{2\rho^2}\right\} # $$ # # where $\alpha$ is the amplitude of the covariance, $\rho$ is the length scale which controls how slowly information decays with distance (larger $\rho$ means information about a point can be used for data far away); and $d$ is the distance. # In[4]: # Specify GP model gp_model = tfd.JointDistributionNamed( dict( amplitude=tfd.LogNormal(dtype(0), dtype(0.1)), # amplitude length_scale=tfd.LogNormal(dtype(0), dtype(1)), # length scale v=tfd.LogNormal(dtype(0), dtype(1)), # model sd obs=lambda length_scale, amplitude, v: tfd.GaussianProcess( kernel=tfp.math.psd_kernels.ExponentiatedQuadratic( amplitude, length_scale), index_points=X[..., np.newaxis], observation_noise_variance=v))) # Run graph to make sure it works. _ = gp_model.sample() # Initial values. initial_state = [ 1e-1 * tf.ones([], dtype=np.float64, name='amplitude'), 1e-1 * tf.ones([], dtype=np.float64, name='length_scale'),
self.hmm = tfd.HiddenMarkovModel( initial_distribution=tfd.Categorical(logits=initial_state_logits), transition_distribution=tfd.Categorical( probs=self._transition_probs), observation_distribution=tfd.Poisson( log_rate=_trainable_log_rates), num_steps=len(observed_counts)) true_rates = [40, 3, 20, 50] true_durations = [10, 20, 5, 35] plt.plot(observed_counts) #plt.show() rate_prior = tfd.LogNormal(5, 5) def log_prob(): return ( tf.reduce_sum(rate_prior.log_prob(tf.math.exp(trainable_log_rates))) + hmm.log_prob(observed_counts)) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) @tf.function(autograph=False) def train_op(): with tf.GradientTape() as tape: neg_log_prob = -log_prob()
qv_rho = tf.Variable(tf.random.normal([ncomponents - 1], dtype=np.float64) - 1, name='qv_rho') qalpha_loc = tf.Variable(tf.random.normal([], dtype=np.float64), name='qalpha_loc') qalpha_rho = tf.Variable(tf.random.normal([], dtype=np.float64), name='qalpha_rho') # Create variational distribution. surrogate_posterior = tfd.JointDistributionNamed( dict( # qmu mu=tfd.Independent(tfd.Normal(qmu_loc, tf.nn.softplus(qmu_rho)), reinterpreted_batch_ndims=1), # qsigma sigma=tfd.Independent(tfd.LogNormal(qsigma_loc, tf.nn.softplus(qsigma_rho)), reinterpreted_batch_ndims=1), # qv v=tfd.Independent(tfd.LogitNormal(qv_loc, tf.nn.softplus(qv_rho)), reinterpreted_batch_ndims=1), # qalpha alpha=tfd.LogNormal(qalpha_loc, tf.nn.softplus(qalpha_rho)))) # In[12]: # Run optimizer # @tf.function(autograph=False) , experimental_compile=True) # Makes slower? def run_advi(optimizer, sample_size=1, num_steps=2000, seed=1): return tfp.vi.fit_surrogate_posterior( target_log_prob_fn=target_log_prob_fn,
def get_list_of_moment_map(fitting_area): def build_latent_state(num_states, max_num_states, daily_change_prob=0.05): # Give probability exp(-100) ~= 0 to states outside of the current model. initial_state_logits = -100. * np.ones([max_num_states], dtype=np.float32) initial_state_logits[:num_states] = 0. initial_state_logits[0] = 1. # Build a transition matrix that transitions only within the current # `num_states` states. transition_probs = np.eye(max_num_states, dtype=np.float32) if num_states > 1: transition_probs[:num_states, :num_states] = ( daily_change_prob / (num_states - 1)) np.fill_diagonal(transition_probs[:num_states, :num_states], 1 - daily_change_prob) return initial_state_logits, transition_probs max_num_states = 10 batch_initial_state_logits = [] batch_transition_probs = [] for num_states in range(1, max_num_states + 1): initial_state_logits, transition_probs = build_latent_state( num_states=num_states, max_num_states=max_num_states) batch_initial_state_logits.append(initial_state_logits) batch_transition_probs.append(transition_probs) batch_initial_state_logits = np.array(batch_initial_state_logits) batch_transition_probs = np.array(batch_transition_probs) trainable_log_rates = tf.Variable( (np.log(np.mean(fitting_area)) * np.ones([batch_initial_state_logits.shape[0], max_num_states]) + tf.random.normal([1, max_num_states])), name='log_rates') hmm = tfd.HiddenMarkovModel( initial_distribution=tfd.Categorical( logits=batch_initial_state_logits), transition_distribution=tfd.Categorical(probs=batch_transition_probs), observation_distribution=tfd.Poisson(log_rate=trainable_log_rates), num_steps=len(fitting_area)) rate_prior = tfd.LogNormal(5, 5) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) def log_prob(): prior_lps = rate_prior.log_prob(tf.math.exp(trainable_log_rates)) prior_lp = tf.stack( [tf.reduce_sum(prior_lps[i, :i + 1]) for i in range(max_num_states)]) return prior_lp + hmm.log_prob(fitting_area) @tf.function(autograph=False) def train_op(): with tf.GradientTape() as tape: neg_log_prob = -log_prob() grads = tape.gradient(neg_log_prob, [trainable_log_rates])[0] optimizer.apply_gradients([(grads, trainable_log_rates)]) return neg_log_prob, tf.math.exp(trainable_log_rates) for step in range(201): loss, rates = [t.numpy() for t in train_op()] if step % 20 == 0: print("step {}: loss {}".format(step, loss)) posterior_probs = hmm.posterior_marginals( fitting_area).probs_parameter().numpy() most_probable_states = np.argmax(posterior_probs, axis=-1) fig = plt.figure(figsize=(14, 12)) for i, learned_model_rates in enumerate(rates): ax = fig.add_subplot(4, 3, i + 1) ax.plot(learned_model_rates[most_probable_states[i]], c='green', lw=3, label='inferred rate') ax.plot(fitting_area, c='black', alpha=0.3, label='observed counts') ax.set_ylabel("latent rate") ax.set_xlabel("time") ax.set_title("{}-state model".format(i + 1)) ax.legend(loc=4) plt.tight_layout() plt.show() fig = plt.figure(figsize=(14, 12)) list_of_moment_map = [] for number_of_states in range(max_num_states): moment_map = {} ax = fig.add_subplot(4, 3, number_of_states + 1) for state_no in range(max_num_states): moment_map[state_no] = [] index = 0 for state in most_probable_states[number_of_states]: moment_map[state].append(index) index += 1 # moment_map = {k:v for k,v in moment_map.items() if len(v) > 0} frequency_count = [len(moment_map[x]) / index for x in moment_map] bar1 = ax.bar(range(len(moment_map)), frequency_count) # autolabel(bar1,most_probable_states[number_of_states]) ax.set_ylim(0, 1.1) ax.set_xlabel("state id") ax.set_title("{}-state model".format(i + 1)) list_of_moment_map.append(moment_map) plt.tight_layout() plt.savefig("rate_frequency.png") plt.clf() return list_of_moment_map
def latent_state_number_changing_curve(fitting_area, output_dir_prefix, log_dir_prefix, log_dir, fig_name=""): max_num_states = 10 def build_latent_state(num_states, max_num_states, daily_change_prob=0.05): # Give probability exp(-100) ~= 0 to states outside of the current model. initial_state_logits = -100. * np.ones([max_num_states], dtype=np.float32) initial_state_logits[:num_states] = 0. initial_state_logits[0] = 1. # Build a transition matrix that transitions only within the current # `num_states` states. transition_probs = np.eye(max_num_states, dtype=np.float32) if num_states > 1: transition_probs[:num_states, :num_states] = ( daily_change_prob / (num_states - 1)) np.fill_diagonal(transition_probs[:num_states, :num_states], 1 - daily_change_prob) return initial_state_logits, transition_probs # For each candidate model, build the initial state prior and transition matrix. batch_initial_state_logits = [] batch_transition_probs = [] for num_states in range(1, max_num_states + 1): initial_state_logits, transition_probs = build_latent_state( num_states=num_states, max_num_states=max_num_states) batch_initial_state_logits.append(initial_state_logits) batch_transition_probs.append(transition_probs) batch_initial_state_logits = np.array(batch_initial_state_logits) batch_transition_probs = np.array(batch_transition_probs) trainable_log_rates = tf.Variable( (np.log(np.mean(fitting_area)) * np.ones([batch_initial_state_logits.shape[0], max_num_states]) + tf.random.normal([1, max_num_states])), name='log_rates') hmm = tfd.HiddenMarkovModel( initial_distribution=tfd.Categorical( logits=batch_initial_state_logits), transition_distribution=tfd.Categorical(probs=batch_transition_probs), observation_distribution=tfd.Poisson(log_rate=trainable_log_rates), num_steps=len(fitting_area)) rate_prior = tfd.LogNormal(5, 5) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) def log_prob(): prior_lps = rate_prior.log_prob(tf.math.exp(trainable_log_rates)) prior_lp = tf.stack( [tf.reduce_sum(prior_lps[i, :i + 1]) for i in range(max_num_states)]) return prior_lp + hmm.log_prob(fitting_area) @tf.function(autograph=False) def train_op(): with tf.GradientTape() as tape: neg_log_prob = -log_prob() grads = tape.gradient(neg_log_prob, [trainable_log_rates])[0] optimizer.apply_gradients([(grads, trainable_log_rates)]) return neg_log_prob, tf.math.exp(trainable_log_rates) for step in range(201): loss, rates = [t.numpy() for t in train_op()] if step % 20 == 0: print("step {}: loss {}".format(step, loss)) num_states = np.arange(1, max_num_states + 1) fig = plt.figure(figsize=(8, 6)) plt.plot(num_states, -loss, "b-", label="likelihood") plt.ylabel("marginal likelihood $\\tilde{p}(x)$") plt.xlabel("number of latent states") plt.legend() plt.twinx() plt.plot(num_states, np.gradient(-loss), "g--", label="gradient") plt.ylabel("Gradient of the likelihood") plt.title("Model selection on latent states") plt.legend() output_path = output_dir_prefix + log_dir.replace(log_dir_prefix, "").replace("/", "_") mkdir_p(output_path) plt.savefig("{}/{}_likelihood_curve.pdf".format(output_path, fig_name), bbox_inches="tight") plt.savefig("{}/{}_likelihood_curve.png".format(output_path, fig_name), bbox_inches="tight") plt.clf() posterior_probs = hmm.posterior_marginals( fitting_area).probs_parameter().numpy() most_probable_states = np.argmax(posterior_probs, axis=-1) fig = plt.figure(figsize=(14, 12)) for i, learned_model_rates in enumerate(rates): ax = fig.add_subplot(4, 3, i + 1) ax.plot(learned_model_rates[most_probable_states[i]], c='green', lw=3, label='inferred rate') ax.plot(fitting_area, c='black', alpha=0.3, label='observed counts') ax.set_ylabel("latent rate") ax.set_xlabel("time") ax.set_title("{}-state model".format(i + 1)) ax.legend(loc=4) plt.tight_layout() plt.savefig("{}/{}_model_fitting_test.pdf".format(output_path, fig_name), bbox_inches="tight") plt.savefig("{}/{}_model_fitting_test.png".format(output_path, fig_name), bbox_inches="tight") plt.clf() pass
def HMM_on_one_file(log_dir): stdout_file, LOG_file, report_csv = get_log_and_std_files(log_dir) data_set = load_log_and_qps(LOG_file, report_csv) bucket_df = vectorize_by_compaction_output_level(data_set) bucket_df["qps"] = data_set.qps_df["interval_qps"] _ = bucket_df.plot(subplots=True) num_states = 5 # memtable filling, flush only, L0 compaction (CPU busy), crowded compaction (disk busy) initial_state_logits = np.zeros([num_states], dtype=np.float32) # uniform distribution initial_state_logits[ 0] = 1.0 # the possiblity of transferring into the Flushing limitation initial_state_logits initial_distribution = tfd.Categorical(probs=initial_state_logits) daily_change_prob = 0.05 transition_probs = daily_change_prob / (num_states - 1) * np.ones( [num_states, num_states], dtype=np.float32) np.fill_diagonal(transition_probs, 1 - daily_change_prob) observed_counts = bucket_df["qps"].fillna(0).tolist() observed_counts = np.array(observed_counts).astype(np.float32) transition_distribution = tfd.Categorical(probs=transition_probs) trainable_log_rates = tf.Variable(np.log(np.mean(observed_counts)) + tf.random.normal([num_states]), name='log_rates') hmm = tfd.HiddenMarkovModel( initial_distribution=initial_distribution, transition_distribution=transition_distribution, observation_distribution=tfd.Poisson(log_rate=trainable_log_rates), num_steps=len(observed_counts)) rate_prior = tfd.LogNormal(5, 5) # def log_prob(): return (tf.reduce_sum( rate_prior.log_prob(tf.math.exp(trainable_log_rates))) + hmm.log_prob(observed_counts)) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) @tf.function(autograph=False) def train_op(): with tf.GradientTape() as tape: neg_log_prob = -log_prob() grads = tape.gradient(neg_log_prob, [trainable_log_rates])[0] optimizer.apply_gradients([(grads, trainable_log_rates)]) return neg_log_prob, tf.math.exp(trainable_log_rates) # for step in range(201): loss, rates = [t.numpy() for t in train_op()] if step % 20 == 0: print("step {}: log prob {} rates {}".format(step, -loss, rates)) posterior_dists = hmm.posterior_marginals(observed_counts) posterior_probs = posterior_dists.probs_parameter().numpy() most_probable_states = np.argmax(posterior_probs, axis=1) most_probable_rates = rates[most_probable_states] fig = plt.figure(figsize=(10, 4)) ax = fig.add_subplot(1, 1, 1) ax.plot(most_probable_rates, c='green', lw=3, label='inferred rate') ax.plot(observed_counts, c='black', alpha=0.3, label='observed counts') ax.set_ylabel("latent rate") ax.set_xlabel("time") ax.set_title("Inferred latent rate over time") ax.legend(loc=4) output_path = "image/" + log_dir.replace("log_files/", "").replace( "/", "_") mkdir_p(output_path) plt.savefig("{}/state_guessing.pdf".format(output_path), bbox_inches="tight")