def general_model_body(self, r, f_param, vis_transformer_params, m_t, v_t, prev_ll, the_number_of_rounds_under_acc_diff, prev_delta, stable_r, stable_f_param, stable_vis_transformer_params, stable_m_t, stable_v_t, stable_prev_ll, stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, step, alpha, max_ll, extra_took_steps): # checking out the turn! new_r, new_f_param, new_vis_transformer_params, new_m_t, new_v_t, new_prev_ll, \ new_the_number_of_rounds_under_acc_diff, new_prev_delta, \ new_stable_r, new_stable_f_param, new_stable_vis_transformer_params, new_stable_m_t, new_stable_v_t, new_stable_prev_ll, \ new_stable_the_number_of_rounds_under_acc_diff, new_stable_prev_delta, new_step, new_alpha, new_max_ll, new_extra_took_steps = \ tf.cond( tf.equal(step % 100, tf.constant(0, dtype=tf.int32)), lambda: self.recheck_model_body(r, f_param, vis_transformer_params, m_t, v_t, prev_ll, the_number_of_rounds_under_acc_diff, prev_delta, stable_r, stable_f_param, stable_vis_transformer_params, stable_m_t, stable_v_t, stable_prev_ll, stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, step, alpha, max_ll, extra_took_steps), lambda: self.normal_loop_body(r, f_param, vis_transformer_params, m_t, v_t, prev_ll, the_number_of_rounds_under_acc_diff, prev_delta, stable_r, stable_f_param, stable_vis_transformer_params, stable_m_t, stable_v_t, stable_prev_ll, stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, step, alpha, max_ll, extra_took_steps) ) # checking nan s! new_r = check_nan(new_r, 'r') new_f_param = check_nan(new_f_param, 'trans_f_params') new_vis_transformer_params = check_nan(new_vis_transformer_params, 'trans vis params') new_m_t = check_nan(new_m_t, 'm') new_v_t = check_nan(new_v_t, 'v') new_r.set_shape(r.get_shape()) new_f_param.set_shape(f_param.get_shape()) new_vis_transformer_params.set_shape(vis_transformer_params.get_shape()) new_m_t.set_shape(m_t.get_shape()) new_v_t.set_shape(v_t.get_shape()) new_prev_ll.set_shape(prev_ll.get_shape()) new_the_number_of_rounds_under_acc_diff.set_shape(the_number_of_rounds_under_acc_diff.get_shape()) new_prev_delta.set_shape(prev_delta.get_shape()) new_stable_r.set_shape(stable_r.get_shape()) new_stable_f_param.set_shape(stable_f_param.get_shape()) new_stable_vis_transformer_params.set_shape(stable_vis_transformer_params.get_shape()) new_stable_m_t.set_shape(stable_m_t.get_shape()) new_stable_v_t.set_shape(stable_v_t.get_shape()) new_stable_prev_ll.set_shape(stable_prev_ll.get_shape()) new_stable_the_number_of_rounds_under_acc_diff.set_shape(stable_the_number_of_rounds_under_acc_diff.get_shape()) new_stable_prev_delta.set_shape(stable_prev_delta.get_shape()) new_step.set_shape(step.get_shape()) new_alpha.set_shape(alpha.get_shape()) new_max_ll.set_shape(max_ll.get_shape()) new_extra_took_steps.set_shape(extra_took_steps.get_shape()) return new_r, new_f_param, new_vis_transformer_params, new_m_t, new_v_t, new_prev_ll, \ new_the_number_of_rounds_under_acc_diff, new_prev_delta, \ new_stable_r, new_stable_f_param, new_stable_vis_transformer_params, new_stable_m_t, new_stable_v_t, new_stable_prev_ll, \ new_stable_the_number_of_rounds_under_acc_diff, new_stable_prev_delta, new_step, new_alpha, new_max_ll, new_extra_took_steps
def normal_loop_body(self, r, f_params, vis_transformer_params, m_t, v_t, prev_ll, the_number_of_rounds_under_acc_diff, prev_delta, stable_r, stable_f_params, stable_vis_transformer_params, stable_m_t, stable_v_t, stable_prev_ll, stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, step, alpha, max_ll, extra_took_steps): # sub-sampling for mini batch nzero_mb = tf.cond( tf.equal(self.non_zero_batch_size, self.non_zero_samples_num), lambda: self.insig_interactions, lambda: tf.gather( self.insig_interactions, tf.random_uniform([self.non_zero_batch_size], maxval=self.non_zero_samples_num, dtype=tf.int32))) # calculating derivations of nonzero batch si = tf.gather(self.vis, tf.squeeze(nzero_mb[:, 0])) sj = tf.gather(self.vis, tf.squeeze(nzero_mb[:, 1])) xij = tf.cast(nzero_mb[:, 2], tf.float64) d_ij = tf.cast(tf.abs(nzero_mb[:, 0] - nzero_mb[:, 1]), tf.float64) ln_dij = tf.log(d_ij) # %% bv = vis_transformer_params[1] spower = vis_transformer_params[0] tsi = tf.pow(si, spower) tsj = tf.pow(sj, spower) tvi, tsi_share, bv_i_share = tf_soft_max(tsi, bv, True) tvj, tsj_share, bv_j_share = tf_soft_max(tsj, bv, True) f_var_ij = tf_cis_var_func(f_params, d_ij) f_free = tf.exp(f_params[4]) f_d_ij, var_f_share, free_f_share = tf_soft_max(f_var_ij, f_free, True) mu_ij = tvi * tvj * f_d_ij common_der = (xij - mu_ij) / (r + mu_ij) r_der = r * ( -2.0 * self.init_regularization_factor * r + (tf.log(r) - tf.digamma(r - 1)) + tf.reduce_mean( tf.digamma(xij + r - 1) - common_der - tf.log(r + mu_ij))) # %% vis_transformer_params_der = tf.stack([ 2.0 * tf.sqrt(spower) * r * tf_weighted_average( common_der * ((tsi_share * tsi * tf.log(si)) + (tsj_share * tsj * tf.log(sj)) / (tsi_share + tsj_share)), tsi_share + tsj_share), 2.0 * tf.sqrt(bv) * r * tf_weighted_average(common_der, bv_i_share + bv_j_share) ]) f_var_common_der = common_der * f_var_ij / f_d_ij f_var_common_der = check_nan(f_var_common_der, 'f_var_common_der') ln_dij = check_nan(ln_dij, 'ln_dij') var_f_share = check_nan(var_f_share, 'var_f_share') var_f_share = check_inf(var_f_share, 'var_f_share') f_params = check_nan(f_params, 'f_params') f_drev = tf.stack([ f_params[0] * r * tf_weighted_average( f_var_common_der * tf.pow(ln_dij, 3), var_f_share), r * tf_weighted_average(f_var_common_der * tf.pow(ln_dij, 2), var_f_share), r * tf_weighted_average(f_var_common_der * ln_dij, var_f_share), r * tf_weighted_average(f_var_common_der, var_f_share) ]) f_drev = check_nan(f_drev, 'f_drev') free_f_drev = r * tf_weighted_average( (common_der / f_d_ij), free_f_share) # %% non_zero_g_t = tf.concat([ tf.expand_dims(r_der, 0), f_drev, tf.expand_dims(free_f_drev, 0), vis_transformer_params_der ], axis=0) # **************************************** g_t = non_zero_g_t g_t = check_nan(g_t, 'g_t') # updating values: b1 = tf.constant(self.beta1, dtype=tf.float64) b2 = tf.constant(self.beta2, dtype=tf.float64) r_t = tf.cast(step + 1, tf.float64) r_m = b1 * m_t + (1 - b1) * g_t r_v = b2 * v_t + (1 - b2) * tf.pow(g_t, 2) a_t = alpha * (tf.sqrt(1 - tf.pow(b2, r_t))) / (1 - tf.pow(b1, r_t)) delta_vals = a_t * r_m / (tf.sqrt(r_v) + self.eps) delta_vals = check_nan(delta_vals, 'delta_vals') r_p = tf.log(r) + delta_vals[0] new_r = tf.exp(r_p) a3_p = tf.log(-1.0 * f_params[0]) + delta_vals[1] new_f_params = tf.stack([ -1.0 * tf.exp(a3_p), f_params[1] + delta_vals[2], f_params[2] + delta_vals[3], f_params[3] + delta_vals[4], f_params[4] + delta_vals[5] ]) # %% new_vis_transformer_params = tf.pow( tf.sqrt(vis_transformer_params) + delta_vals[6:6 + self.n_vis_t_params], 2) new_the_number_of_rounds_under_acc_diff = tf.cond( tf.less(tf.reduce_max(tf.abs(delta_vals)), self.acc_diff_limit), lambda: the_number_of_rounds_under_acc_diff + 1, lambda: tf.constant(0)) # checking flips new_alpha = tf.cond( tf.less(tf.reduce_max(delta_vals * prev_delta), tf.constant(0.0, dtype=tf.float64)), lambda: 0.1 * alpha, lambda: alpha) new_step = step + tf.constant(1, dtype=tf.int32) return new_r, new_f_params, new_vis_transformer_params, r_m, r_v, prev_ll, \ new_the_number_of_rounds_under_acc_diff, delta_vals, \ stable_r, stable_f_params, stable_vis_transformer_params, stable_m_t, stable_v_t, stable_prev_ll,\ stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, new_step, new_alpha, max_ll, extra_took_steps
def normal_loop_body(self, r, dist_params, b1_v_params, b2_v_params, m_t, v_t, prev_ll, the_number_of_rounds_under_acc_diff, prev_delta, stable_r, stable_dist_params, stable_b1_v_params, stable_b2_v_params, stable_m_t, stable_v_t, stable_prev_ll, stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, step, alpha, max_ll, extra_took_steps): # sub-sampling for mini batch nzero_mb = tf.cond( tf.equal(self.non_zero_batch_size, self.non_zero_samples_num), lambda: self.training_ints, lambda: tf.gather( self.training_ints, tf.random_uniform([self.non_zero_batch_size], maxval=self.non_zero_samples_num, dtype=tf.int32))) # abbreviate: d_Y_d_X = dY/dX, l = LogL, b = base, v = var as softmax(v, b), sh as share # calculating derivations of nonzero batch si = tf.gather(self.vis, tf.squeeze(nzero_mb[:, 0])) sj = tf.gather(self.vis, tf.squeeze(nzero_mb[:, 1])) xij = tf.cast(nzero_mb[:, 2], tf.float64) d_ij = tf.cast(tf.abs(nzero_mb[:, 0] - nzero_mb[:, 1]), tf.float64) ln_dij = tf.log(d_ij) # %% tsi = tf.pow(si, b1_v_params[0]) tsj = tf.pow(sj, b2_v_params[0]) tvi, tsi_share, bv_i_share = tf_soft_max(tsi, b1_v_params[1], True) tvj, tsj_share, bv_j_share = tf_soft_max(tsj, b2_v_params[1], True) f_var_ij = tf_cis_var_func(dist_params, d_ij) f_free = tf.exp(dist_params[4]) f_d_ij, var_f_share, free_f_share = tf_soft_max(f_var_ij, f_free, True) mu_ij = tvi * tvj * f_d_ij # MAIN PARTS DERIVATION common_der = (xij - mu_ij) / (r + mu_ij) d_r = r * ( -2.0 * self.init_regularization_factor * r + (tf.log(r) - tf.digamma(r - 1)) + tf.reduce_mean( tf.digamma(xij + r - 1) - common_der - tf.log(r + mu_ij))) # MU PARTS # VIS PARAMS # %% #tf.maximum(w_f_sum, if self.equal_v_params: b1_v_params_der = tf.stack([ 2.0 * tf.sqrt(b1_v_params[0]) * r * tf_weighted_average( common_der * ((tsi_share * tsi * tf.log(si)) + (tsj_share * tsj * tf.log(sj)) / tf.maximum( (tsi_share + tsj_share), tf.constant(1.0, dtype=tf.float64))), tsi_share + tsj_share), 2.0 * tf.sqrt(b1_v_params[1]) * r * tf_weighted_average(common_der, bv_i_share + bv_j_share) ]) b2_v_params_der = b1_v_params_der else: b1_v_params_der = tf.stack([ 2.0 * tf.sqrt(b1_v_params[0]) * r * tf_weighted_average(common_der * (tsi * tf.log(si)), tsi_share), 2.0 * tf.sqrt(b1_v_params[1]) * r * tf_weighted_average(common_der, bv_i_share) ]) b2_v_params_der = tf.stack([ 2.0 * tf.sqrt(b2_v_params[0]) * r * tf_weighted_average(common_der * (tsj * tf.log(sj)), tsj_share), 2.0 * tf.sqrt(b2_v_params[1]) * r * tf_weighted_average(common_der, bv_j_share) ]) #b1_v_params_der = tf.Print(b1_v_params_der, [b1_v_params_der, d_mu_ij_mult_v_mu_ij, sh_v_mu_ij, (sh_b_vi + sh_b_vj)], '>>>>', summarize=30) # FUNCTION PARAMS f_var_common_der = common_der * f_var_ij / f_d_ij # checking nans f_var_common_der = check_nan(f_var_common_der, 'f_var_common_der') ln_dij = check_nan(ln_dij, 'ln_dij') var_f_share = check_nan(var_f_share, 'var_f_share') var_f_share = check_inf(var_f_share, 'var_f_share') f_params = check_nan(dist_params, 'dist_params') f_drev = tf.stack([ f_params[0] * r * tf_weighted_average( f_var_common_der * tf.pow(ln_dij, 3), var_f_share), r * tf_weighted_average(f_var_common_der * tf.pow(ln_dij, 2), var_f_share), r * tf_weighted_average(f_var_common_der * ln_dij, var_f_share), r * tf_weighted_average(f_var_common_der, var_f_share) ]) f_drev = check_nan(f_drev, 'f_drev') free_f_drev = r * tf_weighted_average( (common_der / f_d_ij), free_f_share) # %% non_zero_g_t = tf.concat([ tf.expand_dims(d_r, 0), f_drev, tf.expand_dims(free_f_drev, 0), b1_v_params_der, b2_v_params_der ], axis=0) # **************************************** g_t = non_zero_g_t g_t = check_nan(g_t, 'g_t') # updating values: b1 = tf.constant(self.beta1, dtype=tf.float64) b2 = tf.constant(self.beta2, dtype=tf.float64) r_t = tf.cast(step + 1, tf.float64) r_m = b1 * m_t + (1 - b1) * g_t r_v = b2 * v_t + (1 - b2) * tf.pow(g_t, 2) a_t = alpha * (tf.sqrt(1 - tf.pow(b2, r_t))) / (1 - tf.pow(b1, r_t)) delta_vals = a_t * r_m / (tf.sqrt(r_v) + self.eps) delta_vals = check_nan(delta_vals, 'delta_vals') r_p = tf.log(r) + delta_vals[0] new_r = tf.exp(r_p) a3_p = tf.log(-1.0 * dist_params[0]) + delta_vals[1] new_dist_params = tf.stack([ -1.0 * tf.exp(a3_p), dist_params[1] + delta_vals[2], dist_params[2] + delta_vals[3], dist_params[3] + delta_vals[4], dist_params[4] + delta_vals[5] ]) # %% new_b1_v_params = tf.pow(tf.sqrt(b1_v_params) + delta_vals[6:6 + 2], 2) new_b2_v_params = tf.pow(tf.sqrt(b2_v_params) + delta_vals[8:8 + 2], 2) new_the_number_of_rounds_under_acc_diff = tf.cond( tf.less(tf.reduce_max(tf.abs(delta_vals)), self.acc_diff_limit), lambda: the_number_of_rounds_under_acc_diff + 1, lambda: tf.constant(0)) # checking flips new_alpha = tf.cond( tf.less(tf.reduce_max(delta_vals * prev_delta), tf.constant(0.0, dtype=tf.float64)), lambda: 0.1 * alpha, lambda: alpha) new_step = step + tf.constant(1, dtype=tf.int32) return new_r, new_dist_params, new_b1_v_params, new_b2_v_params, r_m, r_v, prev_ll, \ new_the_number_of_rounds_under_acc_diff, delta_vals, \ stable_r, stable_dist_params, stable_b1_v_params, stable_b2_v_params, \ stable_m_t, stable_v_t, stable_prev_ll, stable_the_number_of_rounds_under_acc_diff, \ stable_prev_delta, new_step, new_alpha, max_ll, extra_took_steps
def normal_loop_body(self, r, f_param, vis_transformer_params, m_t, v_t, prev_ll, the_number_of_rounds_under_acc_diff, prev_delta, stable_r, stable_f_param, stable_vis_transformer_params, stable_m_t, stable_v_t, stable_prev_ll, stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, step, alpha, max_ll, extra_took_steps): # sub-sampling for mini batch nzero_mb = tf.cond(tf.equal(self.non_zero_batch_size, self.non_zero_samples_num), lambda: self.insig_interactions, lambda: tf.gather(self.insig_interactions, tf.random_uniform([self.non_zero_batch_size], maxval=self.non_zero_samples_num, dtype=tf.int32))) # calculating derivations of nonzero batch si = tf.gather(self.vis, tf.squeeze(nzero_mb[:, 0])) sj = tf.gather(self.vis, tf.squeeze(nzero_mb[:, 1])) # %% bv = vis_transformer_params[1] spower = vis_transformer_params[0] tsi = tf.pow(si, spower) tsj = tf.pow(sj, spower) tvi, tsi_share, bv_i_share = tf_soft_max(tsi, bv, True) tvj, tsj_share, bv_j_share = tf_soft_max(tsj, bv, True) xij = tf.cast(nzero_mb[:, 2], tf.float64) mu_ij = tvi * tvj * tf.exp(f_param) mu_ij = check_nan(mu_ij, 'mu ij') xij = check_nan(xij, 'x ij') common_der = (xij - mu_ij) / (r + mu_ij) r_der = r * (-2.0 * self.init_regularization_factor * r + (tf.log(r) - tf.digamma(r - 1)) + tf.reduce_mean(tf.digamma(xij + r - 1) - common_der - tf.log(r + mu_ij))) # %% vis_transformer_params_der = tf.stack([ 2.0 * tf.sqrt(spower) * r * tf_weighted_average( common_der * ((tsi_share * tsi * tf.log(si)) + (tsj_share * tsj * tf.log(sj)) / (tsi_share + tsj_share)), tsi_share + tsj_share ), 2.0 * tf.sqrt(bv) * r * tf_weighted_average( common_der, bv_i_share + bv_j_share ) ]) common_der = check_nan(common_der, 'common der') r = check_nan(r, 'r via update') # separating samples that have higher f_func (close distance) than free f (far distance) free_f_drev = r * tf.reduce_mean(common_der) # %% non_zero_g_t = tf.stack([r_der, free_f_drev, vis_transformer_params_der[0], vis_transformer_params_der[1]]) # **************************************** g_t = non_zero_g_t # updating values: b1 = tf.constant(self.beta1, dtype=tf.float64) b2 = tf.constant(self.beta2, dtype=tf.float64) r_t = tf.cast(step + 1, tf.float64) r_m = b1 * m_t + (1 - b1) * g_t r_v = b2 * v_t + (1 - b2) * tf.pow(g_t, 2) a_t = alpha * (tf.sqrt(1 - tf.pow(b2, r_t))) / (1 - tf.pow(b1, r_t)) delta_vals = a_t * r_m / (tf.sqrt(r_v) + self.eps) r_p = tf.log(r) + delta_vals[0] new_r = tf.exp(r_p) new_f_param = tf.exp(tf.log(f_param) + delta_vals[1]) new_vis_transformer_params = tf.pow(tf.sqrt(vis_transformer_params) + delta_vals[2:2 + self.n_vis_t_params], 2) new_the_number_of_rounds_under_acc_diff = tf.cond( tf.less(tf.reduce_max(tf.abs(delta_vals)), self.acc_diff_limit), lambda: the_number_of_rounds_under_acc_diff + 1, lambda: tf.constant(0)) # checking flips new_alpha = tf.cond( tf.less(tf.reduce_max(delta_vals * prev_delta), tf.constant(0.0, dtype=tf.float64)), lambda: 0.1 * alpha, lambda: alpha ) new_step = step + 1 # reshaping new_r.set_shape(r.get_shape()) new_f_param.set_shape(f_param.get_shape()) new_vis_transformer_params.set_shape(vis_transformer_params.get_shape()) r_m.set_shape(m_t.get_shape()) r_v.set_shape(v_t.get_shape()) new_step.set_shape(step.get_shape()) new_the_number_of_rounds_under_acc_diff.set_shape(the_number_of_rounds_under_acc_diff.get_shape()) delta_vals.set_shape(prev_delta.get_shape()) return new_r, new_f_param, new_vis_transformer_params, r_m, r_v, prev_ll, new_the_number_of_rounds_under_acc_diff, delta_vals,\ stable_r, stable_f_param, stable_vis_transformer_params, stable_m_t, stable_v_t, stable_prev_ll,\ stable_the_number_of_rounds_under_acc_diff, stable_prev_delta, new_step, new_alpha, max_ll, extra_took_steps