def _e_step(self, users, items, user_feat, target, pie, rate): """Performs the e-step of the EM algorithm to estimate the response values w_ijt. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values 4. target: <(D, ) int> target rates 5. pie: <(D, ) float> estimated mixing weights 6. rate: <(D, ) float> estimated rate parameter Returns --------- 1. w_ijt: <(D, ) float> estimated response values. """ point = tm.get_point('_e_step') zero_mask = np.where(target == 0)[0] pois_prob = np.exp(objectives.pois_log_prob(target, rate)) prob_from_rate = pie[zero_mask] * pois_prob[zero_mask] # Only need to update the w_ijt at the zero_mask, for the rest it has to come from the rate process so we can # leave it as 1. w_ijt = np.ones(user_feat.shape[0]) w_ijt[zero_mask] = prob_from_rate / (prob_from_rate + 1 - pie[zero_mask]) point.collect() return w_ijt
def eta_likelihood(self, users, items, user_feat, w_ijt): """ Computes the likelihood conditioned on eta. This is the logistic likelihood function. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values 4. w_ijt: <(D, ) int> target response values Returns --------- 1. ll: <float> likelihood """ point = tm.get_point('eta_logistic_likelihood') sig = self.sigmoid_func(users, items, user_feat) # For robustness making sure no one is totally 1 or totally. tmp = np.where(sig == 1)[0] sig[tmp] -= 1E-24 tmp = np.where(sig == 0)[0] sig[tmp] += 1E-24 ll = np.mean(w_ijt * np.log(sig) + (1 - w_ijt) * (np.log(1 - sig))) point.collect() return ll
def get_est_lambda(self, users, items, features): """Estimates the \lambda parameters. This code uses the current \beta values and estimates it for each user i and item j pairs in the users and items vectors according to the corresponding features. Args ------ 1. users: <(N, ) ndarray of type int> user ids 2. items: <(N, ) ndarray of type int> item ids 3. features: <(N, d) ndarray of type float> features values. Returns --------- 1. est_lamb: <(N, ) ndarray of type float> estimated lambdas. """ if users.shape[0] != items.shape[0] or users.shape[ 0] != features.shape[0]: raise AssertionError( 'Numbers of users, items and features have to be the same.') point = tm.get_point('get_est_lambda') beta_x = gd_commons.mul_feat_coeff(users, items, features, self.beta_0, self.beta_u, self.beta_i, num_proc=self.num_proc) est_lamb = np.exp(beta_x) point.collect() return est_lamb
def _pois_reg_data_log_like(self, target, users, items, user_feat, weights=None): """Computes the data log likelihood. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> data-driven (non-intercept) and user const features 4. target: <(D, ) int> target rates 5. weights: <(D, ) float> points weights for the weighted regression case Returns --------- 1. <float> average data log likelihood. """ point = tm.get_point('pois_reg_data_log_like') est_lambda = self.get_est_lambda(users, items, user_feat) curr_ll = objectives.pois_log_prob(target, est_lambda) if weights is not None: # Adjusting the weights. curr_ll *= weights point.collect() return np.mean(curr_ll)
def grad_for_user(users, d_pois_reg_user, d_user_prior): """Computes the gradient for \beta_i including the user intercept. Args ------ 1. users: <(D, ) int> user ids 2. d_pois_reg_user: <(D, f) float> derivative of user features 3. d_user_prior: <(N, f) float> derivative of the user coefficient prior Returns --------- 1. grad: <(N, f) float> gradient for each user """ # I found to be easier on the cython part if you create the data structure outside instead of using malloc and free # inside the cython code. user_counts = np.zeros(d_user_prior.shape[0]) grad = np.zeros(d_user_prior.shape) point = tm.get_point('grad_for_user') fm.grad_for_user(users, d_pois_reg_user, d_user_prior, user_counts, grad) point.collect() return grad
def grad_for_item(items, d_pois_reg_item, d_item_prior): """Computes the gradient for the item intercept. Args ------ 1. items: <(D, ) int> item ids 2. d_pois_reg_item: <(D, f) float> derivative of item intercept 3. d_item_prior: <(M, f) float> derivative of the item coefficient prior Returns --------- 1. grad: <(M, ) float> gradient for each item """ # I found to be easier on the cython part if you create the data structure outside instead of using malloc and free # inside the cython code. item_counts = np.zeros(d_item_prior.shape[0]) grad = np.zeros(d_item_prior.shape[0]) point = tm.get_point('grad_for_item') fm.grad_for_item(items, d_pois_reg_item, d_item_prior, item_counts, grad) point.collect() return grad
def _mle(self, target, users, items, features, weights=None): point = tm.get_point('beta_mle_est_lambda') est_lambda = self.get_est_lambda(users, items, features) point.collect() point = tm.get_point('beta_mle_log_factorial') y_log_fact = helpers.log_factorial(target) point.collect() point = tm.get_point('beta_mle_curr_ll_numpy') curr_ll = (target * np.log(est_lambda)) - est_lambda - y_log_fact if weights is not None: curr_ll *= weights point.collect() return np.mean(curr_ll)
def _beta_derivative_vals(self, users, items, user_feat, target): """Computes the derivations for each element in the matrix. Note that this is not where the gradient is computed, but just where each element in the feature table is derived. This also includes the two intercept. If there's a prior, it is derived as well. The reason for the separation is because of the fixed-regression in which we have a fixed effect for population, each individual and each item so it is easier to first compute the derivation at each point using mat operations and later compute the different gradients separately. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values. 4. target: <(D, ) int> target rates Returns --------- 1. d_pois_reg: <(D, f + 2) float> derivative of ALL features 2. d_0_prior: <float> derivative of the global intercept prior 3. d_i_prior: <(M, ) float> derivative of the item intercept prior 3. d_u_prior: <(N, f) float> derivative of the user \beta (including intercept) """ point = tm.get_point('pois_regression_deriv') # First computing \beta_u * features. It's going to be needed in the derivation computation. beta_u_x = gd_commons.mul_feat_coeff(users, items, user_feat, self.beta_0, self.beta_u, self.beta_i) # Adding two columns of ones for the 'non-user' feat. This is done to make the computation easier using matrix # operations. f_const = np.hstack([np.ones([user_feat.shape[0], 2]), user_feat]) # Computing the parts of the Poisson regression derivative d_features = f_const * np.atleast_2d(np.exp(beta_u_x)).T d_target = f_const * np.atleast_2d(target).T d_pois_reg = d_target - d_features # The dervation of the prior d_0_prior = self.gd_lamb * (self.beta_0 - self.beta_0_prior) d_i_prior = self.gd_lamb * (self.beta_i - self.beta_i_prior) d_u_prior = self.gd_lamb * (self.beta_u - self.beta_u_prior) point.collect() return d_pois_reg, d_0_prior, d_i_prior, d_u_prior
def fast_sample(num_points, batch_size): """Generates a choice sample of size batch_size from num_points. Args ------ 1. num_points: <int> number of points to choose from. 2. btach_size: <int> number of points to sample. Returns --------- 1. samp: <(batch_size, ) int> indexes of selected points """ point = tm.get_point('fast_sample_%d_%d' % (num_points, batch_size)) samp = sampler.get_sample(num_points, batch_size) point.collect() return samp
def get_est_lambda(self, users, items, user_feat): """Estimates the \lambda parameters. This code uses the current \beta values and estimates it for each user i and item j pairs in the users and items vectors according to the corresponding features. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values Returns --------- 1. est_lamb: <(D, ) ndarray of type float> estimated lambdas. """ point = tm.get_point('get_est_lambda') beta_x = gd_commons.mul_feat_coeff(users, items, user_feat, self.beta_0, self.beta_u, self.beta_i) est_lamb = np.exp(beta_x) point.collect() return est_lamb
def learn_eta(self, users, items, user_feat, w_ijt): """Performs the e-step of the EM algorithm to estimate the response values w_ijt. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values 4. w_ijt: <(D, ) int> target response values """ self._initialize_eta(user_feat.shape[1]) # Number of times the likelihood went down. Used to prevent overfitting and parameter explosion. num_down = 0 prev_ll = curr_ll = -np.inf reached_conv = False for i in range(1, self.gd_max_iter + 1): # Sampling a mini-batch samp = gd_commons.fast_sample(user_feat.shape[0], self.gd_batch_size) eta_sgd_point = tm.get_point( 'eta_sgd_iter') # Taking this time point after the sample. d_features, d_0_prior, d_u_prior = self._eta_derivative_vals( users[samp], items[samp], user_feat[samp], w_ijt[samp]) # ADAM initial values adam_vals_u = { 'mean': np.zeros(self.eta_u.shape), 'var': np.zeros(self.eta_u.shape), 't': 0 } adam_vals_0 = {'mean': 0, 'var': 0, 't': 0} g_grad = gd_commons.grad_for_global(d_features[:, 0], d_0_prior) u_grad = gd_commons.grad_for_user(users[samp], d_features[:, 1:], d_u_prior) # These operations are safe because if the user or item were not in the sample the grad for them will be # zero. self.eta_0 += gd_commons.get_adam_update(self.gd_step_size, g_grad, adam_vals_0) self.eta_u += gd_commons.get_adam_update(self.gd_step_size, u_grad, adam_vals_u) eta_sgd_point.collect() # Checking for convergence - using only the data likelihood. if i >= self.min_gd_iter and i % self.gd_ll_iters == 0: curr_ll = self.eta_likelihood(users, items, user_feat, w_ijt) if curr_ll < prev_ll: num_down += 1 log.info( 'ZipRegression.learn_eta: Data log like after %d iterations [%.5f --> %.5f]' % (i, prev_ll, curr_ll)) if np.abs(curr_ll - prev_ll ) <= self.gd_tol or num_down >= self.gd_num_dec: log.info( 'ZipRegression.learn_eta: Reached convergance after %d iterations' % i) reached_conv = True break prev_ll = curr_ll if not reached_conv: log.info( 'ZipRegression.learn_eta: Did not reach convergance after %d iterations' % self.gd_max_iter) log.info('ZipRegression.learn_eta: Train data log like %.3f' % curr_ll)
def _learn_beta(self, users, items, user_feat, target, weights=None): """Learns all the \beta's using stochastic gradient descent with ADAM. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values 4. target: <(D, ) int> target rates 5. weights: <(D, ) float> points weights for the weighted regression case Raises -------- 1. ValueError if coefficients went out of hand and got the value of np.inf. """ self._initialize_beta(user_feat.shape[1]) # ADAM initial values adam_vals_u = { 'mean': np.zeros(self.beta_u.shape), 'var': np.zeros(self.beta_u.shape), 't': 0 } adam_vals_i = { 'mean': np.zeros(self.beta_i.shape), 'var': np.zeros(self.beta_i.shape), 't': 0 } adam_vals_0 = {'mean': 0, 'var': 0, 't': 0} # Number of times the likelihood went down. Used to prevent overfitting and parameter explosion. num_down = 0 prev_ll = curr_ll = -np.inf reached_conv = False # Gradient descent main loop for i in range(1, self.gd_num_iter + 1): # Sampling a mini-bucket samp = gd_commons.fast_sample(user_feat.shape[0], self.gd_batch_size) point = tm.get_point('pois_reg_sgd_iter' ) # Taking this time point after the sample. # First computing all the derivative values. Not computing the gradients yet. d_pois_reg, d_0_prior, d_i_prior, d_u_prior = \ self._beta_derivative_vals(users[samp], items[samp], user_feat[samp], target[samp]) if weights is not None: # It's weighted regression and I need to modify the weight of each point. d_pois_reg *= np.atleast_2d(weights[samp]).T # Computing all the gradients g_grad = gd_commons.grad_for_global(d_pois_reg[:, 0], d_0_prior) i_grad = gd_commons.grad_for_item(items[samp], d_pois_reg[:, 1], d_i_prior) u_grad = gd_commons.grad_for_user(users[samp], d_pois_reg[:, 2:], d_u_prior) # These operations are safe because if the user or item were not in the sample the grad for them will be # zero. self.beta_0 += gd_commons.get_adam_update(self.gd_step_size, g_grad, adam_vals_0) self.beta_i += gd_commons.get_adam_update(self.gd_step_size, i_grad, adam_vals_i) self.beta_u += gd_commons.get_adam_update(self.gd_step_size, u_grad, adam_vals_u) point.collect() # Checking for convergence - using only the data likelihood. if i > self.min_gd_iter and i % self.gd_ll_iters == 0: curr_ll = self._pois_reg_data_log_like(target, users, items, user_feat, weights) if curr_ll < prev_ll: num_down += 1 if np.isnan(curr_ll) or np.isinf(curr_ll): raise ValueError( 'Pois_Reg: Coefficient values went out of hand -- adjust regularizer value.' ) log.info('Pois_Reg data log like: [%.3f --> %.3f]' % (prev_ll, curr_ll)) if np.abs(curr_ll - prev_ll ) <= self.gd_tol or num_down >= self.gd_num_dec: log.info( 'Pois_Reg: Reached convergance after %d iterations' % i) reached_conv = True break prev_ll = curr_ll if not reached_conv: log.error( 'Pois_Reg: Did not reach convergence after %d iterations' % self.gd_num_iter) log.info('Pois_Reg: Train log like %.3f' % curr_ll)
def _learn_beta(self, users, items, features, target, weights=None): # If any of the parameters wasn't initialized if self.beta_u is None: self.beta_u = np.random.normal(0, 0.1, [self.N, features.shape[1]]) if self.beta_i is None: self.beta_i = np.random.normal(0, 0.1, self.M) if self.beta_0 is None: self.beta_0 = np.random.normal(0, 0.1, 1)[0] if self.gd_adam: adam_vals_u = { 'mean': np.zeros(self.beta_u.shape), 'var': np.zeros(self.beta_u.shape), 't': 0 } adam_vals_i = { 'mean': np.zeros(self.beta_i.shape), 'var': np.zeros(self.beta_i.shape), 't': 0 } adam_vals_0 = {'mean': 0, 'var': 0, 't': 0} # Computing the lambda array reached_conv = False for i in range(1, self.gd_num_iter + 1): beta_iter_point = tm.get_point('beta_sgd_iter') point = tm.get_point('beta_sgd_samp') if self.gd_weights_sample: samp = gd_commons.fast_sample_with_weights(weights) else: samp = gd_commons.fast_sample(features.shape[0], self.gd_batch_size) point.collect() point = tm.get_point('beta_derivative_vals') d_mle, d_g_prior, d_i_prior, d_u_prior = \ self._beta_derivative_vals(users[samp], items[samp], features[samp], target[samp]) point.collect() # TODO: Discuss the most proper way to combine the weights and the prior/regularization with Padhraic if weights is not None and not self.gd_weights_sample: # If it's weight sample no need to modify the mle with the weights d_mle *= np.atleast_2d(weights[samp]).T # Updating the gradient g_grad = gd_commons.grad_for_global(d_mle[:, 0], d_g_prior) i_grad = gd_commons.grad_for_item(items[samp], d_mle[:, 1], d_i_prior) u_grad = gd_commons.grad_for_user(users[samp], d_mle[:, 2:], d_u_prior) a = self.gd_step_size / self.decay if self.gd_decay else self.gd_step_size # These operations are safe because if the user or item were not in the sample the grad for them will be # zero. point = tm.get_point('beta_grad_updates') if self.gd_adam: self.beta_0 += gd_commons.get_AdaM_update( a, g_grad, adam_vals_0) self.beta_i += gd_commons.get_AdaM_update( a, i_grad, adam_vals_i) self.beta_u += gd_commons.get_AdaM_update( a, u_grad, adam_vals_u) else: self.beta_0 += g_grad * a self.beta_i += i_grad * a self.beta_u += u_grad * a point.collect() beta_iter_point.collect() if i % self.gd_ll_iters == 0: point = tm.get_point('beta_mle') curr_ll = self._mle(target, users, items, features, weights) point.collect() if np.isnan(curr_ll) or np.isinf(curr_ll): raise ValueError( 'Coefficient values went out of hand -- adjust lambda and/or step size' ) log.info('BETA GD MLE: [%.3f --> %.3f]' % (self.prev_ll, curr_ll)) if np.abs(curr_ll - self.prev_ll) <= self.gd_tol: log.info( 'BETA GD: Reached convergance after %d iterations' % i) reached_conv = True self.prev_ll = curr_ll break else: self.prev_ll = curr_ll self.decay += 1 if not reached_conv: log.error( 'BETA GD: Did not reach convergance after %d iterations' % self.gd_num_iter) log.info('BETA GD: Train log like %.3f' % curr_ll)