def load_model(path): model_name = 'zip' log.info('ZipRegression.load_model: Loading model %s from path %s' % (model_name, path)) eta_0 = flu.np_load(join(path, '%s_eta_0.npy' % model_name)) eta_0 = np.atleast_1d(eta_0)[0] eta_u = flu.np_load(join(path, '%s_eta_u.npy' % model_name)) beta_0 = flu.np_load(join(path, '%s_beta_0.npy' % model_name)) beta_0 = np.atleast_1d(beta_0)[0] beta_i = flu.np_load(join(path, '%s_beta_i.npy' % model_name)) beta_u = flu.np_load(join(path, '%s_beta_u.npy' % model_name)) n = beta_u.shape[0] m = beta_i.shape[0] model = ZipRegression(N=n, M=m, eta_0=eta_0, eta_u=eta_u, beta_0=beta_0, beta_i=beta_i, beta_u=beta_u) model.trained_users = flu.np_load( join(path, '%s_trained_users.npy' % model_name)) model.trained = True return model
def test_log_prob(self, users, items, data_feat, target, return_vals=False): """Evaluates Log-Likelihood accuracy on test data. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. data_feat: <(D, f) float> data-driven (non-intercept) features 4. target: <(D, ) int> target rates 5. return_vals: <bool> if True returns all the values instead of the mean (default = False) Returns --------- 1. <float> average log-likelihood (if return_vals is False) 2 <(D, ) float> log likelihood for each point (if return_vals is True) 3. -np.inf if model is not trained. """ if not self.trained: return -np.inf # At optimization time - it is very likely that some users don't have train data (because they're not active # yet). This makes sure that I'm not testing on them. test_users = np.unique(users) trained_user_mask = np.where( np.in1d(users, self.trained_users, assume_unique=False)) log.info('Trained on %d out of %d test users' % (self.trained_users.shape[0], test_users.shape[0])) user_feat = np.hstack([np.ones([data_feat.shape[0], 1]), data_feat]) lambda_est = self.pos_model.get_est_lambda( users[trained_user_mask], items[trained_user_mask], user_feat[trained_user_mask]) pis = self.sigmoid_func(users[trained_user_mask], items[trained_user_mask], user_feat[trained_user_mask]) vals = self.data_log_like(target[trained_user_mask], lambda_est, pis) if return_vals: return vals else: return np.mean(vals)
def load_model(path, num_proc): log.info('Loading ZIP model from path %s with %d num proc' % (path, num_proc)) beta_0 = flu.np_load(join(path, 'pos_beta_0.npy')) beta_0 = np.atleast_1d(beta_0)[0] beta_i = flu.np_load(join(path, 'pos_beta_i.npy')) beta_u = flu.np_load(join(path, 'pos_beta_u.npy')) # TODO(MOSHE): Why do I need num_proc here??? And how do I deal with no N and M? # TODO(MOSHE): Specifically M because N can be taken from the coefficients return PoissonRegression(beta_0=beta_0, beta_i=beta_i, beta_u=beta_u, num_proc=num_proc)
def test_abs_error(self, users, items, data_feat, target): if not self.trained: return np.inf # At optimization time - it is very likely that some users don't have train data (because they're not active # yet). This makes sure that I'm not testing on them. test_users = np.unique(users) trained_user_mask = np.where( np.in1d(users, self.trained_users, assume_unique=False)) log.info('Trained on %d out of %d test users' % (self.trained_users.shape[0], test_users.shape[0])) lambda_est = self.predict(users[trained_user_mask], items[trained_user_mask], data_feat[trained_user_mask]) return np.mean(np.abs(lambda_est - target[trained_user_mask]))
def np_save(path, file_name, data): """ Wrapper for np.save that also creates the dir if doesn't exist INPUT: ------- 1. path: <sting> dir path 2. file_name: <string> file name 3. data: <ndarray> numpy array """ log.info('Saving file %s/%s' % (path, file_name)) make_dir(path) start = time.time() np.save(join(path, file_name), data) os.chmod(join(path, file_name), 0770) log.info('Saving took %d seconds' % (time.time() - start))
def np_load(file_path): """ Wrapper fpr the np.load that also prints time. INPUT: ------- 1. file_path: <string> file path OUTPUT: -------- 1. data: <?> whatever was saved RAISE: ------- 1. IOError """ log.info('Loading %s' % file_path) start = time.time() data = np.load(file_path) log.info('Loading took %d seconds' % (time.time() - start)) return data
def _em(self, users, items, data_feat, target): """Runs the EM algorithm to learn both \eta and \beta. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. data_feat: <(D, f) float> data-driven (non-intercept) features 4. target: <(D, ) int> target rates """ prev_ll = curr_ll = -np.inf reached_conv = False # Adding the user intercept constant. In my code, the exposure process has different constants than the # rate process, so I only pass the data_feat to the methods and deal with the constants separately. # The reason I keep the user const in the user_feat is to avoid starting the counts from 1 in the cython code. # Other you trust me, or you can go and look at it :) user_feat = np.hstack([np.ones([data_feat.shape[0], 1]), data_feat]) # Randomly initializing \eta and \beta self._initialize_eta(user_feat.shape[1]) self.pos_model._initialize_beta(user_feat.shape[1]) pie = self.sigmoid_func(users, items, user_feat) rate = self.pos_model.get_est_lambda(users, items, user_feat) # Starting with an ESTEP after randomly initializing eta and beta. w_ijt = self._e_step(users, items, user_feat, target, pie, rate) # M STEP pie, rate = self._m_step(users, items, user_feat, target, w_ijt, rate) for em_i in xrange(self.em_num_iter): w_ijt = self._e_step(users, items, user_feat, target, pie, rate) pie, rate = self._m_step(users, items, user_feat, target, w_ijt, rate) # ZIP probability if em_i > self.min_em_iter and em_i % self.em_ll_iters == 0: curr_ll = np.mean(self.data_log_like(target, rate, pie)) log.info( 'ZipRegression._em: Data LL at iteration %d [%.5f --> %.5f]' % (em_i, prev_ll, curr_ll)) if np.abs(prev_ll - curr_ll) < self.em_tol: log.info('ZipRegression._em: Reached conversion') reached_conv = True break prev_ll = curr_ll if not reached_conv: log.error( 'ZipRegression._em: Did not reach convergance after %d iterations' % self.em_num_iter) log.info('ZipRegression._em: Train data log like %.5f' % curr_ll)
def test_f1(self, users, items, data_feat, target, return_vals=False): """Evaluates F1 accuracy on test data. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. data_feat: <(D, f) float> data-driven (non-intercept) features 4. target: <(D, ) int> target rates 5. return_vals: <bool> if True returns all the values instead of the mean (default = False) Returns --------- 1. <float> average f1 (if return_vals is False) 2 <(D, ) float> f1 for each point (if return_vals is True) 3. np.inf if model is not trained. """ if not self.trained: return np.inf # At optimization time - it is very likely that some users don't have train data (because they're not active # yet). This makes sure that I'm not testing on them. test_users = np.unique(users) trained_user_mask = np.where( np.in1d(users, self.trained_users, assume_unique=False)) log.info('Trained on %d out of %d test users' % (self.trained_users.shape[0], test_users.shape[0])) zip_exp = self.predict(users[trained_user_mask], items[trained_user_mask], data_feat[trained_user_mask]) vals = objectives.f_measure(target[trained_user_mask], zip_exp) if return_vals: return vals else: return np.mean(vals)
def start_sampling(self, num_points, batch_size): """Creates a sampling process for the (num_points, batch_size) pair. Args ------ 1. num_points: <int> number of elements to choose from 2. batch_size: <int> number of choices """ log.info('AsyncSampler.start_sampling: Starting a sampler for [%d %d]' % (num_points, batch_size)) pair = (num_points, batch_size) q = Queue(self.q_size) proc_pool = [] # We save pointers to the queue and the process pool so we can free them in the "destructor" self.samplers[pair] = q self.proc_pools[pair] = proc_pool # Creating processes that will do the sampling for i in range(self.num_proc): proc = Process(target=self._async_sampler, args=(q, num_points, batch_size)) atexit.register(proc.terminate) proc_pool.append(proc) proc.start()
def learn_eta(self, users, items, user_feat, w_ijt): """Performs the e-step of the EM algorithm to estimate the response values w_ijt. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values 4. w_ijt: <(D, ) int> target response values """ self._initialize_eta(user_feat.shape[1]) # Number of times the likelihood went down. Used to prevent overfitting and parameter explosion. num_down = 0 prev_ll = curr_ll = -np.inf reached_conv = False for i in range(1, self.gd_max_iter + 1): # Sampling a mini-batch samp = gd_commons.fast_sample(user_feat.shape[0], self.gd_batch_size) eta_sgd_point = tm.get_point( 'eta_sgd_iter') # Taking this time point after the sample. d_features, d_0_prior, d_u_prior = self._eta_derivative_vals( users[samp], items[samp], user_feat[samp], w_ijt[samp]) # ADAM initial values adam_vals_u = { 'mean': np.zeros(self.eta_u.shape), 'var': np.zeros(self.eta_u.shape), 't': 0 } adam_vals_0 = {'mean': 0, 'var': 0, 't': 0} g_grad = gd_commons.grad_for_global(d_features[:, 0], d_0_prior) u_grad = gd_commons.grad_for_user(users[samp], d_features[:, 1:], d_u_prior) # These operations are safe because if the user or item were not in the sample the grad for them will be # zero. self.eta_0 += gd_commons.get_adam_update(self.gd_step_size, g_grad, adam_vals_0) self.eta_u += gd_commons.get_adam_update(self.gd_step_size, u_grad, adam_vals_u) eta_sgd_point.collect() # Checking for convergence - using only the data likelihood. if i >= self.min_gd_iter and i % self.gd_ll_iters == 0: curr_ll = self.eta_likelihood(users, items, user_feat, w_ijt) if curr_ll < prev_ll: num_down += 1 log.info( 'ZipRegression.learn_eta: Data log like after %d iterations [%.5f --> %.5f]' % (i, prev_ll, curr_ll)) if np.abs(curr_ll - prev_ll ) <= self.gd_tol or num_down >= self.gd_num_dec: log.info( 'ZipRegression.learn_eta: Reached convergance after %d iterations' % i) reached_conv = True break prev_ll = curr_ll if not reached_conv: log.info( 'ZipRegression.learn_eta: Did not reach convergance after %d iterations' % self.gd_max_iter) log.info('ZipRegression.learn_eta: Train data log like %.3f' % curr_ll)
def _learn_beta(self, users, items, user_feat, target, weights=None): """Learns all the \beta's using stochastic gradient descent with ADAM. Args ------ 1. users: <(D, ) int> user ids 2. items: <(D, ) int> item ids 3. user_feat: <(D, f) float> user features values 4. target: <(D, ) int> target rates 5. weights: <(D, ) float> points weights for the weighted regression case Raises -------- 1. ValueError if coefficients went out of hand and got the value of np.inf. """ self._initialize_beta(user_feat.shape[1]) # ADAM initial values adam_vals_u = { 'mean': np.zeros(self.beta_u.shape), 'var': np.zeros(self.beta_u.shape), 't': 0 } adam_vals_i = { 'mean': np.zeros(self.beta_i.shape), 'var': np.zeros(self.beta_i.shape), 't': 0 } adam_vals_0 = {'mean': 0, 'var': 0, 't': 0} # Number of times the likelihood went down. Used to prevent overfitting and parameter explosion. num_down = 0 prev_ll = curr_ll = -np.inf reached_conv = False # Gradient descent main loop for i in range(1, self.gd_num_iter + 1): # Sampling a mini-bucket samp = gd_commons.fast_sample(user_feat.shape[0], self.gd_batch_size) point = tm.get_point('pois_reg_sgd_iter' ) # Taking this time point after the sample. # First computing all the derivative values. Not computing the gradients yet. d_pois_reg, d_0_prior, d_i_prior, d_u_prior = \ self._beta_derivative_vals(users[samp], items[samp], user_feat[samp], target[samp]) if weights is not None: # It's weighted regression and I need to modify the weight of each point. d_pois_reg *= np.atleast_2d(weights[samp]).T # Computing all the gradients g_grad = gd_commons.grad_for_global(d_pois_reg[:, 0], d_0_prior) i_grad = gd_commons.grad_for_item(items[samp], d_pois_reg[:, 1], d_i_prior) u_grad = gd_commons.grad_for_user(users[samp], d_pois_reg[:, 2:], d_u_prior) # These operations are safe because if the user or item were not in the sample the grad for them will be # zero. self.beta_0 += gd_commons.get_adam_update(self.gd_step_size, g_grad, adam_vals_0) self.beta_i += gd_commons.get_adam_update(self.gd_step_size, i_grad, adam_vals_i) self.beta_u += gd_commons.get_adam_update(self.gd_step_size, u_grad, adam_vals_u) point.collect() # Checking for convergence - using only the data likelihood. if i > self.min_gd_iter and i % self.gd_ll_iters == 0: curr_ll = self._pois_reg_data_log_like(target, users, items, user_feat, weights) if curr_ll < prev_ll: num_down += 1 if np.isnan(curr_ll) or np.isinf(curr_ll): raise ValueError( 'Pois_Reg: Coefficient values went out of hand -- adjust regularizer value.' ) log.info('Pois_Reg data log like: [%.3f --> %.3f]' % (prev_ll, curr_ll)) if np.abs(curr_ll - prev_ll ) <= self.gd_tol or num_down >= self.gd_num_dec: log.info( 'Pois_Reg: Reached convergance after %d iterations' % i) reached_conv = True break prev_ll = curr_ll if not reached_conv: log.error( 'Pois_Reg: Did not reach convergence after %d iterations' % self.gd_num_iter) log.info('Pois_Reg: Train log like %.3f' % curr_ll)
def _learn_beta(self, users, items, features, target, weights=None): # If any of the parameters wasn't initialized if self.beta_u is None: self.beta_u = np.random.normal(0, 0.1, [self.N, features.shape[1]]) if self.beta_i is None: self.beta_i = np.random.normal(0, 0.1, self.M) if self.beta_0 is None: self.beta_0 = np.random.normal(0, 0.1, 1)[0] if self.gd_adam: adam_vals_u = { 'mean': np.zeros(self.beta_u.shape), 'var': np.zeros(self.beta_u.shape), 't': 0 } adam_vals_i = { 'mean': np.zeros(self.beta_i.shape), 'var': np.zeros(self.beta_i.shape), 't': 0 } adam_vals_0 = {'mean': 0, 'var': 0, 't': 0} # Computing the lambda array reached_conv = False for i in range(1, self.gd_num_iter + 1): beta_iter_point = tm.get_point('beta_sgd_iter') point = tm.get_point('beta_sgd_samp') if self.gd_weights_sample: samp = gd_commons.fast_sample_with_weights(weights) else: samp = gd_commons.fast_sample(features.shape[0], self.gd_batch_size) point.collect() point = tm.get_point('beta_derivative_vals') d_mle, d_g_prior, d_i_prior, d_u_prior = \ self._beta_derivative_vals(users[samp], items[samp], features[samp], target[samp]) point.collect() # TODO: Discuss the most proper way to combine the weights and the prior/regularization with Padhraic if weights is not None and not self.gd_weights_sample: # If it's weight sample no need to modify the mle with the weights d_mle *= np.atleast_2d(weights[samp]).T # Updating the gradient g_grad = gd_commons.grad_for_global(d_mle[:, 0], d_g_prior) i_grad = gd_commons.grad_for_item(items[samp], d_mle[:, 1], d_i_prior) u_grad = gd_commons.grad_for_user(users[samp], d_mle[:, 2:], d_u_prior) a = self.gd_step_size / self.decay if self.gd_decay else self.gd_step_size # These operations are safe because if the user or item were not in the sample the grad for them will be # zero. point = tm.get_point('beta_grad_updates') if self.gd_adam: self.beta_0 += gd_commons.get_AdaM_update( a, g_grad, adam_vals_0) self.beta_i += gd_commons.get_AdaM_update( a, i_grad, adam_vals_i) self.beta_u += gd_commons.get_AdaM_update( a, u_grad, adam_vals_u) else: self.beta_0 += g_grad * a self.beta_i += i_grad * a self.beta_u += u_grad * a point.collect() beta_iter_point.collect() if i % self.gd_ll_iters == 0: point = tm.get_point('beta_mle') curr_ll = self._mle(target, users, items, features, weights) point.collect() if np.isnan(curr_ll) or np.isinf(curr_ll): raise ValueError( 'Coefficient values went out of hand -- adjust lambda and/or step size' ) log.info('BETA GD MLE: [%.3f --> %.3f]' % (self.prev_ll, curr_ll)) if np.abs(curr_ll - self.prev_ll) <= self.gd_tol: log.info( 'BETA GD: Reached convergance after %d iterations' % i) reached_conv = True self.prev_ll = curr_ll break else: self.prev_ll = curr_ll self.decay += 1 if not reached_conv: log.error( 'BETA GD: Did not reach convergance after %d iterations' % self.gd_num_iter) log.info('BETA GD: Train log like %.3f' % curr_ll)
def log_summary(): tm_df = get_summary() log.info('\n\n***** TIME MEASUREMENTS *****\n\n%s\n\n' % tm_df) reset_tm()