def _initialize_lists(self): # weights and number of iterations (more or less) for adaptive # learning rate self.w = GetList([0.] * self.ndims) self.num = GetList([0.] * self.ndims)
class OGDLR(object): """Online Gradient Descent Logistic Regression Model adapted from Sam Hocevar with interface similar to scikit learn's. Only support for single class prediction right now. The model supports per coordinate learning rate schedules. """ def __init__(self, lambda1=0., lambda2=0., alpha=0.02, beta=1., ndims=None, alr_schedule='gradient', callback=None, callback_period=10000, verbose=False): """Parameters ---------- * lambda1 (float, default: 1.): L1 regularization factor * lambda2 (float, default: 0.): L2 regularization factor * alpha (float, default: 0.02): scales learning rate up * beta (float, default: 1.): scales learning rate down * ndims (int, default: None): Max number of dimensions (use array), if None, no max (use dict) * alr_schedule (string, {'gradient' (default), 'count', 'constant'}): adaptive learning rate schedule, either decrease with gradient or with count or remain constant. * callback (func, default: None): optional callback function * callback_period (int, default: 10000): period of the callback * verbose: not much here yet Notes on parameter choice: * alpha must be greater 0. * "The optimal value of alpha can vary a fair bit..." * "beta=1 is usually good enough; this simply ensures that early learning rates are not too high" -- McMahan et al. """ self.lambda1 = lambda1 self.lambda2 = lambda2 self.alpha = alpha self.beta = beta self.ndims = ndims self.alr_schedule = alr_schedule self.callback = callback self.callback_period = callback_period self.verbose = verbose def _initialize_dicts(self): # weights and number of iterations (more or less) for adaptive # learning rate self.w = {} self.num = {} def _initialize_lists(self): # weights and number of iterations (more or less) for adaptive # learning rate self.w = GetList([0.] * self.ndims) self.num = GetList([0.] * self.ndims) def _initialize_cols(self, X, cols): # column names m = X.shape[1] if cols is not None: if len(set(cols)) != len(cols): raise ValueError("Columns contain duplicate names.") self.cols = cols # name of the columns else: # generate generic column names s1 = "col{0:0%dd}" % len(str(m)) self.cols = [s1.format(i) for i in range(m)] def _initialize(self, X, cols): """Initialize some required attributes on first call. """ if self.ndims is None: self._initialize_dicts() else: self._initialize_lists() self._initialize_cols(X, cols) # Validation for each single iteration. Could be changed # to collection.deque if speed and size are an issue here. self.valid_history = [] self._is_initialized = True def _get_x(self, xt): # 'BIAS' is the bias term. The other keys are created as a # the column name joined with the value itself. This should # ensure unique keys. if self.ndims is None: x = ['BIAS'] + [ '__'.join((col, str(val))) for col, val in zip(self.cols, xt) ] else: x = [mmh('BIAS', seed=SEED) % self.ndims] x += [ mmh(key + '__' + str(val), seed=SEED) % self.ndims for key, val in zip(self.cols, xt) ] return x def _get_w(self, xt): wt = [self.w.get(xi, 0.) for xi in xt] return wt def _get_p(self, xt, wt=None): if wt is None: wt = self._get_w(xt) wTx = sum(wt) # bounded sigmoid wTx = max(min(wTx, 20.), -20.) return sigmoid(wTx) def _get_num(self, xt): numt = [self.num.get(xi, 0.) for xi in xt] return numt def _get_delta_num(self, grad): if self.alr_schedule == 'gradient': return grad * grad elif self.alr_schedule == 'count': return 1 elif self.alr_schedule == 'constant': return 0 else: raise TypeError("Do not know learning" "rate schedule %s" % self.alr_schedule) def _get_grads(self, yt, pt, wt): # Get the gradient as a function of true value and predicted # probability, as well as the regularization terms. The # gradient is the derivative of the cost function with # respect to each wi. y_err = pt - yt costs = self._get_regularization(wt) grads = [y_err + cost for cost in costs] return grads def _get_regularization(self, wt): # Get cost from L1 and L2 regularization. Currently, bias is # also regularized but should not matter much. costs = self.lambda1 * np.sign(wt) # L1 costs += 2 * self.lambda2 * np.array(wt) # L2 return costs def _get_skip_sample(self, class_weight, y): if class_weight == 'auto': class_weight = np.mean(y) / (1 - np.mean(y)) if class_weight != 1.: rand = np.random.random(len(y)) skip_sample = rand > class_weight skip_sample[np.array(y) == 1] = False # don't skip 1 labels if self.verbose: print("Using {:0.3f}% of negative samples".format( 100 * class_weight)) else: skip_sample = [False] * len(y) return skip_sample def _call_back(self, t): if ((t % self.callback_period == 0) & (self.callback is not None) & (t != 0)): self.callback.plot(self) def _update(self, wt, xt, gradt, sample_weight): # note: wt is not used here but is still passed so that the # interface for FTRL proximal (which requires wt) can stay the # same numt = self._get_num(xt) for xi, numi, gradi in zip(xt, numt, gradt): delta_w = gradi * self.alpha / (self.beta + np.sqrt(numi)) delta_w /= sample_weight self.w[xi] = self.w.get(xi, 0.) - delta_w delta_num = self._get_delta_num(gradi) self.num[xi] = numi + delta_num return self def _update_valid(self, yt, pt): self.valid_history.append((yt, pt)) def fit(self, X, y, cols=None, class_weight=1.): """Fit OGDLR model. Can also be used as if 'partial_fit', i.e. calling 'fit' more than once continues with the learned weights. Parameters ---------- X : numpy.array, shape = [n_samples, n_features] Training data y : numpy.array, shape = [n_samples] Target values cols : list of strings, shape = [n_features] (default : None) The name of the columns used for training. They are used as part of the first part of the key. If None is given, give generic names to columns ('col01', 'col02', etc.) class_weight : float or 'auto' (optional, default=1.0) Subsample the negative class. Assumes that classes are {0, 1} and that the negative (0) class is the more common one. If these assumptions are not met, don't change this parameter. Returns ------- self """ if not hasattr(self, '_is_initialized'): self._initialize(X, cols) # skip samples if class weight is given skip_sample = self._get_skip_sample(class_weight, y) n = X.shape[0] for t in range(n): # if classes weighted differently if skip_sample[t]: continue else: sample_weight = 1. if y[t] == 1 else class_weight yt = y[t] xt = self._get_x(X[t]) wt = self._get_w(xt) pt = self._get_p(xt, wt) gradt = self._get_grads(yt, pt, wt) self._update(wt, xt, gradt, sample_weight) self._update_valid(yt, pt) self._call_back(t) def predict_proba(self, X): """ Predict probability for class 1. Parameters ---------- X : numpy.array, shape = [n_samples, n_features] Samples Returns ------- y_prob : array, shape = [n_samples] Predicted probability for class 1 """ X = [self._get_x(xt) for xt in X] pt = [self._get_p(xt) for xt in X] y_prob = np.array(pt) return y_prob def predict(self, X): """ Predict class label for samples in X. Parameters ---------- X : numpy.array, shape = [n_samples, n_features] Samples Returns ------- y_pred : array, shape = [n_samples] Predicted class label per sample. """ pt = self.predict_proba(X) y_pred = (pt > 0.5).astype(int) return y_pred def _cost_function(self, xt, wt, y): pt = self._get_p(xt, wt) ll = logloss([y], [pt]) l1 = self.lambda1 * np.abs(wt) l2 = self.lambda2 * (np.array(wt)**2) J = ll + l1 + l2 return J def numerical_grad(self, x, y, epsilon=1e-9): """Calculate the gradient and the gradient determined numerically; they should be very close. Use this function to verify that the gradient is determined correctly. The fit method needs to be called once before this method may be invoked. Parameters ---------- x : list of strings The keys; just a single row. y : int The target to be predicted. epsilon : float (default: 1e-6) The shift applied to the weights to determine the numerical gradient. A small but not too small value such as the default should do the job. Returns ------- grad : float The gradient as determined by this class. For cross-entropy, this is simply the prediction minus the true value. grad_num : float The gradient as determined numerically. """ # analytic xt = self._get_x(x) wt = self._get_w(xt) pt = self._get_p(xt, wt) grad = self._get_grads(y, pt, wt) # numeric: vary each wi grad_num = [] for i in range(len(wt)): wt_pe, wt_me = wt[:], wt[:] wt_pe[i] += epsilon wt_me[i] -= epsilon cost_pe = self._cost_function(xt, wt_pe, y)[i] cost_me = self._cost_function(xt, wt_me, y)[i] grad_num.append((cost_pe - cost_me) / 2 / epsilon) return grad, grad_num def keys(self): """Return the keys saved by the model. This only works if the model uses the dictionary method for storing its keys and values. Otherwise, it returns None. Returns ------- keys : None or list of strings The keys of the model if they can be retrieved else None. """ if self.ndims is None: return self.w.keys() else: return None def weights(self): """Return the weights saved by the model. The weights are not necessarily the same as in the 'w' attribute. Returns ------- weights : None or list of floats The weights of the model. If the model uses a dictionary for storing the weights, they are returned in the same order as the keys. If the model uses hashing to a list to store the weights, returns None. """ if self.ndims is None: weights = self._get_w(self.keys()) return weights else: return None
class OGDLR(object): """Online Gradient Descent Logistic Regression Model adapted from Sam Hocevar with interface similar to scikit learn's. Only support for single class prediction right now. The model supports per coordinate learning rate schedules. """ def __init__(self, lambda1=0., lambda2=0., alpha=0.02, beta=1., ndims=None, alr_schedule='gradient', callback=None, callback_period=10000, verbose=False): """Parameters ---------- * lambda1 (float, default: 1.): L1 regularization factor * lambda2 (float, default: 0.): L2 regularization factor * alpha (float, default: 0.02): scales learning rate up * beta (float, default: 1.): scales learning rate down * ndims (int, default: None): Max number of dimensions (use array), if None, no max (use dict) * alr_schedule (string, {'gradient' (default), 'count', 'constant'}): adaptive learning rate schedule, either decrease with gradient or with count or remain constant. * callback (func, default: None): optional callback function * callback_period (int, default: 10000): period of the callback * verbose: not much here yet Notes on parameter choice: * alpha must be greater 0. * "The optimal value of alpha can vary a fair bit..." * "beta=1 is usually good enough; this simply ensures that early learning rates are not too high" -- McMahan et al. """ self.lambda1 = lambda1 self.lambda2 = lambda2 self.alpha = alpha self.beta = beta self.ndims = ndims self.alr_schedule = alr_schedule self.callback = callback self.callback_period = callback_period self.verbose = verbose def _initialize_dicts(self): # weights and number of iterations (more or less) for adaptive # learning rate self.w = {} self.num = {} def _initialize_lists(self): # weights and number of iterations (more or less) for adaptive # learning rate self.w = GetList([0.] * self.ndims) self.num = GetList([0.] * self.ndims) def _initialize_cols(self, X, cols): # column names m = X.shape[1] if cols is not None: if len(set(cols)) != len(cols): raise ValueError("Columns contain duplicate names.") self.cols = cols # name of the columns else: # generate generic column names s1 = "col{0:0%dd}" % len(str(m)) self.cols = [s1.format(i) for i in range(m)] def _initialize(self, X, cols): """Initialize some required attributes on first call. """ if self.ndims is None: self._initialize_dicts() else: self._initialize_lists() self._initialize_cols(X, cols) # Validation for each single iteration. Could be changed # to collection.deque if speed and size are an issue here. self.valid_history = [] self._is_initialized = True def _get_x(self, xt): # 'BIAS' is the bias term. The other keys are created as a # the column name joined with the value itself. This should # ensure unique keys. if self.ndims is None: x = ['BIAS'] + ['__'.join((col, str(val))) for col, val in zip(self.cols, xt)] else: x = [mmh('BIAS', seed=SEED) % self.ndims] x += [mmh(key + '__' + str(val), seed=SEED) % self.ndims for key, val in zip(self.cols, xt)] return x def _get_w(self, xt): wt = [self.w.get(xi, 0.) for xi in xt] return wt def _get_p(self, xt, wt=None): if wt is None: wt = self._get_w(xt) wTx = sum(wt) # bounded sigmoid wTx = max(min(wTx, 20.), -20.) return sigmoid(wTx) def _get_num(self, xt): numt = [self.num.get(xi, 0.) for xi in xt] return numt def _get_delta_num(self, grad): if self.alr_schedule == 'gradient': return grad * grad elif self.alr_schedule == 'count': return 1 elif self.alr_schedule == 'constant': return 0 else: raise TypeError("Do not know learning" "rate schedule %s" % self.alr_schedule) def _get_grads(self, yt, pt, wt): # Get the gradient as a function of true value and predicted # probability, as well as the regularization terms. The # gradient is the derivative of the cost function with # respect to each wi. y_err = pt - yt costs = self._get_regularization(wt) grads = [y_err + cost for cost in costs] return grads def _get_regularization(self, wt): # Get cost from L1 and L2 regularization. Currently, bias is # also regularized but should not matter much. costs = self.lambda1 * np.sign(wt) # L1 costs += 2 * self.lambda2 * np.array(wt) # L2 return costs def _get_skip_sample(self, class_weight, y): if class_weight == 'auto': class_weight = np.mean(y) / (1 - np.mean(y)) if class_weight != 1.: rand = np.random.random(len(y)) skip_sample = rand > class_weight skip_sample[np.array(y) == 1] = False # don't skip 1 labels if self.verbose: print("Using {:0.3f}% of negative samples".format( 100 * class_weight)) else: skip_sample = [False] * len(y) return skip_sample def _call_back(self, t): if ( (t % self.callback_period == 0) & (self.callback is not None) & (t != 0) ): self.callback.plot(self) def _update(self, wt, xt, gradt, sample_weight): # note: wt is not used here but is still passed so that the # interface for FTRL proximal (which requires wt) can stay the # same numt = self._get_num(xt) for xi, numi, gradi in zip(xt, numt, gradt): delta_w = gradi * self.alpha / (self.beta + np.sqrt(numi)) delta_w /= sample_weight self.w[xi] = self.w.get(xi, 0.) - delta_w delta_num = self._get_delta_num(gradi) self.num[xi] = numi + delta_num return self def _update_valid(self, yt, pt): self.valid_history.append((yt, pt)) def fit(self, X, y, cols=None, class_weight=1.): """Fit OGDLR model. Can also be used as if 'partial_fit', i.e. calling 'fit' more than once continues with the learned weights. Parameters ---------- X : numpy.array, shape = [n_samples, n_features] Training data y : numpy.array, shape = [n_samples] Target values cols : list of strings, shape = [n_features] (default : None) The name of the columns used for training. They are used as part of the first part of the key. If None is given, give generic names to columns ('col01', 'col02', etc.) class_weight : float or 'auto' (optional, default=1.0) Subsample the negative class. Assumes that classes are {0, 1} and that the negative (0) class is the more common one. If these assumptions are not met, don't change this parameter. Returns ------- self """ if not hasattr(self, '_is_initialized'): self._initialize(X, cols) # skip samples if class weight is given skip_sample = self._get_skip_sample(class_weight, y) n = X.shape[0] for t in range(n): # if classes weighted differently if skip_sample[t]: continue else: sample_weight = 1. if y[t] == 1 else class_weight yt = y[t] xt = self._get_x(X[t]) wt = self._get_w(xt) pt = self._get_p(xt, wt) gradt = self._get_grads(yt, pt, wt) self._update(wt, xt, gradt, sample_weight) self._update_valid(yt, pt) self._call_back(t) def predict_proba(self, X): """ Predict probability for class 1. Parameters ---------- X : numpy.array, shape = [n_samples, n_features] Samples Returns ------- y_prob : array, shape = [n_samples] Predicted probability for class 1 """ X = [self._get_x(xt) for xt in X] pt = [self._get_p(xt) for xt in X] y_prob = np.array(pt) return y_prob def predict(self, X): """ Predict class label for samples in X. Parameters ---------- X : numpy.array, shape = [n_samples, n_features] Samples Returns ------- y_pred : array, shape = [n_samples] Predicted class label per sample. """ pt = self.predict_proba(X) y_pred = (pt > 0.5).astype(int) return y_pred def _cost_function(self, xt, wt, y): pt = self._get_p(xt, wt) ll = logloss([y], [pt]) l1 = self.lambda1 * np.abs(wt) l2 = self.lambda2 * (np.array(wt) ** 2) J = ll + l1 + l2 return J def numerical_grad(self, x, y, epsilon=1e-9): """Calculate the gradient and the gradient determined numerically; they should be very close. Use this function to verify that the gradient is determined correctly. The fit method needs to be called once before this method may be invoked. Parameters ---------- x : list of strings The keys; just a single row. y : int The target to be predicted. epsilon : float (default: 1e-6) The shift applied to the weights to determine the numerical gradient. A small but not too small value such as the default should do the job. Returns ------- grad : float The gradient as determined by this class. For cross-entropy, this is simply the prediction minus the true value. grad_num : float The gradient as determined numerically. """ # analytic xt = self._get_x(x) wt = self._get_w(xt) pt = self._get_p(xt, wt) grad = self._get_grads(y, pt, wt) # numeric: vary each wi grad_num = [] for i in range(len(wt)): wt_pe, wt_me = wt[:], wt[:] wt_pe[i] += epsilon wt_me[i] -= epsilon cost_pe = self._cost_function(xt, wt_pe, y)[i] cost_me = self._cost_function(xt, wt_me, y)[i] grad_num.append((cost_pe - cost_me) / 2 / epsilon) return grad, grad_num def keys(self): """Return the keys saved by the model. This only works if the model uses the dictionary method for storing its keys and values. Otherwise, it returns None. Returns ------- keys : None or list of strings The keys of the model if they can be retrieved else None. """ if self.ndims is None: return self.w.keys() else: return None def weights(self): """Return the weights saved by the model. The weights are not necessarily the same as in the 'w' attribute. Returns ------- weights : None or list of floats The weights of the model. If the model uses a dictionary for storing the weights, they are returned in the same order as the keys. If the model uses hashing to a list to store the weights, returns None. """ if self.ndims is None: weights = self._get_w(self.keys()) return weights else: return None