def __iter__(self): p = np.size(self.wrt) v = 0.0001 * np.random.randn(p) eta = self.eta0 * np.ones(p) for i, (args, kwargs) in enumerate(self.args): gradient = self.fprime(self.wrt, *args, **kwargs) if not is_nonzerofinite(gradient): warnings.warn('gradient is either zero, nan or inf') break Hp = self.f_Hp(self.wrt, v, *args, **kwargs) eta = eta * np.maximum(0.5, 1 + self.mu * v * gradient) v *= self.lmbd v += eta * (gradient - self.lmbd * Hp) self.wrt -= eta * gradient yield { 'n_iter': i, 'args': args, 'kwargs': kwargs, 'gradient': gradient, 'v': v, 'eta': eta }
def __iter__(self): args, kwargs = self.args.next() grad = self.fprime(self.wrt, *args, **kwargs) grad_m1 = scipy.zeros(grad.shape) loss = self.f(self.wrt, *args, **kwargs) loss_m1 = 0 for i, (next_args, next_kwargs) in enumerate(self.args): if i == 0: direction, info = -grad, {} else: direction, info = self.find_direction(grad_m1, grad, direction) if not is_nonzerofinite(direction): self.logfunc( {'message': 'direction is invalid -- neet to bail out.'}) break # Line search minimization. initialization = 2 * (loss - loss_m1) / scipy.dot(grad, direction) initialization = min(1, initialization) step_length = self.line_search.search( direction, initialization, args, kwargs) self.wrt += step_length * direction # If we don't bail out here, we will enter regions of numerical # instability. if (abs(grad) < self.epsilon).all(): self.logfunc( {'message': 'converged - gradient smaller than epsilon'}) break # Prepare everything for the next loop. args, kwargs = next_args, next_kwargs grad_m1[:], grad[:] = grad, self.line_search.grad loss_m1, loss = loss, self.line_search.val info.update({ 'loss': loss, 'step_length': step_length, 'n_iter': i, 'args': args, 'gradient': grad, 'gradient_m1': grad_m1, 'kwargs': kwargs, }) yield info
def __iter__(self): args, kwargs = self.args.next() grad = self.fprime(self.wrt, *args, **kwargs) grad_m1 = np.zeros(grad.shape) loss = self.f(self.wrt, *args, **kwargs) loss_m1 = 0 for i, (next_args, next_kwargs) in enumerate(self.args): if i == 0: direction, info = -grad, {} else: direction, info = self.find_direction(grad_m1, grad, direction) if not is_nonzerofinite(direction): warnings.warn('gradient is either zero, nan or inf') break # Line search minimization. initialization = 2 * (loss - loss_m1) / np.dot(grad, direction) initialization = min(1, initialization) step_length = self.line_search.search( direction, initialization, args, kwargs) self.wrt += step_length * direction # If we don't bail out here, we will enter regions of numerical # instability. if (abs(grad) < self.min_grad).all(): warnings.warn('gradient is too small') break # Prepare everything for the next loop. args, kwargs = next_args, next_kwargs grad_m1[:], grad[:] = grad, self.line_search.grad loss_m1, loss = loss, self.line_search.val info.update({ 'n_iter': i, 'args': args, 'kwargs': kwargs, 'loss': loss, 'gradient': grad, 'gradient_m1': grad_m1, 'step_length': step_length, }) yield info
def __iter__(self): args, kwargs = self.args.next() grad = self.fprime(self.wrt, *args, **kwargs) grad_m1 = np.zeros(grad.shape) loss = self.f(self.wrt, *args, **kwargs) loss_m1 = 0 for i, (next_args, next_kwargs) in enumerate(self.args): if i == 0: direction, info = -grad, {} else: direction, info = self.find_direction(grad_m1, grad, direction) if not is_nonzerofinite(direction): warnings.warn('gradient is either zero, nan or inf') break # Line search minimization. initialization = 2 * (loss - loss_m1) / np.dot(grad, direction) initialization = min(1, initialization) step_length = self.line_search.search(direction, initialization, args, kwargs) self.wrt += step_length * direction # If we don't bail out here, we will enter regions of numerical # instability. if (abs(grad) < self.min_grad).all(): warnings.warn('gradient is too small') break # Prepare everything for the next loop. args, kwargs = next_args, next_kwargs grad_m1[:], grad[:] = grad, self.line_search.grad loss_m1, loss = loss, self.line_search.val info.update({ 'n_iter': i, 'args': args, 'kwargs': kwargs, 'loss': loss, 'gradient': grad, 'gradient_m1': grad_m1, 'step_length': step_length, }) yield info
def __iter__(self): args, kwargs = self.args.next() grad = self.fprime(self.wrt, *args, **kwargs) grad_m1 = scipy.zeros(grad.shape) if self.inv_hessian is None: self.inv_hessian = scipy.eye(grad.shape[0]) for i, (next_args, next_kwargs) in enumerate(self.args): if i == 0: direction, info = -grad, {} else: direction, info = self.find_direction( grad_m1, grad, step, self.inv_hessian) if not is_nonzerofinite(direction): self.logfunc( {'message': 'direction is invalid -- need to bail out.'}) break step_length = self.line_search.search( direction, None, args, kwargs) if step_length != 0: step = step_length * direction self.wrt += step else: self.logfunc( {'message': 'step length is 0--need to bail out.'}) break # Prepare everything for the next loop. args, kwargs = next_args, next_kwargs # TODO: not all line searches have .grad! grad_m1[:], grad[:] = grad, self.line_search.grad info.update({ 'step_length': step_length, 'n_iter': i, 'args': args, 'kwargs': kwargs, }) yield info
def __iter__(self): args, kwargs = self.args.next() grad = self.fprime(self.wrt, *args, **kwargs) grad_m1 = scipy.zeros(grad.shape) if self.inv_hessian is None: self.inv_hessian = scipy.eye(grad.shape[0]) for i, (next_args, next_kwargs) in enumerate(self.args): if i == 0: direction, info = -grad, {} else: direction, info = self.find_direction(grad_m1, grad, step, self.inv_hessian) if not is_nonzerofinite(direction): # TODO: inform the user here. break step_length = self.line_search.search(direction, None, args, kwargs) if step_length != 0: step = step_length * direction self.wrt += step else: self.logfunc( {'message': 'step length is 0--need to bail out.'}) break # Prepare everything for the next loop. args, kwargs = next_args, next_kwargs # TODO: not all line searches have .grad! grad_m1[:], grad[:] = grad, self.line_search.grad info.update({ 'step_length': step_length, 'n_iter': i, 'args': args, 'kwargs': kwargs, }) yield info
def __iter__(self): p = np.size(self.wrt) v = 0.0001*np.random.randn(p) eta = self.eta0 * np.ones(p) for i, (args, kwargs) in enumerate(self.args): gradient = self.fprime(self.wrt, *args, **kwargs) if not is_nonzerofinite(gradient): self.logfunc( {'message': 'gradient is invalid -- need to bail out.'}) break Hp = self.f_Hp(self.wrt, v, *args, **kwargs) eta = eta * np.maximum(0.5, 1 + self.mu * v * gradient) v *= self.lmbd v += eta*(gradient - self.lmbd*Hp) self.wrt -= eta*gradient yield {'gradient': gradient, 'v': v, 'eta': eta}
def __iter__(self): args, kwargs = self.args.next() grad = self.fprime(self.wrt, *args, **kwargs) grad_m1 = scipy.zeros(grad.shape) factor_shape = self.n_factors, self.wrt.shape[0] grad_diffs = scipy.zeros(factor_shape) steps = scipy.zeros(factor_shape) hessian_diag = self.initial_hessian_diag step_length = None step = scipy.empty(grad.shape) grad_diff = scipy.empty(grad.shape) # We need to keep track in which order the different statistics # from different runs are saved. # # Why? # # Each iteration, we save statistics such as the difference between # gradients and the actual steps taken. These are then later combined # into an approximation of the Hessian. We call them factors. Since we # don't want to create a new matrix of factors each iteration, we # instead keep track externally, which row of the matrix corresponds # to which iteration. `idxs` now is a list which maps its i'th element # to the corresponding index for the array. Thus, idx[i] contains the # rowindex of the for the (n_factors - i)'th iteration prior to the # current one. idxs = [] for i, (next_args, next_kwargs) in enumerate(self.args): if i == 0: direction = -grad info = {} else: sTgd = scipy.inner(step, grad_diff) if sTgd > 1E-10: # Don't do an update if this value is too small. # Determine index for the current update. if not idxs: # First iteration. this_idx = 0 elif len(idxs) < self.n_factors: # We are not "full" yet. Thus, append the next idxs. this_idx = idxs[-1] + 1 else: # we are full and discard the first index. this_idx = idxs.pop(0) idxs.append(this_idx) grad_diffs[this_idx] = grad_diff steps[this_idx] = step hessian_diag = sTgd / scipy.inner(grad_diff, grad_diff) direction, info = self.find_direction( grad_diffs, steps, -grad, hessian_diag, idxs) if not is_nonzerofinite(direction): warnings.warn('search direction is either 0, nan or inf') break step_length = self.line_search.search( direction, None, args, kwargs) step[:] = step_length * direction if step_length != 0: self.wrt += step else: warnings.warn('step length is 0') pass # Prepare everything for the next loop. args, kwargs = next_args, next_kwargs # TODO: not all line searches have .grad! grad_m1[:], grad[:] = grad, self.line_search.grad grad_diff = grad - grad_m1 info.update({ 'step_length': step_length, 'n_iter': i, 'args': args, 'kwargs': kwargs, 'loss': self.line_search.val, 'gradient': grad, 'gradient_m1': grad_m1, }) yield info
def __iter__(self): args, kwargs = self.args.next() grad = self.fprime(self.wrt, *args, **kwargs) grad_m1 = scipy.zeros(grad.shape) factor_shape = self.n_factors, self.wrt.shape[0] grad_diffs = scipy.zeros(factor_shape) steps = scipy.zeros(factor_shape) hessian_diag = self.initial_hessian_diag step_length = None step = scipy.empty(grad.shape) grad_diff = scipy.empty(grad.shape) # We need to keep track in which order the different statistics # from different runs are saved. # # Why? # # Each iteration, we save statistics such as the difference between # gradients and the actual steps taken. These are then later combined # into an approximation of the Hessian. We call them factors. Since we # don't want to create a new matrix of factors each iteration, we # instead keep track externally, which row of the matrix corresponds # to which iteration. `idxs` now is a list which maps its i'th element # to the corresponding index for the array. Thus, idx[i] contains the # rowindex of the for the (n_factors - i)'th iteration prior to the # current one. idxs = [] for i, (next_args, next_kwargs) in enumerate(self.args): if i == 0: direction = -grad info = {} else: sTgd = scipy.inner(step, grad_diff) if sTgd > 1E-10: # Don't do an update if this value is too small. # Determine index for the current update. if not idxs: # First iteration. this_idx = 0 elif len(idxs) < self.n_factors: # We are not "full" yet. Thus, append the next idxs. this_idx = idxs[-1] + 1 else: # we are full and discard the first index. this_idx = idxs.pop(0) idxs.append(this_idx) grad_diffs[this_idx] = grad_diff steps[this_idx] = step hessian_diag = sTgd / scipy.inner(grad_diff, grad_diff) direction, info = self.find_direction(grad_diffs, steps, -grad, hessian_diag, idxs) if not is_nonzerofinite(direction): warnings.warn('search direction is either 0, nan or inf') break step_length = self.line_search.search(direction, None, args, kwargs) step[:] = step_length * direction if step_length != 0: self.wrt += step else: warnings.warn('step length is 0') pass # Prepare everything for the next loop. args, kwargs = next_args, next_kwargs # TODO: not all line searches have .grad! grad_m1[:], grad[:] = grad, self.line_search.grad grad_diff = grad - grad_m1 info.update({ 'step_length': step_length, 'n_iter': i, 'args': args, 'kwargs': kwargs, 'loss': self.line_search.val, 'gradient': grad, 'gradient_m1': grad_m1, }) yield info