def test_differentiate_univariate_vector_function(self): def func(x): f1 = x.logistic() f2 = x.log(base=2) f3 = x**2 f4 = 1 - x return np.array([f1, f2, f3, f4]) x = Variable(2, 1) val, der = differentiate(func, np.array([2])) val_expected = np.array([1 / (1 + np.exp(-2)), np.log2(2), 4, -1]) f1_der = [np.exp(-2) / (1 + np.exp(-2))**2] f2_der = [1 / (np.log(2) * 2)] f3_der = [2 * 2] f4_der = [-1] derivative_expected = np.array([f1_der, f2_der, f3_der, f4_der]) np.testing.assert_array_almost_equal(val, val_expected) np.testing.assert_array_almost_equal(der, derivative_expected) def func(x): f1 = x[0].logistic() f2 = x[0].log(base=2) f3 = x[0]**2 f4 = 1 - x[0] return np.array([f1, f2, f3, f4]) val, der = differentiate(func, np.array([2]), scalar=False) np.testing.assert_array_almost_equal(val, val_expected) np.testing.assert_array_almost_equal(der, derivative_expected)
def test_differentiate_univariate_scalar_function(self): def func(x): return 3 * x**2 + 4 / x + np.sin(x**2) val, der = differentiate(func, np.array([2])) x = 2 self.assertAlmostEqual(val, func(x)) derivative_expected = np.array( [6 * x - 4 / x**2 + np.cos(x**2) * 2 * x]) np.testing.assert_array_almost_equal(der, derivative_expected) def func(x): return 3 * x[0]**2 + 4 / x[0] + np.sin(x[0]**2) val, der = differentiate(func, np.array([x]), scalar=False) self.assertAlmostEqual(val, func(np.array([x]))) np.testing.assert_array_almost_equal(der, derivative_expected)
def test_differentiate_multivariate_vector_function(self): def func(x, y, z): f1 = 3 * x**2 + 4 * z / x + np.log(x + y + z) f2 = x * y * z f3 = 1 / (x - z) f4 = y return np.array([f1, f2, f3, f4]) x = 2 y = 3 z = 1 x_arr = np.array([x, y, z]) val, der = differentiate(func, x_arr) f1_der = [ 6 * x - 4 * z / x**2 + 1 / (x + y + z), 1 / (x + y + z), (5 * x + 4 * z + 4 * y) / (x * (x + y + z)) ] f2_der = [y * z, x * z, x * y] f3_der = [-1 / (x - z)**2, 0, 1 / (x - z)**2] f4_der = [0, 1, 0] derivative_expected = np.array([f1_der, f2_der, f3_der, f4_der]) np.testing.assert_array_almost_equal(val, func(x, y, z)) np.testing.assert_array_almost_equal(der, derivative_expected) def func(x): f1 = 3 * x[0]**2 + 4 * x[2] / x[0] + np.log(x[0] + x[1] + x[2]) f2 = x[0] * x[1] * x[2] f3 = 1 / (x[0] - x[2]) f4 = x[1] return np.array([f1, f2, f3, f4]) val, der = differentiate(func, x_arr, scalar=False) np.testing.assert_array_almost_equal(val, func(x_arr)) np.testing.assert_array_almost_equal(der, derivative_expected)
def test_differentiate_multivariate_scalar_function(self): def func(x, y, z): return 3 * x**2 + 4 * z / x + np.log(x + y + z) x = 2 y = 3 z = 1 x_arr = np.array([x, y, z]) val, der = differentiate(func, x_arr) self.assertAlmostEqual(val, func(x, y, z)) derivative_expected = np.array([ 6 * x - 4 * z / x**2 + 1 / (x + y + z), 1 / (x + y + z), (5 * x + 4 * z + 4 * y) / (x * (x + y + z)) ]) np.testing.assert_array_almost_equal(der, derivative_expected) def func(x): return 3 * x[0]**2 + 4 * x[2] / x[0] + np.log(x[0] + x[1] + x[2]) val, der = differentiate(func, x_arr, scalar=False) self.assertAlmostEqual(val, func(x_arr)) np.testing.assert_array_almost_equal(der, derivative_expected)
def bfgs_optimize(self, num_iterations=1000, learning_rate=0.01, tolerance=None): """ method that performs Quasi-Newton optimization of the objective function with BFGS updates INPUTS ======= - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000 - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01 - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm of the update step is smaller than this value, gradient descent will terminate; Default is None (no tolerance check is used) RETURNS ======== - val: the minimum value of the objective_function that was found (float) - cur_variable_values: the values for the inputs to objective_function that gave the minimum objective_value found. (1D array of floats with the same size as the number of inputs to the objective function) EXAMPLES ========= # multivariate function with scalars as input >>> import numpy as np >>> f = lambda x, y: x**2 + y**2 >>> op = Optimizer(f, np.array([1, -1])) >>> op.bfgs_optimize(num_iterations=1000, learning_rate=0.1) (4.82773951620493e-92, array([ 1.55366333e-46, -1.55366333e-46])) # multivariate function with a vector as input >>> import numpy as np >>> f = lambda x: x[0]**2 + x[1]**2 >>> op = Optimizer(f, np.array([1, -1]), scalar=False) >>> op.bfgs_optimize(num_iterations=1000, learning_rate=0.1) (4.82773951620493e-92, array([ 1.55366333e-46, -1.55366333e-46])) # univariate function with scalar as input >>> import numpy as np >>> f = lambda x: x**2 >>> op = Optimizer(f, np.array([1])) >>> op.bfgs_optimize(num_iterations=1000, learning_rate=0.1) (2.4138697581024885e-92, array([1.55366333e-46])) """ num_variables = len(self.variable_initialization) cur_variable_values = self.variable_initialization cur_inv_hessian = np.eye(num_variables) val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history = [val] for i in range(num_iterations): delta_var = -learning_rate * cur_inv_hessian@der cur_variable_values = cur_variable_values + delta_var val, der2 = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history.append(val) identity = np.eye(num_variables) y = (der2 - der).reshape(-1, 1) s = delta_var.reshape(-1, 1) denominator = y.T@s t1 = (identity- [email protected]/denominator) t2 = (identity - [email protected] / denominator) t3 = [email protected]/denominator cur_inv_hessian = t1@cur_inv_hessian@t2 + t3 der = der2 self._print_updates(i, val) if self._tolerance_check(tolerance, delta_var): break return val, cur_variable_values
def adam_optimize(self, num_iterations=1000, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, tolerance=None): """ method that performs Adaptive Moment Estimation(adam) optimization of the objective function INPUTS ======= Default parameters follow those provided in the original paper. - num_iterations: an int specifying the maximum number of iterations; Default is 1000 - learning_rate: a float/int specifying the learning rate for gradient descent; Default value 0.001. - beta1: Exponential decay hyperparameter for the first moment estimates. Default value 0.9 - beta2: Exponential decay hyperparameter for the second moment estimates. Default 0.999 - epsilon: Hyperparameter preventing division by zero. Default value 1e-8. - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm of the update step is smaller than this value, the adam_optimizer will terminate; Default is None (no tolerance check is used) RETURNS ======== - objective_value: the minimum value of the objective_function that was found (float) - cur_variable_values: the values for the inputs to objective_function that gave the minimum objective_value found. (1D array of floats with the same size as the number of inputs to the objective function) EXAMPLES ========= # multivariate function with scalars as input >>> import numpy as np >>> f = lambda x, y: x**3 + y**2 >>> op = Optimizer(f, np.array([1, -1])) >>> op.adam_optimize(learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8) (6.03886825409073e-06, array([1.82103595e-02, 1.81385270e-21])) # multivariate function with a vector as input >>> import numpy as np >>> f = lambda x: x[0]**2 + x[1]**2 >>> op = Optimizer(f, np.array([1, -1]), scalar=False) >>> op.adam_optimize(learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8) (7.701661519998926e-49, array([-6.20550623e-25, 6.20550623e-25])) # univariate function with scalar as input >>> import numpy as np >>> f = lambda x: x**2 >>> op = Optimizer(f, np.array([1])) >>> op.adam_optimize(learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8) (3.850830759999463e-49, array([-6.20550623e-25])) """ if not 0 <= beta1 < 1 or not 0 <= beta2 < 1: raise ValueError("The value of beta (sample weight) should be between 0 and 1 (excluding 1).") cur_variable_values = self.variable_initialization val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history = [val] v, s, v_corrected, s_corrected = 0,0,0,0 for l in range(num_iterations): # Compute the moving average of the gradients. v = beta1 * v + (1 - beta1) * der # Compute bias-corrected first moment estimate. v_corrected = v / (1 - np.power(beta1, l+1)) # Moving average of the squared gradients. s = beta2 * s + (1 - beta2) * der**2 # Compute bias-corrected second raw moment estimate. s_corrected = s / (1 - np.power(beta2, l+1)) # Update the derivatives. delta_var = learning_rate * v_corrected / (np.sqrt(s_corrected) + epsilon) cur_variable_values = cur_variable_values - delta_var val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history.append(val) self._print_updates(l, val) if self._tolerance_check(tolerance, delta_var): break return val, cur_variable_values
def rmsprop_optimize(self, num_iterations=1000, learning_rate=0.001, epsilon=1e-7, beta=0.9, tolerance=None): """ Method that performs RMSProp gradient descent optimization of the objective function. This is an enhancement to Adagrad and adjusts the learning rate alpha by dividing it by the exponential moving averages of gradients. INPUTS ======= - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000 - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.001 - epsilon: A float to prevent division by zero during optimization; Default is 1e-7 - beta: A float ranging between 0 and 1 specifying the sample weight for exponential average of weights; Default is 0.9 - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm of the update step is smaller than this value, gradient descent will terminate; Default is None (no tolerance check is used) RETURNS ======== - objective_value: the minimum value of the objective_function that was found (float) - cur_variable_values: the values for the inputs to objective_function that gave the minimum objective_value found. (1D array of floats with the same size as the number of inputs to the objective function) EXAMPLES ========= # Univariate objective function with scalar inputs. >>> import numpy as np >>> g = lambda x: x**4 - x >>> op = Optimizer(g, np.array([1])) >>> op.rmsprop_optimize(num_iterations=1000, learning_rate=0.01) (-0.4724703937105774, array([0.62996052])) # Multivariate objective function with scalar inputs. >>> import numpy as np >>> g = lambda x, y: x**2 + y**2 + 12 >>> op = Optimizer(g, np.array([0.5, 0.88])) >>> op.rmsprop_optimize(num_iterations=10000, learning_rate=0.01) (12.00004995, array([ 0.0049975, -0.0049975])) # Multivariate objective function with vector inputs. >>> import numpy as np >>> g = lambda x: x[0]**2 + 2*x[1]**2 + 12 >>> op = Optimizer(g, np.array([0.5, 0.88]), scalar=False) >>> op.rmsprop_optimize(num_iterations=10000, learning_rate=0.01) (12.0000749625, array([ 0.0049975 , -0.00499937])) """ if not 0 <= beta <= 1: raise ValueError("The value of beta (sample weight) should be between 0 and 1.") cur_variable_values = self.variable_initialization val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history = [val] _exp_average_gradient = 0 for i in range(num_iterations): _exp_average_gradient = (beta * _exp_average_gradient) + ((1 - beta) * der**2) delta_var = (learning_rate * der) / np.sqrt(_exp_average_gradient + epsilon) cur_variable_values = cur_variable_values - delta_var val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history.append(val) self._print_updates(i, val) if self._tolerance_check(tolerance, delta_var): break return val, cur_variable_values
def adagrad_optimize(self, num_iterations=1000, learning_rate=0.01, epsilon=1e-7, tolerance=None): """ Method that performs adaptive gradient descent optimization of the objective function.Adagrad adjusts the learning rate alpha by dividing it by the square root of the cumulative sum of current and past squared gradients. INPUTS ======= - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000 - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01 - epsilon: A float to prevent division by zero during optimization; Default is 1e-7 - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm of the update step is smaller than this value, gradient descent will terminate; Default is None (no tolerance check is used) RETURNS ======== - objective_value: the minimum value of the objective_function that was found (float) - cur_variable_values: the values for the inputs to objective_function that gave the minimum objective_value found. (1D array of floats with the same size as the number of inputs to the objective function) EXAMPLES ========= # Univariate objective function with scalar inputs. >>> import numpy as np >>> g = lambda x: x**4 - x >>> op = Optimizer(g, np.array([1])) >>> op.adagrad_optimize(num_iterations=1000, learning_rate=0.01) (-0.4705616040471904, array([0.65786042])) # Multivariate objective function with scalar inputs. >>> import numpy as np >>> g = lambda x, y: x**2 + y**2 + 12 >>> op = Optimizer(g, np.array([0.5, 0.88])) >>> op.adagrad_optimize(num_iterations=10000, learning_rate=0.01) (12.000013226920059, array([8.13318093e-08, 3.63688329e-03])) # Multivariate objective function with vector inputs. >>> import numpy as np >>> g = lambda x: x[0]**2 + 2*x[1]**2 + 12 >>> op = Optimizer(g, np.array([0.5, 0.88]), scalar=False) >>> op.adagrad_optimize(num_iterations=10000, learning_rate=0.01) (12.000026453839908, array([8.13318093e-08, 3.63688327e-03])) """ cur_variable_values = self.variable_initialization val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history = [val] _cumsum_gradient = 0 for i in range(num_iterations): _cumsum_gradient = _cumsum_gradient + (der**2) delta_var = (learning_rate * der) / np.sqrt(_cumsum_gradient + epsilon) cur_variable_values = cur_variable_values - delta_var val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history.append(val) self._print_updates(i, val) if self._tolerance_check(tolerance, delta_var): break return val, cur_variable_values
def momentum_optimize(self, num_iterations=1000, learning_rate=0.01, beta=0.9, tolerance=None): """ Method that performs momentum gradient descent optimization of the objective function. It does so by factoring a momentum term during learning, which is an exponential moving average of current and past gradients. INPUTS ======= - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000 - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01 - beta: A float ranging between 0 and 1 specifying the sample weight for exponential average of weights; Default is 0.9 - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm of the update step is smaller than this value, gradient descent will terminate; Default is None (no tolerance check is used) RETURNS ======== - objective_value: the minimum value of the objective_function that was found (float) - cur_variable_values: the values for the inputs to objective_function that gave the minimum objective_value found. (1D array of floats with the same size as the number of inputs to the objective function) EXAMPLES ========= # Univariate objective function with scalar inputs. >>> import numpy as np >>> g = lambda x: x**4 - x >>> op = Optimizer(g, np.array([1])) >>> op.momentum_optimize(num_iterations=1000, learning_rate=0.01) (-0.4724703937105774, array([0.62996052])) # Multivariate objective function with scalar inputs. >>> import numpy as np >>> g = lambda x, y: x**3 + 2*y**2 + 12 >>> op = Optimizer(g, np.array([0.5, 0.88])) >>> op.momentum_optimize(num_iterations=10000, learning_rate=0.01) (12.000000035335317, array([ 3.28147927e-003, -1.79857502e-230])) # Multivariate objective function with vector inputs. >>> import numpy as np >>> g = lambda x: x[0]**3 + 2*x[1]**2 + 12 >>> op = Optimizer(g, np.array([0.5, 0.88]), scalar=False) >>> op.momentum_optimize(num_iterations=1000, learning_rate=0.01) (12.00002667493136, array([2.98791178e-02, 1.51990528e-23])) """ if not 0 <= beta <= 1: raise ValueError("The value of beta (sample weight) should be between 0 and 1.") cur_variable_values = self.variable_initialization val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history = [val] _current_momentum = 0 for i in range(num_iterations): _current_momentum = (beta * _current_momentum) + ((1 - beta) * der) delta_var = learning_rate * _current_momentum cur_variable_values = cur_variable_values - delta_var val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history.append(val) self._print_updates(i, val) if self._tolerance_check(tolerance, delta_var): break return val, cur_variable_values
def gd_optimize(self, num_iterations=1000, learning_rate=0.01, tolerance=None): """ method that performs gradient descent optimization of the objective function INPUTS ======= - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000 - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01 - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm of the update step is smaller than this value, gradient descent will terminate; Default is None (no tolerance check is used) RETURNS ======== - val: the minimum value of the objective_function that was found (float) - cur_variable_values: the values for the inputs to objective_function that gave the minimum objective_value found. (1D array of floats with the same size as the number of inputs to the objective function) EXAMPLES ========= # multivariate function with scalars as input >>> import numpy as np >>> f = lambda x, y: x**2 + y**2 >>> op = Optimizer(f, np.array([1, -1])) >>> op.gd_optimize(num_iterations=1000, learning_rate=0.1) (3.026941164608489e-194, array([ 1.23023192e-97, -1.23023192e-97])) # multivariate function with a vector as input >>> import numpy as np >>> f = lambda x: x[0]**2 + x[1]**2 >>> op = Optimizer(f, np.array([1, -1]), scalar=False) >>> op.gd_optimize(num_iterations=1000, learning_rate=0.1) (3.026941164608489e-194, array([ 1.23023192e-97, -1.23023192e-97])) # univariate function with scalar as input >>> import numpy as np >>> f = lambda x: x**2 >>> op = Optimizer(f, np.array([1])) >>> op.gd_optimize(num_iterations=1000, learning_rate=0.1) (1.5134705823042444e-194, array([1.23023192e-97])) """ cur_variable_values = self.variable_initialization val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history = [val] for i in range(num_iterations): delta_var = learning_rate * der cur_variable_values = cur_variable_values - delta_var val, der = differentiate(self.objective_function, cur_variable_values, self.scalar) self.val_history.append(val) self._print_updates(i, val) if self._tolerance_check(tolerance, delta_var): break return val, cur_variable_values