def init_param_updates(self, layer, parameter): step = self.variables.step epsilon = self.epsilon parameter_shape = parameter.get_value().shape prev_mean_squred_grad = theano.shared( name="{}/prev-mean-squred-grad".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) prev_mean_squred_dx = theano.shared( name="{}/prev-mean-squred-dx".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = ( self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient ** 2 ) parameter_delta = gradient * ( T.sqrt(prev_mean_squred_dx + epsilon) / T.sqrt(mean_squred_grad + epsilon) ) mean_squred_dx = ( self.decay * prev_mean_squred_dx + (1 - self.decay) * parameter_delta ** 2 ) return [ (prev_mean_squred_grad, mean_squred_grad), (prev_mean_squred_dx, mean_squred_dx), (parameter, parameter - step * parameter_delta), ]
def init_param_updates(self, layer, parameter): step = self.variables.step parameter_shape = T.shape(parameter).eval() prev_delta = theano.shared( name="{}/prev-delta".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) prev_gradient = theano.shared( name="{}/prev-grad".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) grad_delta = T.abs_(prev_gradient - gradient) parameter_delta = ifelse( T.eq(self.variables.epoch, 1), gradient, T.clip( T.abs_(prev_delta) * gradient / grad_delta, -self.upper_bound, self.upper_bound ) ) return [ (parameter, parameter - step * parameter_delta), (prev_gradient, gradient), (prev_delta, parameter_delta), ]
def quadratic_minimizer(x_a, y_a, y_prime_a, x_b, y_b, bound_size_ratio=0.1): """ Finds the minimizer for a quadratic polynomial that goes through the points (x_a, y_a), (x_b, y_b) with derivative at x_a of y_prime_a. Parameters ---------- x_a : float or theano variable Left point ``a`` in the ``x`` axis. y_a : float or theano variable Output from function ``y`` at point ``a``. y_prime_a : float or theano variable Output from function ``y'`` (``y`` derivative) at point ``a``. x_b : float or theano variable Right point ``a`` in the ``x`` axis. y_b : float or theano variable Output from function ``y`` at point ``b``. bound_size_ratio : float Value control acceptable bounds for interpolation. If value close to one of the points interpolation result will be ignored. The bigger ratio, the more likely to reject interpolation. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. Returns ------- object Theano variable that after evaluation is equal to point ``x`` which is minimizer for quadratic function. """ if not 0 <= bound_size_ratio < 1: raise ValueError("Value ``bound_size_ratio`` need to be a float " "between 0 and 1, got {}".format(bound_size_ratio)) # The main formula works for the region [0, a] we need to # shift function to the left side and put point ``a`` # at ``0`` position. x_range = x_b - x_a coef = (y_b - y_a - y_prime_a * x_range) / (x_range ** 2) minimizer = -y_prime_a / (asfloat(2) * coef) + x_a bound_size_ratio = asfloat(bound_size_ratio) return T.switch( sequential_or( # Handle bad cases T.eq(x_range, zero), coef <= zero, T.gt(minimizer, x_b - bound_size_ratio * x_range), T.lt(minimizer, x_a + bound_size_ratio * x_range), ), x_a + asfloat(0.5) * x_range, # Since we shifted funciton to the left, we need to shift # the result to the right to make it correct for # the specified region. That's why we are adding ``x_a`` # at the end. -y_prime_a / (asfloat(2) * coef) + x_a )
def test_upscale_layer(self): input_value = np.array([ [1, 2, 3, 4], [5, 6, 7, 8], ]).reshape((1, 1, 2, 4)) expected_output = np.array([ [1, 1, 2, 2, 3, 3, 4, 4], [1, 1, 2, 2, 3, 3, 4, 4], [1, 1, 2, 2, 3, 3, 4, 4], [5, 5, 6, 6, 7, 7, 8, 8], [5, 5, 6, 6, 7, 7, 8, 8], [5, 5, 6, 6, 7, 7, 8, 8], ]).reshape((1, 1, 6, 8)) upscale_layer = layers.Upscale((3, 2)) connection = layers.Input((1, 2, 4)) > upscale_layer x = T.tensor4('x') actual_output = upscale_layer.output(x) actual_output = actual_output.eval({x: asfloat(input_value)}) np.testing.assert_array_almost_equal( asfloat(expected_output), actual_output )
def init_param_updates(self, layer, parameter): epoch = self.variables.epoch step = self.variables.step beta1 = self.beta1 beta2 = self.beta2 parameter_shape = T.shape(parameter).eval() prev_first_moment = theano.shared( name="{}/prev-first-moment".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) prev_weighted_inf_norm = theano.shared( name="{}/prev-weighted-inf-norm".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm, T.abs_(gradient)) parameter_delta = ( (1 / (1 - beta1 ** epoch)) * (first_moment / (weighted_inf_norm + self.epsilon)) ) return [ (prev_first_moment, first_moment), (prev_weighted_inf_norm, weighted_inf_norm), (parameter, parameter - step * parameter_delta), ]
def test_mixture_of_experts(self): dataset = datasets.load_diabetes() data, target = asfloat(dataset.data), asfloat(dataset.target) insize, outsize = data.shape[1], 1 input_scaler = preprocessing.MinMaxScaler((-1 ,1)) output_scaler = preprocessing.MinMaxScaler() x_train, x_test, y_train, y_test = cross_validation.train_test_split( input_scaler.fit_transform(data), output_scaler.fit_transform(target.reshape(-1, 1)), train_size=0.8 ) n_epochs = 10 scaled_y_test = output_scaler.inverse_transform(y_test) scaled_y_test = scaled_y_test.reshape((y_test.size, 1)) # -------------- Train single GradientDescent -------------- # bpnet = algorithms.GradientDescent( (insize, 20, outsize), step=0.1, verbose=False ) bpnet.train(x_train, y_train, epochs=n_epochs) network_output = bpnet.predict(x_test) network_error = rmsle(output_scaler.inverse_transform(network_output), scaled_y_test) # -------------- Train ensemlbe -------------- # moe = algorithms.MixtureOfExperts( networks=[ algorithms.Momentum( (insize, 20, outsize), step=0.1, batch_size=1, verbose=False ), algorithms.Momentum( (insize, 20, outsize), step=0.1, batch_size=1, verbose=False ), ], gating_network=algorithms.Momentum( layers.Softmax(insize) > layers.Output(2), step=0.1, verbose=False ) ) moe.train(x_train, y_train, epochs=n_epochs) ensemble_output = moe.predict(x_test) ensemlbe_error = rmsle( output_scaler.inverse_transform(ensemble_output), scaled_y_test ) self.assertGreater(network_error, ensemlbe_error)
def init_variables(self): super(ConjugateGradient, self).init_variables() n_parameters = count_parameters(self.connection) self.variables.update( prev_delta=theano.shared(name="conj-grad/prev-delta", value=asfloat(np.zeros(n_parameters))), prev_gradient=theano.shared(name="conj-grad/prev-gradient", value=asfloat(np.zeros(n_parameters))), )
def test_batch_norm_as_shared_variable(self): gamma = theano.shared(value=asfloat(np.ones(2))) beta = theano.shared(value=asfloat(2 * np.ones(2))) batch_norm = layers.BatchNorm(gamma=gamma, beta=beta) layers.Input(10) > batch_norm self.assertIs(gamma, batch_norm.gamma) self.assertIs(beta, batch_norm.beta)
def test_concatenate_basic(self): concat_layer = layers.Concatenate(axis=1) x1 = T.tensor4() x2 = T.tensor4() y = theano.function([x1, x2], concat_layer.output(x1, x2)) x1_tensor4 = asfloat(np.random.random((1, 2, 3, 4))) x2_tensor4 = asfloat(np.random.random((1, 8, 3, 4))) output = y(x1_tensor4, x2_tensor4) self.assertEqual((1, 10, 3, 4), output.shape)
def test_elementwise_basic(self): elem_layer = layers.Elementwise(merge_function=T.add) x1 = T.matrix() x2 = T.matrix() y = theano.function([x1, x2], elem_layer.output(x1, x2)) x1_matrix = asfloat(np.random.random((10, 2))) x2_matrix = asfloat(np.random.random((10, 2))) expected_output = x1_matrix + x2_matrix actual_output = y(x1_matrix, x2_matrix) np.testing.assert_array_almost_equal(expected_output, actual_output)
def init_layers(self): super(Adamax, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_first_moment = theano.shared( name="prev_first_moment_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_weighted_inf_norm = theano.shared( name="prev_weighted_inf_norm_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), )
def init_layers(self): super(Quickprop, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_delta = theano.shared( name="prev_delta_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_gradient = theano.shared( name="prev_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), )
def init_layers(self): super(Adadelta, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_mean_squred_grad = theano.shared( name="prev_mean_squred_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_mean_squred_dx = theano.shared( name="prev_mean_squred_dx_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), )
def test_jacobian_for_levenberg_marquardt(self): w1 = theano.shared(name='w1', value=asfloat(np.array([[1]]))) b1 = theano.shared(name='b1', value=asfloat(np.array([0]))) w2 = theano.shared(name='w2', value=asfloat(np.array([[2]]))) b2 = theano.shared(name='b2', value=asfloat(np.array([1]))) x = T.matrix('x') y = T.matrix('y') output = ((x.dot(w1.T) + b1) ** 2).dot(w2.T) + b2 error_func = T.mean((y - output), axis=1) x_train = asfloat(np.array([[1, 2, 3]]).T) y_train = asfloat(np.array([[1, 2, 3]]).T) output_expected = asfloat(np.array([[3, 9, 19]]).T) np.testing.assert_array_almost_equal( output.eval({x: x_train}), output_expected ) jacobian_expected = asfloat(np.array([ [-4, -4, -1, -1], [-16, -8, -4, -1], [-36, -12, -9, -1], ])) jacobian_actual = compute_jacobian(error_func, [w1, b1, w2, b2]) np.testing.assert_array_almost_equal( jacobian_expected, jacobian_actual.eval({x: x_train, y: y_train}) )
def test_categorical_hinge_without_one_hot_encoding(self): targets = asfloat(np.array([2, 0])) predictions = asfloat(np.array([ [0.1, 0.2, 0.7], [0.0, 0.9, 0.1], ])) expected = asfloat(np.array([0.5, 1.9]).mean()) prediction_var = T.matrix() target_var = T.vector() error_output = errors.categorical_hinge(target_var, prediction_var) actual = error_output.eval({prediction_var: predictions, target_var: targets}) self.assertAlmostEqual(expected, actual)
def golden_search(f, maxstep=50, maxiter=1024, tol=1e-5): """ Identify best step for function in specific direction. Parameters ---------- f : func maxstep : float Defaults to ``50``. maxiter : int Defaults to ``1024``. tol : float Defaults to ``1e-5``. Returns ------- float Identified optimal step. """ golden_ratio = asfloat((math.sqrt(5) - 1) / 2) def interval_reduction(a, b, c, d, tol): fc = f(c) fd = f(d) a, b, c, d = ifelse( T.lt(fc, fd), [a, d, d - golden_ratio * (d - a), c], [c, b, d, c + golden_ratio * (b - c)] ) stoprule = theano.scan_module.until( T.lt(T.abs_(c - d), tol) ) return [a, b, c, d], stoprule a = T.constant(asfloat(0)) b = maxstep c = b - golden_ratio * (b - a) d = a + golden_ratio * (b - a) (a, b, c, d), _ = theano.scan( interval_reduction, outputs_info=[a, b, c, d], non_sequences=[asfloat(tol)], n_steps=maxiter ) return (a[-1] + b[-1]) / 2
def initialize(self): super(BatchNorm, self).initialize() input_shape = as_tuple(None, self.input_shape) ndim = len(input_shape) if self.axes is None: # If ndim == 4 then axes = (0, 2, 3) # If ndim == 2 then axes = (0,) self.axes = tuple(axis for axis in range(ndim) if axis != 1) if any(axis >= ndim for axis in self.axes): raise ValueError("Cannot apply batch normalization on the axis " "that doesn't exist.") opposite_axes = find_opposite_axes(self.axes, ndim) parameter_shape = [input_shape[axis] for axis in opposite_axes] if any(parameter is None for parameter in parameter_shape): unknown_dim_index = parameter_shape.index(None) raise ValueError("Cannot apply batch normalization on the axis " "with unknown size over the dimension #{} " "(0-based indeces).".format(unknown_dim_index)) self.running_mean = theano.shared( name='running_mean_{}'.format(self.layer_id), value=asfloat(np.zeros(parameter_shape)) ) self.running_inv_std = theano.shared( name='running_inv_std_{}'.format(self.layer_id), value=asfloat(np.ones(parameter_shape)) ) if isinstance(self.gamma, number_type): self.gamma = np.ones(parameter_shape) * self.gamma if isinstance(self.beta, number_type): self.beta = np.ones(parameter_shape) * self.beta self.gamma = theano.shared( name='gamma_{}'.format(self.layer_id), value=asfloat(self.gamma), ) self.beta = theano.shared( name='beta_{}'.format(self.layer_id), value=asfloat(self.beta), ) self.parameters = [self.gamma, self.beta]
def test_rmsle(self): actual = np.e ** (np.array([1, 2, 3, 4])) - 1 predicted = np.e ** (np.array([4, 3, 2, 1])) - 1 self.assertEqual( asfloat(np.sqrt(5)), estimators.rmsle(actual, predicted) )
def test_rmse(self): actual = np.array([0, 1, 2, 3]) predicted = np.array([3, 2, 1, 0]) self.assertEqual( asfloat(np.sqrt(5)), estimators.rmse(actual, predicted) )
def test_save_link_to_assigned_connections(self): # Tree structure: # # Sigmoid(10) # / # Input(10) - Sigmoid(5) # \ # Softmax(10) # input_layer = layers.Input(10) minimized = input_layer > layers.Sigmoid(5) reconstructed = minimized > layers.Sigmoid(10) classifier = minimized > layers.Softmax(20) x = T.matrix() y_minimized = theano.function([x], minimized.output(x)) y_reconstructed = theano.function([x], reconstructed.output(x)) y_classifier = theano.function([x], classifier.output(x)) x_matrix = asfloat(np.random.random((3, 10))) minimized_output = y_minimized(x_matrix) self.assertEqual((3, 5), minimized_output.shape) reconstructed_output = y_reconstructed(x_matrix) self.assertEqual((3, 10), reconstructed_output.shape) classifier_output = y_classifier(x_matrix) self.assertEqual((3, 20), classifier_output.shape)
def test_dict_based_inputs_into_connection(self): # Tree structure: # # Input(10) - Sigmoid(5) - Sigmoid(10) # input_layer = layers.Input(10) hidden_layer = layers.Sigmoid(5) output_layer = layers.Sigmoid(10) minimized = input_layer > hidden_layer reconstructed = minimized > output_layer x = T.matrix() y_minimized = theano.function([x], minimized.output(x)) x_matrix = asfloat(np.random.random((3, 10))) minimized_output = y_minimized(x_matrix) self.assertEqual((3, 5), minimized_output.shape) h_output = T.matrix() y_reconstructed = theano.function( [h_output], reconstructed.output({output_layer: h_output}) ) reconstructed_output = y_reconstructed(minimized_output) self.assertEqual((3, 10), reconstructed_output.shape)
def init_prev_delta(self, parameter): parameter_shape = T.shape(parameter).eval() self.prev_delta = theano.shared( name="{}/prev-delta".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) return self.prev_delta
def test_parallel_layer(self): input_layer = layers.Input((3, 8, 8)) parallel_layer = layers.join( [[ layers.Convolution((11, 5, 5)), ], [ layers.Convolution((10, 3, 3)), layers.Convolution((5, 3, 3)), ]], layers.Concatenate(), ) output_layer = layers.MaxPooling((2, 2)) conn = layers.join(input_layer, parallel_layer) output_connection = layers.join(conn, output_layer) x = T.tensor4() y = theano.function([x], conn.output(x)) x_tensor4 = asfloat(np.random.random((10, 3, 8, 8))) output = y(x_tensor4) self.assertEqual(output.shape, (10, 11 + 5, 4, 4)) output_function = theano.function([x], output_connection.output(x)) final_output = output_function(x_tensor4) self.assertEqual(final_output.shape, (10, 11 + 5, 2, 2))
def init_variables(self): super(LeakStepAdaptation, self).init_variables() n_parameters = count_parameters(self) self.variables.leak_average = theano.shared( value=asfloat(np.zeros(n_parameters)), name='leak_average' )
def create_shared_parameter(value, name, shape): """ Creates NN parameter as Theano shared variable. Parameters ---------- value : array-like, Theano variable, scalar or Initializer Default value for the parameter. name : str Shared variable name. shape : tuple Parameter's shape. Returns ------- Theano shared variable. """ if isinstance(value, (T.sharedvar.SharedVariable, T.Variable)): return value if isinstance(value, init.Initializer): value = value.sample(shape) return theano.shared(value=asfloat(value), name=name, borrow=True)
def init_variables(self): super(LeakStepAdaptation, self).init_variables() n_parameters = count_parameters(self.connection) self.variables.leak_average = theano.shared( name='leak-step-adapt/leak-average', value=asfloat(np.zeros(n_parameters)), )
def test_connection_output(self): input_value = asfloat(np.random.random((10, 2))) connection = layers.Input(2) > layers.Relu(10) > layers.Relu(1) output_value = connection.output(input_value).eval() self.assertEqual(output_value.shape, (10, 1))
def output(self, input_value): if not self.input_shape: raise LayerConnectionError("Layer `{}` doesn't have defined " "input shape. Probably it doesn't " "have an input layer.".format(self)) half = self.n // 2 squared_value = input_value ** 2 n_samples = input_value.shape[0] channel = input_value.shape[1] height = input_value.shape[2] width = input_value.shape[3] zero = asfloat(0) extra_channels = T.alloc(zero, n_samples, channel + 2 * half, height, width) squared_value = T.set_subtensor( extra_channels[:, half:half + channel, :, :], squared_value ) scale = self.k for i in range(self.n): scale += self.alpha * squared_value[:, i:i + channel, :, :] scale = scale ** self.beta return input_value / scale
def test_elementwise_in_connections(self): input_layer = layers.Input(2) hidden_layer_1 = layers.Relu(1, weight=init.Constant(1), bias=init.Constant(0)) hidden_layer_2 = layers.Relu(1, weight=init.Constant(2), bias=init.Constant(0)) elem_layer = layers.Elementwise(merge_function=T.add) connection = layers.join(input_layer, hidden_layer_1, elem_layer) connection = layers.join(input_layer, hidden_layer_2, elem_layer) connection.initialize() self.assertEqual(elem_layer.output_shape, (1,)) x = T.matrix() y = theano.function([x], connection.output(x)) test_input = asfloat(np.array([ [0, 1], [-1, -1], ])) actual_output = y(test_input) expected_output = np.array([ [3], [0], ]) np.testing.assert_array_almost_equal(expected_output, actual_output)
def create_shared_parameter(value, name, shape, init_method, bounds): """ Creates NN parameter as Theano shared variable. Parameters ---------- value : array-like, theano shared variable or None Default value for the parameter. If value eqaul to ``None`` parameter will be created bsaed on the ``init_method`` value. name : str Sahred variable name. shape : tuple Parameter shape. init_method : str Weight initialization procedure name. bounds : tuple Specific parameter for the one of the ``init_method`` argument. Returns ------- Theano shared variable. """ if isinstance(value, T.sharedvar.TensorSharedVariable): return value if value is None: value = generate_weight(shape, bounds, init_method) return theano.shared(value=asfloat(value), name=name, borrow=True)
def test_select_network_branch(self): network = layers.join(layers.Input(10, name='input-1'), [[ layers.Relu(1, name='relu-1'), ], [ layers.Relu(2, name='relu-2'), ]]) self.assertEqual(network.input_shape, (10, )) self.assertEqual(network.output_shape, [(1, ), (2, )]) self.assertEqual(len(network), 3) relu_1_network = network.end('relu-1') self.assertEqual(relu_1_network.input_shape, (10, )) self.assertEqual(relu_1_network.output_shape, (1, )) self.assertEqual(len(relu_1_network), 2) x_test = asfloat(np.ones((7, 10))) y_predicted = self.eval(relu_1_network.output(x_test)) self.assertEqual(y_predicted.shape, (7, 1)) relu_2_network = network.end('relu-2') self.assertEqual(relu_2_network.input_shape, (10, )) self.assertEqual(relu_2_network.output_shape, (2, )) self.assertEqual(len(relu_2_network), 2)
def activation_function(self, input_value): alpha = asfloat(self.alpha) return T.nnet.elu(input_value, alpha)
def save_dict(network): """ Save network into the dictionary. Parameters ---------- network : network, list of layer or network Returns ------- dict Saved parameters and information about network in dictionary using specific format. Learn more about the NeuPy's storage format in the official documentation. Examples -------- >>> from neupy import layers, storage >>> >>> network = layers.Input(10) >> layers.Softmax(3) >>> layers_data = storage.save_dict(network) >>> >>> layers_data.keys() ['layers', 'graph', 'metadata'] """ network = extract_network(network) network.create_variables() session = tf_utils.tensorflow_session() tf_utils.initialize_uninitialized_variables() data = { 'metadata': { 'language': 'python', 'library': 'neupy', 'version': neupy.__version__, 'created': strftime("%a, %d %b %Y %H:%M:%S %Z", gmtime()), }, # Make it as a list in order to save the right order # of paramters, otherwise it can be convert to the dictionary. 'graph': network.layer_names_only(), 'layers': [], } for layer in network: parameters = {} configs = {} for attrname, parameter in layer.variables.items(): parameters[attrname] = { 'value': asfloat(session.run(parameter)), 'trainable': parameter.trainable, } for option_name in layer.options: if option_name not in parameters: configs[option_name] = getattr(layer, option_name) data['layers'].append({ 'class_name': layer.__class__.__name__, 'name': layer.name, 'parameters': parameters, 'configs': configs, }) return data
def test_rmsle(self): actual = np.e**(np.array([1, 2, 3, 4])) - 1 predicted = np.e**(np.array([4, 3, 2, 1])) - 1 self.assertEqual(asfloat(np.sqrt(5)), estimators.rmsle(actual, predicted))
def test_binary_crossentropy(self): predicted = asfloat(np.array([0.1, 0.9, 0.2, 0.5])) actual = asfloat(np.array([0, 1, 0, 1])) error = errors.binary_crossentropy(actual, predicted) self.assertAlmostEqual(0.28, self.eval(error), places=2)
def init_variables(self): super(LevenbergMarquardt, self).init_variables() self.variables.update( mu=theano.shared(name='mu', value=asfloat(self.mu)), last_error=theano.shared(name='last_error', value=np.nan), )
def step_decay(initial_value, reduction_freq, start_iter=0, name='step'): """ Algorithm minimizes learning step monotonically after each iteration. .. math:: \\alpha_{t + 1} = \\frac{\\alpha_{0}}{1 + \\frac{t}{m}} where :math:`\\alpha` is a step, :math:`t` is an iteration number and :math:`m` is a ``reduction_freq`` parameter. .. code-block:: python step = initial_value / (1 + current_iteration / reduction_freq) Notes ----- Step will be reduced faster when you have smaller training batches. Parameters ---------- initial_value : float Initial value for the learning rate. It's the learning rate returned during the first iteration. reduction_freq : int Parameter controls step reduction frequency. The larger the value the slower step parameter decreases. For instance, if ``reduction_freq=100`` and ``step=0.12`` then after ``100`` iterations ``step`` is going to be equal to ``0.06`` (which is ``0.12 / 2``), after ``200`` iterations ``step`` is going to be equal to ``0.04`` (which is ``0.12 / 3``) and so on. start_iter : int Start iteration. At has to be equal to ``0`` when network just started the training. Defaults to ``0``. name : str Learning rate's variable name. Defaults to ``step``. Examples -------- >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> optimizer = algorithms.Momentum( ... Input(5) >> Relu(10) >> Sigmoid(1), ... step=algorithms.step_decay( ... initial_value=0.1, ... reduction_freq=100, ... ) ... ) """ step, iteration = init_variables(initial_value, start_iter, name) reduction_freq = asfloat(reduction_freq) step_update = initial_value / (1 + iteration / reduction_freq) updated_step = step.assign(step_update) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updated_step) with tf.control_dependencies([updated_step]): next_iteration = iteration.assign(iteration + 1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, next_iteration) return step
def test_functions(self): Case = namedtuple("Case", "func X answer") testcases = [ Case(func=cg.fletcher_reeves, X=( asfloat(np.array([1.35, 0.3])), asfloat(np.array([0.11, -0.5])), asfloat(np.array([0, 0])), ), answer=0.137), Case(func=cg.polak_ribiere, X=( asfloat(np.array([1., -0.5])), asfloat(np.array([1.2, -0.45])), asfloat(np.array([0, 0])), ), answer=0.174), Case(func=cg.hentenes_stiefel, X=( asfloat(np.array([1., -0.5])), asfloat(np.array([1.2, -0.45])), asfloat(np.array([0.2, 0.05])), ), answer=5.118), Case(func=cg.liu_storey, X=( asfloat(np.array([1., -0.5])), asfloat(np.array([1.2, -0.45])), asfloat(np.array([0.2, 0.05])), ), answer=-1.243), Case(func=cg.dai_yuan, X=( asfloat(np.array([1., -0.5])), asfloat(np.array([1.2, -0.45])), asfloat(np.array([0.2, 0.05])), ), answer=38.647), ] for testcase in testcases: result = self.eval(testcase.func(*testcase.X)) self.assertAlmostEqual(result, testcase.answer, places=1)
def free_energy(visible_sample): wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias visible_bias_term = T.dot(visible_sample, self.visible_bias) hidden_term = T.log(asfloat(1) + T.exp(wx_b)).sum(axis=1) return -visible_bias_term - hidden_term
def target_function(network, x, y): weight = network.layers[1].weight new_weight = np.array([[x], [y]]) weight.set_value(asfloat(new_weight)) return network.prediction_error(input_data, target_data)
plt.figure(figsize=(10, 10)) plt.suptitle('RBM componenets', size=16) for index, image in enumerate(weight.T, start=1): plt.subplot(10, 10, index) plt.imshow(image.reshape((28, 28)), cmap=plt.cm.gray) plt.xticks([]) plt.yticks([]) plt.show() utils.reproducible() X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True) X = asfloat(X > 130) rbm = algorithms.RBM( n_visible=784, n_hidden=100, step=0.01, batch_size=20, verbose=True, shuffle_data=True, ) rbm.train(X, X, epochs=10) plot_rbm_components(rbm)
def scorer(network, X, y): y = asfloat(y) result = asfloat(network.predict(X)) return self.eval(errors.rmsle(result[:, 0], y))
def test_functions(self): Case = namedtuple("Case", "func input_data answer") testcases = [ Case(func=cg.fletcher_reeves, input_data=( np.array([1.35, 0.3]), np.array([0.11, -0.5]), np.array([0, 0]), ), answer=0.137), Case(func=cg.polak_ribiere, input_data=( np.array([1., -0.5]), np.array([1.2, -0.45]), np.array([0, 0]), ), answer=0.174), Case(func=cg.hentenes_stiefel, input_data=( np.array([1., -0.5]), np.array([1.2, -0.45]), np.array([0.2, 0.05]), ), answer=5.118), Case(func=cg.conjugate_descent, input_data=( np.array([1., -0.5]), np.array([1.2, -0.45]), np.array([0.2, 0.05]), ), answer=-7.323), Case(func=cg.liu_storey, input_data=( np.array([1., -0.5]), np.array([1.2, -0.45]), np.array([0.2, 0.05]), ), answer=1.243), Case(func=cg.dai_yuan, input_data=( np.array([1., -0.5]), np.array([1.2, -0.45]), np.array([0.2, 0.05]), ), answer=38.647), ] for testcase in testcases: input_data = asfloat(np.array(testcase.input_data)) variables = T.vectors(3) # For functions some input variables can be optional and we # ignore them during the computation. This solution cause errors # related to the Theano computational graph, because we # do not use all defined variables. That's why we need # simple hack that fix this issue and do not add changes to # the output result. hack = asfloat(0) * variables[-1][0] output_func = theano.function(variables, testcase.func(*variables) + hack) result = output_func(*input_data) self.assertAlmostEqual(result, testcase.answer, places=1)
def quadratic_minimizer(x_a, y_a, y_prime_a, x_b, y_b, bound_size_ratio=0.1): """ Finds the minimizer for a quadratic polynomial that goes through the points (x_a, y_a), (x_b, y_b) with derivative at x_a of y_prime_a. Parameters ---------- x_a : float or tensorflow variable Left point ``a`` in the ``x`` axis. y_a : float or tensorflow variable Output from function ``y`` at point ``a``. y_prime_a : float or tensorflow variable Output from function ``y'`` (``y`` derivative) at point ``a``. x_b : float or tensorflow variable Right point ``a`` in the ``x`` axis. y_b : float or tensorflow variable Output from function ``y`` at point ``b``. bound_size_ratio : float Value control acceptable bounds for interpolation. If value close to one of the points interpolation result will be ignored. The bigger ratio, the more likely to reject interpolation. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. Returns ------- object Tensorfow variable that after evaluation is equal to point ``x`` which is minimizer for quadratic function. """ if not 0 <= bound_size_ratio < 1: raise ValueError("Value ``bound_size_ratio`` need to be a float " "between 0 and 1, got {}".format(bound_size_ratio)) # The main formula works for the region [0, a] we need to # shift function to the left side and put point ``a`` # at ``0`` position. x_range = x_b - x_a coef = (y_b - y_a - y_prime_a * x_range) / (x_range ** asfloat(2)) minimizer = -y_prime_a / (asfloat(2) * coef) + x_a bound_size_ratio = asfloat(bound_size_ratio) return tf.where( sequential_or( # Handle bad cases tf.equal(x_range, 0), coef <= 0, tf.is_nan(minimizer), tf.greater(minimizer, x_b - bound_size_ratio * x_range), tf.less(minimizer, x_a + bound_size_ratio * x_range), ), x_a + asfloat(0.5) * x_range, # Since we shifted funciton to the left, we need to shift # the result to the right to make it correct for # the specified region. That's why we are adding ``x_a`` # at the end. -y_prime_a / (asfloat(2) * coef) + x_a )
def cubic_minimizer(x_a, y_a, y_prime_a, x_b, y_b, x_c, y_c, bound_size_ratio=0.2): """ Finds the minimizer for a cubic polynomial that goes through the points (x_a, y_a), (x_b, y_b), and (x_c, y_c) with derivative at ``x_a`` of y_prime_a. Parameters ---------- x_a : float or tensorflow variable First point ``a`` in the ``x`` axis. y_a : float or tensorflow variable Output from function ``y`` at point ``a``. y_prime_a : float or tensorflow variable Output from function ``y'`` (``y`` derivative) at point ``a``. x_b : float or tensorflow variable Second point ``b`` in the ``x`` axis. y_b : float or tensorflow variable Output from function ``y`` at point ``b``. x_c : float or tensorflow variable Third point ``c`` in the ``x`` axis. y_c : float or tensorflow variable Output from function ``y`` at point ``c``. bound_size_ratio : float Value control acceptable bounds for interpolation. If value is close to one of the points than interpolation result will be ignored. The bigger the ratio, the more likely it's going to reject interpolation. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. Returns ------- object Tensorfow variable that after evaluation is equal to the point ``x`` which is a minimizer for the cubic function. """ if not 0 <= bound_size_ratio < 1: raise ValueError("The `bound_size_ratio` value should be a float " "number between 0 and 1, got {}" "".format(bound_size_ratio)) bound_size_ratio = asfloat(bound_size_ratio) from_a2b_dist = x_b - x_a from_a2c_dist = x_c - x_a denominator = ( (from_a2b_dist * from_a2c_dist) ** asfloat(2) * (from_a2b_dist - from_a2c_dist) ) tau_ab = y_b - y_a - y_prime_a * from_a2b_dist tau_ac = y_c - y_a - y_prime_a * from_a2c_dist alpha = ( from_a2c_dist ** asfloat(2) * tau_ab - from_a2b_dist ** asfloat(2) * tau_ac ) / denominator beta = ( from_a2b_dist ** asfloat(3) * tau_ac - from_a2c_dist ** asfloat(3) * tau_ab ) / denominator radical = beta ** asfloat(2) - asfloat(3) * alpha * y_prime_a minimizer = x_a + (-beta + tf.sqrt(radical)) / (asfloat(3) * alpha) return tf.where( sequential_or( # Handle bad cases radical < 0, tf.equal(x_a, x_b), tf.equal(x_a, x_c), tf.equal(x_b, x_c), tf.equal(alpha, 0), tf.is_nan(minimizer), tf.greater(minimizer, x_b - bound_size_ratio * from_a2b_dist), tf.less(minimizer, x_a + bound_size_ratio * from_a2b_dist), ), quadratic_minimizer(x_a, y_a, y_prime_a, x_b, y_b), minimizer, )
def zoom(x_low, x_high, y_low, y_high, y_deriv_low, f, f_deriv, y0, y_deriv_0, c1, c2, maxiter=10): """ Notes ----- Part of the optimization algorithm in `scalar_search_wolfe2`. Parameters ---------- x_low : float Step size x_high : float Step size y_low : float Value of f at x_low y_high : float Value of f at x_high y_deriv_low : float Value of derivative at x_low f : callable f(x) Generates computational graph f_deriv : callable f'(x) Generates computational graph y0 : float Value of f for ``x = 0`` y_deriv_0 : float Value of the derivative for ``x = 0`` c1 : float Parameter for Armijo condition rule. c2 : float Parameter for curvature condition rule. maxiter : int Maximum number of iterations. Defaults to ``10``. """ def zoom_itertion_step(_, x_low, y_low, y_deriv_low, x_high, y_high, x_recent, y_recent, x_star): x_new = cubic_minimizer( x_low, y_low, y_deriv_low, x_high, y_high, x_recent, y_recent) y_new = f(x_new) y_deriv_new = f_deriv(x_new) continue_searching_condition = sequential_or( y_new > (y0 + c1 * x_new * y_deriv_0), y_new >= y_low, tf.abs(y_deriv_new) > (-c2 * y_deriv_0), ) condition1 = tf.logical_or( y_new > (y0 + c1 * x_new * y_deriv_0), y_new >= y_low ) condition2 = y_deriv_new * (x_high - x_low) >= 0 x_recent = tf.where( tf.logical_or(condition1, condition2), x_high, x_low) y_recent = tf.where( tf.logical_or(condition1, condition2), y_high, y_low) x_high = tf.where( condition1, x_new, tf.where(condition2, x_low, x_high)) y_high = tf.where( condition1, y_new, tf.where(condition2, y_low, y_high)) x_low = tf.where(condition1, x_low, x_new) y_low = tf.where(condition1, y_low, y_new) y_deriv_low = tf.where(condition1, y_deriv_low, y_deriv_new) x_star = x_new return [ continue_searching_condition, x_low, y_low, y_deriv_low, x_high, y_high, y_recent, x_recent, x_star ] zero = tf.constant(asfloat(0)) x_recent = zero y_recent = y0 outs = tf.while_loop( cond=lambda condition, *args: condition, body=zoom_itertion_step, loop_vars=[ True, x_low, y_low, y_deriv_low, x_high, y_high, x_recent, y_recent, zero, ], back_prop=False, maximum_iterations=maxiter, ) return outs[-1]
def test_mae(self): predicted = asfloat(np.array([1, 2, 3])) target = asfloat(np.array([3, 2, 1])) actual = errors.mae(target, predicted) self.assertAlmostEqual(self.eval(actual), 4 / 3., places=3)
def test_smallest_positive_number(self): epsilon = smallest_positive_number() self.assertNotEqual(0, asfloat(1) - (asfloat(1) - asfloat(epsilon))) self.assertEqual(0, asfloat(1) - (asfloat(1) - asfloat(epsilon / 10)))
def test_rmse(self): actual = np.array([0, 1, 2, 3]) predicted = np.array([3, 2, 1, 0]) self.assertEqual(asfloat(np.sqrt(5)), estimators.rmse(actual, predicted))
def output(self, value): if not self.training_state: return 2 * asfloat(value < 0) - 1 return value
def line_search(f, f_deriv, maxiter=20, c1=1e-4, c2=0.9): """ Find ``x`` that satisfies strong Wolfe conditions. ``x > 0`` is assumed to be a descent direction. Parameters ---------- f : callable f(x) Objective scalar function. f_deriv : callable f'(x) Objective function derivative. maxiter : int Maximum number of iterations. Defaults ``20``. c1 : float Parameter for Armijo condition rule. Defaults ``1e-4``. c2 : float Parameter for curvature condition rule. Defaults ``0.9``. Returns ------- Variable Value ``x`` that satisfies strong Wolfe conditions and minimize function ``f``. Notes ----- Uses the line search algorithm to enforce strong Wolfe conditions. See Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-60. For the zoom phase it uses an algorithm by [...]. """ if not 0 < c1 < 1: raise ValueError("c1 should be a float between 0 and 1") if not 0 < c2 < 1: raise ValueError("c2 should be a float between 0 and 1") if c2 < c1: raise ValueError("c2 needs to be greater than c1") if maxiter <= 0: raise ValueError("maxiter needs to be greater than 0") c1, c2 = asfloat(c1), asfloat(c2) def search_iteration_step(condition, x_previous, x_current, y_previous, y_current, y_deriv_previous, iteration, x_star): y_deriv_current = f_deriv(x_current) x_new = x_current * asfloat(2) y_new = f(x_new) condition1 = tf.logical_or( y_current > (y0 + c1 * x_current * y_deriv_0), tf.logical_and( y_current >= y_previous, tf.not_equal(iteration, 1), ) ) condition2 = tf.abs(y_deriv_current) <= -c2 * y_deriv_0 condition3 = y_deriv_current >= 0 x_star = tf.where( condition1, zoom( x_previous, x_current, y_previous, y_current, y_deriv_previous, f, f_deriv, y0, y_deriv_0, c1, c2 ), tf.where( condition2, x_current, tf.where( condition3, zoom( x_current, x_previous, y_current, y_previous, y_deriv_current, f, f_deriv, y0, y_deriv_0, c1, c2 ), x_new, ), ), ) y_deriv_previous_new = tf.where( condition1, y_deriv_previous, y_deriv_current ) is_any_condition_satisfied = sequential_or( condition1, condition2, condition3) y_current_new = tf.where( is_any_condition_satisfied, y_current, y_new ) continue_searching_condition = tf.logical_and( tf.not_equal(x_new, 0), tf.logical_not(is_any_condition_satisfied), ) return [ continue_searching_condition, x_current, x_new, y_current, y_current_new, y_deriv_previous_new, iteration + 1, x_star ] one = tf.constant(asfloat(1)) zero = tf.constant(asfloat(0)) x0, x1 = zero, one y0, y1 = f(x0), f(x1) y_deriv_0 = f_deriv(x0) outs = tf.while_loop( cond=lambda condition, *args: condition, body=search_iteration_step, loop_vars=[True, x0, x1, y0, y1, y_deriv_0, 1, zero], back_prop=False, maximum_iterations=maxiter, ) return outs[-1]
def init_methods(self): def free_energy(visible_sample): with tf.name_scope('free-energy'): wx = tf.matmul(visible_sample, self.weight) wx_b = wx + self.hidden_bias visible_bias_term = dot(visible_sample, self.visible_bias) # We can get infinity when wx_b is a relatively large number # (maybe 100). Taking exponent makes it even larger and # for with float32 it can convert it to infinity. But because # number is so large we don't care about +1 value before taking # logarithms and therefore we can just pick value as it is # since our operation won't change anything. hidden_terms = tf.where( # exp(30) is such a big number that +1 won't # make any difference in the outcome. tf.greater(wx_b, 30), wx_b, tf.log1p(tf.exp(wx_b)), ) hidden_term = tf.reduce_sum(hidden_terms, axis=1) return -(visible_bias_term + hidden_term) def visible_to_hidden(visible_sample): with tf.name_scope('visible-to-hidden'): wx = tf.matmul(visible_sample, self.weight) wx_b = wx + self.hidden_bias return tf.nn.sigmoid(wx_b) def hidden_to_visible(hidden_sample): with tf.name_scope('hidden-to-visible'): wx = tf.matmul(hidden_sample, self.weight, transpose_b=True) wx_b = wx + self.visible_bias return tf.nn.sigmoid(wx_b) def sample_hidden_from_visible(visible_sample): with tf.name_scope('sample-hidden-to-visible'): hidden_prob = visible_to_hidden(visible_sample) hidden_sample = random_binomial(hidden_prob) return hidden_sample def sample_visible_from_hidden(hidden_sample): with tf.name_scope('sample-visible-to-hidden'): visible_prob = hidden_to_visible(hidden_sample) visible_sample = random_binomial(visible_prob) return visible_sample network_input = self.variables.network_input network_hidden_input = self.variables.network_hidden_input input_shape = tf.shape(network_input) n_samples = input_shape[0] weight = self.weight h_bias = self.hidden_bias v_bias = self.visible_bias h_samples = self.variables.h_samples step = asfloat(self.step) with tf.name_scope('positive-values'): # We have to use `cond` instead of `where`, because # different if-else cases might have different shapes # and it triggers exception in tensorflow. v_pos = tf.cond( tf.equal(n_samples, self.batch_size), lambda: network_input, lambda: random_sample(network_input, self.batch_size)) h_pos = visible_to_hidden(v_pos) with tf.name_scope('negative-values'): v_neg = sample_visible_from_hidden(h_samples) h_neg = visible_to_hidden(v_neg) with tf.name_scope('weight-update'): weight_update = ( tf.matmul(v_pos, h_pos, transpose_a=True) - tf.matmul(v_neg, h_neg, transpose_a=True)) / asfloat(n_samples) with tf.name_scope('hidden-bias-update'): h_bias_update = tf.reduce_mean(h_pos - h_neg, axis=0) with tf.name_scope('visible-bias-update'): v_bias_update = tf.reduce_mean(v_pos - v_neg, axis=0) with tf.name_scope('flipped-input-features'): # Each row will have random feature marked with number 1 # Other values will be equal to 0 possible_feature_corruptions = tf.eye(self.n_visible) corrupted_features = random_sample(possible_feature_corruptions, n_samples) rounded_input = tf.round(network_input) # If we scale input values from [0, 1] range to [-1, 1] # than it will be easier to flip feature values with simple # multiplication. scaled_rounded_input = 2 * rounded_input - 1 scaled_flipped_rounded_input = ( # for corrupted_features we convert 0 to 1 and 1 to -1 # in this way after multiplication we will flip all # signs where -1 in the transformed corrupted_features (-2 * corrupted_features + 1) * scaled_rounded_input) # Scale it back to the [0, 1] range flipped_rounded_input = (scaled_flipped_rounded_input + 1) / 2 with tf.name_scope('pseudo-likelihood-loss'): # Stochastic pseudo-likelihood error = tf.reduce_mean(self.n_visible * tf.log_sigmoid( free_energy(flipped_rounded_input) - free_energy(rounded_input))) with tf.name_scope('gibbs-sampling'): gibbs_sampling = sample_visible_from_hidden( sample_hidden_from_visible(network_input)) initialize_uninitialized_variables() self.methods.update(train_epoch=function( [network_input], error, name='rbm/train-epoch', updates=[ (weight, weight + step * weight_update), (h_bias, h_bias + step * h_bias_update), (v_bias, v_bias + step * v_bias_update), (h_samples, random_binomial(p=h_neg)), ]), prediction_error=function( [network_input], error, name='rbm/prediction-error', ), diff1=function( [network_input], free_energy(flipped_rounded_input), name='rbm/diff1-error', ), diff2=function( [network_input], free_energy(rounded_input), name='rbm/diff2-error', ), visible_to_hidden=function( [network_input], visible_to_hidden(network_input), name='rbm/visible-to-hidden', ), hidden_to_visible=function( [network_hidden_input], hidden_to_visible(network_hidden_input), name='rbm/hidden-to-visible', ), gibbs_sampling=function( [network_input], gibbs_sampling, name='rbm/gibbs-sampling', ))
def search_iteration_step(condition, x_previous, x_current, y_previous, y_current, y_deriv_previous, iteration, x_star): y_deriv_current = f_deriv(x_current) x_new = x_current * asfloat(2) y_new = f(x_new) condition1 = tf.logical_or( y_current > (y0 + c1 * x_current * y_deriv_0), tf.logical_and( y_current >= y_previous, tf.not_equal(iteration, 1), ) ) condition2 = tf.abs(y_deriv_current) <= -c2 * y_deriv_0 condition3 = y_deriv_current >= 0 x_star = tf.where( condition1, zoom( x_previous, x_current, y_previous, y_current, y_deriv_previous, f, f_deriv, y0, y_deriv_0, c1, c2 ), tf.where( condition2, x_current, tf.where( condition3, zoom( x_current, x_previous, y_current, y_previous, y_deriv_current, f, f_deriv, y0, y_deriv_0, c1, c2 ), x_new, ), ), ) y_deriv_previous_new = tf.where( condition1, y_deriv_previous, y_deriv_current ) is_any_condition_satisfied = sequential_or( condition1, condition2, condition3) y_current_new = tf.where( is_any_condition_satisfied, y_current, y_new ) continue_searching_condition = tf.logical_and( tf.not_equal(x_new, 0), tf.logical_not(is_any_condition_satisfied), ) return [ continue_searching_condition, x_current, x_new, y_current, y_current_new, y_deriv_previous_new, iteration + 1, x_star ]
def test_mixture_of_experts(self): dataset = datasets.load_diabetes() data, target = asfloat(dataset.data), asfloat(dataset.target) insize, outsize = data.shape[1], 1 input_scaler = preprocessing.MinMaxScaler((-1 ,1)) output_scaler = preprocessing.MinMaxScaler() x_train, x_test, y_train, y_test = cross_validation.train_test_split( input_scaler.fit_transform(data), output_scaler.fit_transform(target.reshape(-1, 1)), train_size=0.8 ) n_epochs = 10 scaled_y_test = output_scaler.inverse_transform(y_test) scaled_y_test = scaled_y_test.reshape((y_test.size, 1)) # -------------- Train single GradientDescent -------------- # bpnet = algorithms.GradientDescent( (insize, 20, outsize), step=0.1, verbose=False ) bpnet.train(x_train, y_train, epochs=n_epochs) network_output = bpnet.predict(x_test) network_error = rmsle(output_scaler.inverse_transform(network_output), scaled_y_test) # -------------- Train ensemlbe -------------- # moe = algorithms.MixtureOfExperts( networks=[ algorithms.Momentum( (insize, 20, outsize), step=0.1, batch_size=1, verbose=False ), algorithms.Momentum( (insize, 20, outsize), step=0.1, batch_size=1, verbose=False ), ], gating_network=algorithms.Momentum( layers.Input(insize) > layers.Softmax(2), step=0.1, verbose=False ) ) moe.train(x_train, y_train, epochs=n_epochs) ensemble_output = moe.predict(x_test) ensemlbe_error = rmsle( output_scaler.inverse_transform(ensemble_output), scaled_y_test ) self.assertGreater(network_error, ensemlbe_error)
def init_methods(self): def free_energy(visible_sample): wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias visible_bias_term = T.dot(visible_sample, self.visible_bias) hidden_term = T.log(asfloat(1) + T.exp(wx_b)).sum(axis=1) return -visible_bias_term - hidden_term def visible_to_hidden(visible_sample): wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias return T.nnet.sigmoid(wx_b) def hidden_to_visible(hidden_sample): wx_b = T.dot(hidden_sample, self.weight.T) + self.visible_bias return T.nnet.sigmoid(wx_b) def sample_hidden_from_visible(visible_sample): theano_random = self.theano_random hidden_prob = visible_to_hidden(visible_sample) hidden_sample = theano_random.binomial(n=1, p=hidden_prob, dtype=theano.config.floatX) return hidden_sample def sample_visible_from_hidden(hidden_sample): theano_random = self.theano_random visible_prob = hidden_to_visible(hidden_sample) visible_sample = theano_random.binomial(n=1, p=visible_prob, dtype=theano.config.floatX) return visible_sample network_input = self.variables.network_input n_samples = asfloat(network_input.shape[0]) theano_random = self.theano_random weight = self.weight h_bias = self.hidden_bias v_bias = self.visible_bias h_samples = self.variables.h_samples step = asfloat(self.step) sample_indeces = theano_random.random_integers( low=0, high=n_samples - 1, size=(self.batch_size, )) v_pos = ifelse( T.eq(n_samples, self.batch_size), network_input, # In case if final batch has less number of # samples then expected network_input[sample_indeces]) h_pos = visible_to_hidden(v_pos) v_neg = sample_visible_from_hidden(h_samples) h_neg = visible_to_hidden(v_neg) weight_update = v_pos.T.dot(h_pos) - v_neg.T.dot(h_neg) h_bias_update = (h_pos - h_neg).mean(axis=0) v_bias_update = (v_pos - v_neg).mean(axis=0) # Stochastic pseudo-likelihood feature_index_to_flip = theano_random.random_integers( low=0, high=self.n_visible - 1, ) rounded_input = T.round(network_input) rounded_input = network_input rounded_input_flip = T.set_subtensor( rounded_input[:, feature_index_to_flip], 1 - rounded_input[:, feature_index_to_flip]) error = T.mean(self.n_visible * T.log( T.nnet.sigmoid( free_energy(rounded_input_flip) - free_energy(rounded_input)))) self.methods.update(train_epoch=theano.function( [network_input], error, name='algo:rbm/func:train-epoch', updates=[ (weight, weight + step * weight_update / n_samples), (h_bias, h_bias + step * h_bias_update), (v_bias, v_bias + step * v_bias_update), (h_samples, asint(theano_random.binomial(n=1, p=h_neg))), ]), prediction_error=theano.function( [network_input], error, name='algo:rbm/func:prediction-error', ), visible_to_hidden=theano.function( [network_input], visible_to_hidden(network_input), name='algo:rbm/func:visible-to-hidden', ), hidden_to_visible=theano.function( [network_input], hidden_to_visible(network_input), name='algo:rbm/func:hidden-to-visible', ), gibbs_sampling=theano.function( [network_input], sample_visible_from_hidden( sample_hidden_from_visible(network_input)), name='algo:rbm/func:gibbs-sampling', ))
""" Main source code from Pylearn2 library: https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/\ optimization/linesearch.py """ import theano import theano.tensor as T from theano.ifelse import ifelse from neupy.utils import asfloat one = T.constant(asfloat(1)) zero = T.constant(asfloat(0)) theano_true = T.constant(1) theano_false = T.constant(0) def sequential_or(*conditions): """ Use ``or`` operator between all conditions. Function is just a syntax sugar that make long Theano logical conditions looks less ugly. Parameters ---------- *conditions Conditions that returns ``True`` or ``False`` """ first_condition, other_conditions = conditions[0], conditions[1:]
def random_weight(shape): initializer = init.Normal() weight = initializer.sample(shape) return tf.Variable(asfloat(weight), dtype=tf.float32)
def line_search(f, f_deriv, maxiter=20, c1=1e-4, c2=0.9): """ Find ``x`` that satisfies strong Wolfe conditions. ``x > 0`` is assumed to be a descent direction. Parameters ---------- f : callable f(x) Objective scalar function. f_deriv : callable f'(x) Objective function derivative (can be None) maxiter : int Maximum number of iterations. c1 : float Parameter for Armijo condition rule. c2 : float Parameter for curvature condition rule. Returns ------- Theano object Value ``x`` that satisfies strong Wolfe conditions and minimize function ``f``. Notes ----- Uses the line search algorithm to enforce strong Wolfe conditions. See Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-60. For the zoom phase it uses an algorithm by [...]. """ if not 0 < c1 < 1: raise ValueError("c1 should be a float between 0 and 1") if not 0 < c2 < 1: raise ValueError("c2 should be a float between 0 and 1") if c2 < c1: raise ValueError("c2 needs to be greater than c1") if maxiter <= 0: raise ValueError("maxiter needs to be greater than 0") c1, c2 = asfloat(c1), asfloat(c2) def search_iteration_step(x_previous, x_current, y_previous, y_current, y_deriv_previous, is_first_iteration, x_star): y_deriv_current = f_deriv(x_current) x_new = x_current * asfloat(2) y_new = f(x_new) condition1 = T.or_( y_current > (y0 + c1 * x_current * y_deriv_0), T.and_(y_current >= y_previous, T.bitwise_not(is_first_iteration))) condition2 = T.abs_(y_deriv_current) <= -c2 * y_deriv_0 condition3 = y_deriv_current >= zero x_star = ifelse( condition1, zoom(x_previous, x_current, y_previous, y_current, y_deriv_previous, f, f_deriv, y0, y_deriv_0, c1, c2), ifelse( condition2, x_current, ifelse( condition3, zoom(x_current, x_previous, y_current, y_previous, y_deriv_current, f, f_deriv, y0, y_deriv_0, c1, c2), x_new, ), ), ) y_deriv_previous_new = ifelse(condition1, y_deriv_previous, y_deriv_current) is_any_condition_satisfied = sequential_or(condition1, condition2, condition3) y_current_new = ifelse(is_any_condition_satisfied, y_current, y_new) return ([ x_current, x_new, y_current, y_current_new, y_deriv_previous_new, theano_false, x_star ], theano.scan_module.scan_utils.until( sequential_or( T.eq(x_new, zero), is_any_condition_satisfied, ))) x0, x1 = zero, one y0, y1 = f(x0), f(x1) y_deriv_0 = f_deriv(x0) c1 = T.as_tensor_variable(c1) c2 = T.as_tensor_variable(c2) outs, _ = theano.scan( search_iteration_step, outputs_info=[x0, x1, y0, y1, y_deriv_0, theano_true, zero], n_steps=maxiter) x_star = outs[-1][-1] return x_star
def init_variables(self): super(LeakStepAdaptation, self).init_variables() n_parameters = count_parameters(self) self.variables.leak_average = theano.shared(value=asfloat( np.zeros(n_parameters)), name='leak_average')
import copy from functools import partial import numpy as np from neupy import algorithms, init, layers from neupy.layers import Input, Sigmoid from neupy.utils import asfloat from helpers import compare_networks from base import BaseTestCase simple_x_train = asfloat( np.array([ [0.1, 0.1, 0.2], [0.2, 0.3, 0.4], [0.1, 0.7, 0.2], ])) simple_y_train = asfloat(np.array([ [0.2, 0.2], [0.3, 0.3], [0.5, 0.5], ])) class RPROPTestCase(BaseTestCase): def setUp(self): super(RPROPTestCase, self).setUp() self.network = Input(3) > Sigmoid(10) > Sigmoid(2) def test_rprop(self):