def test_deriv_missing_connection(N): """ Taking the derivative of an expression with respect to a variable not used to compute the expression should raise an exception. """ x = ng.variable([N]) y = ng.variable([N]) z = ng.variable([N]) with pytest.raises(ValueError): ng.deriv(x + y, z)
def test_batchnorm_bprop(input_placeholder, bn_params): layer = BatchNorm(**bn_params) fprop = layer(input_placeholder) # Derivatives to check bprop_vars = [input_placeholder, layer.gamma, layer.beta] delta_placeholder = ng.placeholder(fprop.axes) bprops = [ng.deriv(fprop, var, delta_placeholder) for var in bprop_vars] with ExecutorFactory() as ex: # Create derivative executor bprop_function = ex.executor(bprops, input_placeholder, delta_placeholder) # Generate data x = rng.uniform(0, 1, input_placeholder.axes) delta = rng.uniform(-.1, .1, delta_placeholder.axes) # Compute reference bprop dx_ref, dgamma_ref, dbeta_ref = BatchNormReference( x, **bn_params).bprop(delta) # Compute ngraph bprop dx, dgamma, dbeta = bprop_function(x, delta) ng.testing.assert_allclose(dx, dx_ref, rtol=rtol, atol=atol) ng.testing.assert_allclose(dgamma, dgamma_ref, rtol=rtol, atol=atol) ng.testing.assert_allclose(dbeta, dbeta_ref, rtol=rtol, atol=atol)
def __call__(self, cost_func, variables=None, subgraph=None, warning=False): """ Arguments: cost_func (Op): The cost function to optimize variables (list of variables): List of variables to optimize subgraph (SubGraph): A subgraph instance containing all variables to optimize warning (bool): If True displays warning message if any variables specified do not participate in batch cost computation .. Note:: If subgraph is provided, the variables to optimize will be taken from it. Otherwise, they can be provided explicitly by passing a list as `variables`. If neither `subgraph` nor `variables` is provided, the variables to optimize will be all trainable variables on which `cost` depends. """ all_updates = [] batch_cost = ng.sum(cost_func, out_axes=()) if cost_func.axes.batch_axis() is None: batch_size = 1 else: batch_size = cost_func.axes.batch_axis().length # determine variables to optimize if subgraph is not None: if variables is not None: raise ValueError("variables and subgraph cannot both be specified.") variables = list(subgraph.variables.values()) if variables is None: variables = batch_cost.variables() elif variables is not None and warning is True: all_variables = batch_cost.variables() selected_variables = all_variables & set(variables) if len(selected_variables) < len(variables): logger.warn("not all selected variables participate in cost computation") # gradients grads = [ng.deriv(batch_cost, v) / batch_size for v in variables] scale_factor = clip_gradient_norm(grads, self.gradient_clip_norm) # updates for variable, grad in zip(variables, grads): updates = self.variable_update(variable, grad, scale_factor, self.weight_clip_value) all_updates.append(updates) updates = ng.doall(all_updates) # grads = ng.doall(grads) # clips = ng.doall([ng.assign(variable, # clip_weight_value(variable, self.weight_clip_value)) # for variable in variables]) # return ng.sequential([grads, updates, clips, 0]) # return ng.sequential([grads, updates, 0]) return ng.sequential([updates, 0])
def test_concatenate(concatenate_variables): x_list, np_list, pos = concatenate_variables with ExecutorFactory() as ex: v = ng.concat_along_axis(x_list, x_list[0].axes[pos]) d = ng.deriv(v, x_list[0], error=ng.constant(np.ones(v.axes.lengths), axes=v.axes)) f = ex.executor([v, d]) e_v, e_d = f() np_v = np.concatenate(np_list, axis=pos) ng.testing.assert_allclose(e_v.copy(), np_v) ng.testing.assert_allclose(e_d.copy(), np.ones(x_list[0].axes.lengths))
def test_idempotent_axes_c(): """ Test test axes transformations with autodiff, case c, with broadcast, slice, cast and dim-shuffle """ with ExecutorFactory() as ex: axes = ng.make_axes([ng.make_axis(3), ng.make_axis(1)]) result_axes = [ng.make_axis(length=axis.length) for axis in axes] # variable w = ng.variable(axes, initial_value=np.ones((3, 1))) # broadcast l / r, introducing dummy length 1 axes l = ng.broadcast(w, axes) r = ng.broadcast(w, axes) # slice axes_slice = [slice(None, None, None), slice(None, None, None)] l_sliced = ng.tensor_slice(l, axes_slice) r_sliced = ng.tensor_slice(r, axes_slice) # cast r r_sliced_casted = ng.cast_axes(r_sliced, axes) # perform add result = ng.add(l_sliced, r_sliced_casted) # cast / dimshuffle result = ng.cast_axes(result, result_axes) result = ng.axes_with_order(result, result_axes) # cost and grad cost = ng.sum(result, reduction_axes=result.axes) grad = ng.deriv(cost, w) grad_comp = ex.executor(grad) cost_comp = ex.executor(cost) cost_comp_ng = cost_comp() grad_comp_ng = grad_comp() grad_comp_np = np.ones((3, 1)) * 2. assert cost_comp_ng == 6.0 assert np.array_equal(grad_comp_ng, grad_comp_np)
def test_logreg(): # xs: (C, N), y: (N,) xs = np.array([[0.52, 0.88, 0.52, 0.74], [1.12, -1.08, 0.06, -2.49], [0.77, 0.15, -1.3, 1.39]]) ys = np.array([1, 1, 0, 1]) max_iter = 10 alpha = 0.1 thetas = np.array([0., 0., 0.]) np_logreg = NumpyLogreg(xs, ys, thetas) C, N = ng.make_axis(length=3), ng.make_axis(length=4) # input tensors xs_v = ng.placeholder((C, N)) ys_v = ng.placeholder([N]) alpha_v = ng.placeholder(()) thetas_var = ng.variable([C], initial_value=thetas) # define ops ys_pred = ng.sigmoid(ng.dot(thetas_var, xs_v)) log_likelihoods = ng.log(ys_pred) * ys_v + ng.log(1 - ys_pred) * (1 - ys_v) loss = -ng.sum(log_likelihoods, reduction_axes=[N]) grad_comp = ng.deriv(loss, thetas_var) weight_update = ng.sequential( [ng.assign(thetas_var, thetas_var - alpha_v * grad_comp), thetas_var]) # transformer with ExecutorFactory() as ex: train_eval_func = ex.executor([grad_comp, loss, weight_update], xs_v, ys_v, alpha_v) # evaluate for i in range(max_iter): grad_np, loss_np, thetas_np = np_logreg.optimize(alpha) grad_ng, loss_ng, thetas_ng = train_eval_func(xs, ys, alpha) ng.testing.assert_allclose(loss_np, loss_ng, rtol=1e-05, atol=1e-05, \ transformer_overwrite=False) ng.testing.assert_allclose(grad_np, grad_ng, rtol=1e-05, atol=1e-05, \ transformer_overwrite=False) ng.testing.assert_allclose(thetas_np, thetas_ng, rtol=1e-05, atol=1e-05, \ transformer_overwrite=False)
def test_specific_slice_deriv(): # with ExecutorFactory() as ex: A = ng.make_axis(name='A', length=3) B = ng.make_axis(name='B', length=4) np_shape = (A.length, B.length) x_np = np.empty(np_shape, dtype=np.float32) for i in range(A.length): for j in range(B.length): x_np[i, j] = 10 * i + j x_ng = ng.persistent_tensor([A, B], initial_value=x_np) for i in range(A.length): for j in range(B.length): slice = ng.tensor_slice(x_ng, (i, j)) dslice_dx = ng.deriv(slice, x_ng) dslice_dx_fun = ex.executor(dslice_dx) dslice_dx_val = dslice_dx_fun() dslice_dx_np = np.zeros_like(x_np) dslice_dx_np[i, j] = 1 ng.testing.assert_allclose(dslice_dx_val, dslice_dx_np)
def minimize(self, cost, variables): """ Minimize cost by returning update Ops. Arguments: cost: The cost Op to be minimized variables: TODO Returns: A doall op containing setitems to variable ops. """ assert cost is not None assert variables is not None return ng.doall([ ng.assign(variable, variable - self.compute_lr_op * ng.deriv(cost, variable)) for variable in variables ])
def test_dropout_bprop_single_comp(nin, batch_size, keep): # set inputs N = ng.make_axis(batch_size, name='N') F = ng.make_axis(nin, name='F') mul_factor = ng.placeholder(()) inp = ng.placeholder([F, N]) layer = Dropout(keep=keep) fprop = layer(inp * mul_factor) out_graph = ng.sum(fprop, out_axes=()) bprop = ng.deriv(out_graph, mul_factor) # create data x = np.random.uniform(size=(nin, batch_size)) # evaluate with ExecutorFactory() as ex: comp = ex.executor([fprop, bprop, layer.mask], inp, mul_factor) fout, bout, mask = comp(x, 2) # Calculate derivative by hand and compare ng.testing.assert_allclose(bout, (x * mask[:, None]).sum(), rtol=1e-6)
def test_idempotent_axes_b(): """ Test test axes transformations with autodiff, case b, with broadcast applied to the same tensor """ with ExecutorFactory() as ex: axes = ng.make_axes([ng.make_axis(3), ng.make_axis(1)]) w = ng.variable(axes, initial_value=np.ones((3, 1))) l = ng.broadcast(w, axes) r = ng.broadcast(w, axes) result = ng.add(l, r) result = ng.cast_axes(result, axes) cost = ng.sum(result, reduction_axes=axes) grad = ng.deriv(cost, w) grad_comp = ex.executor(grad) cost_comp = ex.executor(cost) assert cost_comp() == 6.0 assert np.array_equal(grad_comp(), np.ones((3, 1)) * 2.)
def test_idempotent_axes_a(): """ Test test axes transformations with autodiff, case a, reference test """ with ExecutorFactory() as ex: axes = ng.make_axes([ng.make_axis(3), ng.make_axis(1)]) w = ng.variable(axes, initial_value=np.ones((3, 1))) result = w + w result = ng.cast_axes(result, axes) cost = ng.sum(result, reduction_axes=axes) grad = ng.deriv(cost, w) grad_comp = ex.executor(grad) cost_comp = ex.executor(cost) cost_comp_val = cost_comp() grad_comp_val = grad_comp() grad_comp_np = np.ones((3, 1)) * 2. assert cost_comp_val == 6.0 assert np.array_equal(grad_comp_val, grad_comp_np)
def test_rnn_deriv_ref(sequence_length, input_size, hidden_size, batch_size, return_sequence, weight_initializer, bias_initializer, init_state): assert batch_size == 1, "the recurrent reference implementation only support batch size 1" assert return_sequence is True, "the reference rnn only supports sequences for deriv" # Get input placeholder and numpy array input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size) # Construct network weights and initial state, if desired W_in, W_rec, b, init_state, init_state_value = make_weights( input_placeholder, hidden_size, weight_initializer, bias_initializer, init_state) # Compute reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size, return_sequence=return_sequence) rnn_ref.set_weights(W_in, W_rec, b.reshape(rnn_ref.bh.shape)) # Prepare deltas for gradient check output_shape = (hidden_size, sequence_length, batch_size) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) dW_in, dW_rec, db = rnn_ref.lossFun(input_value.transpose([1, 0, 2]), deltas.copy().transpose([1, 0, 2]), init_states=init_state_value)[:3] # Generate ngraph RNN rnn_ng = RNNCell(hidden_size, init=W_in, init_h2h=W_rec, activation=Tanh(), reset_cells=True) # fprop ngraph RNN num_steps = input_placeholder.axes.recurrent_axis().length init_states = {'h': init_state} if init_state is not None else init_state out_ng = unroll(rnn_ng, num_steps, input_placeholder, init_states=init_states, return_sequence=return_sequence) deltas_constant = ng.constant(deltas, axes=out_ng.axes) params = [(rnn_ng.i2h.linear.W, W_in), (rnn_ng.h2h.W, W_rec), (rnn_ng.i2h.bias.W, b)] with ExecutorFactory() as ex: # Create derivative computations and execute param_updates = list() for px, _ in params: update = ng.deriv(out_ng, px, error=deltas_constant) if init_state is not None: param_updates.append( ex.executor(update, input_placeholder, init_state)) else: param_updates.append(ex.executor(update, input_placeholder)) for update_fun, ref_val in zip(param_updates, [dW_in, dW_rec, db]): if init_state is not None: grad_neon = update_fun(input_value, init_state_value) else: grad_neon = update_fun(input_value) ng.testing.assert_allclose(grad_neon, ref_val.squeeze(), rtol=bprop_rtol, atol=bprop_atol)
def derivative(self, f, px, *parameters): """ Full derivative of f wrt placeholder px Arguments: f: TODO px: TODO parameters: TODO Returns: """ fshape = f.axes.lengths xshape = px.axes.lengths # print "=============" # for op in Op.ordered_ops([dfdx]): # print '-----' # print op, op.axes # print op.args # print '------' # print "=============" if len(fshape) is 0: return self.transformer.computation(ng.deriv(f, px), px, *parameters) else: initial_adjoint = ng.placeholder(f.axes).named('adj') adjoint = np.zeros(fshape, dtype=f.dtype) dfdx = ng.deriv(f, px, error=initial_adjoint) comp = self.transformer.computation(dfdx, initial_adjoint, px, *parameters) def helper(x, *args): dfdxshape = list(fshape) dfdxshape.extend(xshape) npdfdx = np.empty(dfdxshape, dtype=x.dtype) dindex = [0 for _ in fshape] dindex.extend([slice(None) for _ in xshape]) idxiter = np.nditer(adjoint, flags=['multi_index'], op_flags=['readwrite']) for dfdxiter in idxiter: dfdxiter[...] = 1 df = comp(adjoint, x, *args) if is_flex_transformer(comp.transformer): reset_flex_entries(comp) # import pytest; pytest.set_trace() # with open("code_sum.py", "w") as f: f.write(comp.transformer.code.code) dindex[0:len(fshape)] = idxiter.multi_index npdfdx[tuple(dindex)] = df dfdxiter[...] = 0 return npdfdx return helper
def test_recurrent_batchnorm_bprop(RNN, recurrent_input, output_size, bn_params): """Compare bprop gated RNN with batch norm to numpy batch norm followed by rnn without""" helper = RNNHelper(recurrent_input, output_size, RNN, bn_params) # Get rnn + batch norm bprop graph fprop = helper.rnn(recurrent_input) bprop_vars = [recurrent_input, helper.gamma, helper.beta] # Get bprop graph delta_placeholder = ng.placeholder(fprop.axes) bprops = [ng.deriv(fprop, var, delta_placeholder) for var in bprop_vars] # Get reference graphs reference_fprop = helper.reference_rnn(helper.reference_input) # Handle the case where we have gates in the RNN object bprop_vars = [helper.reference_input] if helper.has_gates: bprop_vars.append(helper.get_ancestor_op(reference_fprop)) reference_delta_placeholder = ng.placeholder(reference_fprop.axes) reference_bprop = [ ng.deriv(reference_fprop, var, reference_delta_placeholder) for var in bprop_vars ] # Begin execution with ExecutorFactory() as ex: bprop_function = ex.executor(bprops, recurrent_input, delta_placeholder) reference_function = ex.executor(reference_bprop, helper.reference_input, reference_delta_placeholder) # Create data input_value = rng.uniform(0, 1, recurrent_input.axes) delta = rng.uniform(-.1, .1, fprop.axes) # Compute reference weighted input weighted_input = np.dot(helper.W_in, input_value.swapaxes(0, 1)) # Set the reduction axes used for reference bn_params['axis'] = (1, 2) # Get reference batch normed input batch_norm_reference = BatchNormReference(weighted_input, **bn_params) normed_input = batch_norm_reference.fprop[0] # Reference backprop through RNN reference_result = reference_function(normed_input, delta) # This is because of a HETR bug where return collections aren't handled properly if isinstance(reference_result, tuple): rnn_delta = reference_result[0] else: rnn_delta = reference_result # Reference backprop through BN dx_ref, dgamma_ref, dbeta_ref = batch_norm_reference.bprop(rnn_delta) # Backprop through reference batch norm for a single gate if helper.has_gates: rnn_gate_delta = reference_result[1] _, dgamma_ref, dbeta_ref = batch_norm_reference.bprop( rnn_gate_delta) # Backprop through weighted input dx_ref = np.dot(helper.W_in.T, dx_ref.swapaxes(0, 1)) # Compute ngraph bprop dx, dgamma, dbeta = bprop_function(input_value, delta) ng.testing.assert_allclose(dx, dx_ref, rtol=rtol, atol=recurrent_atol) ng.testing.assert_allclose(dgamma, dgamma_ref, rtol=rtol, atol=recurrent_atol) ng.testing.assert_allclose(dbeta, dbeta_ref, rtol=rtol, atol=recurrent_atol)
def test_seq2seq_deriv_ref(batch_size, sequence_length_enc, sequence_length_dec, input_size, hidden_size, weight_initializer, bias_initializer): # TODO: are these assumptions true? assert batch_size == 1, "the seq2seq reference implementation only support batch size 1" # Get input placeholders and numpy arrays input_placeholder_enc, input_value_enc, = \ make_placeholder(input_size, sequence_length_enc, batch_size) input_placeholder_dec, input_value_dec, = \ make_placeholder(input_size, sequence_length_dec, batch_size) # Construct encoder weights W_in_enc, W_rec_enc, b_enc, _, _ = make_weights(input_placeholder_enc, hidden_size, weight_initializer, bias_initializer, init_state=False) # Construct decoder weights W_in_dec, W_rec_dec, b_dec, _, _ = make_weights(input_placeholder_dec, hidden_size, weight_initializer, bias_initializer, init_state=False) # Reference numpy seq2seq seq2seq_ref = RefSeq2Seq(input_size, hidden_size, decoder_return_sequence=True) seq2seq_ref.set_weights(W_in_enc, W_rec_enc, b_enc.reshape(seq2seq_ref.bh_enc.shape), W_in_dec, W_rec_dec, b_dec.reshape(seq2seq_ref.bh_dec.shape)) # Prepare deltas for gradient check output_shape = (hidden_size, sequence_length_dec, batch_size) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) dW_in_enc, dW_rec_enc, db_enc, dW_in_dec, dW_rec_dec, db_dec, encoding_ref, hs_return_dec = \ seq2seq_ref.lossFun(input_value_enc.transpose([1, 0, 2]), input_value_dec.transpose([1, 0, 2]), deltas.copy().transpose([1, 0, 2])) # Generate ngraph Seq2Seq rnn_enc_ng = Recurrent(hidden_size, init=W_in_enc, init_inner=W_rec_enc, activation=Tanh(), reset_cells=True, return_sequence=False) rnn_dec_ng = Recurrent(hidden_size, init=W_in_dec, init_inner=W_rec_dec, activation=Tanh(), reset_cells=True, return_sequence=True) # ngraph fprop graph encoding_ng = rnn_enc_ng(input_placeholder_enc, init_state=None) output_ng = rnn_dec_ng(input_placeholder_dec, init_state=encoding_ng) deltas_constant = ng.constant(deltas, axes=output_ng.axes) params = [(rnn_dec_ng.b, db_dec), (rnn_dec_ng.W_input, dW_in_dec), (rnn_dec_ng.W_recur, dW_rec_dec), (rnn_enc_ng.b, db_enc), (rnn_enc_ng.W_input, dW_in_enc), (rnn_enc_ng.W_recur, dW_rec_enc)] with ExecutorFactory() as ex: # fprop computations fprop_fun = ex.executor([encoding_ng, output_ng], input_placeholder_enc, input_placeholder_dec) # gradient computations update_funs = [] for px, _ in params: update = ng.deriv(output_ng, px, error=deltas_constant) update_funs.append( ex.executor(update, input_placeholder_enc, input_placeholder_dec)) # check forward pass encoding, output = fprop_fun(input_value_enc, input_value_dec) ng.testing.assert_allclose(encoding, encoding_ref, rtol=1e-5, atol=1e-5, \ transformer_overwrite=False) ng.testing.assert_allclose(np.squeeze(output), np.squeeze(hs_return_dec), \ rtol=1e-5, atol=1e-5, transformer_overwrite=False) # check gradient computations for update_fun, (_, deriv_ref_val) in zip(update_funs, params): grad_neon = update_fun(input_value_enc, input_value_dec) ng.testing.assert_allclose(grad_neon, deriv_ref_val.squeeze(), rtol=1e-5, atol=1e-4)