def extract_slot_triplets(self) -> Mapping[str, sparse.Triplets]: slot_representations = { name: sparse.SparseRepresentation(self.weights.get_metainfo(), slot.np_variable) for name, slot in self.get_slot_var_dict().items() } return { name: sparse.triplets_from_representation(self.weights.spec, representation, self.weights.matmul_options, debug_name=name + "(slot)") for name, representation in slot_representations.items() }
def main(args): tf.logging.set_verbosity(tf.logging.ERROR) np.set_printoptions(linewidth=200) random_seed = args.random_seed checkpoint_path = os.path.join(tempfile.mkdtemp(), "model.ckpt") # Input activations for the attention layer random_gen = np.random.default_rng(seed=random_seed) activations_np = random_gen.uniform(-0.1, 0.1, size=(args.batch_size, args.source_sequence_length, args.hidden_length)) # Configure the IPU cfg = ipu.utils.create_ipu_config(profiling=args.profile, report_directory="./report/") cfg = ipu.utils.auto_select_ipus(cfg, 1) ipu.utils.configure_ipu_system(cfg) # Build IPU graphs sparse_decoder_graph = tf.Graph() sparse_transformer = DynsparseTransformer(args) with sparse_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weight placeholders are created inside sparse_transfomer inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): sparse_decoder = partial(sparse_transformer_fwd_and_grad, sparse_transformer) sparse_decoder_fetches = ipu.ipu_compiler.compile( sparse_decoder, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() # sparse-decoder with tf.Session(graph=sparse_decoder_graph) as sess: # initialize weights sess.run(tf.global_variables_initializer()) # Save the sparse weights to checkpoint as dense sparse_transformer.checkpointAsDense(checkpoint_path) # run sparse decoder sparse_result = sess.run(sparse_decoder_fetches, feed_dict={inputs_ph: activations_np}) # Create a dense transformer and initialize the weights to the values that # the sparse model was initialzed with originally dense_decoder_graph = tf.Graph() dense_transformer = DenseTransformer(args) with dense_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weights will get streamed from checkpoint inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): dense_decoder_fetches = partial(dense_transformer_fwd_and_grad, dense_transformer) dense_graph = ipu.ipu_compiler.compile(dense_decoder_fetches, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() with tf.device("cpu"): # We will only load the trainable variables, not momentum etc. loader = tf.train.Saver(tf.trainable_variables()) # dense-decoder with tf.Session(graph=dense_decoder_graph) as sess: # Initialized momentums which are not part of the checkpoint sess.run(tf.global_variables_initializer()) # Restore saved trainable variables loader.restore(sess, checkpoint_path) dense_result = sess.run(dense_graph, feed_dict={inputs_ph: activations_np}) # TEST rtol = 1e-05 atol = 1e-05 if args.dtype == tf.float16: rtol = 1e-04 atol = 1e-02 # Compare model output activations (actual vs. desired) -> (sparse vs. dense) np.testing.assert_allclose(sparse_result["output_activation"], dense_result["output_activation"], atol=atol, rtol=rtol, err_msg="Output activations do not match.") # Compate gradient of output wrt. input np.testing.assert_allclose(sparse_result["input_grad"], dense_result["input_grad"], atol=atol, rtol=rtol, err_msg="Grads wrt. inputs do not match") # Compare the dense_w and sparse grads of every sparse layer for name, sparse_layer in sparse_transformer.sparse_layers.items(): # Compate the dense grads dense_grad = dense_result[name + "/weight" + "_grad"] sparse_grad_w = sparse_result[name + "_grad_w"] np.testing.assert_allclose( sparse_grad_w, dense_grad, atol=atol, rtol=rtol, err_msg=f"Dense grads for layer {name} do not match") # Compare the sparse grads sparse_grad_padded = sparse_result[name + "/sparse_layer/nz_values_grad"] sparse_grad_data = sparse.SparseRepresentation( sparse_layer.weights.get_metainfo(), sparse_grad_padded) i, j, sparse_grad = sparse.triplets_from_representation( sparse_layer.weights.spec, sparse_grad_data, sparse_layer.weights.matmul_options) # Convert dense grads to blocks block_size, _ = sparse_layer.get_nonzero_blocks_shape() nx, ny = dense_grad.shape[0] // block_size, dense_grad.shape[ 1] // block_size strides = np.array(dense_grad.strides) # strides are in bytes strides = tuple(strides * block_size) + tuple(strides) blocked_dense_grad = np.lib.stride_tricks.as_strided( dense_grad, (nx, ny, block_size, block_size), strides) blocked_dense_grad = np.squeeze( np.copy(blocked_dense_grad )) # this will squeeze out the special case block size 1 np.testing.assert_allclose( sparse_grad, blocked_dense_grad[i, j], atol=atol, rtol=rtol, err_msg=f"Sparse grads for layer {name} do not match") print("All results match.") return sparse_result, dense_result
rhs: masked_rhs }) sess.run(sparse_data_update_op, feed_dict=fc.feed_dict()) sparse_result, sparse_input_grad, sparse_weight_grad, dense_grad_w = sess.run( sparse_fetches, feed_dict={ lhs: lhs_values, compute_dense_grad_w: True }) # Check all the results: # Convert the sparse gradient metainfo back to triplets and then use those row and col indices # to index the dense reference weight gradient: sparse_data = sparse.SparseRepresentation(fc.data.metainfo_state, sparse_weight_grad[0]) triplets = sparse.triplets_from_representation(fc.spec, sparse_data) reference_grad_nzvalues = sparse.values_at_indices(triplets[0], triplets[1], reference_weight_grad[0]) # Convert the dense reference weight gradient to a sparse one using the same mask # that we used for the weights so we can compare the nzvalues against the sparse grad: _, _, values = sparse.triplets_from_dense(reference_weight_grad[0]) sparse_data = sparse.representation_from_triplets(fc.spec, *triplets) reference_grad_nzvalues = sparse_data.nz_values # Need to set tolerances for fp32 as numpy is set for doubles by default: rtol = 1e-05 atol = 1e-06 if not np.allclose(
lhs: lhs_values, rhs: masked_rhs }) sparse_result, sparse_input_grad, sparse_weight_grad, dense_grad_w = sess.run( sparse_fetches, feed_dict={ lhs: lhs_values, compute_dense_grad_w: True }) # Check all the results: # Convert the sparse gradient metainfo back to triplets and then use those row and col indices # to index the dense reference weight gradient: sparse_data = sparse.SparseRepresentation(fc.weights.get_metainfo(), sparse_weight_grad[0]) triplets = sparse.triplets_from_representation(fc.weights.spec, sparse_data, fc.weights.matmul_options) if args.block_size == 1: reference_grad_nzvalues = sparse.values_at_indices( triplets[0], triplets[1], reference_weight_grad) else: reference_grad_nzvalues = sparse.blocks_at_indices( triplets[0], triplets[1], args.block_size, reference_weight_grad) # Convert the dense reference weight gradient to a sparse one using the same mask # that we used for the weights so we can compare the nzvalues against the sparse grad: dense_data = sparse.representation_from_triplets(fc.weights.spec, triplets[0], triplets[1], reference_grad_nzvalues, fc.weights.matmul_options)
def extract_momentum_triplets(self): momentum_data = sparse.SparseRepresentation(self.data.metainfo_state, self.sparse_momentum) return sparse.triplets_from_representation(self.spec, momentum_data)
# Check the projection dding result: if not np.allclose(projections, reference_projections, rtol=rtol, atol=atol, equal_nan=True): print( f"Max abs error: {np.max(np.abs(projections-reference_projections))}" ) raise RuntimeError("Sparse and reference projections do not match.") # Convert the sparse gradient metainfo back to triplets and then use those row and col indices # to index the dense reference weight gradient: matmul_spec = embedding.projection.weights.spec matmul_opts = embedding.projection.weights.matmul_options sparse_data = sparse.SparseRepresentation( embedding.projection.weights.get_metainfo(), tied_grad_w[0]) triplets = sparse.triplets_from_representation(matmul_spec, sparse_data, matmul_opts) # Reference grad is transposed with respect to popsparse one (third Jacobian is the reduction gradient wrt. weights): ref_grad_reduced = np.transpose(reference_grads_w) if args.block_size == 1: reference_grad_nzvalues = sparse.values_at_indices( triplets[0], triplets[1], ref_grad_reduced) else: reference_grad_nzvalues = sparse.blocks_at_indices( triplets[0], triplets[1], args.block_size, ref_grad_reduced) # Convert the dense reference weight gradient to a sparse one using the same mask # that we used for the weights so we can compare the nzvalues against the sparse grad: dense_data = sparse.representation_from_triplets(matmul_spec, triplets[0], triplets[1], reference_grad_nzvalues,