def sample(): x = sym.Variable("x") y = sym.Variable("y") z1 = sym.elemwise_add(x, sym.sqrt(y)) z2 = sym.log(x) gradient = graph_util.gradients([z1, z2], [x, y]) print(gradient)
def nnvm_conv(): x = sym.Variable("x") y = sym.Variable("y") z = sym.conv2d(x, y, channels=3, kernel_size=3) grad = graph_util.gradients([z], [x, y]) print(grad) print(grad[0].debug_str())
def test_gradient(): x = sym.Variable("x") y = sym.Variable("y") z1 = sym.elemwise_add(x, sym.sqrt(y)) z2 = sym.log(x) gradient = graph_util.gradients([z1, z2], [x, y]) assert len(gradient) == 2 g1 = sym.Variable("g1") g2 = sym.Variable("g2") grad_ys = [g1, g2] gradient = graph_util.gradients(sym.Group([z1, z2]), sym.Group([x, y]), grad_ys=grad_ys) g_graph = graph.create(sym.Group(gradient)).ir() assert len(gradient) == 2 assert "g1" in g_graph assert "g2" in g_graph
def check_function(symbol, forward=None, backward=None, grad_input_vars=None, shape=None, dtype=None, in_range=None, values=None, exclude_targets=None, only_targets=None, additional_params=None, numerical_grads=None, numerical_grads_params=None, atol=1e-5, rtol=1e-5, quiet=False): """Compute the function and/or its gradients on a random input and raise an exception if the result doesn't match the reference implementation. Parameters ---------- symbol : nnvm.Symbol A symbol representing the output. forward : Callable[..., List[numpy.ndarray]], optional A reference implementation to compare with. backward : Callable[..., List[numpy.ndarray] or Dict[str, numpy.ndarray]], optional A reference implementation of gradients. Should also accept head_grads besides normal inputs which is a list of gradients of some scalar wrt the outputs or just a single gradient if there are multiple outputs. Should return either a dict mapping input variable names to the respective gradients or a list of gradients wrt variables from grad_input_vars in exactly the same order (in alphabetical order by default). grad_input_vars : List[nnvm.Symbol or str], optional A list of variables with respect to which the gradients will be computed. None (default) means that all input variables will be used in an alphabetical order. shape : Dict[nnvm.Symbol or str, Tuple[int]] or Tuple[int], optional A dict mapping input variable names to shapes, or just a single shape. By default shapes will be inferred from variables' attributes (see the Examples). Note that this parameter takes precedence over variables' attributes. dtype : Dict[nnvm.Symbol or str, str] or str, optional A dict mapping input variable names to dtypes, or just a single dtype. By default dtypes will be inferred from variables' attributes (see the Examples). If dtypes cannot be inferred for some variables then float32 will be used as a fallback. Note that this parameter takes precedence over variables' attributes. in_range : Dict[nnvm.Symbol or str, (float, float)] or (float, float), optional A dict mapping input variable names to ranges or just a single range (the same for all variables). Input values will be generated from uniform distributions on these ranges. `head_grads` can also be assigned a range this way. values : Dict[nnvm.Symbol or str, numpy.ndarray], optional A dict explicitly providing values for some variables instead of random generation. exclude_targets : Set[str], optional Skip compiling and running anything for these targets. only_targets : Set[str], optional Test only for those targets from `ctx_list()` that are also in this set. additional_params : dict, optional A dict of additional parameters which will be passed to forward and backward. numerical_grads : bool or 'if_possible', optional Whether to additionally check against numerically computed gradients. If 'if_possible' or None is passed (which is the default) then it will try to create a gradient computation graph and then check gradients numerically only if this graph can be created (i.e. if there are some operations with unimplemented gradients, it will just issue a warning). Checking against numerical gradients is done via the `check_numerical_grads` function. numerical_grads_params : dict, optional Additional parameters for `check_numerical_grads`. atol : float, optional Absolute tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients. rtol : float, optional Relative tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients. quiet : bool, optional Don't dump additional information to stdout on failure. Examples -------- .. code-block:: python x = sym.Variable("x", shape=(1, 2)) y = sym.Variable("y", shape=(1, 2)) # check the function and its gradients both numerically and using a reference function check_function(x + 2*y, lambda x, y: x + 2*y, lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads}) # just check gradients numerically check_function(x + 2*y, numerical_grads=True) # just check the forward computation check_function(x + 2*y, lambda x, y: x + 2*y, numerical_grads=False) # specifying dtype check_function(x + 2*y, lambda x, y: x + 2*y, dtype='float64') # dtypes can also be specified during variable creation with dtype codes x = sym.Variable("x", dtype=0) check_function(x + 1, shape=(2, 2), numerical_grads=True) """ # validate and preprocess the input params if numerical_grads is None and forward is None and backward is None: raise ValueError("No reference function was passed to check_function. If you only want to " "check gradients numerically, pass numerical_grads=True explicitly.") if numerical_grads is None: numerical_grads = 'if_possible' if numerical_grads not in [False, True, 'if_possible']: raise ValueError("numerical_grads must be a bool or 'if_possible', not {}" .format(numerical_grads)) if additional_params is None: additional_params = {} input_vars = symbol.list_input_variables() input_dict = {x.attr('name'): x for x in input_vars} if grad_input_vars is None: grad_input_vars = sorted(input_vars, key=lambda x: x.attr('name')) else: grad_input_vars = [input_dict[x] if isinstance(x, str) else x for x in grad_input_vars] in_range = _dict_var_to_dict_str(in_range) values = _dict_var_to_dict_str(values) out_len = len(symbol.list_output_names()) # Infer the output shapes and dtypes, and preprocess the shape and dtype params forward_graph, shape, dtype, out_shapes, out_dtypes = \ infer_shapes_dtypes(nnvm.graph.create(symbol), shape=shape, dtype=dtype, fallback_dtype='float32') if not all(out_shapes) or not all(out_dtypes): if not quiet: print(forward_graph.ir(join_node_attrs=['shape', 'dtype'])) raise ValueError("Could not infer shapes or dtypes for outputs.\n" "out_shapes = {}\nout_dtypes = {}".format(out_shapes, out_dtypes)) backward_graph = None # If we want gradients, we have to recreate the graph, but now with gradient computations # Note that here we need out_shapes for defining the shape of head grads, so we have to # create the graph twice if backward is not None or numerical_grads: try: head_grads_symbols = [nnvm.symbol.Variable("head_grads_" + str(i), shape=out_shapes[i], dtype=DTYPE_TO_TCODE[out_dtypes[i]]) for i in range(out_len)] grad_symbols = graph_util.gradients([symbol], grad_input_vars, grad_ys=head_grads_symbols) # Sometimes grads do not depend on head_grads, so head_grads does not appear # in the variable list; adding it manually prevents this, making things a bit easier backward_graph = \ nnvm.graph.create(nnvm.symbol.Group([symbol] + grad_symbols + head_grads_symbols)) backward_graph, shape, dtype, out_shapes, out_dtypes = \ infer_shapes_dtypes(backward_graph, shape=shape, dtype=dtype, fallback_dtype='float32') except nnvm._base.NNVMError as err: if backward is None and numerical_grads == "if_possible": logging.warning("Won't check gradients because: %s", str(err).split('\n', 1)[0]) numerical_grads = False backward_graph = None else: raise main_graph = backward_graph if backward_graph is not None else forward_graph # Generate random data for inputs (including head_grads) np_inputs = {} for x in main_graph.symbol.list_input_variables(): x_name = x.attr('name') x_shape = shape[x_name] x_dtype = dtype[x_name] if values is not None and x_name in values: np_inputs[x_name] = values[x_name].astype(x_dtype) continue low = -1.0 high = 1.0 if in_range is not None: if isinstance(in_range, dict): if x_name in in_range: low = in_range[x_name][0] high = in_range[x_name][1] else: low = in_range[0] high = in_range[1] np_inputs[x_name] = np.random.uniform(size=x_shape, low=low, high=high).astype(x_dtype) np_inputs_without_head_grads = {k: np_inputs[k] for k in np_inputs if not k.startswith('head_grads_')} nothing_was_done = True # Compute and compare the results for target, ctx in ctx_list(): if exclude_targets is not None: if target in exclude_targets or str(target) in exclude_targets: logging.info("Skipping target = %s, ctx = %s", target, ctx) continue if only_targets is not None: if target not in only_targets and str(target) not in only_targets: logging.info("Skipping target = %s, ctx = %s", target, ctx) continue logging.info("Checking computation on target = %s, ctx = %s", target, ctx) debug_stage = None try: nnvm_res = None debug_stage = "compiling" main_function = graph_to_function(main_graph, target, ctx) # nnvm_res contains the output and gradients (if they are needed) debug_stage = "running" nnvm_res = main_function(**np_inputs) try: logging.debug("checking to_relay conversion") inputs = np_inputs_without_head_grads.copy() func, inputs = to_relay(main_graph, shape, dtype, params=inputs) with relay.build_config(opt_level=3): graph, lib, params = relay.build(func, target=target) m = graph_runtime.create(graph, lib, ctx) m.set_input(**inputs) m.set_input(**params) m.run() for i in range(out_len): relay_out = m.get_output(i).asnumpy() tvm.testing.assert_allclose(nnvm_res[i], relay_out, atol=atol, rtol=rtol) except NotImplementedError as err: # the NNVM operator is not supported yet logging.warning(err) if backward_graph is not None: grad_var_names = [x.attr('name') for x in grad_input_vars] nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])} if forward is not None: nothing_was_done = False debug_stage = "checking forward computation" logging.debug(debug_stage) params = {} params.update(np_inputs_without_head_grads) params.update(additional_params) numpy_res = forward(**params) if isinstance(numpy_res, tuple): numpy_res = list(numpy_res) if not isinstance(numpy_res, list): numpy_res = [numpy_res] if len(numpy_res) != out_len: raise ValueError("Forward function returned {} values, but " "the nnvm graph returns {} values" .format(len(numpy_res), out_len)) for i in range(out_len): tvm.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol) if backward is not None: nothing_was_done = False debug_stage = "checking gradients" logging.debug(debug_stage) np_head_grads = [np_inputs["head_grads_" + str(i)] for i in range(out_len)] if out_len == 1: np_head_grads = np_head_grads[0] params = {'head_grads': np_head_grads} params.update(np_inputs_without_head_grads) params.update(additional_params) numpy_grads = backward(**params) if not isinstance(numpy_grads, dict): if isinstance(numpy_grads, tuple): numpy_grads = list(numpy_grads) if not isinstance(numpy_grads, list): numpy_grads = [numpy_grads] numpy_grads = {x: v for x, v in zip(grad_var_names, numpy_grads)} if len(numpy_grads) != len(grad_var_names): raise ValueError("The backward function returns a list of gradients which " "does not contain gradients for these variables: {}" .format(set(grad_var_names) - set(numpy_grads))) for x_name in numpy_grads: tvm.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name], atol=atol, rtol=rtol) if numerical_grads: nothing_was_done = False debug_stage = "checking gradients numerically" logging.debug(debug_stage) forward_function = graph_to_function(forward_graph, target, ctx) # Since the result may be non-scalar, we have to put another operation on the top, # so we just multiple by the randomly generated head_grads and then sum everything. # This way we can reuse the gradient values which has been already computed. def scalar_function(**kwargs): res = forward_function(**kwargs) return np.sum([np.dot(np_inputs['head_grads_' + str(i)].ravel(), res[i].ravel()) for i in range(out_len)]) if numerical_grads_params is None: numerical_grads_params = {} check_numerical_grads( scalar_function, input_values=np_inputs_without_head_grads, grad_values=nnvm_grads, **numerical_grads_params) except: if not quiet: print("\ncheck_function failed while {}, here is the main graph" .format(debug_stage)) print(main_graph.ir(join_node_attrs=['shape', 'dtype'])) if nnvm_res is not None: print("Generated inputs:") print(np_inputs) print() raise if nothing_was_done: logging.warning("Nothing was done in check_function. Check ctx_list().")
def check_function(symbol, forward=None, backward=None, grad_input_vars=None, shape=None, dtype=None, in_range=None, values=None, exclude_targets=None, only_targets=None, additional_params=None, numerical_grads=None, numerical_grads_params=None, atol=1e-5, rtol=1e-5, quiet=False): """Compute the function and/or its gradients on a random input and raise an exception if the result doesn't match the reference implementation. Parameters ---------- symbol : nnvm.Symbol A symbol representing the output. forward : Callable[..., List[numpy.ndarray]], optional A reference implementation to compare with. backward : Callable[..., List[numpy.ndarray] or Dict[str, numpy.ndarray]], optional A reference implementation of gradients. Should also accept head_grads besides normal inputs which is a list of gradients of some scalar wrt the outputs or just a single gradient if there are multiple outputs. Should return either a dict mapping input variable names to the respective gradients or a list of gradients wrt variables from grad_input_vars in exactly the same order (in alphabetical order by default). grad_input_vars : List[nnvm.Symbol or str], optional A list of variables with respect to which the gradients will be computed. None (default) means that all input variables will be used in an alphabetical order. shape : Dict[nnvm.Symbol or str, Tuple[int]] or Tuple[int], optional A dict mapping input variable names to shapes, or just a single shape. By default shapes will be inferred from variables' attributes (see the Examples). Note that this parameter takes precedence over variables' attributes. dtype : Dict[nnvm.Symbol or str, str] or str, optional A dict mapping input variable names to dtypes, or just a single dtype. By default dtypes will be inferred from variables' attributes (see the Examples). If dtypes cannot be inferred for some variables then float32 will be used as a fallback. Note that this parameter takes precedence over variables' attributes. in_range : Dict[nnvm.Symbol or str, (float, float)] or (float, float), optional A dict mapping input variable names to ranges or just a single range (the same for all variables). Input values will be generated from uniform distributions on these ranges. `head_grads` can also be assigned a range this way. values : Dict[nnvm.Symbol or str, numpy.ndarray], optional A dict explicitly providing values for some variables instead of random generation. exclude_targets : Set[str], optional Skip compiling and running anything for these targets. only_targets : Set[str], optional Test only for those targets from `ctx_list()` that are also in this set. additional_params : dict, optional A dict of additional parameters which will be passed to forward and backward. numerical_grads : bool or 'if_possible', optional Whether to additionally check against numerically computed gradients. If 'if_possible' or None is passed (which is the default) then it will try to create a gradient computation graph and then check gradients numerically only if this graph can be created (i.e. if there are some operations with unimplemented gradients, it will just issue a warning). Checking against numerical gradients is done via the `check_numerical_grads` function. numerical_grads_params : dict, optional Additional parameters for `check_numerical_grads`. atol : float, optional Absolute tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients. rtol : float, optional Relative tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients. quiet : bool, optional Don't dump additional information to stdout on failure. Examples -------- .. code-block:: python x = sym.Variable("x", shape=(1, 2)) y = sym.Variable("y", shape=(1, 2)) # check the function and its gradients both numerically and using a reference function check_function(x + 2*y, lambda x, y: x + 2*y, lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads}) # just check gradients numerically check_function(x + 2*y, numerical_grads=True) # just check the forward computation check_function(x + 2*y, lambda x, y: x + 2*y, numerical_grads=False) # specifying dtype check_function(x + 2*y, lambda x, y: x + 2*y, dtype='float64') # dtypes can also be specified during variable creation with dtype codes x = sym.Variable("x", dtype=0) check_function(x + 1, shape=(2, 2), numerical_grads=True) """ # validate and preprocess the input params if numerical_grads is None and forward is None and backward is None: raise ValueError( "No reference function was passed to check_function. If you only want to " "check gradients numerically, pass numerical_grads=True explicitly." ) if numerical_grads is None: numerical_grads = 'if_possible' if numerical_grads not in [False, True, 'if_possible']: raise ValueError( "numerical_grads must be a bool or 'if_possible', not {}".format( numerical_grads)) if additional_params is None: additional_params = {} input_vars = symbol.list_input_variables() input_dict = {x.attr('name'): x for x in input_vars} if grad_input_vars is None: grad_input_vars = sorted(input_vars, key=lambda x: x.attr('name')) else: grad_input_vars = [ input_dict[x] if isinstance(x, str) else x for x in grad_input_vars ] in_range = _dict_var_to_dict_str(in_range) values = _dict_var_to_dict_str(values) out_len = len(symbol.list_output_names()) # Infer the output shapes and dtypes, and preprocess the shape and dtype params forward_graph, shape, dtype, out_shapes, out_dtypes = \ infer_shapes_dtypes(nnvm.graph.create(symbol), shape=shape, dtype=dtype, fallback_dtype='float32') if not all(out_shapes) or not all(out_dtypes): if not quiet: print(forward_graph.ir(join_node_attrs=['shape', 'dtype'])) raise ValueError("Could not infer shapes or dtypes for outputs.\n" "out_shapes = {}\nout_dtypes = {}".format( out_shapes, out_dtypes)) backward_graph = None # If we want gradients, we have to recreate the graph, but now with gradient computations # Note that here we need out_shapes for defining the shape of head grads, so we have to # create the graph twice if backward is not None or numerical_grads: try: head_grads_symbols = [ nnvm.symbol.Variable("head_grads_" + str(i), shape=out_shapes[i], dtype=DTYPE_TO_TCODE[out_dtypes[i]]) for i in range(out_len) ] grad_symbols = graph_util.gradients([symbol], grad_input_vars, grad_ys=head_grads_symbols) # Sometimes grads do not depend on head_grads, so head_grads does not appear # in the variable list; adding it manually prevents this, making things a bit easier backward_graph = \ nnvm.graph.create(nnvm.symbol.Group([symbol] + grad_symbols + head_grads_symbols)) backward_graph, shape, dtype, out_shapes, out_dtypes = \ infer_shapes_dtypes(backward_graph, shape=shape, dtype=dtype, fallback_dtype='float32') except nnvm._base.NNVMError as err: if backward is None and numerical_grads == "if_possible": logging.warning("Won't check gradients because: %s", str(err).split('\n', 1)[0]) numerical_grads = False backward_graph = None else: raise main_graph = backward_graph if backward_graph is not None else forward_graph # Generate random data for inputs (including head_grads) np_inputs = {} for x in main_graph.symbol.list_input_variables(): x_name = x.attr('name') x_shape = shape[x_name] x_dtype = dtype[x_name] if values is not None and x_name in values: np_inputs[x_name] = values[x_name].astype(x_dtype) continue low = -1.0 high = 1.0 if in_range is not None: if isinstance(in_range, dict): if x_name in in_range: low = in_range[x_name][0] high = in_range[x_name][1] else: low = in_range[0] high = in_range[1] np_inputs[x_name] = np.random.uniform(size=x_shape, low=low, high=high).astype(x_dtype) np_inputs_without_head_grads = { k: np_inputs[k] for k in np_inputs if not k.startswith('head_grads_') } nothing_was_done = True # Compute and compare the results for target, ctx in ctx_list(): if exclude_targets is not None: if target in exclude_targets or str(target) in exclude_targets: logging.info("Skipping target = %s, ctx = %s", target, ctx) continue if only_targets is not None: if target not in only_targets and str(target) not in only_targets: logging.info("Skipping target = %s, ctx = %s", target, ctx) continue logging.info("Checking computation on target = %s, ctx = %s", target, ctx) debug_stage = None try: nnvm_res = None debug_stage = "compiling" main_function = graph_to_function(main_graph, target, ctx) # nnvm_res contains the output and gradients (if they are needed) debug_stage = "running" nnvm_res = main_function(**np_inputs) if backward_graph is not None: grad_var_names = [x.attr('name') for x in grad_input_vars] nnvm_grads = { x: v for x, v in zip(grad_var_names, nnvm_res[out_len:]) } if forward is not None: nothing_was_done = False debug_stage = "checking forward computation" logging.debug(debug_stage) params = {} params.update(np_inputs_without_head_grads) params.update(additional_params) numpy_res = forward(**params) if isinstance(numpy_res, tuple): numpy_res = list(numpy_res) if not isinstance(numpy_res, list): numpy_res = [numpy_res] if len(numpy_res) != out_len: raise ValueError( "Forward function returned {} values, but " "the nnvm graph returns {} values".format( len(numpy_res), out_len)) for i in range(out_len): np.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol) if backward is not None: nothing_was_done = False debug_stage = "checking gradients" logging.debug(debug_stage) np_head_grads = [ np_inputs["head_grads_" + str(i)] for i in range(out_len) ] if out_len == 1: np_head_grads = np_head_grads[0] params = {'head_grads': np_head_grads} params.update(np_inputs_without_head_grads) params.update(additional_params) numpy_grads = backward(**params) if not isinstance(numpy_grads, dict): if isinstance(numpy_grads, tuple): numpy_grads = list(numpy_grads) if not isinstance(numpy_grads, list): numpy_grads = [numpy_grads] numpy_grads = { x: v for x, v in zip(grad_var_names, numpy_grads) } if len(numpy_grads) != len(grad_var_names): raise ValueError( "The backward function returns a list of gradients which " "does not contain gradients for these variables: {}" .format(set(grad_var_names) - set(numpy_grads))) for x_name in numpy_grads: np.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name], atol=atol, rtol=rtol) if numerical_grads: nothing_was_done = False debug_stage = "checking gradients numerically" logging.debug(debug_stage) forward_function = graph_to_function(forward_graph, target, ctx) # Since the result may be non-scalar, we have to put another operation on the top, # so we just multiple by the randomly generated head_grads and then sum everything. # This way we can reuse the gradient values which has been already computed. def scalar_function(**kwargs): res = forward_function(**kwargs) return np.sum([ np.dot(np_inputs['head_grads_' + str(i)].ravel(), res[i].ravel()) for i in range(out_len) ]) if numerical_grads_params is None: numerical_grads_params = {} check_numerical_grads( scalar_function, input_values=np_inputs_without_head_grads, grad_values=nnvm_grads, **numerical_grads_params) except: if not quiet: print( "\ncheck_function failed while {}, here is the main graph". format(debug_stage)) print(main_graph.ir(join_node_attrs=['shape', 'dtype'])) if nnvm_res is not None: print("Generated inputs:") print(np_inputs) print() raise if nothing_was_done: logging.warning( "Nothing was done in check_function. Check ctx_list().")
def test_cnn_gradients(): # input data h = 128 w = 128 data_shape = (1000, 3, h, w) data = sym.Variable('data', shape=data_shape, dtype=0) # conv2d num_channels = 64 kernel_size = 32 conv_w_shape = (num_channels, 3, kernel_size, kernel_size) conv_b_shape = (num_channels, ) conv_w = sym.Variable('conv_w', shape=conv_w_shape) conv_b = sym.Variable('conv_b', shape=conv_b_shape) conv1 = sym.conv2d(data=data, weight=conv_w, bias=conv_b, channels=num_channels, kernel_size=(kernel_size, kernel_size), name='conv1') # relu1 relu1 = sym.relu(data=conv1, name='relu1') # max pooling max_pooling1 = sym.max_pool2d(data=relu1, pool_size=(2, 2), name='max_pooling1') # flatten flatten1 = sym.flatten(data=max_pooling1) # shape after flatten flatten_out_shape = (h - kernel_size) * (w - kernel_size) * num_channels # dense1 dense1_hidden_units = 100 dense1 = sym.dense(data=flatten1, name='dense1', units=dense1_hidden_units) # relu2 relu2 = sym.relu(data=dense1, name='relu2') # dense2 dense2_hidden_units = 10 dense2 = sym.dense(data=relu2, name='dense2', units=dense2_hidden_units) # softmax mlp = sym.softmax(data=dense2, name='softmax') # fake non-sparse label label = sym.full_like(mlp, fill_value=1) # cross entropy loss ce_loss = sym.sum(sym.elemwise_mul(sym.log_softmax(dense2), label), axis=1, keepdims=True, name="ce_loss") # input variables: # print grad_g.symbol.list_input_names() # >> ['data', 'conv_w', 'conv_b', # 'dense1_weight', 'dense1_bias', # 'dense2_weight', 'dense2_bias'] # output gradient variables: # print grad_g.symbol.list_output_names() # >> ['conv1_grad_data', 'conv1_grad_weight', 'conv1_grad_bias', # 'dense1_grad_weight', 'dense1_grad_bias', # 'dense2_grad_weight', 'dense2_grad_bias'] grad_g = graph_util.get_gradient_graph(ce_loss, ce_loss.list_input_variables()) # infer shape in_shapes, out_shapes = graph_util.infer_shape(grad_g) # forward graph shape assert in_shapes == [ list(data_shape), list(conv_w_shape), list(conv_b_shape), [dense1_hidden_units, flatten_out_shape], [dense1_hidden_units], [dense2_hidden_units, dense1_hidden_units], [dense2_hidden_units] ] # input grads shape should be equal with input shape assert in_shapes == out_shapes # output grads w.r.t input variables grads = graph_util.gradients(ce_loss, ce_loss.list_input_variables()) # gradients number should be equal with grad_input number assert len(grads) == len(ce_loss.list_input_variables()) # infer type in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g) assert out_dtypes == [ 'float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32' ]
def nnvm_bn(): x = sym.Variable("x") z = sym.batch_norm(x) grad = graph_util.gradients([z], [x]) print(grad)
def test_cnn_gradients(): # input data h = 128 w = 128 data_shape = (1000, 3, h, w) data = sym.Variable('data', shape=data_shape, dtype=0) # conv2d num_channels = 64 kernel_size = 32 conv_w_shape = (num_channels, 3, kernel_size, kernel_size) conv_b_shape = (num_channels,) conv_w = sym.Variable('conv_w', shape=conv_w_shape) conv_b = sym.Variable('conv_b', shape=conv_b_shape) conv1 = sym.conv2d(data=data, weight=conv_w, bias=conv_b, channels=num_channels, kernel_size=(kernel_size, kernel_size), name='conv1') # relu1 relu1 = sym.relu(data=conv1, name='relu1') # max pooling max_pooling1 = sym.max_pool2d(data=relu1, pool_size=(2, 2), name='max_pooling1') # flatten flatten1 = sym.flatten(data=max_pooling1) # shape after flatten flatten_out_shape = (h - kernel_size) * (w - kernel_size) * num_channels # dense1 dense1_hidden_units = 100 dense1 = sym.dense(data=flatten1, name='dense1', units=dense1_hidden_units) # relu2 relu2 = sym.relu(data=dense1, name='relu2') # dense2 dense2_hidden_units = 10 dense2 = sym.dense(data=relu2, name='dense2', units=dense2_hidden_units) # softmax mlp = sym.softmax(data=dense2, name='softmax') # fake non-sparse label label = sym.full_like(mlp, fill_value=1) # cross entropy loss ce_loss = sym.sum( sym.elemwise_mul(sym.log_softmax(dense2), label), axis=1, keepdims=True, name="ce_loss") # input variables: # print grad_g.symbol.list_input_names() # >> ['data', 'conv_w', 'conv_b', # 'dense1_weight', 'dense1_bias', # 'dense2_weight', 'dense2_bias'] # output gradient variables: # print grad_g.symbol.list_output_names() # >> ['conv1_grad_data', 'conv1_grad_weight', 'conv1_grad_bias', # 'dense1_grad_weight', 'dense1_grad_bias', # 'dense2_grad_weight', 'dense2_grad_bias'] grad_g = graph_util.get_gradient_graph(ce_loss, ce_loss.list_input_variables()) # infer shape in_shapes, out_shapes = graph_util.infer_shape(grad_g) # forward graph shape assert in_shapes == [list(data_shape), list(conv_w_shape), list(conv_b_shape), [dense1_hidden_units, flatten_out_shape], [dense1_hidden_units], [dense2_hidden_units, dense1_hidden_units], [dense2_hidden_units]] # input grads shape should be equal with input shape assert in_shapes == out_shapes # output grads w.r.t input variables grads = graph_util.gradients(ce_loss, ce_loss.list_input_variables()) # gradients number should be equal with grad_input number assert len(grads) == len(ce_loss.list_input_variables()) # infer type in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g) assert out_dtypes == ['float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']