def _GRUBlockCellGrad(op, *grad): r"""Gradient for GRUBlockCell. Args: op: Op for which the gradient is defined. *grad: Gradients of the optimization function wrt output for the Op. Returns: d_x: Gradients wrt to x d_h: Gradients wrt to h d_w_ru: Gradients wrt to w_ru d_w_c: Gradients wrt to w_c d_b_ru: Gradients wrt to b_ru d_b_c: Gradients wrt to b_c Mathematics behind the Gradients below: ``` d_c_bar = d_h \circ (1-u) \circ (1-c \circ c) d_u_bar = d_h \circ (h-c) \circ u \circ (1-u) d_r_bar_u_bar = [d_r_bar d_u_bar] [d_x_component_1 d_h_prev_component_1] = d_r_bar_u_bar * w_ru^T [d_x_component_2 d_h_prevr] = d_c_bar * w_c^T d_x = d_x_component_1 + d_x_component_2 d_h_prev = d_h_prev_component_1 + d_h_prevr \circ r + u ``` Below calculation is performed in the python wrapper for the Gradients (not in the gradient kernel.) ``` d_w_ru = x_h_prevr^T * d_c_bar d_w_c = x_h_prev^T * d_r_bar_u_bar d_b_ru = sum of d_r_bar_u_bar along axis = 0 d_b_c = sum of d_c_bar along axis = 0 ``` """ x, h_prev, w_ru, w_c, b_ru, b_c = op.inputs r, u, c, _ = op.outputs _, _, _, d_h = grad d_x, d_h_prev, d_c_bar, d_r_bar_u_bar = gen_gru_ops.gru_block_cell_grad( x, h_prev, w_ru, w_c, b_ru, b_c, r, u, c, d_h) x_h_prev = array_ops.concat([x, h_prev], 1) d_w_ru = math_ops.matmul(x_h_prev, d_r_bar_u_bar, transpose_a=True) d_b_ru = nn_ops.bias_add_grad(d_r_bar_u_bar) x_h_prevr = array_ops.concat([x, h_prev * r], 1) d_w_c = math_ops.matmul(x_h_prevr, d_c_bar, transpose_a=True) d_b_c = nn_ops.bias_add_grad(d_c_bar) return d_x, d_h_prev, d_w_ru, d_w_c, d_b_ru, d_b_c
def _GRUBlockCellGrad(op, *grad): r"""Gradient for GRUBlockCell. Args: op: Op for which the gradient is defined. *grad: Gradients of the optimization function wrt output for the Op. Returns: d_x: Gradients wrt to x d_h: Gradients wrt to h d_w_ru: Gradients wrt to w_ru d_w_c: Gradients wrt to w_c d_b_ru: Gradients wrt to b_ru d_b_c: Gradients wrt to b_c Mathematics behind the Gradients below: ``` d_c_bar = d_h \circ (1-u) \circ (1-c \circ c) d_u_bar = d_h \circ (h-c) \circ u \circ (1-u) d_r_bar_u_bar = [d_r_bar d_u_bar] [d_x_component_1 d_h_prev_component_1] = d_r_bar_u_bar * w_ru^T [d_x_component_2 d_h_prevr] = d_c_bar * w_c^T d_x = d_x_component_1 + d_x_component_2 d_h_prev = d_h_prev_component_1 + d_h_prevr \circ r + u ``` Below calculation is performed in the python wrapper for the Gradients (not in the gradient kernel.) ``` d_w_ru = x_h_prevr^T * d_c_bar d_w_c = x_h_prev^T * d_r_bar_u_bar d_b_ru = sum of d_r_bar_u_bar along axis = 0 d_b_c = sum of d_c_bar along axis = 0 ``` """ x, h_prev, w_ru, w_c, b_ru, b_c = op.inputs r, u, c, _ = op.outputs _, _, _, d_h = grad d_x, d_h_prev, d_c_bar, d_r_bar_u_bar = gen_gru_ops.gru_block_cell_grad( x, h_prev, w_ru, w_c, b_ru, b_c, r, u, c, d_h) x_h_prev = array_ops.concat([x, h_prev], 1) d_w_ru = math_ops.matmul(x_h_prev, d_r_bar_u_bar, transpose_a=True) d_b_ru = nn_ops.bias_add_grad(d_r_bar_u_bar) x_h_prevr = array_ops.concat([x, h_prev * r], 1) d_w_c = math_ops.matmul(x_h_prevr, d_c_bar, transpose_a=True) d_b_c = nn_ops.bias_add_grad(d_c_bar) return d_x, d_h_prev, d_w_ru, d_w_c, d_b_ru, d_b_c