def lstm_unit(hidden_t_prev, cell_t_prev, gates, seq_lengths, timestep, forget_bias=0.0, drop_states=False): D = cell_t_prev.shape[2] G = gates.shape[2] N = gates.shape[1] t = (timestep * np.ones(shape=(N, D))).astype(np.int32) assert t.shape == (N, D) seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) assert seq_lengths.shape == (N, D) assert G == 4 * D # Resize to avoid broadcasting inconsistencies with NumPy gates = gates.reshape(N, 4, D) cell_t_prev = cell_t_prev.reshape(N, D) i_t = gates[:, 0, :].reshape(N, D) f_t = gates[:, 1, :].reshape(N, D) o_t = gates[:, 2, :].reshape(N, D) g_t = gates[:, 3, :].reshape(N, D) i_t = sigmoid(i_t) f_t = sigmoid(f_t + forget_bias) o_t = sigmoid(o_t) g_t = tanh(g_t) valid = (t < seq_lengths).astype(np.int32) assert valid.shape == (N, D) cell_t = ((f_t * cell_t_prev) + (i_t * g_t)) * (valid) + \ (1 - valid) * cell_t_prev * (1 - drop_states) assert cell_t.shape == (N, D) hidden_t = (o_t * tanh(cell_t)) * valid + hidden_t_prev * ( 1 - valid) * (1 - drop_states) hidden_t = hidden_t.reshape(1, N, D) cell_t = cell_t.reshape(1, N, D) return hidden_t, cell_t
def gru_reference(input, hidden_input, reset_gate_w, reset_gate_b, update_gate_w, update_gate_b, output_gate_w, output_gate_b, seq_lengths, drop_states=False, linear_before_reset=False): D = hidden_input.shape[hidden_input.ndim - 1] T = input.shape[0] N = input.shape[1] G = input.shape[2] print("Dimensions: T= ", T, " N= ", N, " G= ", G, " D= ", D) hidden = np.zeros(shape=(T + 1, N, D)) hidden[0, :, :] = hidden_input for t in range(T): input_t = input[t].reshape(1, N, G) hidden_t_prev = hidden[t].reshape(1, N, D) # Split input contributions for three gates. input_t = input_t.reshape(N, 3, D) input_reset = input_t[:, 0, :].reshape(N, D) input_update = input_t[:, 1, :].reshape(N, D) input_output = input_t[:, 2, :].reshape(N, D) reset_gate = np.dot(hidden_t_prev, reset_gate_w.T) + reset_gate_b reset_gate = reset_gate + input_reset update_gate = np.dot(hidden_t_prev, update_gate_w.T) + update_gate_b update_gate = update_gate + input_update if linear_before_reset: with_linear = np.dot( hidden_t_prev, output_gate_w.T) + output_gate_b output_gate = sigmoid(reset_gate) * with_linear else: with_reset = hidden_t_prev * sigmoid(reset_gate) output_gate = np.dot(with_reset, output_gate_w.T) + output_gate_b output_gate = output_gate + input_output gates_out_t = np.concatenate( (reset_gate, update_gate, output_gate), axis=2, ) print(reset_gate, update_gate, output_gate, gates_out_t, sep="\n") (hidden_t, ) = gru_unit( hidden_t_prev, gates_out_t, seq_lengths, t, drop_states=drop_states ) hidden[t + 1] = hidden_t return ( hidden[1:], hidden[-1].reshape(1, N, D), )
def gru_reference(input, hidden_input, reset_gate_w, reset_gate_b, update_gate_w, update_gate_b, output_gate_w, output_gate_b, seq_lengths, drop_states=False, linear_before_reset=False): D = hidden_input.shape[hidden_input.ndim - 1] T = input.shape[0] N = input.shape[1] G = input.shape[2] print("Dimensions: T= ", T, " N= ", N, " G= ", G, " D= ", D) hidden = np.zeros(shape=(T + 1, N, D)) hidden[0, :, :] = hidden_input for t in range(T): input_t = input[t].reshape(1, N, G) hidden_t_prev = hidden[t].reshape(1, N, D) # Split input contributions for three gates. input_t = input_t.reshape(N, 3, D) input_reset = input_t[:, 0, :].reshape(N, D) input_update = input_t[:, 1, :].reshape(N, D) input_output = input_t[:, 2, :].reshape(N, D) reset_gate = np.dot(hidden_t_prev, reset_gate_w.T) + reset_gate_b reset_gate = reset_gate + input_reset update_gate = np.dot(hidden_t_prev, update_gate_w.T) + update_gate_b update_gate = update_gate + input_update if linear_before_reset: with_linear = np.dot(hidden_t_prev, output_gate_w.T) + output_gate_b output_gate = sigmoid(reset_gate) * with_linear else: with_reset = hidden_t_prev * sigmoid(reset_gate) output_gate = np.dot(with_reset, output_gate_w.T) + output_gate_b output_gate = output_gate + input_output gates_out_t = np.concatenate( (reset_gate, update_gate, output_gate), axis=2, ) print(reset_gate, update_gate, output_gate, gates_out_t, sep="\n") (hidden_t, ) = gru_unit( hidden_t_prev, gates_out_t, seq_lengths, t, drop_states=drop_states ) hidden[t + 1] = hidden_t return ( hidden[1:], hidden[-1].reshape(1, N, D), )
def gru_unit(*args, **kwargs): ''' Implements one GRU unit, for one time step Shapes: hidden_t_prev.shape = (1, N, D) gates_out_t.shape = (1, N, G) seq_lenths.shape = (N,) ''' drop_states = kwargs.get('drop_states', False) sequence_lengths = kwargs.get('sequence_lengths', True) if sequence_lengths: hidden_t_prev, gates_out_t, seq_lengths, timestep = args else: hidden_t_prev, gates_out_t, timestep = args N = hidden_t_prev.shape[1] D = hidden_t_prev.shape[2] G = gates_out_t.shape[2] t = (timestep * np.ones(shape=(N, D))).astype(np.int32) assert t.shape == (N, D) assert G == 3 * D # Calculate reset, update, and output gates separately # because output gate depends on reset gate. gates_out_t = gates_out_t.reshape(N, 3, D) reset_gate_t = gates_out_t[:, 0, :].reshape(N, D) update_gate_t = gates_out_t[:, 1, :].reshape(N, D) output_gate_t = gates_out_t[:, 2, :].reshape(N, D) # Calculate gate outputs. reset_gate_t = sigmoid(reset_gate_t) update_gate_t = sigmoid(update_gate_t) output_gate_t = tanh(output_gate_t) if sequence_lengths: seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) assert seq_lengths.shape == (N, D) valid = (t < seq_lengths).astype(np.int32) else: valid = np.ones(shape=(N, D)) assert valid.shape == (N, D) hidden_t = update_gate_t * hidden_t_prev + (1 - update_gate_t) * output_gate_t hidden_t = hidden_t * valid + hidden_t_prev * (1 - valid) * (1 - drop_states) hidden_t = hidden_t.reshape(1, N, D) return (hidden_t, )
def gru_unit(*args, **kwargs): ''' Implements one GRU unit, for one time step Shapes: hidden_t_prev.shape = (1, N, D) gates_out_t.shape = (1, N, G) seq_lenths.shape = (N,) ''' drop_states = kwargs.get('drop_states', False) sequence_lengths = kwargs.get('sequence_lengths', True) if sequence_lengths: hidden_t_prev, gates_out_t, seq_lengths, timestep = args else: hidden_t_prev, gates_out_t, timestep = args N = hidden_t_prev.shape[1] D = hidden_t_prev.shape[2] G = gates_out_t.shape[2] t = (timestep * np.ones(shape=(N, D))).astype(np.int32) assert t.shape == (N, D) assert G == 3 * D # Calculate reset, update, and output gates separately # because output gate depends on reset gate. gates_out_t = gates_out_t.reshape(N, 3, D) reset_gate_t = gates_out_t[:, 0, :].reshape(N, D) update_gate_t = gates_out_t[:, 1, :].reshape(N, D) output_gate_t = gates_out_t[:, 2, :].reshape(N, D) # Calculate gate outputs. reset_gate_t = sigmoid(reset_gate_t) update_gate_t = sigmoid(update_gate_t) output_gate_t = tanh(output_gate_t) if sequence_lengths: seq_lengths = (np.ones(shape=(N, D)) * seq_lengths.reshape(N, 1)).astype(np.int32) assert seq_lengths.shape == (N, D) valid = (t < seq_lengths).astype(np.int32) else: valid = np.ones(shape=(N, D)) assert valid.shape == (N, D) hidden_t = update_gate_t * hidden_t_prev + (1 - update_gate_t) * output_gate_t hidden_t = hidden_t * valid + hidden_t_prev * (1 - valid) * (1 - drop_states) hidden_t = hidden_t.reshape(1, N, D) return (hidden_t, )