def lstm(dh, dc, sv, x): # projected contribution from input(s), hidden, and bias proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv) it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim) ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim) it = sigmoid(it_proj) # input gate(t) ft = sigmoid(ft_proj) # forget-me-not gate(t) ot = sigmoid(ot_proj) # output gate(t) # the following is reading gate proj3rg = sigmoid( times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg) v = proj3rg * sv cx_t = tanh(times(x, Wcx) + times(dh, Hcx)) # need to do stablization ?? # update memory cell c = it * cx_t + ft * dc + tanh(times(v, Wfc)) h = ot * tanh(c) return (h, c, v)
def gru_cell(shape, init=glorot_uniform(), name=''): # (x, (h,c)) """ GRU cell function """ shape = _as_tuple(shape) if len(shape) != 1: raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)") # determine stacking dimensions cell_shape_stacked = shape * 2 # patched dims with stack_axis duplicated 2 times # parameters Wz = Parameter(cell_shape_stacked, init=init, name='Wz') Wr = Parameter(cell_shape_stacked, init=init, name='Wr') Wh = Parameter(cell_shape_stacked, init=init, name='Wh') Uz = Parameter(_INFERRED + shape, init=init, name='Uz') Ur = Parameter(_INFERRED + shape, init=init, name='Ur') Uh = Parameter(_INFERRED + shape, init=init, name='Uh') def create_s_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return Placeholder(shape=shape, name='S') # (h, c) # parameters to model function x = Placeholder(name='gru_block_arg') prev_status = create_s_placeholder() # formula of model function Sn_1 = prev_status z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'), name='z') r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'), name='r') h = tanh(times(x, Uh, name='x*Uh') + times(element_times(Sn_1, r, name='Sprev*r'), Wh), name='h') s = plus(element_times((1 - z), h, name='(1-z)*h'), element_times(z, Sn_1, name='z*SPrev'), name=name) apply_x_s = combine([s]) apply_x_s.create_placeholder = create_s_placeholder return apply_x_s
def LSTM(shape, cell_shape=None, use_peepholes=use_peepholes_default_or_False, init=init_default_or_glorot_uniform, init_bias=init_bias_default_or_0, enable_self_stabilization=enable_self_stabilization_default_or_False): # (x, (h, c)) use_peepholes = use_peepholes if _is_given(use_peepholes) else _current_default_options.use_peepholes enable_self_stabilization = enable_self_stabilization if _is_given(enable_self_stabilization) else _current_default_options.enable_self_stabilization has_projection = cell_shape is not None has_aux = False if has_aux: UntestedBranchError("LSTM, has_aux option") shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("LSTM: shape and cell_shape must be vectors (rank-1 tensors)") # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # stacking along the fastest-changing one, to match BS # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[0] cell_shape_list[stack_axis] = stacked_dim*4 cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # a bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input A = Parameter(_INFERRED + cell_shape_stacked, init=init, name='A') if has_aux else None # aux input (optional) H = Parameter(shape + cell_shape_stacked, init=init, name='H') # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init) if has_projection else None # final projection Sdh = Stabilizer() if enable_self_stabilization else identity Sdc = Stabilizer() if enable_self_stabilization else identity Sct = Stabilizer() if enable_self_stabilization else identity Sht = Stabilizer() if enable_self_stabilization else identity def create_hc_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return (Placeholder(shape=shape, name='hPh'), Placeholder(shape=cell_shape, name='cPh')) # (h, c) # parameters to model function x = Placeholder(name='lstm_block_arg') prev_state = create_hc_placeholder() # formula of model function dh, dc = prev_state dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \ b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # add peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) bit = it * tanh (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * tanh (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht _name_node(h, 'h') if _trace_layers: _log_node(h) # this looks right _name_node(c, 'c') # TODO: figure out how to do scoping, and also rename all the apply... to expression apply_x_h_c = combine ([h, c]) # return to caller a helper function to create placeholders for recurrence # Note that this function will only exist in the object returned here, but not any cloned version of it. apply_x_h_c.create_placeholder = create_hc_placeholder #return Block(apply_x_h_c, 'LSTM') # BUGBUG: fails with "RuntimeError: A Function instance with more than one output cannot be implicitly converted to a Variable" return apply_x_h_c