def _step(self,y_tm1,y_m,s_tm1,h,x_m): # attention pctx__=T.dot(h,self.W_ha)+T.dot(s_tm1,self.W_sa)[None,:,:] pctx__=T.tanh(pctx__) e=T.dot(pctx__,self.U_att)+self.b_att e=T.exp(e.reshape((e.shape[0],e.shape[1]))) e=e/e.sum(0, keepdims=True) e=e*x_m c=(h*e[:,:,None]).sum(0) z = T.nnet.sigmoid(T.dot(y_tm1, self.W_z) + self.b_z + T.dot(s_tm1, self.U_z)+T.dot(c,self.W_cs)) r = T.nnet.sigmoid(T.dot(y_tm1, self.W_r) + self.b_r + T.dot(s_tm1, self.U_r)+T.dot(c,self.W_cs)) hh_t = T.tanh(T.dot(y_tm1, self.W_h) + self.b_h + T.dot(r * s_tm1, self.U_h)+T.dot(c,self.W_cy)) s_t = z * s_tm1 + (1 - z) * hh_t s_t = (1. - y_m)[:,None] * s_tm1 + y_m[:,None] * s_t logit=T.tanh(T.dot(s_t, self.W_hl)+T.dot(y_tm1, self.W_yl)+T.dot(c, self.W_cl)) return T.cast(s_t,dtype =theano.config.floatX),T.cast(logit,dtype =theano.config.floatX)
def _step(x_, h_, c_, pred_, prob_): h_a = [] c_a = [] for it in range(self.n_levels): preact = T.dot(h_[it], self.U[it]) preact += T.dot(x_, self.W[it]) + self.b[it] i = T.nnet.sigmoid(_slice(preact, 0, self.n_dim)) f = T.nnet.sigmoid(_slice(preact, 1, self.n_dim)) o = T.nnet.sigmoid(_slice(preact, 2, self.n_dim)) c = T.tanh(_slice(preact, 3, self.n_dim)) c = f * c_[it] + i * c h = o * T.tanh(c) h_a.append(h) c_a.append(c) x_ = h q = T.dot(h, self.L) + self.b0 prob = T.nnet.softmax(q) pred = T.argmax(prob, axis=1) return T.stack(h_a).squeeze(), T.stack(c_a).squeeze(), pred, prob
def step(self, X, previous_hidden, previous_state): if self.use_input_peep: input_gate = T.nnet.sigmoid(T.dot(X, self.Wi) + T.dot(previous_hidden, self.Ui) + T.dot(previous_state, self.Pi) + self.bi) else: input_gate = T.nnet.sigmoid(T.dot(X, self.Wi) + T.dot(previous_hidden, self.Ui) + self.bi) candidate_state = T.tanh(T.dot(X, self.Wg) + T.dot(previous_hidden, self.Ug) + self.bg) if self.use_forget_gate: if self.use_forget_peep: forget_gate = T.nnet.sigmoid(T.dot(X, self.Wf) + T.dot(previous_hidden, self.Uf) + T.dot(previous_state, self.Pf) + self.bf) else: forget_gate = T.nnet.sigmoid(T.dot(X, self.Wf) + T.dot(previous_hidden, self.Uf) + self.bf) state = candidate_state * input_gate + previous_state * forget_gate else: state = candidate_state * input_gate + previous_state * 0 if self.use_output_peep: output_gate = T.nnet.sigmoid(T.dot(X, self.Wo) + T.dot(previous_hidden, self.Uo) + T.dot(previous_state, self.Po) + self.bo) else: output_gate = T.nnet.sigmoid(T.dot(X, self.Wo) + T.dot(previous_hidden, self.Uo) + self.bo) if self.use_tanh_output: output = output_gate * T.tanh(state) else: output = output_gate * state return output, state
def __call__(self, x, h, prev_cell): z = x.dot(self.W_x) + h.dot(self.U_h) + self.b def _get_unit(matrix, unit, dim): slice_num = self.units[unit] # assume all slices have the same dimension return matrix[:, slice_num * dim: (slice_num + 1) * dim] # input gate i = T.nnet.sigmoid(_get_unit(z, 'i', self.unit_size)) # candidate memory cell candidate = T.tanh(_get_unit(z, 'c', self.unit_size)) # forget gate f = T.nnet.sigmoid(_get_unit(z, 'f', self.unit_size)) # output gate (note it doesn't involve the current memory cell) o = T.nnet.sigmoid(_get_unit(z, 'o', self.unit_size)) next_cell = i * candidate + f * prev_cell h = o * T.tanh(next_cell) return [next_cell, h]
def generate(self, h_, c_, x_): h_a = [] c_a = [] for it in range(self.n_levels): preact = T.dot(x_, self.W[it]) preact += T.dot(h_[it], self.U[it]) + self.b[it] i = T.nnet.sigmoid(self.slice(preact, 0, self.n_dim)) f = T.nnet.sigmoid(self.slice(preact, 1, self.n_dim)) o = T.nnet.sigmoid(self.slice(preact, 2, self.n_dim)) c = T.tanh(self.slice(preact, 3, self.n_dim)) c = f * c_[it] + i * c h = o * T.tanh(c) h_a.append(h) c_a.append(c) x_ = h q = T.dot(h, self.L) + self.b0 # mask = T.concatenate([T.alloc(np_floatX(1.), q.shape[0] - 1), T.alloc(np_floatX(0.), 1)]) prob = T.nnet.softmax(q / 1) return prob, T.stack(h_a).squeeze(), T.stack(c_a)[0].squeeze()
def dev_loss(self, dev_types, dev_lams, ss_ratio, y): su_mask = ss_ratio * T.neq(y, 0).reshape((y.shape[0], 1)) un_mask = T.eq(y, 0).reshape((y.shape[0], 1)) ss_mask = su_mask + un_mask var_fun = lambda x1, x2: T.sum(((x1 - x2) * ss_mask)**2.0) / T.sum(ss_mask) tanh_fun = lambda x1, x2: var_fun(T.tanh(x1), T.tanh(x2)) norm_fun = lambda x1, x2: var_fun( \ (x1 / T.sqrt(T.sum(x1**2.0,axis=1,keepdims=1) + 1e-6)), \ (x2 / T.sqrt(T.sum(x2**2.0,axis=1,keepdims=1) + 1e-6))) sigm_fun = lambda x1, x2: var_fun(T.nnet.sigmoid(x1), T.nnet.sigmoid(x2)) cent_fun = lambda xt, xo: T.sum(T.nnet.binary_crossentropy( \ T.nnet.sigmoid(xo), T.nnet.sigmoid(xt))) / xt.shape[0] L = 0.0 for i in xrange(self.layer_count): if (i < (self.layer_count - 1)): x1 = self.layers[i].output x2 = self.drop_nets[0][i].output else: x1 = self.layers[i].linear_output x2 = self.drop_nets[0][i].linear_output if (dev_types[i] == 1): L = L + (dev_lams[i] * norm_fun(x1, x2)) elif (dev_types[i] == 2): L = L + (dev_lams[i] * tanh_fun(x1, x2)) elif (dev_types[i] == 3): L = L + (dev_lams[i] * sigm_fun(x1, x2)) elif (dev_types[i] == 4): L = L + (dev_lams[i] * cent_fun(x1, x2)) else: L = L + (dev_lams[i] * var_fun(x1, x2)) return L
def tanh_actfun(x, scale=None): """Compute rescaled tanh activation for x.""" if scale is None: x_tanh = T.tanh(x) else: x_tanh = scale * T.tanh(constFX(1/scale) * x) return x_tanh
def step(self, i_t, x_t, z_t, y_p, c_p, *other_args): # See Unit.scan() for seqs. # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none) other_outputs = [] if self.recurrent_transform: state_vars = other_args[:len(self.recurrent_transform.state_vars)] self.recurrent_transform.set_sorted_state_vars(state_vars) z_r, r_updates = self.recurrent_transform.step(y_p) z_t += z_r for v in self.recurrent_transform.get_sorted_state_vars(): other_outputs += [r_updates[v]] z_t += T.dot(y_p, self.W_re) partition = z_t.shape[1] // 4 #number of units forgetgate = T.nnet.sigmoid(z_t[:,:partition]) propgate = T.nnet.sigmoid(z_t[:,partition:2*partition]) diffgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition]) input = T.tanh(z_t[:,3*partition:4*partition]) # c(t) = (1 - FG(t)) * IN(t) + FG(t) * c(t-1) c_t = (1-forgetgate) * input + forgetgate * c_p # y(t) = tanh( PG(t) * c(t) + DG(t) * ( c(t) - c(t-1)) ) HINT: The additional nonlinearity maybe has not a significant effect y_t = T.tanh(propgate * c_t + diffgate * ( c_t - c_p)) i_output = T.outer(i_t, self.o_output) i_h = T.outer(i_t, self.o_h) # return: next outputs (# unit.n_act, y_t, c_t, ...) return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
def recurrence(x_t, c_tm1, h_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.w_xi) + T.dot(h_tm1, self.w_hi) + self.b_i) # + T.dot(c_tm1, self.w_ci) f_t = T.nnet.sigmoid(T.dot(x_t, self.w_xf) + T.dot(h_tm1, self.w_hf) + self.b_f) # + T.dot(c_tm1, self.w_cf) c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.w_xc) + T.dot(h_tm1, self.w_hc) + self.b_c) o_t = T.nnet.sigmoid(T.dot(x_t, self.w_xo) + T.dot(h_tm1, self.w_ho) + self.b_o) # + T.dot(c_t, self.w_co) h_t = o_t * T.tanh(c_t) return [c_t, h_t]
def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) if has_input_gate: if has_forget_gate: c = f * c_ + i * c else: c = c_ + i*c else: if has_forget_gate: c = f*c_ + c else: c = c_ + c c = m_[:, None] * c + (1. - m_)[:, None] * c_ if has_output_gate: h = o * tensor.tanh(c) else: h = tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c
def step(self, i_t, x_t, z_t, att_p, y_p, c_p, *other_args): # See Unit.scan() for seqs. # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none) other_outputs = [] #att_p = theano.printing.Print('att in lstms', attrs=['__str__'])(att_p) if self.recurrent_transform: state_vars = other_args[:len(self.recurrent_transform.state_vars)] self.recurrent_transform.set_sorted_state_vars(state_vars) z_r, r_updates = self.recurrent_transform.step(y_p) z_t += z_r for v in self.recurrent_transform.get_sorted_state_vars(): other_outputs += [r_updates[v]] maxatt = att_p.repeat(z_t.shape[1]).reshape((z_t.shape[0],z_t.shape[1]))#.dimshuffle(1,0) #maxatt = theano.printing.Print('maxatt',attrs=['__str__','shape'])(maxatt) z_t = T.switch(maxatt>0,z_t,z_t + T.dot(y_p, self.W_re)) #z_t += T.dot(y_p, self.W_re) #z_t = theano.printing.Print('z_t lstms',attrs=['shape'])(z_t) partition = z_t.shape[1] // 4 ingate = T.nnet.sigmoid(z_t[:,:partition]) forgetgate = ((T.nnet.sigmoid(z_t[:,partition:2*partition])).T * (1.-att_p)).T outgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition]) input = T.tanh(z_t[:,3*partition:4*partition]) #c_t = ((forgetgate * c_p + ingate * input).T * (1.-T.max(att_p,axis=-1))).T c_t = forgetgate * c_p + ingate * input y_t = outgate * T.tanh(c_t) i_output = T.outer(i_t, self.o_output) i_h = T.outer(i_t, self.o_h) # return: next outputs (# unit.n_act, y_t, c_t, ...) return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
def step_fn(current_input_to_state, prev_c, prev_h): # all args have shape (batch size, output_dim, height) # TODO consider learning this padding prev_h_padded = T.zeros((batch_size, output_dim, 1+height), dtype=theano.config.floatX) prev_h_padded = T.inc_subtensor(prev_h_padded[:,:,1:], prev_h) state_to_state = lib.ops.conv1d.Conv1D( name+'.StateToState', output_dim, 4*output_dim, 2, prev_h_padded, biases=False ) gates = current_input_to_state + state_to_state o_f_i = T.nnet.sigmoid(gates[:,:3*output_dim,:]) o = o_f_i[:,0*output_dim:1*output_dim,:] f = o_f_i[:,1*output_dim:2*output_dim,:] i = o_f_i[:,2*output_dim:3*output_dim,:] g = T.tanh(gates[:,3*output_dim:4*output_dim,:]) new_c = (f * prev_c) + (i * g) new_h = o * T.tanh(new_c) return (new_c, new_h)
def forward_prop_step(x_t, s_t1_prev, s_t2_prev): ''' Inner function encapsulating a propagation step This is how we calculated the hidden state in a simple RNN. No longer! s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev)) ''' # Word embedding layer x_e = E[:,x_t] # GRU Layer 1 z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0]) r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1]) c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2]) s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev # GRU Layer 2 z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3]) r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4]) c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5]) s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row o_t = T.nnet.softmax(V.dot(s_t2) + c)[0] return [o_t, s_t1, s_t2]
def fprop_step_mask(self, state_below, mask, state_before, U): """ Scan function for case using masks Parameters ---------- : todo state_below : TheanoTensor """ g_on = state_below + tensor.dot(state_before[:, :self.dim], U) i_on = tensor.nnet.sigmoid(g_on[:, :self.dim]) f_on = tensor.nnet.sigmoid(g_on[:, self.dim:2*self.dim]) o_on = tensor.nnet.sigmoid(g_on[:, 2*self.dim:3*self.dim]) z = tensor.set_subtensor(state_before[:, self.dim:], f_on * state_before[:, self.dim:] + i_on * tensor.tanh(g_on[:, 3*self.dim:])) z = tensor.set_subtensor(z[:, :self.dim], o_on * tensor.tanh(z[:, self.dim:])) # Only update the state for non-masked data, otherwise # just carry on the previous state until the end z = mask[:, None] * z + (1 - mask[:, None]) * state_before return z
def step(x_t, m, h_tm1, c_tm1, ctx_t, att, pctx_): projected_state = T.dot(h_tm1, Wd_att) pctx_ = T.tanh(pctx_ + projected_state[None, :, :]) new_att = T.dot(pctx_, U_att) + c_att new_att = new_att.reshape([new_att.shape[0], new_att.shape[1]]) new_att = T.exp(new_att) * context_mask new_att = new_att / new_att.sum(axis=0, keepdims=True) # Current context ctx_t = (context * new_att[:, :, None]).sum(axis=0) preactivation = T.dot(h_tm1, U) preactivation += x_t preactivation += T.dot(ctx_t, Wc) i_t = T.nnet.sigmoid(_slice(preactivation, 0, hidden_size)) f_t = T.nnet.sigmoid(_slice(preactivation, 1, hidden_size)) o_t = T.nnet.sigmoid(_slice(preactivation, 2, hidden_size)) c_t = T.tanh(_slice(preactivation, 3, hidden_size)) c_t = f_t * c_tm1 + i_t * c_t c_t = m[:, None] * c_t + (1. - m)[:, None] * c_tm1 h_t = o_t * T.tanh(c_t) h_t = m[:, None] * h_t + (1. - m)[:, None] * h_tm1 return (h_t, c_t, ctx_t, new_att.T, projected_state, i_t, f_t, o_t, preactivation)
def step(x_t, m_t, att_i_t, h_tm1, ctx_tm1, att_w_tm1, proj_hid_att, conc_hidden, U, W, W_cth, W_ctc, Ws_att, Wp_att, bp_att, Wc_att, Urz, hidden_mask): att_s = tensor.dot(h_tm1, Ws_att) att = proj_hid_att + att_s[None, :, :] att += att_i_t att = tensor.tanh(att) att_w_t = tensor.dot(att, Wp_att) + bp_att att_w_t = att_w_t.reshape((att_w_t.shape[0], att_w_t.shape[1])) # ? att_w_t_max = (att_w_t * hidden_mask).max(axis=0, keepdims=True) att_w_t = tensor.exp(att_w_t - att_w_t_max) att_w_t = hidden_mask * att_w_t att_w_t = att_w_t / att_w_t.sum(axis=0, keepdims=True) ctx_t = (conc_hidden * att_w_t[:, :, None]).sum(axis=0) projected_state = tensor.dot(h_tm1, Urz) projected_state += tensor.dot(ctx_t, W_cth) r = tensor.nnet.sigmoid(_slice(x_t, 0) + _slice(projected_state, 0)) z = tensor.nnet.sigmoid(_slice(x_t, 1) + _slice(projected_state, 1)) candidate_h_t = tensor.tanh(_slice(x_t, 2) + r * tensor.dot( h_tm1, U) + tensor.dot(ctx_t, W_ctc)) h_ti = z * h_tm1 + (1. - z) * candidate_h_t h_t = m_t[:, None] * h_ti + (1 - m_t)[:, None] * h_tm1 return h_t, ctx_t, att_w_t.T
def pass_edges(input_idx_t, edge_t, edge_mask_t, counter_t, h_tm1, c_tm1, x): h_t = h_tm1 c_t = c_tm1 # select the input vector to use for this edge (source) x_t_i = x[input_idx_t, :] # zero out the input unless this is a leaf node x_t_0 = T.switch(T.eq(T.sum(edge_mask_t), 0), x_t_i, x_t_i*0) # concatenate with the input edge vector x_t_edge = T.concatenate([x_t_0, edge_t]) # compute attention weights, using a manual softmax attention_scores = T.dot(self.v_a, T.tanh(T.dot(self.W_h_a, h_tm1))) # (1, n_edges) # find the max of the unmasked values max_score = T.max(attention_scores + edge_mask_t * 10000.0) - 10000.0 # exponentiate the differences, masking first to avoid inf, and then to keep only relevant scores exp_scores = T.exp((attention_scores - max_score) * edge_mask_t) * edge_mask_t # take the sum, and add one if the mask is all zeros to avoid an inf exp_scores_sum = T.sum(exp_scores) + T.switch(T.eq(T.sum(edge_mask_t), 0), 1.0, 0.0) # normalize to compute the weights weighted_mask = exp_scores / exp_scores_sum i_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_i) + T.sum(T.dot(self.W_h_i.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_i) f_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_f) + T.sum(T.dot(self.W_h_f.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_f) o_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_o) + T.sum(T.dot(self.W_h_o.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_o) u_t = T.tanh(T.dot(x_t_edge, self.W_x_u) + T.sum(T.dot(self.W_h_u.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_u) c_temp = i_t * u_t + f_t * T.sum((weighted_mask * c_tm1).T, axis=0) h_temp = o_t * T.tanh(c_temp) h_t = T.set_subtensor(h_t[:, counter_t], h_temp) c_t = T.set_subtensor(c_t[:, counter_t], c_temp) return h_t, c_t
def _step_slice(m_, x_, xx_, xc_, h_, ctx_, alpha_, pctx_, cc_, U, Wc, Wd_att, U_att, c_tt, Ux, Wcx): # attention pstate_ = tensor.dot(h_, Wd_att) pctx__ = pctx_ + pstate_[None,:,:] pctx__ += xc_ pctx__ = tensor.tanh(pctx__) alpha = tensor.dot(pctx__, U_att)+c_tt alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) alpha = tensor.exp(alpha) if context_mask: alpha = alpha * context_mask alpha = alpha / alpha.sum(0, keepdims=True) ctx_ = (cc_ * alpha[:,:,None]).sum(0) # current context preact = tensor.dot(h_, U) preact += x_ preact += tensor.dot(ctx_, Wc) preact = tensor.nnet.sigmoid(preact) r = _slice(preact, 0, dim) u = _slice(preact, 1, dim) preactx = tensor.dot(h_, Ux) preactx *= r preactx += xx_ preactx += tensor.dot(ctx_, Wcx) h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h h = m_[:,None] * h + (1. - m_)[:,None] * h_ return h, ctx_, alpha.T #, pstate_, preact, preactx, r, u
def _step(self, m_, x_, h_, c_): i_preact = (index_dot(x_, self.W_i) + T.dot(h_, self.U_i) + self.b_i) i = T.nnet.sigmoid(i_preact) f_preact = (index_dot(x_, self.W_f) + T.dot(h_, self.U_f) + self.b_f) f = T.nnet.sigmoid(f_preact) o_preact = (index_dot(x_, self.W_o) + T.dot(h_, self.U_o) + self.b_o) o = T.nnet.sigmoid(o_preact) c_preact = (index_dot(x_, self.W_c) + T.dot(h_, self.U_c) + self.b_c) c = T.tanh(c_preact) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * T.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c
def _step(c, c_m, hidden, c_matrix): node_idx = c[:, 0] left_child_idx = c[:, 1] right_child_idx = c[:, 2] all_samples = T.arange(n_samples) recursive = ( T.dot(hidden[left_child_idx, all_samples, :], self.W) + T.dot(hidden[right_child_idx, all_samples, :], self.U) + self.b ) i = T.nnet.sigmoid(_slice(recursive, 0, self.dim_proj)) f1 = T.nnet.sigmoid(_slice(recursive, 1, self.dim_proj)) f2 = T.nnet.sigmoid(_slice(recursive, 2, self.dim_proj)) o = T.nnet.sigmoid(_slice(recursive, 3, self.dim_proj)) c_prime = T.tanh(_slice(recursive, 4, self.dim_proj)) new_c = ( i * c_prime + f1 * c_matrix[left_child_idx, all_samples, :] + f2 * c_matrix[right_child_idx, all_samples, :] ) new_c_masked = c_m[:, None] * new_c + (1.0 - c_m[:, None]) * c_matrix[node_idx, all_samples, :] new_h = o * T.tanh(new_c_masked) new_h_masked = c_m[:, None] * new_h + (1.0 - c_m[:, None]) * hidden[node_idx, all_samples, :] return ( T.set_subtensor(hidden[node_idx, all_samples], new_h_masked), T.set_subtensor(c_matrix[node_idx, all_samples], new_c_masked), )
def recurrence( sample_z_t, sample_x_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec, mu_z_t, sigma_z_t, mu_x_tm1, sigma_x_tm1, v): if v is not None: v_hat = v - ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input r_t = T.concatenate( [v , v_hat], axis = 1 ) else: v_hat = mu_x_tm1 - ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input r_t = T.concatenate( [mu_x_tm1 , v_hat], axis = 1 ) # v_enc = [r_t, h_tm1_dec] v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1) #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc) i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc)) f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc)) c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc )) o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc)) h_t_enc = o_t_enc * T.tanh( c_t_enc ) # Get z_t mu_z_t = T.dot(h_t_enc, Wh_enc_mu_z ) + b_mu_z sigma_z_t = sigma_b + T.nnet.softplus(T.dot(h_t_enc, Wh_enc_sig_z ) + b_sig_z) #sample = theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX) z_t = mu_z_t + (sigma_z_t * (sample_z_t.reshape((batch_size,n_z))) ) # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec)) f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec)) c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec )) o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec)) h_t_dec = o_t_dec * T.tanh( c_t_dec ) # Get w_t mu_x_t = mu_x_tm1 + T.dot(h_t_dec, Wh_dec_mu_x) + b_mu_x sigma_x_t = sigma_b + T.nnet.softplus(T.dot(h_t_dec, Wh_dec_sig_x) + b_sig_x) return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec, mu_z_t, sigma_z_t, mu_x_t, sigma_x_t]
def step(x,prev_h,prev_c): input_gate = T.nnet.sigmoid( T.dot(x,P.W_input_in) +\ T.dot(prev_h,P.W_hidden_in) +\ T.dot(prev_c,P.W_cell_in) +\ P.b_in ) forget_gate = T.nnet.sigmoid( T.dot(x,P.W_input_forget) +\ T.dot(prev_h,P.W_hidden_forget) +\ T.dot(prev_c,P.W_cell_forget) +\ P.b_forget ) curr_c = forget_gate * prev_c + input_gate * T.tanh( T.dot(x,P.W_input_cell) +\ T.dot(prev_h,P.W_hidden_cell) +\ P.b_cell ) output_gate = T.nnet.sigmoid( T.dot(x,P.W_input_output) +\ T.dot(prev_h,P.W_hidden_output) +\ T.dot(curr_c,P.W_cell_output) +\ P.b_output ) curr_h = output_gate * T.tanh(curr_c) return curr_h,curr_c
def _step(x_, xb_, h_, c_, hb_, cb_): preact = T.dot(h_, tparams[_p(prefix, 'U')]) preact += T.dot(x_, tparams[_p(prefix, 'W')]) preact += tparams[_p(prefix, 'b')] i = T.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = T.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = T.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = T.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c h = o * T.tanh(c) preactb = T.dot(hb_, tparams[_p(prefix, 'Ub')]) preactb += T.dot(xb_, tparams[_p(prefix, 'Wb')]) preactb += tparams[_p(prefix, 'bb')] ib = T.nnet.sigmoid(_slice(preactb, 0, options['dim_proj'])) fb = T.nnet.sigmoid(_slice(preactb, 1, options['dim_proj'])) ob = T.nnet.sigmoid(_slice(preactb, 2, options['dim_proj'])) cb = T.tanh(_slice(preactb, 3, options['dim_proj'])) cb = fb * cb_ + ib * cb hb = ob * T.tanh(cb) # take the reverse of hb and concatenate with h before feeding into logistic regression hhb = T.concatenate([h,hb[::-1]]) # a single frame prediction given h - the posterior probablity one_pred = T.nnet.softmax(T.dot(hhb, tparams['U']) + tparams['b']) return h, c, hb, cb, one_pred
def get_ht_ct(self, xWxi_t, xWxf_t, xWxc_t, xWxo_t, h_t1, c_t1): i_t = T.nnet.sigmoid(xWxi_t + h_t1.dot(self.Whi) + c_t1.dot(self.Wci) + self.bi) f_t = T.nnet.sigmoid(xWxf_t + h_t1.dot(self.Whf) + c_t1.dot(self.Wcf) + self.bf) c_t = f_t * c_t1 + i_t * T.tanh(xWxc_t + h_t1.dot(self.Whc) + self.bc) o_t = T.nnet.sigmoid(xWxo_t + h_t1.dot(self.Who) + c_t.dot(self.Wco) + self.bo) h_t = o_t * T.tanh(c_t) return h_t, c_t
def forward_step(x_t, prev_state, prev_content, prev_state_2, prev_content_2): input_gate = T.nnet.hard_sigmoid(T.dot((self.U_input), x_t) + T.dot(self.W_input, prev_state) + self.bias_input) forget_gate = T.nnet.hard_sigmoid( T.dot((self.U_forget), x_t) + T.dot(self.W_forget, prev_state) + self.bias_forget) output_gate = T.nnet.hard_sigmoid( T.dot((self.U_output), x_t) + T.dot(self.W_output, prev_state) + self.bias_output) stabilized_input = T.tanh(T.dot((self.U), x_t) + T.dot(self.W, prev_state) + self.bias) c = forget_gate * prev_content + input_gate * stabilized_input s1 = output_gate * T.tanh(c) input_gate_2 = T.nnet.hard_sigmoid( T.dot((self.U_input_2), s1) + T.dot(self.W_input_2, prev_state_2) + self.bias_input_2) forget_gate_2 = T.nnet.hard_sigmoid( T.dot((self.U_forget_2), s1) + T.dot(self.W_forget_2, prev_state_2) + self.bias_forget_2) output_gate_2 = T.nnet.hard_sigmoid( T.dot((self.U_output_2), s1) + T.dot(self.W_output_2, prev_state_2) + self.bias_output_2) stabilized_input_2 = T.tanh(T.dot((self.U_2), s1) + T.dot(self.W_2, prev_state_2) + self.bias_2) c2 = forget_gate_2 * prev_content_2 + input_gate_2 * stabilized_input_2 s2 = output_gate_2 * T.tanh(c2) o = T.nnet.sigmoid(T.dot(self.O_w, s2) + self.O_bias) return [o, s1, c, s2, c2, input_gate, forget_gate, output_gate]
def convolutional_model(X, w_1, w_2, w_3, w_4, w_5, w_6, p_1, p_2, p_3, p_4, p_5): l1 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(X, w_1, border_mode='full'),0.), (2, 2),ignore_border=True) + b_1.dimshuffle('x', 0, 'x', 'x') ), p_1) l2 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(l1, w_2), 0.), (2, 2),ignore_border=True) + b_2.dimshuffle('x', 0, 'x', 'x') ), p_2) l3 = dropout(T.flatten(T.tanh( max_pool_2d(T.maximum(conv2d(l2, w_3), 0.), (2, 2),ignore_border=True) + b_3.dimshuffle('x', 0, 'x', 'x') ), outdim=2), p_3)# flatten to switch back to 1d layers l4 = dropout(T.maximum(T.dot(l3, w_4), 0.), p_4) l5 = dropout(T.maximum(T.dot(l4, w_5), 0.), p_5) return T.dot(l5, w_6)
def _step(self,y_tm1,s_tm1,h): # attention pctx__=T.dot(h,self.W_ha)+T.dot(s_tm1,self.W_sa) #pctx__+=T.dot(y_t,self.W_yc) e=T.exp(T.tanh(pctx__)) #e=T.dot(pctx__,self.U_z) e=e/e.sum(0, keepdims=True) c=T.dot(e.T,h) #c=(h*e[:,:,None]).sum(0) z = T.tanh(T.dot(y_tm1, self.W_z) + self.b_z + T.dot(s_tm1, self.U_z)+T.dot(c,self.W_cs)) r = T.tanh(T.dot(y_tm1, self.W_r) + self.b_r + T.dot(s_tm1, self.U_r)+T.dot(c,self.W_cs)) hh_t = T.tanh(T.dot(y_tm1, self.W_h) + self.b_h + T.dot(r * s_tm1, self.U_h)+T.dot(c,self.W_cy)) s_t = z * s_tm1 + (1 - z) * hh_t logit=T.tanh(T.dot(s_t, self.W_hl)+T.dot(y_tm1, self.W_yl)+T.dot(c, self.W_cl)) return T.cast(s_t,dtype =theano.config.floatX),logit
def theano_setup(self): # The matrices Wb and Wc were originally tied. # Because of that, I decided to keep Wb and Wc with # the same shape (instead of being transposed) to # avoid disturbing the code as much as possible. Wb = T.dmatrix('Wb') Wc = T.dmatrix('Wc') b = T.dvector('b') c = T.dvector('c') s = T.dscalar('s') x = T.dmatrix('x') h_act = T.dot(x, Wc) + c if self.act_func[0] == 'tanh': h = T.tanh(h_act) elif self.act_func[0] == 'sigmoid': h = T.nnet.sigmoid(h_act) elif self.act_func[0] == 'id': # bad idae h = h_act else: raise("Invalid act_func[0]") r_act = T.dot(h, Wb.T) + b if self.act_func[1] == 'tanh': r = s * T.tanh(r_act) elif self.act_func[1] == 'sigmoid': r = s * T.nnet.sigmoid(r_act) elif self.act_func[1] == 'id': r = s * r_act else: raise("Invalid act_func[1]") # Another variable to be able to call a function # with a noisy x and compare it to a reference x. y = T.dmatrix('y') loss = ((r - y)**2) sum_loss = T.sum(loss) # theano_encode_decode : vectorial function in argument X. # theano_loss : vectorial function in argument X. # theano_gradients : returns triplet of gradients, each of # which involves the all data X summed # so it's not a "vectorial" function. self.theano_encode_decode = function([Wb,Wc,b,c,s,x], r) self.theano_loss = function([Wb,Wc,b,c,s,x,y], loss) self.theano_gradients = function([Wb,Wc,b,c,s,x,y], [T.grad(sum_loss, Wb), T.grad(sum_loss, Wc), T.grad(sum_loss, b), T.grad(sum_loss, c), T.grad(sum_loss, s)]) # other useful theano functions for the experiments that involve # adding noise to the hidden states self.theano_encode = function([Wc,c,x], h) self.theano_decode = function([Wb,b,s,h], r)
def step(x, prev_cell, prev_hidden): transformed_x = T.dot(x, P[name_W_input]) x_i = transformed_x[0 * hidden_size : 1 * hidden_size] x_f = transformed_x[1 * hidden_size : 2 * hidden_size] x_c = transformed_x[2 * hidden_size : 3 * hidden_size] x_o = transformed_x[3 * hidden_size : 4 * hidden_size] transformed_hid = T.dot(prev_hidden, P[name_W_hidden]) h_i = transformed_hid[0 * hidden_size : 1 * hidden_size] h_f = transformed_hid[1 * hidden_size : 2 * hidden_size] h_c = transformed_hid[2 * hidden_size : 3 * hidden_size] h_o = transformed_hid[3 * hidden_size : 4 * hidden_size] transformed_cell = T.dot(prev_cell, V_if) c_i = transformed_cell[0 * hidden_size : 1 * hidden_size] c_f = transformed_cell[1 * hidden_size : 2 * hidden_size] in_lin = x_i + h_i + b_i + c_i forget_lin = x_f + h_f + b_f + c_f cell_lin = x_c + h_c + b_c in_gate = T.nnet.sigmoid(in_lin) forget_gate = T.nnet.sigmoid(forget_lin) cell_updates = T.tanh(cell_lin) cell = forget_gate * prev_cell + in_gate * cell_updates out_lin = x_o + h_o + b_o + T.dot(cell, V_o) out_gate = T.nnet.sigmoid(out_lin) hid = out_gate * T.tanh(cell) return cell, hid
def b_step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi_b) + T.dot(h_tm1, self.W_hi_b) + self.b_i_b) f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf_b) + T.dot(h_tm1, self.W_hf_b) + self.b_f_b) c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc_b) + T.dot(h_tm1, self.W_hc_b) + self.b_c_b) o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo_b) + T.dot(h_tm1, self.W_ho_b) + self.b_o_b) h_t = o_t * T.tanh(c_t) return [h_t, c_t]
import theano.sandbox.rng_mrg as RNG_MRG from utils import data_tools as data from recurrent_gsn import generative_stochastic_network import utils.logger as log from utils.image_tiler import tile_raster_images from utils.utils import cast32, logit, trunc, get_shared_weights, get_shared_bias, salt_and_pepper, \ make_time_units_string # Default values to use for SEN parameters defaults = { # gsn parameters "gsn_layers": 3, # number of hidden layers to use "walkbacks": 5, # number of walkbacks (generally 2*layers) - need enough to have info from top layer propagate to visible layer "hidden_size": 1500, "hidden_activation": lambda x: T.tanh(x), "visible_activation": lambda x: T.nnet.sigmoid(x), "input_sampling": True, "MRG": RNG_MRG.MRG_RandomStreams(1), # recurrent parameters "recurrent_hidden_size": 1500, "recurrent_hidden_activation": lambda x: T.tanh(x), # sen parameters # training parameters "load_params": False, "cost_function": lambda x, y: T.mean(T.nnet.binary_crossentropy(x, y)), "n_epoch": 1000, "gsn_batch_size": 100, "batch_size": 200, "save_frequency": 10,
def build_model(tparams, options): # description string: #words x #samples x = tensor.matrix('x', dtype='float32') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64') y_mask = tensor.matrix('y_mask', dtype='float32') n_timesteps_trg = y.shape[0] n_samples = x.shape[1] init_memory = None # word embedding (target) import scipy.io from sklearn.decomposition import PCA matlab_data = scipy.io.loadmat('corr_5.mat') correlations = matlab_data['corr_5'] pca = PCA(n_components=options['dim_word']) pca.fit(correlations) correlations_reduced = pca.transform(correlations) n_clusters, dim_reduced = correlations_reduced.shape Wemb = numpy.zeros((n_clusters + 1, dim_reduced), dtype=numpy.float32) Wemb[1:, :] = numpy.array(correlations_reduced, dtype=numpy.float32) Wemb_tensor = tensor.constant(Wemb, dtype=numpy.float32) emb = Wemb_tensor[y.flatten()].reshape( [n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # decoder proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=y_mask, context=x.T, context_mask=x_mask.T, one_step=False, init_state=None, init_memory=init_memory) proj_h = proj[0] if options['decoder'].startswith('lstm'): ctxs = proj[2] alphas = proj[3] else: ctxs = proj[1] alphas = proj[2] proj_h = dropout_layer(proj_h) # Drop out here # compute word probabilities logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_ctx) logit = dropout_layer(logit) # Dropout here logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx] + 1e-8) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) cost = cost.mean() return x, x_mask, y, y_mask, alphas, cost
def tanh(x): return tensor.tanh(x)
def recurrent_fn( u_t, h_tm1, W_hh, W_ux, W_hy,b) : x_t = TT.dot(W_ux, u_t) h_t = TT.tanh( TT.dot(W_hh, h_tm1) + x_t + b) y_t = TT.dot(W_hy, h_t) return h_t, y_t
def tanh(self, X): return T.tanh(X)
def model(self, activation, RecurrentUnit): self.f = activation # embedding layer parameters we = init_weight(self.V, self.D) # hidden layer parameters self.hidden_layers = [] Mi = self.D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo # attention layer parameters wa = init_weight(Mi, Mi) ba = np.zeros(Mi) ua = init_weight(Mi,1) self.Wa = theano.shared(wa) self.Ba = theano.shared(ba) self.Ua = theano.shared(ua) # output layer parameters wo = init_weight(Mi, self.O) bo = np.zeros(self.O) # shared variable self.We = theano.shared(we, name="Embedding weights") self.Wo = theano.shared(wo, name="Output weight") self.Bo = theano.shared(bo, name="Output Bias") self.params = [self.We, self.Wa, self.Ba, self.Ua, self.Wo, self.Bo] for ru in self.hidden_layers: self.params += ru.params # input variables thx = T.ivector('X') thy = T.ivector('Y') thStartPoints = T.ivector('start_points') thEndPoints = T.ivector('end_points') # embedding layer computation Z = self.We[thx] # size = [? x D] # rnn layer computation for ru in self.hidden_layers: Z = ru.output(Z, thStartPoints) # size = [? x H] # attention layer computation u = T.tanh(Z.dot(self.Wa) + self.Ba) # size = [? x H] alpha = T.nnet.softmax(u.dot(self.Ua)) # size = [? x 1] ( [? x H].dot([H x 1]) ) c = T.repeat(alpha, Z.shape[1], axis=1) * Z # size = [H] ( [? x H]*[? x H] ) # output layer computation py = T.nnet.softmax(c.dot(self.Wo) + self.Bo) # size = [O] ( [H].dot([H x O]) ) py_x = py[thEndPoints, :] prediction = T.argmax(py_x, axis=1) self.predict_op = theano.function( inputs=[thx, thStartPoints, thEndPoints], outputs=prediction, allow_input_downcast=True ) return thx, thy, thStartPoints, thEndPoints, py_x, prediction
def __init__(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, args=None, logger=None): # Output logger self.logger = logger self.outdir = args.get("output_path", defaults["output_path"]) if self.outdir[-1] != '/': self.outdir = self.outdir + '/' # Input data self.train_X = train_X self.train_Y = train_Y self.valid_X = valid_X self.valid_Y = valid_Y self.test_X = test_X self.test_Y = test_Y # variables from the dataset that are used for initialization and image reconstruction if train_X is None: self.N_input = args.get("input_size") if args.get("input_size") is None: raise AssertionError( "Please either specify input_size in the arguments or provide an example train_X for input dimensionality." ) else: self.N_input = train_X.eval().shape[1] self.root_N_input = numpy.sqrt(self.N_input) self.is_image = args.get('is_image', defaults['is_image']) if self.is_image: self.image_width = args.get('width', self.root_N_input) self.image_height = args.get('height', self.root_N_input) ####################################### # Network and training specifications # ####################################### self.gsn_layers = args.get( 'gsn_layers', defaults['gsn_layers']) # number hidden layers self.walkbacks = args.get('walkbacks', defaults['walkbacks']) # number of walkbacks self.learning_rate = theano.shared( cast32(args.get('learning_rate', defaults['learning_rate']))) # learning rate self.init_learn_rate = cast32( args.get('learning_rate', defaults['learning_rate'])) self.momentum = theano.shared( cast32(args.get('momentum', defaults['momentum']))) # momentum term self.annealing = cast32(args.get( 'annealing', defaults['annealing'])) # exponential annealing coefficient self.noise_annealing = cast32( args.get('noise_annealing', defaults['noise_annealing']) ) # exponential noise annealing coefficient self.batch_size = args.get('batch_size', defaults['batch_size']) self.gsn_batch_size = args.get('gsn_batch_size', defaults['gsn_batch_size']) self.n_epoch = args.get('n_epoch', defaults['n_epoch']) self.early_stop_threshold = args.get('early_stop_threshold', defaults['early_stop_threshold']) self.early_stop_length = args.get('early_stop_length', defaults['early_stop_length']) self.save_frequency = args.get('save_frequency', defaults['save_frequency']) self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"]) self.hidden_add_noise_sigma = theano.shared( cast32( args.get('hidden_add_noise_sigma', defaults["hidden_add_noise_sigma"]))) self.input_salt_and_pepper = theano.shared( cast32( args.get('input_salt_and_pepper', defaults["input_salt_and_pepper"]))) self.input_sampling = args.get('input_sampling', defaults["input_sampling"]) self.vis_init = args.get('vis_init', defaults['vis_init']) self.load_params = args.get('load_params', defaults['load_params']) self.hessian_free = args.get('hessian_free', defaults['hessian_free']) self.layer_sizes = [self.N_input] + [ args.get('hidden_size', defaults['hidden_size']) ] * self.gsn_layers # layer sizes, from h0 to hK (h0 is the visible layer) self.recurrent_hidden_size = args.get( 'recurrent_hidden_size', defaults['recurrent_hidden_size']) self.top_layer_sizes = [self.recurrent_hidden_size] + [ args.get('hidden_size', defaults['hidden_size']) ] * self.gsn_layers # layer sizes, from h0 to hK (h0 is the visible layer) self.f_recon = None self.f_noise = None # Activation functions! # For the GSN: if args.get('hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for GSN hiddens') self.hidden_activation = args.get('hidden_activation') elif args.get('hidden_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for GSN hiddens') self.hidden_activation = T.nnet.sigmoid elif args.get('hidden_act') == 'rectifier': log.maybeLog(self.logger, 'Using rectifier activation for GSN hiddens') self.hidden_activation = lambda x: T.maximum(cast32(0), x) elif args.get('hidden_act') == 'tanh': log.maybeLog( self.logger, 'Using hyperbolic tangent activation for GSN hiddens') self.hidden_activation = lambda x: T.tanh(x) elif args.get('hidden_act') is not None: log.maybeLog( self.logger, "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens" .format(args.get('hidden_act'))) raise NotImplementedError( "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens" .format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for GSN hiddens") self.hidden_activation = defaults['hidden_activation'] # For the RNN: if args.get('recurrent_hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for RNN hiddens') self.recurrent_hidden_activation = args.get( 'recurrent_hidden_activation') elif args.get('recurrent_hidden_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for RNN hiddens') self.recurrent_hidden_activation = T.nnet.sigmoid elif args.get('recurrent_hidden_act') == 'rectifier': log.maybeLog(self.logger, 'Using rectifier activation for RNN hiddens') self.recurrent_hidden_activation = lambda x: T.maximum( cast32(0), x) elif args.get('recurrent_hidden_act') == 'tanh': log.maybeLog( self.logger, 'Using hyperbolic tangent activation for RNN hiddens') self.recurrent_hidden_activation = lambda x: T.tanh(x) elif args.get('recurrent_hidden_act') is not None: log.maybeLog( self.logger, "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens" .format(args.get('hidden_act'))) raise NotImplementedError( "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens" .format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for RNN hiddens") self.recurrent_hidden_activation = defaults[ 'recurrent_hidden_activation'] # Visible layer activation if args.get('visible_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for visible layer') self.visible_activation = args.get('visible_activation') elif args.get('visible_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for visible layer') self.visible_activation = T.nnet.sigmoid elif args.get('visible_act') == 'softmax': log.maybeLog(self.logger, 'Using softmax activation for visible layer') self.visible_activation = T.nnet.softmax elif args.get('visible_act') is not None: log.maybeLog( self.logger, "Did not recognize visible activation {0!s}, please use sigmoid or softmax" .format(args.get('visible_act'))) raise NotImplementedError( "Did not recognize visible activation {0!s}, please use sigmoid or softmax" .format(args.get('visible_act'))) else: log.maybeLog(self.logger, 'Using default activation for visible layer') self.visible_activation = defaults['visible_activation'] # Cost function! if args.get('cost_function') is not None: log.maybeLog(self.logger, '\nUsing specified cost function for GSN training\n') self.cost_function = args.get('cost_function') elif args.get('cost_funct') == 'binary_crossentropy': log.maybeLog(self.logger, '\nUsing binary cross-entropy cost!\n') self.cost_function = lambda x, y: T.mean( T.nnet.binary_crossentropy(x, y)) elif args.get('cost_funct') == 'square': log.maybeLog(self.logger, "\nUsing square error cost!\n") #cost_function = lambda x,y: T.log(T.mean(T.sqr(x-y))) self.cost_function = lambda x, y: T.log(T.sum(T.pow((x - y), 2))) elif args.get('cost_funct') is not None: log.maybeLog( self.logger, "\nDid not recognize cost function {0!s}, please use binary_crossentropy or square\n" .format(args.get('cost_funct'))) raise NotImplementedError( "Did not recognize cost function {0!s}, please use binary_crossentropy or square" .format(args.get('cost_funct'))) else: log.maybeLog(self.logger, '\nUsing default cost function for GSN training\n') self.cost_function = defaults['cost_function'] ############################ # Theano variables and RNG # ############################ self.X = T.fmatrix('X') #single (batch) for training gsn self.Xs = T.fmatrix('Xs') #sequence for training rnn self.MRG = RNG_MRG.MRG_RandomStreams(1) ############### # Parameters! # ############### #visible gsn self.weights_list = [ get_shared_weights(self.layer_sizes[i], self.layer_sizes[i + 1], name="W_{0!s}_{1!s}".format(i, i + 1)) for i in range(self.gsn_layers) ] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.bias_list = [ get_shared_bias(self.layer_sizes[i], name='b_' + str(i)) for i in range(self.gsn_layers + 1) ] # initialize each layer to 0's. #recurrent self.recurrent_to_gsn_weights_list = [ get_shared_weights(self.recurrent_hidden_size, self.layer_sizes[layer], name="W_u_h{0!s}".format(layer)) for layer in range(self.gsn_layers + 1) if layer % 2 != 0 ] self.W_u_u = get_shared_weights(self.recurrent_hidden_size, self.recurrent_hidden_size, name="W_u_u") self.W_ins_u = get_shared_weights(args.get('hidden_size', defaults['hidden_size']), self.recurrent_hidden_size, name="W_ins_u") self.recurrent_bias = get_shared_bias(self.recurrent_hidden_size, name='b_u') #top layer gsn self.top_weights_list = [ get_shared_weights(self.top_layer_sizes[i], self.top_layer_sizes[i + 1], name="Wtop_{0!s}_{1!s}".format(i, i + 1)) for i in range(self.gsn_layers) ] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.top_bias_list = [ get_shared_bias(self.top_layer_sizes[i], name='btop_' + str(i)) for i in range(self.gsn_layers + 1) ] # initialize each layer to 0's. #lists for use with gradients self.gsn_params = self.weights_list + self.bias_list self.u_params = [self.W_u_u, self.W_ins_u, self.recurrent_bias] self.top_params = self.top_weights_list + self.top_bias_list self.params = self.gsn_params + self.recurrent_to_gsn_weights_list + self.u_params + self.top_params ################################################### # load initial parameters # ################################################### if self.load_params: params_to_load = 'gsn_params.pkl' log.maybeLog(self.logger, "\nLoading existing GSN parameters\n") loaded_params = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[:len(self.weights_list)], self.weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.weights_list):], self.bias_list) ] params_to_load = 'rnn_params.pkl' log.maybeLog(self.logger, "\nLoading existing RNN parameters\n") loaded_params = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[:len(self.recurrent_to_gsn_weights_list)], self.recurrent_to_gsn_weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.recurrent_to_gsn_weights_list ):len(self.recurrent_to_gsn_weights_list ) + 1], self.W_u_u) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.recurrent_to_gsn_weights_list) + 1:len(self.recurrent_to_gsn_weights_list) + 2], self.W_ins_u) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.recurrent_to_gsn_weights_list) + 2:], self.recurrent_bias) ] params_to_load = 'top_gsn_params.pkl' log.maybeLog(self.logger, "\nLoading existing top level GSN parameters\n") loaded_params = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(self.top_weights_list)], self.top_weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.top_weights_list):], self.top_bias_list) ] self.gsn_args = { 'weights_list': self.weights_list, 'bias_list': self.bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.visible_activation, 'cost_function': self.cost_function, 'layers': self.gsn_layers, 'walkbacks': self.walkbacks, 'hidden_size': args.get('hidden_size', defaults['hidden_size']), 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir + 'gsn/', 'is_image': self.is_image, 'input_size': self.N_input } self.top_gsn_args = { 'weights_list': self.top_weights_list, 'bias_list': self.top_bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.recurrent_hidden_activation, 'cost_function': self.cost_function, 'layers': self.gsn_layers, 'walkbacks': self.walkbacks, 'hidden_size': args.get('hidden_size', defaults['hidden_size']), 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir + 'top_gsn/', 'is_image': False, 'input_size': self.recurrent_hidden_size } ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") self.network_state_input = [X_sample] + [ T.fmatrix("H_sampling_" + str(i + 1)) for i in range(self.gsn_layers) ] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.maybeLog(self.logger, "Performing one walkback in network state sampling.") generative_stochastic_network.update_layers( self.network_state_output, self.weights_list, self.bias_list, visible_pX_chain, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.logger) ############################################## # Build the graphs for the SEN # ############################################## # If `x_t` is given, deterministic recurrence to compute the u_t. Otherwise, first generate def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.gsn_layers): if i % 2 == 0: log.maybeLog( self.logger, "Using {0!s} and {1!s}".format( self.recurrent_to_gsn_weights_list[(i + 1) / 2], self.bias_list[i + 1])) h_t = T.concatenate([ self.hidden_activation(self.bias_list[i + 1] + T.dot( u_tm1, self.recurrent_to_gsn_weights_list[(i + 1) / 2])) for i in range(self.gsn_layers) if i % 2 == 0 ], axis=0) # Make a GSN to update U _, hs = generative_stochastic_network.build_gsn( x_t, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) htop_t = hs[-1] ins_t = htop_t ua_t = T.dot(ins_t, self.W_ins_u) + T.dot( u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return [ua_t, u_t, h_t] log.maybeLog(self.logger, "\nCreating recurrent step scan.") # For training, the deterministic recurrence is used to compute all the # {h_t, 1 <= t <= T} given Xs. Conditional GSNs can then be trained # in batches using those parameters. u0 = T.zeros((self.recurrent_hidden_size, )) # initial value for the RNN hidden units (ua, u, h_t), updates_recurrent = theano.scan( fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, True), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) log.maybeLog(self.logger, "Now for reconstruction sample without noise") (_, _, h_t_recon), updates_recurrent_recon = theano.scan( fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, False), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) # put together the hiddens list h_list = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer % 2 != 0: h_list.append(T.zeros_like(T.dot(h_list[-1], w))) else: h_list.append( (h_t.T[(layer / 2) * self.hidden_size:(layer / 2 + 1) * self.hidden_size]).T) h_list_recon = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer % 2 != 0: h_list_recon.append(T.zeros_like(T.dot(h_list_recon[-1], w))) else: h_list_recon.append( (h_t_recon.T[(layer / 2) * self.hidden_size:(layer / 2 + 1) * self.hidden_size]).T) #with noise _, cost, show_cost = generative_stochastic_network.build_gsn_given_hiddens( self.Xs, h_list, self.weights_list, self.bias_list, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function, self.logger) #without noise for reconstruction x_sample_recon, _, _ = generative_stochastic_network.build_gsn_given_hiddens( self.Xs, h_list_recon, self.weights_list, self.bias_list, False, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function, self.logger) updates_train = updates_recurrent updates_cost = updates_recurrent ############# # COSTS # ############# log.maybeLog(self.logger, '\nCost w.r.t p(X|...) at every step in the graph') start_functions_time = time.time() # if we are not using Hessian-free training create the normal sgd functions if not self.hessian_free: gradient = T.grad(cost, self.params) gradient_buffer = [ theano.shared( numpy.zeros(param.get_value().shape, dtype='float32')) for param in self.params ] m_gradient = [ self.momentum * gb + (cast32(1) - self.momentum) * g for (gb, g) in zip(gradient_buffer, gradient) ] param_updates = [(param, param - self.learning_rate * mg) for (param, mg) in zip(self.params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) updates_train.update(updates) log.maybeLog(self.logger, "rnn-gsn learn...") self.f_learn = theano.function(inputs=[self.Xs], updates=updates_train, outputs=show_cost, on_unused_input='warn', name='rnngsn_f_learn') log.maybeLog(self.logger, "rnn-gsn cost...") self.f_cost = theano.function(inputs=[self.Xs], updates=updates_cost, outputs=show_cost, on_unused_input='warn', name='rnngsn_f_cost') log.maybeLog(self.logger, "Training/cost functions done.") # Denoise some numbers : show number, noisy number, predicted number, reconstructed number log.maybeLog( self.logger, "Creating graph for noisy reconstruction function at checkpoints during training." ) self.f_recon = theano.function(inputs=[self.Xs], outputs=x_sample_recon[-1], updates=updates_recurrent_recon, name='rnngsn_f_recon') # a function to add salt and pepper noise self.f_noise = theano.function(inputs=[self.X], outputs=salt_and_pepper( self.X, self.input_salt_and_pepper), name='rnngsn_f_noise') # Sampling functions log.maybeLog(self.logger, "Creating sampling function...") if self.gsn_layers == 1: self.f_sample = theano.function( inputs=[X_sample], outputs=visible_pX_chain[-1], name='rnngsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = theano.function(inputs=self.network_state_input, outputs=self.network_state_output + visible_pX_chain, on_unused_input='warn', name='rnngsn_f_sample') log.maybeLog(self.logger, "Done compiling all functions.") compilation_time = time.time() - start_functions_time # Show the compile time with appropriate easy-to-read units. log.maybeLog( self.logger, "Total compilation time took " + make_time_units_string(compilation_time) + ".\n\n")
hidden = 10 D = (numpy.random.randn(examples, features), numpy.random.randint(size=examples, low=0, high=2)) training_steps = 1000 x = T.dmatrix("x") y = T.dvector("y") w1 = theano.shared(numpy.random.randn(features, hidden), name="w1") b1 = theano.shared(numpy.zeros(hidden), name="b1") w2 = theano.shared(numpy.random.randn(hidden), name="w2") b2 = theano.shared(0., name="b2") p1 = T.tanh(T.dot(x, w1) + b1) p2 = T.tanh(T.dot(p1, w2) + b2) prediction = p2 > 0.5 error = T.nnet.binary_crossentropy(p2, y) loss = error.mean() + 0.01 * (l2(w1) + l2(w2)) gw1, gb1, gw2, gb2 = T.grad(loss, [w1, b1, w2, b2]) train = theano.function(inputs=[x, y], outputs=[p2, error], updates=((w1, w1 - 0.1 * gw1), (b1, b1 - 0.1 * gb1), (w2, w2 - 0.1 * gw2), (b2, b2 - 0.1 * gb2))) predict = theano.function(inputs=[x], outputs=[prediction])
def __init__(self, name, config): super().__init__(name) self.config = config pprint(config) sys.stdout.flush() self.add(Embeddings( 'src_char_embeddings', len(config['src_encoder'].sub_encoder), config['src_char_embedding_dims'], dropout=config['char_embeddings_dropout'])) self.add(Embeddings( 'src_embeddings', len(config['src_encoder']), config['src_embedding_dims'], dropout=config['embeddings_dropout'])) self.add(Embeddings( 'trg_embeddings', len(config['trg_encoder']), config['trg_embedding_dims'])) self.add(Linear( 'hidden', config['decoder_state_dims'], config['trg_embedding_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) self.add(Linear( 'emission', config['trg_embedding_dims'], len(config['trg_encoder']), w=self.trg_embeddings._w.T)) self.add(Linear( 'proj_h0', config['encoder_state_dims'], config['decoder_state_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) self.add(Linear( 'proj_c0', config['encoder_state_dims'], config['decoder_state_dims'], dropout=config['dropout'], layernorm=config['layernorm'])) # The total loss is # lambda_o*xent(target sentence) + lambda_a*xent(alignment) self.lambda_o = theano.shared( np.array(1.0, dtype=theano.config.floatX)) self.lambda_a = theano.shared( np.array(config['alignment_loss'], dtype=theano.config.floatX)) for prefix, backwards in (('fwd', False), ('back', True)): self.add(LSTMSequence( prefix+'_char_encoder', backwards, config['src_char_embedding_dims'] + ( (config['src_embedding_dims'] // 2) if backwards else 0), config['src_embedding_dims'] // 2, layernorm=config['encoder_layernorm'], dropout=config['recurrent_dropout'], trainable_initial=True, offset=0)) for prefix, backwards in (('fwd', False), ('back', True)): self.add(LSTMSequence( prefix+'_encoder', backwards, config['src_embedding_dims'] + ( config['encoder_state_dims'] if backwards else 0), config['encoder_state_dims'], layernorm=config['encoder_layernorm'], dropout=config['recurrent_dropout'], trainable_initial=True, offset=0)) self.add(LSTMSequence( 'decoder', False, config['trg_embedding_dims'], config['decoder_state_dims'], layernorm=config['decoder_layernorm'], dropout=config['recurrent_dropout'], attention_dims=config['attention_dims'], attended_dims=2*config['encoder_state_dims'], trainable_initial=False, offset=-1)) h_t = T.matrix('h_t') self.predict_fun = function( [h_t], T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t))))) inputs = T.lmatrix('inputs') inputs_mask = T.bmatrix('inputs_mask') chars = T.lmatrix('chars') chars_mask = T.bmatrix('chars_mask') outputs = T.lmatrix('outputs') outputs_mask = T.bmatrix('outputs_mask') attention = T.tensor3('attention') self.x = [inputs, inputs_mask, chars, chars_mask] self.y = [outputs, outputs_mask, attention] self.encode_fun = function(self.x, self.encode(*self.x)) self.xent_fun = function(self.x+self.y, self.xent(*(self.x+self.y)))
def forward_prop_step(x_t, s_t_prev, U, V, W): s_t = T.tanh(U[:, x_t] + W.dot(s_t_prev)) o_t = T.nnet.softmax(V.dot(s_t)) return [o_t[0], s_t ] # need to take [0] as nnet.softmax returns a 2D tensor
def tanh(x): """ Tanh activation function """ return tensor.tanh(x)
def lstm_train(n_in=7, n_hidden=10, n_i=10, n_c=10, n_o=10, n_f=10, n_y=7, nb_epochs=300, nb_train_examples=1000): ''' # numbeer of input layer dim as embedded reber grammar (7bit vector) n_in = 7 # number of hidden layer unit for gate n_hidden = 10 n_i = 10 n_c = 10 n_o = 10 n_f = 10 # number of output layer dim (7bit vector) n_y = 7 ''' # 重みの初期化 # 入力および出力ゲートは開くか閉じるを使う # 忘却ゲートは開いているべきある。(トレーニングのはじめから忘れないように) # biasの適当な初期化によって達成を試みている。 W_xi = theano.shared(ortho_weights(n_in, n_i)) W_hi = theano.shared(ortho_weights(n_hidden, n_i)) W_ci = theano.shared(ortho_weights(n_c, n_i)) b_i = theano.shared(np.cast[config.floatX](np.random.uniform( -0.5, 0.5, size=n_i))) # 入力ゲートはランダムで良い W_xf = theano.shared(ortho_weights(n_in, n_f)) W_hf = theano.shared(ortho_weights(n_hidden, n_f)) W_cf = theano.shared(ortho_weights(n_c, n_f)) b_f = theano.shared(np.cast[config.floatX](np.random.uniform( 0, 1, size=n_f))) # 忘却ゲートははじめ開いているべき(sigmoidを挟んだときに確実に0.5以上にする) W_xc = theano.shared(ortho_weights(n_in, n_c)) W_hc = theano.shared(ortho_weights(n_hidden, n_c)) b_c = theano.shared(np.zeros( n_c, dtype=config.floatX)) # メモリセルのバイアスは初期値0(閉じるでも開くでもない) W_xo = theano.shared(ortho_weights(n_in, n_o)) W_ho = theano.shared(ortho_weights(n_hidden, n_o)) W_co = theano.shared(ortho_weights(n_c, n_o)) b_o = theano.shared(np.cast[config.floatX](np.random.uniform( -0.5, 0.5, size=n_o))) # 出力ゲートはランダムで良い W_hy = theano.shared(ortho_weights(n_hidden, n_y)) b_y = theano.shared(np.zeros(n_y, dtype=config.floatX)) # カテゴリー分類レイヤーのバイアスは初期値0 c0 = theano.shared(np.zeros(n_c, dtype=config.floatX)) # メモリセルの初期入力 h0 = T.tanh(c0) # 初期ct_prime params = [ W_xi, W_hi, W_ci, b_i, W_xf, W_hf, W_cf, b_f, W_xc, W_hc, b_c, W_xo, W_ho, W_co, b_o, W_hy, b_y, c0 ] # 初期時刻の入力ベクトルシンボル v = T.matrix(dtype=config.floatX) # ターゲット(教師データ)シンボル target = T.matrix(dtype=config.floatX) # recurrence [h_vals, _, y_vals], _ = theano.scan( fn=one_lstm_step, #sequences = dict(input=v, taps=[0]), sequences=v, outputs_info=[h0, c0, None], non_sequences=[ W_xi, W_hi, W_ci, b_i, W_xf, W_hf, W_cf, b_f, W_xc, W_hc, b_c, W_xo, W_ho, W_co, b_o, W_hy, b_y ]) # cost ここでは多クラス問題なのでクロスエントロピー cost = -T.mean(target * T.log(y_vals) + (1. - target) * T.log(1. - y_vals)) # 学習率の共有変数 lr = np.cast[config.floatX](.1) learning_rate = theano.shared(lr) # 各パラメータの勾配 #gparams = T.grad(cost, params) gparams = [] for param in params: gparam = T.grad(cost, param) gparams.append(gparam) # パラメータの更新 simple sgd updates = [] for param, gparam in zip(params, gparams): updates.append((param, param - gparam * learning_rate)) # 教師データの生成 train_data = reber_grammer.get_n_embedded_examples(nb_train_examples) print 'train data length: ', len(train_data) # lstm learn_rnn_fn = theano.function(inputs=[v, target], outputs=cost, updates=updates) train_errors = np.ndarray(nb_epochs) def train_rnn(train_data): for x in range(nb_epochs): error = 0. for j in range(len(train_data)): # train_dataからランダムに1つ事例を取得 index = np.random.randint(0, len(train_data)) # 入力ベクトルi, 教師ベクトルo i, o = train_data[index] #print 'train vector: ',i #print 'train target: ',o train_cost = learn_rnn_fn(i, o) error += train_cost # epochごとにerrorを出力 print "epochs %i : %f" % (x, error) train_errors[x] = error train_rnn(train_data) plt.plot(np.arange(nb_epochs), train_errors, 'b-') plt.xlabel('epochs') plt.ylabel('error') plt.ylim(0., 50) plt.show() print params
def __init__(self, layer_def, inputs, inputs_shape, rs, clone_from=None): """ Create an Gated Recurrent Unit layer with shared variable internal parameters. :type layer_def: Element, xml containing configu for Conv layer :type inputs: list of inputs [input,gate_input,prev_output] :param inputs[0]: input, the input which is a theano.matrix, x_t :param inputs[1]: previous state, h_{t-1}, same shape as this layer :type rs: a random state """ #inputs = [input,gate_input,previous_output] layer_name = layer_def.attrib["name"] assert (len(inputs) == 2) assert (len(inputs_shape) == 2) self.input = inputs[0] self.prev_h = inputs[1] n_in, _ = inputs_shape[0] n_prev_h, bsz = inputs_shape[1] assert (bsz == inputs_shape[0][1]) # clone the num_units if clone_from == None: self.num_units = int(layer_def.find("numunits").text) else: self.num_units = clone_from.num_units assert (n_prev_h == self.num_units) #create the weight matrices rng = np.random.RandomState(seed=int(time.time())) # initialize weights with random weights if clone_from != None: #weight matrices for x_t, the input self.W_z = clone_from.W_z self.W_r = clone_from.W_r self.W = clone_from.W #weight matrices for h_{t-1} self.U_z = clone_from.U_z self.U_r = clone_from.U_r self.U = clone_from.U else: #W_{}: is a matrix of size num_units x n_in W_bound = np.sqrt(6. / (self.num_units + n_in)) #W_o W_values = np.asarray(rng.normal(loc=0., scale=W_bound, size=(self.num_units, n_in)), dtype=theano.config.floatX) self.W_z = theano.shared(value=W_values, name=layer_name + '-Wz', borrow=False) # num_units x n_in #W_f W_values = np.asarray(rng.normal(loc=0., scale=W_bound, size=(self.num_units, n_in)), dtype=theano.config.floatX) self.W_r = theano.shared(value=W_values, name=layer_name + '-Wr', borrow=False) # num_units x n_in #W_i W_values = np.asarray(rng.normal(loc=0., scale=W_bound, size=(self.num_units, n_in)), dtype=theano.config.floatX) self.W = theano.shared(value=W_values, name=layer_name + '-W', borrow=False) # num_units x n_in #U_{}: is a matrix of size num_units x num_units U_bound = np.sqrt(6. / (self.num_units + self.num_units)) #U_o U_values = np.asarray(rng.normal(loc=0., scale=U_bound, size=(self.num_units, self.num_units)), dtype=theano.config.floatX) self.U_z = theano.shared(value=U_values, name=layer_name + '-Uz', borrow=False) #num_units x num_units #U_f U_values = np.asarray(rng.normal(loc=0., scale=U_bound, size=(self.num_units, self.num_units)), dtype=theano.config.floatX) self.U_r = theano.shared(value=U_values, name=layer_name + '-Ur', borrow=False) #num_units x num_units #U_i U_values = np.asarray(rng.normal(loc=0., scale=U_bound, size=(self.num_units, self.num_units)), dtype=theano.config.floatX) self.U = theano.shared(value=U_values, name=layer_name + '-U', borrow=False) #num_units x num_units #calculate the gate values # num_units x bsz #num_units x bsz self.zgate = T.nnet.sigmoid( T.dot(self.W_z, self.input) + T.dot(self.U_z, self.prev_h)) #update gate # num_units x bsz #num_units x bsz self.rgate = T.nnet.sigmoid( T.dot(self.W_r, self.input) + T.dot(self.U_r, self.prev_h)) #reset gate # num_units x bsz #num_units x bsz self.tilde_h = T.tanh( T.dot(self.W, self.input) + T.dot(self.U, (self.rgate * self.prev_h))) #new memory content #output is a dictionary #only if there is a mem output tag, then provide this output self.output = dict() self.output_shape = dict() #the default output self.output[layer_name] = ( 1. - self.zgate) * self.prev_h + self.zgate * self.tilde_h self.output_shape[layer_name] = [self.num_units, bsz] self.inputs_shape = inputs_shape # parameters of the model if clone_from == None: self.params = [ self.W_z, self.W_r, self.W, self.U_z, self.U_r, self.U ] else: self.params = []
def build_model(self, **kwargs): self.opt_ret.clear() use_noise = kwargs.pop('use_noise', theano.shared(np.float32(1.))) trng = kwargs.pop('trng', RandomStreams(self.O['seed'])) dropout_param = None if self.O['use_dropout'][0]: dropout_param = [use_noise, trng, self.O['use_dropout'][1]] x, x_mask, y, y_mask = self.get_input() xr, xr_mask = self.reverse_input(x, x_mask) n_timestep, n_timestep_tgt, n_samples = self.input_dimensions(x, y) # Word embedding for forward rnn (source) emb = self.embedding(x, n_timestep, n_samples) proj_f = self._encoder(emb, 'encoder', mask=x_mask, dropout_param=dropout_param) # Word embedding for backward rnn (source) embr = self.embedding(xr, n_timestep, n_samples) proj_r = self._encoder(embr, 'encoder', mask=xr_mask, dropout_param=dropout_param) # Context will be the concatenation of forward and backward RNNs ctx = concatenate( [proj_f[0], proj_r[0][::-1], proj_f[1], proj_r[1][::-1]], axis=proj_f[0].ndim - 1) # Mean of the context across time, which will be used to initialize decoder LSTM. This is the original code ctx_mean = self.get_context_mean(ctx, x_mask) # Initial decoder state initial_decoder_h = self.fully_connect(ctx_mean, 'initDecoder', T.tanh) # Word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will # not condition on the last output. emb = self.embedding(y, n_timestep_tgt, n_samples, 'Wemb_dec') emb_shifted = T.zeros_like(emb) emb_shifted = T.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted hidden_from_last_layer, ctx_from_1st_layer = self._decoder( emb, y_mask, ctx, x_mask, initial_decoder_h, prefix='decoder', one_step=False, dropout_param=dropout_param, ) # As suggested in Page 14 of the NMT + Attention model paper, let us implement the equation above section A.2.3 fc_hidden = self.fully_connect(hidden_from_last_layer, prefix='fc_compress_lastHiddenState', activ='linear') fc_emb = self.fully_connect(emb, prefix='fc_compress_emb', activ='linear') fc_ctx = self.fully_connect(ctx_from_1st_layer, prefix='fc_compress_ctx', activ='linear') fc_sum = T.tanh(fc_hidden + fc_emb + fc_ctx) # According to Baidu's paper, dropout is only used in LSTM. So I drop the following two lines out (v-yixia) # if self.O['use_dropout'][0]: # fc_sum = self.dropout(fc_sum, use_noise, trng, self.O['use_dropout'][1]) softmax_output = self.fully_connect(fc_sum, prefix='fc_to_softmax', activ='linear') softmax_output_shp = softmax_output.shape probs = T.nnet.softmax( softmax_output.reshape([ softmax_output_shp[0] * softmax_output_shp[1], softmax_output_shp[2] ])) cost = self.get_cost(y, y_mask, probs) return x, x_mask, y, y_mask, cost
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=300, batch_size=10, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/' test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_noMT_epoch4.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_labels, word2id = load_il9_NI_test( word2id, maxSentLen) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) test_labels = np.asarray(test_labels, dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + '100k-ENG-multicca.300.ENG.txt', emb_root + '100k-SWA-multicca.d300.SWA.txt', emb_root + '100k-IL9-multicca.d300.IL9.txt' ], 300) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows, #cols) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), name = 'W', borrow=True ) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) # convolve input feature maps with filters conv_out = conv.conv2d( input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape ) # downsample each feature map individually, using maxpooling pooled_out = downsample.max_pool_2d( input=conv_out, ds=poolsize, ignore_border=True ) # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) # store parameters of this layer self.params = [self.W, self.b] # keep track of model input self.input = input
def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2), non_linear="tanh"): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height,filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows,#cols) """ assert image_shape[1] == filter_shape[1] self.input = input self.filter_shape = filter_shape self.image_shape = image_shape self.poolsize = poolsize self.non_linear = non_linear # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights if self.non_linear == "none" or self.non_linear == "relu": self.W = theano.shared(numpy.asarray(rng.uniform( low=-0.01, high=0.01, size=filter_shape), dtype=theano.config.floatX), borrow=True, name="W_conv") else: W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared(numpy.asarray(rng.uniform( low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX), borrow=True, name="W_conv") b_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True, name="b_conv") # convolve input feature maps with filters conv_out = conv.conv2d(input=input, filters=self.W, filter_shape=self.filter_shape, image_shape=self.image_shape) if self.non_linear == "tanh": conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True) elif self.non_linear == "relu": conv_out_tanh = ReLU(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True) else: pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True) self.output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x') self.params = [self.W, self.b]
def tanh(x): return T.tanh(x)
def apply(self, input_): return tensor.tanh(input_)
def scrn(X, h, R, P, A, B, b, t): #c_t = T.dot(X,B)*(1-T.nnet.sigmoid(C)) + h[:,:n_c]*T.nnet.sigmoid(C) c_t = T.dot(X, B) * 0.05 + h[:, :n_c] * 0.95 h_t = T.tanh( T.dot(X, A) + T.dot(c_t, P) + T.dot(h[:, n_c:], R) + b) return concatenate([c_t, h_t], axis=1)
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64') y_mask = tensor.matrix('y_mask', dtype='float32') # for the backward rnn, we just need to invert x and x_mask xr = x[::-1] xr_mask = x_mask[::-1] n_timesteps = x.shape[0] n_timesteps_trg = y.shape[0] n_samples = x.shape[1] # word embedding for forward rnn (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=x_mask) # word embedding for backward rnn (source) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r', mask=xr_mask) # context will be the concatenation of forward and backward rnns ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) # mean of the context (across time) will be used to initialize decoder rnn ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # or you can use the last state of forward + backward encoder rnns # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) # initial decoder state init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will # not condition on the last output. emb = tparams['Wemb'][y.flatten()] emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # decoder - pass through the decoder conditional gru with attention proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=y_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state) # hidden states of the decoder gru proj_h = proj[0] # weighted averages of context, generated by attention module ctxs = proj[1] # weights (alignment matrix) opt_ret['dec_alphas'] = proj[2] # compute word probabilities logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
def Tanh(x): y = T.tanh(x) return (y)
def forward_prop_step(x_t, s_t_prev, U, V, W): # compute output of the hidden layer s_t = T.tanh(U[:, x_t] + W.dot(s_t_prev)) # compute output of the softmax layer o_t = T.nnet.softmax(V.dot(s_t)) return [o_t[0], s_t]
def build_sampler(tparams, options, trng, use_noise): x = tensor.matrix('x', dtype='int64') xr = x[::-1] n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding (source), forward and backward emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) # encoder proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder') projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r') # concatenate forward and backward rnn hidden states ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) # get the input for decoder rnn initializer mlp ctx_mean = ctx.mean(0) # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') print('Building f_init...') outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init', profile=profile) print('Done') # x: 1 x 1 y = tensor.vector('y_sampler', dtype='int64') init_state = tensor.matrix('init_state', dtype='float32') # if it's the first word, emb should be all zero and it is indicated by -1 emb = tensor.switch(y[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][y]) # apply one step of conditional gru with attention proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state) # get the next hidden state next_state = proj[0] # get the weighted averages of context for this target word y ctxs = proj[1] logit_lstm = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') # compute the softmax probability next_probs = tensor.nnet.softmax(logit) # sample from softmax distribution to get the sample next_sample = trng.multinomial(pvals=next_probs).argmax(1) # compile a function to do the whole thing above, next word probability, # sampled word for the next target, next hidden state to be used print('Building f_next..') inps = [y, ctx, init_state] outs = [next_probs, next_sample, next_state] f_next = theano.function(inps, outs, name='f_next', profile=profile) print('Done') return f_init, f_next
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64') y_mask = tensor.matrix('y_mask', dtype='float32') n_timesteps = x.shape[0] n_timesteps_trg = y.shape[0] n_samples = x.shape[1] # word embedding (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) # pass through encoder gru, recurrence here proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=x_mask) # last hidden state of encoder rnn will be used to initialize decoder rnn ctx = proj[0][-1] ctx_mean = ctx # initial decoder state init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will # not condition on the last output. emb = tparams['Wemb_dec'][y.flatten()] emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # decoder - pass through the decoder gru, recurrence here proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=y_mask, context=ctx, one_step=False, init_state=init_state) # hidden states of the decoder gru proj_h = proj # we will condition on the last state of the encoder only ctxs = ctx[None, :, :] # compute word probabilities logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
def __init__(self, We, params): lstm_layers_num = 1 en_hidden_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value=init_xavier_uniform( self.de_hidden_size, self.num_labels), borrow=True) self.hidden_decode = theano.shared(name="Hidden to Decode", value=init_xavier_uniform( 2 * en_hidden_size, self.de_hidden_size), borrow=True) self.hidden_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0., dtype=theano.config.floatX), borrow=True) self.params += [ self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias ] #concatenate #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], self.en_hidden_size)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(self.en_hidden_size) enclstm_b = LSTM(self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) self.hos += tensor.tanh( tensor.dot(hs[-1], self.hidden_decode) + self.hidden_bias), self.Cos += tensor.tanh( tensor.dot(Cs[-1], self.hidden_decode) + self.hidden_bias), state_below = hs state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) decoder_lstm_outputs = state_below ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) newpred = tensor.dot(state_below0, self.linear).reshape( (encoderInputs.shape[1], self.num_labels)) state_below = tensor.nnet.softmax(newpred) return state_below, hs, Cs hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() train_updates = lasagne.updates.adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, em, di0], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0 })
def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes): input_variable = X_sym # layer_sizes consists of input size, all hidden sizes, and output size hidden_sizes = layer_sizes[1:-1] # set these to stop pep8 vim plugin from complaining input_size = None output_size = None for n in range(len(hidden_sizes)): if (n - 1) < 0: input_size = layer_sizes[0] else: if self.bidirectional: # Accomodate for concatenated hiddens input_size = 2 * output_size else: input_size = output_size hidden_size = hidden_sizes[n] if (n + 1) != len(hidden_sizes): output_size = hidden_sizes[n + 1] else: output_size = layer_sizes[-1] forward_hidden, forward_params = self.recurrent_function( input_size, hidden_size, output_size, input_variable, X_mask, self.random_state) if self.bidirectional: backward_hidden, backward_params = self.recurrent_function( input_size, hidden_size, output_size, input_variable[::-1], X_mask[::-1], self.random_state) params = forward_params + backward_params input_variable = concatenate( [forward_hidden, backward_hidden[::-1]], axis=forward_hidden.ndim - 1) else: params = forward_params input_variable = forward_hidden if self.bidirectional: # Accomodate for concatenated hiddens sz = 2 * hidden_sizes[-1] else: sz = hidden_sizes[-1] if self.cost == "softmax": # easy mode output, output_params = build_linear_layer(sz, output_size, input_variable, self.random_state) params = params + output_params shp = output.shape output = output.reshape([shp[0] * shp[1], shp[2]]) y_hat_sym = T.nnet.softmax(output) y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]]) cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1)) elif self.cost == "encdec": # hardmode context = input_variable context_mean = context[0] init_state, state_params = build_tanh_layer(sz, hidden_sizes[-1], context_mean, self.random_state) init_memory, memory_params = build_tanh_layer(sz, hidden_sizes[-1], context_mean, self.random_state) # partial sampler setup self._encode = theano.function([X_sym, X_mask], [init_state, init_memory, context]) init_state_sampler = T.matrix() init_memory_sampler = T.matrix() y_sw_sampler = T.tensor3() y_sw_mask = T.alloc(1., y_sw_sampler.shape[0], 1) # need this style of init to reuse params for sampler and actual # training. This makes this part quite nasty - dictionary # for initialization and params is making more and more sense. # conditional params will be reused below conditional_params = init_recurrent_conditional_lstm_layer( output_size, hidden_sizes[-1], sz, self.random_state) rval, _p = build_recurrent_conditional_lstm_layer_from_params( conditional_params, y_sw_sampler, y_sw_mask, context, X_mask, init_state_sampler, init_memory_sampler, self.random_state, one_step=True) next_state, next_memory, sampler_contexts, _ = rval #end sampler parts... for now params = params + state_params + memory_params shifted_labels = T.zeros_like(y_sym) shifted_labels = T.set_subtensor(shifted_labels[1:], y_sym[:-1]) y_sym = shifted_labels rval, _p = build_recurrent_conditional_lstm_layer_from_params( conditional_params, shifted_labels, y_mask, context, X_mask, init_state, init_memory, self.random_state) projected_hidden, _, contexts, attention = rval params = params + conditional_params # once again, need to use same params for sample gen lh_params = init_linear_layer(hidden_sizes[-1], output_size, self.random_state) logit_hidden, _ = build_linear_layer_from_params(lh_params, projected_hidden) params = params + lh_params lo_params = init_linear_layer(output_size, output_size, self.random_state) logit_out, _ = build_linear_layer_from_params(lo_params, y_sym) params = params + lo_params lc_params = init_linear_layer(sz, output_size, self.random_state) logit_contexts, _ = build_linear_layer_from_params(lc_params, contexts) params = params + lc_params logit = T.tanh(logit_hidden + logit_out + logit_contexts) output_params = init_linear_layer(output_size, output_size, self.random_state) output, _ = build_linear_layer_from_params(output_params, logit) params = params + output_params shp = output.shape output = output.reshape([shp[0] * shp[1], shp[2]]) y_hat_sym = T.nnet.softmax(output) # Need to apply mask so that cost isn't punished y_sym_reshaped = (y_sym * y_mask.dimshuffle(0, 1, 'x')).reshape( [shp[0] * shp[1], shp[2]]) y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]]) cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1)) # Finish sampler logit_sampler_hidden, _ = build_linear_layer_from_params(lh_params, next_state) logit_sampler_out, _ = build_linear_layer_from_params(lo_params, y_sw_sampler) logit_sampler_contexts, _ = build_linear_layer_from_params( lc_params, sampler_contexts) logit_sampler = T.tanh(logit_sampler_hidden + logit_sampler_out + logit_sampler_contexts) output_sampler, _ = build_linear_layer_from_params(output_params, logit_sampler) shp = output_sampler.shape output_sampler = output_sampler.reshape([shp[0] * shp[1], shp[2]]) y_hat_sampler = T.nnet.softmax(output_sampler) self._sampler_step = theano.function( [y_sw_sampler, context, X_mask, init_state_sampler, init_memory_sampler], [y_hat_sampler, next_state, next_memory]) else: raise ValueError("Value of %s not a valid cost!" % self.cost) self.params_ = params if self.learning_alg == "sgd": updates = self.get_clip_sgd_updates( X_sym, y_sym, params, cost, self.learning_rate, self.momentum) elif self.learning_alg == "rmsprop": updates = self.get_clip_rmsprop_updates( X_sym, y_sym, params, cost, self.learning_rate, self.momentum) elif self.learning_alg == "sfg": updates = self.get_sfg_updates( X_sym, y_sym, params, cost, self.learning_rate, self.momentum) else: raise ValueError("Value of %s not a valid learning_alg!" % self.learning_alg) if self.cost == "softmax": self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="ignore") self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="ignore") self.predict_function = theano.function( inputs=[X_sym, X_mask], outputs=y_hat_sym, on_unused_input="ignore") else: self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="warn") self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="warn") self.predict_function = theano.function( inputs=[X_sym, X_mask, y_sym, y_mask], outputs=y_hat_sym)
def build_sampler(tparams, options, trng, use_noise): x = tensor.matrix('x', dtype='int64') n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) # encoder proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder') ctx = proj[0][-1] ctx_mean = ctx init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') print 'Building f_init...', outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init', profile=profile) print 'Done' # y: 1 x 1 y = tensor.vector('y_sampler', dtype='int64') init_state = tensor.matrix('init_state', dtype='float32') # if it's the first word, emb should be all zero emb = tensor.switch(y[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), tparams['Wemb_dec'][y]) # apply one step of gru layer proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state) next_state = proj ctxs = ctx # compute the output probability dist and sample logit_lstm = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability print 'Building f_next..', inps = [y, ctx, init_state] outs = [next_probs, next_sample, next_state] f_next = theano.function(inps, outs, name='f_next', profile=profile) print 'Done' return f_init, f_next
def build_tanh_layer_from_params(params, input_variable): W, b = params output_variable = T.tanh(T.dot(input_variable, W) + b) return output_variable, params
def __init__(self, rng, input, input_u, input_p, mask, n_wordin, n_usrin, n_prdin, n_out, name, prefix=None): self.input = input self.inputu = input_u self.inputp = input_p if prefix is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_wordin + n_out)), high=numpy.sqrt(6. / (n_wordin + n_out)), size=(n_wordin, n_out) ), dtype=numpy.float32 ) W = theano.shared(value=W_values, name='W', borrow=True) ''' v_values = numpy.zeros((n_out,), dtype=theano.config.floatX) v = theano.shared(value=v_values, name='v', borrow=True) ''' v_values = numpy.asarray( rng.normal(scale=0.1, size=(n_out,)), dtype=numpy.float32 ) v = theano.shared(value=v_values, name='v', borrow=True) Wu_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_usrin + n_out)), high=numpy.sqrt(6. / (n_usrin + n_out)), size=(n_usrin, n_out) ), dtype=numpy.float32 ) Wu = theano.shared(value=Wu_values, name='Wu', borrow=True) Wp_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_prdin + n_out)), high=numpy.sqrt(6. / (n_prdin + n_out)), size=(n_prdin, n_out) ), dtype=numpy.float32 ) Wp = theano.shared(value=Wp_values, name='Wp', borrow=True) b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) else: f = file(prefix + name + '.save', 'rb') W = pickle.load(f) v = pickle.load(f) Wu = pickle.load(f) Wp = pickle.load(f) b = pickle.load(f) f.close() self.W = W self.v = v self.Wu = Wu self.Wp = Wp self.b = b attenu = T.dot(input_u, self.Wu) attenp = T.dot(input_p, self.Wp) atten = T.tanh(T.dot(input, self.W)+ attenu + attenp +b) atten = T.sum(atten * v, axis=2, acc_dtype='float32') atten = softmask(atten.dimshuffle(1,0), mask.dimshuffle(1,0)).dimshuffle(1, 0) output = atten.dimshuffle(0, 1, 'x') * input self.output = T.sum(output, axis=0, acc_dtype='float32') self.params = [self.W, self.v,self.Wu,self.Wp,self.b] self.name=name self.atten = atten self.mask = mask