def ada_grad(cost, params, emb=None, sub_emb=None, w=None, lr=0.1, eps=1.): updates = OrderedDict() """update sub-tensor of embeddings""" if emb: p = emb g = T.grad(cost, sub_emb) r = build_shared_zeros(p.get_value(True).shape) r_sub = r[w] r_sub_t = r_sub + T.sqr(g) r_t = T.set_subtensor(r_sub, r_sub_t) p_t = T.inc_subtensor(sub_emb, - (lr / (T.sqrt(r_sub_t) + eps)) * g) updates[r] = r_t updates[p] = p_t """update parameters""" grads0 = T.grad(cost, params[0]) for p, g in zip(params[0], grads0): r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t """update parameters""" grads1 = T.grad(cost, params[1]) for p, g in zip(params[1], grads1): r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t return updates
def ada_grad(cost, params, emb=None, sub_emb=None, w=None, lr=0.1, eps=1.): updates = OrderedDict() """update sub-tensor of embeddings""" if emb: p = emb g = T.grad(cost, sub_emb) r = build_shared_zeros(p.get_value(True).shape) r_sub = r[w] r_sub_t = r_sub + T.sqr(g) r_t = T.set_subtensor(r_sub, r_sub_t) p_t = T.inc_subtensor(sub_emb, -(lr / (T.sqrt(r_sub_t) + eps)) * g) updates[r] = r_t updates[p] = p_t """update parameters""" grads0 = T.grad(cost, params[0]) for p, g in zip(params[0], grads0): r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t """update parameters""" grads1 = T.grad(cost, params[1]) for p, g in zip(params[1], grads1): r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t return updates
def adam(cost, params, emb, x, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = OrderedDict() grads = T.grad(cost, params) i = theano.shared(np.float32(0)) i_t = i + 1. """update sub-tensor of embeddings""" # p = build_shared_zeros(emb.get_value(True).shape) # p_sub = p[w] # i_p = build_shared_zeros(emb.get_value(True).shape) # i_p_sub = i_p[w] # updates[i_p] = T.inc_subtensor(i_p_sub, 1.) # g = T.grad(cost, x) # v = build_shared_zeros(emb.get_value(True).shape) # r = build_shared_zeros(emb.get_value(True).shape) # v_sub = v[w] # r_sub = r[w] # v_t = ((1. - b1) * g) + (b1 ** (i_t - i_p_sub) * v_sub) # r_t = ((1. - b2) * T.sqr(g)) + (b2 ** (i_t - i_p_sub) * r_sub) # r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e) # v_hat = v_t / (1 - b1 ** i_t) # p_t = p_sub - r_hat * v_hat # updates[v] = T.set_subtensor(v_sub, v_t) # updates[r] = T.set_subtensor(r_sub, r_t) # updates[p] = T.set_subtensor(p_sub, p_t) """update sub-tensor of embeddings""" lr_emb = theano.shared(np.float32(0.1)) updates[emb] = T.inc_subtensor(x, -lr_emb * T.grad(cost, x)) for p, g in zip(params, grads): v = build_shared_zeros(p.get_value(True).shape) r = build_shared_zeros(p.get_value(True).shape) v_t = (b1 * v) + (1. - b1) * g r_t = (b2 * r) + (1. - b2) * T.sqr(g) r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e) v_hat = v / (1 - b1 ** i_t) p_t = p - r_hat * v_hat updates[v] = v_t updates[r] = r_t updates[p] = p_t updates[i] = i_t return updates
def adam(cost, params, emb, x, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = OrderedDict() grads = T.grad(cost, params) i = theano.shared(np.float32(0)) i_t = i + 1. """update sub-tensor of embeddings""" # p = build_shared_zeros(emb.get_value(True).shape) # p_sub = p[w] # i_p = build_shared_zeros(emb.get_value(True).shape) # i_p_sub = i_p[w] # updates[i_p] = T.inc_subtensor(i_p_sub, 1.) # g = T.grad(cost, x) # v = build_shared_zeros(emb.get_value(True).shape) # r = build_shared_zeros(emb.get_value(True).shape) # v_sub = v[w] # r_sub = r[w] # v_t = ((1. - b1) * g) + (b1 ** (i_t - i_p_sub) * v_sub) # r_t = ((1. - b2) * T.sqr(g)) + (b2 ** (i_t - i_p_sub) * r_sub) # r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e) # v_hat = v_t / (1 - b1 ** i_t) # p_t = p_sub - r_hat * v_hat # updates[v] = T.set_subtensor(v_sub, v_t) # updates[r] = T.set_subtensor(r_sub, r_t) # updates[p] = T.set_subtensor(p_sub, p_t) """update sub-tensor of embeddings""" lr_emb = theano.shared(np.float32(0.1)) updates[emb] = T.inc_subtensor(x, -lr_emb * T.grad(cost, x)) for p, g in zip(params, grads): v = build_shared_zeros(p.get_value(True).shape) r = build_shared_zeros(p.get_value(True).shape) v_t = (b1 * v) + (1. - b1) * g r_t = (b2 * r) + (1. - b2) * T.sqr(g) r_hat = lr / (T.sqrt(r_t / (1 - b2**i_t)) + e) v_hat = v / (1 - b1**i_t) p_t = p - r_hat * v_hat updates[v] = v_t updates[r] = r_t updates[p] = p_t updates[i] = i_t return updates
def __init__(self, parameters, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8): super(AdamOptimizer, self).__init__(parameters) # TODO: really? self.t = theano.shared(np.float32(1)) self.alpha = alpha self.beta1 = beta1 self.beta2 = beta2 self.eps = eps self.m = [build_shared_zeros(p.shape.eval()) for p in self.parameters] self.v = [build_shared_zeros(p.shape.eval()) for p in self.parameters]
def ada_delta(grads, params, b=0.999, eps=1e-8): updates = OrderedDict() for p, g in zip(params, grads): r = build_shared_zeros(p.get_value(True).shape) v = build_shared_zeros(p.get_value(True).shape) s = build_shared_zeros(p.get_value(True).shape) r_t = b * r + (1 - b) * T.sqr(g) v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g s_t = b * s + (1 - b) * T.sqr(v_t) p_t = p - v_t updates[r] = r_t updates[v] = v_t updates[s] = s_t updates[p] = p_t return updates
def __init__(self, n_i, n_h, activation=tanh): self.activation = activation self.c0 = build_shared_zeros(n_h) self.h0 = self.activation(self.c0) self.W = theano.shared(sample_weights(n_i, n_h)) """input gate parameters""" self.W_xi = theano.shared(sample_weights(n_h, n_h)) self.W_hi = theano.shared(sample_weights(n_h, n_h)) self.W_ci = theano.shared(sample_weights(n_h)) """forget gate parameters""" self.W_xf = theano.shared(sample_weights(n_h, n_h)) self.W_hf = theano.shared(sample_weights(n_h, n_h)) self.W_cf = theano.shared(sample_weights(n_h)) """cell parameters""" self.W_xc = theano.shared(sample_weights(n_h, n_h)) self.W_hc = theano.shared(sample_weights(n_h, n_h)) """output gate parameters""" self.W_xo = theano.shared(sample_weights(n_h, n_h)) self.W_ho = theano.shared(sample_weights(n_h, n_h)) self.W_co = theano.shared(sample_weights(n_h)) self.params = [self.W, self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf, self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co]
def __init__(self, n_i, n_h, activation=tanh): self.activation = activation self.c0 = build_shared_zeros(n_h) self.h0 = self.activation(self.c0) self.W = theano.shared(sample_weights(n_i, n_h)) # input gate parameters self.W_xi = theano.shared(sample_weights(n_h, n_h)) self.W_hi = theano.shared(sample_weights(n_h, n_h)) self.W_ci = theano.shared(sample_weights(n_h)) # forget gate parameters self.W_xf = theano.shared(sample_weights(n_h, n_h)) self.W_hf = theano.shared(sample_weights(n_h, n_h)) self.W_cf = theano.shared(sample_weights(n_h)) # cell parameters self.W_xc = theano.shared(sample_weights(n_h, n_h)) self.W_hc = theano.shared(sample_weights(n_h, n_h)) # output gate parameters self.W_xo = theano.shared(sample_weights(n_h, n_h)) self.W_ho = theano.shared(sample_weights(n_h, n_h)) self.W_co = theano.shared(sample_weights(n_h)) self.params = [ self.W, self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf, self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co ]
def ada_delta(cost, params, b=0.999, eps=1e-8): updates = OrderedDict() grads = T.grad(cost, params) """update parameters""" for p, g in zip(params, grads): r = build_shared_zeros(p.get_value(True).shape) v = build_shared_zeros(p.get_value(True).shape) s = build_shared_zeros(p.get_value(True).shape) r_t = b * r + (1 - b) * T.sqr(g) v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g s_t = b * s + (1 - b) * T.sqr(v_t) p_t = p - v_t updates[r] = r_t updates[v] = v_t updates[s] = s_t updates[p] = p_t return updates
def ada_grad(grads, params, lr=0.1, eps=1.): updates = OrderedDict() for p, g in zip(params, grads): r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t return updates
def set_layer(self, init_emb, n_vocab, dim_emb, n_posit, dim_posit, fix): self.word_emb = self.create_word_emb(init_emb, n_vocab, dim_emb) self.posit_emb = self.create_posit_emb(n_posit, dim_posit) if fix: self.params.extend([self.posit_emb]) else: self.params.extend([self.word_emb, self.posit_emb]) pad = build_shared_zeros((1, dim_emb)) self.E = T.concatenate([pad, self.word_emb], 0)
def ada_grad(cost, params, emb, x, w, lr=0.1, eps=1.): updates = OrderedDict() grads = T.grad(cost, params) """update sub-tensor of embeddings""" p = emb g = T.grad(cost, x) r = build_shared_zeros(p.get_value(True).shape) r_sub = r[w] r_sub_t = r_sub + T.sqr(g) r_t = T.set_subtensor(r_sub, r_sub_t) p_t = T.inc_subtensor(x, -(lr / (T.sqrt(r_sub_t) + eps)) * g) updates[r] = r_t updates[p] = p_t """update parameters""" for p, g in zip(params, grads): r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t return updates
def ada_delta(cost, params, emb, x, w, b=0.999, eps=1e-8): updates = OrderedDict() grads = T.grad(cost, params) """update sub-tensor of embeddings""" p = emb g = T.grad(cost, x) r = build_shared_zeros(p.get_value(True).shape) v = build_shared_zeros(p.get_value(True).shape) s = build_shared_zeros(p.get_value(True).shape) r_sub = r[w] v_sub = v[w] s_sub = s[w] r_sub_t = b * r_sub + (1 - b) * T.sqr(g) v_sub_t = (T.sqrt(s_sub) + eps) / (T.sqrt(r_sub) + eps) * g s_sub_t = b * s_sub + (1 - b) * T.sqr(v_sub_t) updates[r] = T.set_subtensor(r_sub, r_sub_t) updates[v] = T.set_subtensor(v_sub, v_sub_t) updates[s] = T.set_subtensor(s_sub, s_sub_t) updates[p] = T.inc_subtensor(x, -v_sub_t) """update parameters""" for p, g in zip(params, grads): r = build_shared_zeros(p.get_value(True).shape) v = build_shared_zeros(p.get_value(True).shape) s = build_shared_zeros(p.get_value(True).shape) r_t = b * r + (1 - b) * T.sqr(g) v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g s_t = b * s + (1 - b) * T.sqr(v_t) p_t = p - v_t updates[r] = r_t updates[v] = v_t updates[s] = s_t updates[p] = p_t return updates
def ada_grad(cost, params, lr=0.1, eps=1.): updates = OrderedDict() grads = T.grad(cost, params) """update parameters""" for p, g in zip(params, grads): g = grad_clipping(g, 10.) r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t return updates
def adam(params, grads, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = OrderedDict() i = theano.shared(np.float32(0)) i_t = i + 1. for p, g in zip(params, grads): v = build_shared_zeros(p.get_value(True).shape) r = build_shared_zeros(p.get_value(True).shape) v_t = (b1 * v) + (1. - b1) * g r_t = (b2 * r) + (1. - b2) * T.sqr(g) r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e) v_hat = v / (1 - b1 ** i_t) p_t = p - r_hat * v_hat updates[v] = v_t updates[r] = r_t updates[p] = p_t updates[i] = i_t return updates
def adam(params, grads, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = OrderedDict() i = theano.shared(np.float32(0)) i_t = i + 1. for p, g in zip(params, grads): v = build_shared_zeros(p.get_value(True).shape) r = build_shared_zeros(p.get_value(True).shape) v_t = (b1 * v) + (1. - b1) * g r_t = (b2 * r) + (1. - b2) * T.sqr(g) r_hat = lr / (T.sqrt(r_t / (1 - b2**i_t)) + e) v_hat = v / (1 - b1**i_t) p_t = p - r_hat * v_hat updates[v] = v_t updates[r] = r_t updates[p] = p_t updates[i] = i_t return updates
def ada_grad(cost, params, emb, x, w, lr=0.1, eps=1.): updates = OrderedDict() grads = T.grad(cost, params) """update sub-tensor of embeddings""" p = emb g = T.grad(cost, x) r = build_shared_zeros(p.get_value(True).shape) r_sub = r[w] r_sub_t = r_sub + T.sqr(g) r_t = T.set_subtensor(r_sub, r_sub_t) p_t = T.inc_subtensor(x, - (lr / (T.sqrt(r_sub_t) + eps)) * g) updates[r] = r_t updates[p] = p_t """update parameters""" for p, g in zip(params, grads): r = build_shared_zeros(p.get_value(True).shape) r_t = r + T.sqr(g) p_t = p - (lr / (T.sqrt(r_t) + eps)) * g updates[r] = r_t updates[p] = p_t return updates
def __init__(self, n_i=32, n_h=32, activation=tanh): self.activation = activation self.h0 = build_shared_zeros(n_h) self.W = theano.shared(sample_weights(n_i, n_h)) self.W_xr = theano.shared(sample_weights(n_h, n_h)) self.W_hr = theano.shared(sample_weights(n_h, n_h)) self.W_xz = theano.shared(sample_weights(n_h, n_h)) self.W_hz = theano.shared(sample_weights(n_h, n_h)) self.W_xh = theano.shared(sample_weights(n_h, n_h)) self.W_hh = theano.shared(sample_weights(n_h, n_h)) self.params = [self.W, self.W_xr, self.W_hr, self.W_xz, self.W_hz, self.W_xh, self.W_hh]
def __init__(self, input_dim, hidden_dim, activation=T.tanh): self.c_0 = build_shared_zeros(hidden_dim) self.h_0 = activation(self.c_0) self.activation = activation self.W = theano.shared(get_uniform_weight(input_dim, hidden_dim)) self.W_i = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.U_i = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.V_i = theano.shared(get_uniform_weight(hidden_dim)) self.W_f = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.U_f = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.V_f = theano.shared(get_uniform_weight(hidden_dim)) self.W_c = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.U_c = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.W_o = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.U_o = theano.shared(get_uniform_weight(hidden_dim, hidden_dim)) self.V_o = theano.shared(get_uniform_weight(hidden_dim)) self.parameters = [ self.W, self.W_f, self.U_f, self.V_f, self.W_i, self.U_i, self.V_i, self.W_c, self.U_c, self.W_o, self.U_o, self.V_o ]
def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten() # 1D: n_words * batch_size * window; elem=word id self.c = c # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr n_phi = (w_emb_dim + c_hidden_dim) * window max_len_char = T.cast(self.c.shape[2], 'int32') """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.pad = build_shared_zeros((1, c_emb_dim)) self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim)) self.emb_c = T.concatenate([self.pad, self.e_c], 0) self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim)) self.W_c = theano.shared(sample_weights(c_emb_dim * window, c_hidden_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y] """ look up embedding """ self.x_emb = self.emb[self.x_v] # 1D: batch_size*n_words * window, 2D: emb_dim self.c_emb = self.emb_c[self.c] # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1)) """ convolution """ self.c_phi = T.max(T.dot(self.c_emb.reshape((batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2) # 1D: n_words, 2D: window, 3D: n_h_c self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2) """ forward """ self.h = relu(T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten( ) # 1D: n_words * batch_size * window; elem=word id self.c = c # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr n_phi = (w_emb_dim + c_hidden_dim) * window max_len_char = T.cast(self.c.shape[2], 'int32') """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.pad = build_shared_zeros((1, c_emb_dim)) self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim)) self.emb_c = T.concatenate([self.pad, self.e_c], 0) self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim)) self.W_c = theano.shared( sample_weights(c_emb_dim * window, c_hidden_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [ self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y ] """ look up embedding """ self.x_emb = self.emb[ self.x_v] # 1D: batch_size*n_words * window, 2D: emb_dim self.c_emb = self.emb_c[ self. c] # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1)) """ convolution """ self.c_phi = T.max( T.dot( self.c_emb.reshape( (batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2) # 1D: n_words, 2D: window, 3D: n_h_c self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2) """ forward """ self.h = relu( T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)