def convolution_max(self, h, n_prds): """ :param h: 1D: n_words, 2D: batch_size * n_prds, 3D n_h :return: 1D: n_words, 2D: batch_size * n_prds, 3D: n_h """ # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h m = h.reshape((h.shape[0], h.shape[1] / n_prds, n_prds, h.shape[2])) # 1D: batch_size, 2D: n_prds * n_words, 3D: n_h m = m.dimshuffle((1, 2, 0, 3)) m = m.reshape((m.shape[0], m.shape[1] * m.shape[2], m.shape[3])) # 1D: batch_size, 2D: n_h h_m = T.max(relu(T.dot(m, self.W_m)), axis=1) # 1D: batch_size, 2D: n_h h_m = h_m.reshape((h_m.shape[0], -1)).dimshuffle((0, 'x', 'x', 1)) # 1D: batch_size, 2D: n_prds, 3D: n_h h_m = T.repeat(h_m, n_prds, 1) # 1D: batch_size, 2D: n_prds, 3D: n_words, 4D: n_h h_m = T.repeat(h_m, h.shape[0], 2) # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h h_m = h_m.dimshuffle((2, 0, 1, 3)) # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h h_m = h_m.reshape((h_m.shape[0], h_m.shape[1] * h_m.shape[2], h_m.shape[3])) return relu(T.dot(T.concatenate([h, h_m], axis=2), self.W_c))
def gru_layers(x, batch, n_fin, n_h, n_y, n_layers=1): params = [] for i in xrange(n_layers): if i == 0: layer = GRU(n_i=n_fin, n_h=n_h) layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W)) # h0: 1D: Batch, 2D: n_h h0 = T.zeros((batch, n_h), dtype=theano.config.floatX) else: layer = GRU(n_i=n_h * 2, n_h=n_h) # h: 1D: n_words, 2D: Batch, 3D n_h layer_input = relu( T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1] h0 = layer_input[0] xr = T.dot(layer_input, layer.W_xr) xz = T.dot(layer_input, layer.W_xz) xh = T.dot(layer_input, layer.W_xh) h, _ = theano.scan(fn=layer.forward, sequences=[xr, xz, xh], outputs_info=[h0]) params.extend(layer.params) layer = CRF(n_i=n_h * 2, n_h=n_y) params.extend(layer.params) h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W)) if n_layers % 2 == 0: emit = h[::-1] else: emit = h return params, layer, emit
def gru_layers(x, batch, n_fin, n_h, n_y, n_layers=1): params = [] for i in xrange(n_layers): if i == 0: layer = GRU(n_i=n_fin, n_h=n_h) layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W)) # h0: 1D: Batch, 2D: n_h h0 = T.zeros((batch, n_h), dtype=theano.config.floatX) else: layer = GRU(n_i=n_h * 2, n_h=n_h) # h: 1D: n_words, 2D: Batch, 3D n_h layer_input = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1] h0 = layer_input[0] xr = T.dot(layer_input, layer.W_xr) xz = T.dot(layer_input, layer.W_xz) xh = T.dot(layer_input, layer.W_xh) h, _ = theano.scan(fn=layer.forward, sequences=[xr, xz, xh], outputs_info=[h0]) params.extend(layer.params) layer = CRF(n_i=n_h * 2, n_h=n_y) params.extend(layer.params) h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W)) if n_layers % 2 == 0: emit = h[::-1] else: emit = h return params, layer, emit
def convolution2(self, h, n_prds): """ :param h: 1D: n_words, 2D: batch_size * n_prds, 3D n_h :return: 1D: n_words, 2D: batch_size * n_prds, 3D: n_h """ # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h m = h.reshape((h.shape[0], h.shape[1] / n_prds, n_prds, h.shape[2])) # 1D: n_words, 2D: batch_size, 3D: n_h h_m = self.pooling(relu(T.dot(m, self.W_m)), axis=2) # 1D: n_words, 2D: batch_size * n_prds, 3D: n_h h_m = T.repeat(h_m, n_prds, 1) return relu(T.dot(T.concatenate([h, h_m], axis=2), self.W_c))
def forward(self, A, a_res): """ :param A: 1D: batch, 2D: n_agents, 3D: dim_h :param a_res: 1D: batch, 2D: dim_h :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h """ # 1D: batch, 2D: n_agents, 3D: dim_h M = tanh( T.dot(A, self.W1_c) + T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1)) # 1D: batch, 2D: n_agents u = T.dot(M, self.w) # 1D: batch, 2D: n_agents, 3D: 1 alpha = T.nnet.softmax(u) alpha = alpha.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: dim_h r = T.sum(A * alpha, axis=1) # 1D: batch, 2D: dim_h h = relu(T.dot(r, self.W2_r)) return h
def linear_activation_forward(A_prev, W, b, activation): ''' Implements forward propagation. Arguments: A_prev -- activation from previous layers W -- weight matrix b -- bias matrix activation -- the activation used in this layer, 'sigmoid' or 'relu' Returns: A -- the output activation cache -- a python dictionary containing 'linear_cache' and 'activation_cache' ''' Z, linear_cache = linear_forward(A_prev, W, b) if activation == 'relu': A, activation_cache = nn_utils.relu(Z) elif activation == 'sigmoid': A, activation_cache = nn_utils.sigmoid(Z) cache = (linear_cache, activation_cache) return A, cache
def linear_activation_forward(A_prev, W, b, activation): ''' Implements the forward propagation for the Linear->Activation layer. Arguments: A_prev -- activation from previous layer(or input data) W -- weight matrix b -- bias matrix activation -- activation used in this layer, 'relu' or 'sigmoid Returns: A -- the output of activation function cache -- tuple containing 'linear_cache' and 'activation_cache', stored for computing backward pass efficiently ''' Z, linear_cache = linear_forward(A_prev, W, b) ## calling linear_forward function ## to get the value of Z and linear cache if activation == 'sigmoid': A, activation_cache = sigmoid(Z) ## calling sigmoid function defined in nn_utils elif activation == 'relu': A, activation_cache = relu(Z) ## calling relu function defined in nn_utlis assert (A.shape == (W.shape[0], A_prev.shape[1])) ## assertion for checking the shape of A cache = (linear_cache, activation_cache) return A, cache
def forward(self, x, h): """ :param x: 1D: n_words, 2D: batch, 3D: dim_h :param h: 1D: n_words, 2D: batch, 3D: dim_h :return: 1D: n_words, 2D: batch, 3D: n_labels """ return relu(T.dot(T.concatenate([x, h], 2), self.W))
def linear_activation_forward(A_prev, W, b, activation): """ Implement the forward propagation for the LINEAR->ACTIVATION layer Arguments: A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples) W -- weights matrix: numpy array of shape (size of current layer, size of previous layer) b -- bias vector, numpy array of shape (size of the current layer, 1) activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" Returns: A -- the output of the activation function, also called the post-activation value cache -- a python dictionary containing "linear_cache" and "activation_cache"; stored for computing the backward pass efficiently """ if activation == "sigmoid": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = nn_utils.sigmoid(Z) elif activation == "relu": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = nn_utils.relu(Z) assert (A.shape == (W.shape[0], A_prev.shape[1])) cache = (linear_cache, activation_cache) return A, cache
def lstm_layers(x, batch, n_fin, n_h, n_y, n_layers=1): params = [] for i in xrange(n_layers): if i == 0: layer = LSTM(n_i=n_fin, n_h=n_h) layer_input = relu(T.dot(x.dimshuffle( 1, 0, 2), layer.W)) # x: 1D: Batch, 2D: n_words, 3D: n_fin h0 = layer.h0 * T.ones((batch, n_h)) # h0: 1D: Batch, 2D: n_h c0 = layer.c0 * T.ones((batch, n_h)) # c0: 1D: Batch, 2D: n_h else: layer = LSTM(n_i=n_h * 2, n_h=n_h) layer_input = relu( T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1] # h: 1D: n_words, 2D: Batch, 3D n_h h0 = layer_input[0] c0 = c[-1] xi = T.dot( layer_input, layer.W_xi) # layer_input: 1D: n_words, 2D: Batch, 3D: n_fin xf = T.dot(layer_input, layer.W_xf) xc = T.dot(layer_input, layer.W_xc) xo = T.dot(layer_input, layer.W_xo) [h, c], _ = theano.scan(fn=layer.forward, sequences=[xi, xf, xc, xo], outputs_info=[h0, c0]) params.extend(layer.params) layer = CRF(n_i=n_h * 2, n_h=n_y) params.extend(layer.params) h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W)) if n_layers % 2 == 0: emit = h[::-1] else: emit = h return params, layer, emit
def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.name = name self.x = x self.y = y self.lr = lr self.input = [self.x, self.y, self.lr] n_words = x.shape[0] """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared( sample_weights(hidden_dim, 1, window, emb_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim, 1)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ pad """ self.zero = theano.shared( np.zeros(shape=(1, 1, window / 2, emb_dim), dtype=theano.config.floatX)) """ look up embedding """ self.x_emb = self.emb[self.x] # x_emb: 1D: n_words, 2D: n_emb """ convolution """ self.x_in = self.conv(self.x_emb) """ feed-forward computation """ self.h = relu( self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) + T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ prediction """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ cost function """ self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y]) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.name = name self.x = x self.y = y self.lr = lr self.input = [self.x, self.y, self.lr] n_words = x.shape[0] """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared(sample_weights(hidden_dim, 1, window, emb_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim, 1)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ pad """ self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, emb_dim), dtype=theano.config.floatX)) """ look up embedding """ self.x_emb = self.emb[self.x] # x_emb: 1D: n_words, 2D: n_emb """ convolution """ self.x_in = self.conv(self.x_emb) """ feed-forward computation """ self.h = relu(self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) + T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ prediction """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ cost function """ self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y]) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def convolution(self, h, n_prds): """ :param h: 1D: n_words, 2D: batch_size * n_prds, 3D n_h :return: 1D: n_words, 2D: batch_size * n_prds, 3D: n_h """ # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h m = h.reshape((h.shape[0], h.shape[1] / n_prds, n_prds, h.shape[2])) m = m.dimshuffle((1, 2, 0, 3)) # n_words = m.shape[2] # 1D: batch_size, 2D: n_prds * n_words, 3D: n_h m1 = m.reshape((m.shape[0], m.shape[1] * m.shape[2], m.shape[3])) m3 = m.dimshuffle((0, 2, 1, 3)) # 1D: batch_size, 2D: n_h h_m1 = T.max(relu(T.dot(m1, self.W_m1)), axis=1) h_m1 = h_m1.dimshuffle((0, 'x', 'x', 1)) # h_m1 = T.repeat(h_m1, n_prds, 1) # h_m1 = T.repeat(h_m1, n_words, 2) # 1D: batch_size, 2D: n_prds, 3D: n_h h_m2 = T.max(relu(T.dot(m, self.W_m2)), axis=2) h_m2 = h_m2.dimshuffle((0, 1, 'x', 2)) # h_m2 = T.repeat(h_m2, n_words, 2) # 1D: batch_size, 2D: n_words, 3D: n_h h_m3 = T.max(relu(T.dot(m3, self.W_m3)), axis=2) h_m3 = h_m3.dimshuffle((0, 'x', 1, 2)) # h_m3 = T.repeat(h_m3, n_prds, 1) # 1D: batch_size, 2D: n_prds, 3D: n_words, 4D: n_h h_m = h_m1 + h_m2 + h_m3 # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h h_m = h_m.dimshuffle((2, 0, 1, 3)) # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h h_m = h_m.reshape((h_m.shape[0], h_m.shape[1] * h_m.shape[2], h_m.shape[3])) return relu(T.dot(T.concatenate([h, h_m], axis=2), self.W_c))
def lstm_layers(x, batch, n_fin, n_h, n_y, n_layers=1): params = [] for i in xrange(n_layers): if i == 0: layer = LSTM(n_i=n_fin, n_h=n_h) layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W)) # x: 1D: Batch, 2D: n_words, 3D: n_fin h0 = layer.h0 * T.ones((batch, n_h)) # h0: 1D: Batch, 2D: n_h c0 = layer.c0 * T.ones((batch, n_h)) # c0: 1D: Batch, 2D: n_h else: layer = LSTM(n_i=n_h * 2, n_h=n_h) layer_input = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1] # h: 1D: n_words, 2D: Batch, 3D n_h h0 = layer_input[0] c0 = c[-1] xi = T.dot(layer_input, layer.W_xi) # layer_input: 1D: n_words, 2D: Batch, 3D: n_fin xf = T.dot(layer_input, layer.W_xf) xc = T.dot(layer_input, layer.W_xc) xo = T.dot(layer_input, layer.W_xo) [h, c], _ = theano.scan(fn=layer.forward, sequences=[xi, xf, xc, xo], outputs_info=[h0, c0]) params.extend(layer.params) layer = CRF(n_i=n_h * 2, n_h=n_y) params.extend(layer.params) h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W)) if n_layers % 2 == 0: emit = h[::-1] else: emit = h return params, layer, emit
def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten( ) # 1D: n_words * batch_size * window; elem=word id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ look up embedding """ self.x_emb = self.emb[ self.x_v] # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim """ forward """ self.h = relu( T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim * window)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def grid_propagate(self, h): """ :param h: 1D: batch, 2D: n_prds, 3D: n_words, 4D: dim_h :return: 1D: batch, 2D: n_prds, 3D: n_words, 4D: dim_h """ h0_lr = T.zeros((h.shape[0], h.shape[1], h.shape[3]), dtype=theano.config.floatX) h0_ud = T.zeros((h.shape[0], h.shape[2], h.shape[3]), dtype=theano.config.floatX) for i in xrange(0, self.depth): h_lr = self.forward_lr(self.layers[i * 3], h, h0_lr) h_ud = self.forward_ud(self.layers[(i * 3) + 1], h, h0_ud) h = h + relu(self.dot(self.layers[(i * 3) + 2], h_lr, h_ud)) h = self.flip(h) if (self.depth % 2) == 1: h = self.flip(h) return h
def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten() # 1D: n_words * batch_size * window; elem=word id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ look up embedding """ self.x_emb = self.emb[self.x_v] # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim """ forward """ self.h = relu(T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim * window)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def layers(x, window, dim_emb, dim_hidden, n_layers, activation=tanh): params = [] zero = T.zeros((1, dim_emb * window), dtype=theano.config.floatX) def zero_pad_gate(matrix): return T.neq(T.sum(T.eq(matrix, zero), 1, keepdims=True), dim_emb * window) for i in xrange(n_layers): if i == 0: W = theano.shared(sample_weights(dim_emb * window, dim_hidden)) # h = zero_pad_gate(x) * relu(T.dot(x, W)) h = relu(T.dot(x, W)) else: W = theano.shared(sample_weights(dim_hidden, dim_hidden)) h = activation(T.dot(h, W)) params.append(W) return h, params
def forward_one_layer(A_previous, W, b, activation_function): """ Forwarding only 1 layer ahead W.shape = (l, l-1) Z, A_prev, A.shape = (l, m) Returns A and [cache = A_prev, W, b, Z] """ Z = np.dot( W, A_previous) + b # W.shape: (l, l-1) / A_previous.shape: (l, m) A = None if activation_function == "sigmoid": A = nn_utils.sigmoid(Z) elif activation_function == "relu": A = nn_utils.relu(Z) cache = A_previous, W, b, Z return A, cache
def forward_bi(self, A, a_res): """ :param A: 1D: batch, 2D: n_agents, 3D: dim_h :param a_res: 1D: batch, 2D: dim_h :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h """ # 1D: batch, 2D: n_agents M = T.sum(T.dot(A, self.W1_c) * a_res.dimshuffle(0, 'x', 1), axis=2) # 1D: batch, 2D: n_agents, 3D: 1 alpha = T.nnet.softmax(M) alpha = alpha.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: dim_h r = T.sum(A * alpha, axis=1) # 1D: batch, 2D: dim_h h = relu(T.dot(r, self.W2_r)) return h
def forward_second_order(self, A, a_res): """ :param A: 1D: batch, 2D: n_agents, 3D: dim_h :param a_res: 1D: batch, 2D: dim_h :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h """ # 1D: batch, 2D: n_agents, 3D: dim_h M = tanh( T.dot(A, self.W1_c) + T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1)) # 1D: batch, 2D: n_agents M_a = T.dot(M, self.w) # 1D: batch, 2D: n_agents, 3D: 1 alpha = T.nnet.softmax(M_a) alpha = alpha.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: dim_h r_a = T.sum(A * alpha, axis=1) # 1D: n_agents, 2D: dim_h w = self.w.dimshuffle(('x', 'x', 0)) w = T.repeat(w, M_a.shape[1], axis=1) # 1D: batch, 2D: n_agents, 3D: dim_h M_a = M_a.dimshuffle((0, 1, 'x')) M_a = T.repeat(M_a, M.shape[2], axis=2) M_b = M - T.sum(M_a * w) / self.w.norm(2) beta = T.nnet.softmax(T.dot(M_b, self.w_b)) beta = beta.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: dim_h r_b = T.sum(A * beta, axis=1) # 1D: batch, 2D: dim_h h = relu(T.dot(T.concatenate([r_a, r_b], axis=1), self.W2_r)) return h
def forward_one_layer(self, A_previous, W, b, activation_function): """ Forwarding only 1 layer ahead W.shape = (nl, nl-1) Z, A_prev, A.shape = (nl, m) Returns A and [cache = A_prev, W, b, Z] """ print("--------------------") print(A_previous.shape) print(W.shape) print("--------------------") Z = np.dot(W, A_previous) + b if activation_function == "sigmoid": A = nn_utils.sigmoid(Z) elif activation_function == "relu": A = nn_utils.relu(Z) cache = A_previous, W, b, Z return A, cache
def forward_double(self, A, a_res): """ :param A: 1D: batch, 2D: n_agents, 3D: dim_h :param a_res: 1D: batch, 2D: dim_h :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h """ # 1D: batch, 2D: n_agents, 3D: dim_h M = tanh( T.dot(A, self.W1_c) + T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1)) # 1D: batch, 2D: n_agents M_a = T.dot(M, self.w) # 1D: batch, 2D: dim_h M_b = T.max(M, axis=1) M_b = T.dot(M_b, self.W_m) # 1D: batch, 2D: n_agents, 3D: 1 alpha = T.nnet.softmax(M_a) alpha = alpha.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: 1, 3D: dim_h beta = T.nnet.softmax(M_b) beta = beta.dimshuffle((0, 'x', 1)) # 1D: batch, 2D: n_agents, 3D: dim_h # gamma = - (T.log(alpha) + T.log(beta)) gamma = alpha * beta # 1D: batch, 2D: dim_h r = T.sum(A * gamma, axis=1) # 1D: batch, 2D: dim_h h = relu(T.dot(r, self.W2_r)) return h
def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten() # 1D: n_words * batch_size * window; elem=word id self.c = c # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr n_phi = (w_emb_dim + c_hidden_dim) * window max_len_char = T.cast(self.c.shape[2], 'int32') """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.pad = build_shared_zeros((1, c_emb_dim)) self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim)) self.emb_c = T.concatenate([self.pad, self.e_c], 0) self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim)) self.W_c = theano.shared(sample_weights(c_emb_dim * window, c_hidden_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y] """ look up embedding """ self.x_emb = self.emb[self.x_v] # 1D: batch_size*n_words * window, 2D: emb_dim self.c_emb = self.emb_c[self.c] # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1)) """ convolution """ self.c_phi = T.max(T.dot(self.c_emb.reshape((batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2) # 1D: n_words, 2D: window, 3D: n_h_c self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2) """ forward """ self.h = relu(T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten( ) # 1D: n_words * batch_size * window; elem=word id self.c = c # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr n_phi = (w_emb_dim + c_hidden_dim) * window max_len_char = T.cast(self.c.shape[2], 'int32') """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.pad = build_shared_zeros((1, c_emb_dim)) self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim)) self.emb_c = T.concatenate([self.pad, self.e_c], 0) self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim)) self.W_c = theano.shared( sample_weights(c_emb_dim * window, c_hidden_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [ self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y ] """ look up embedding """ self.x_emb = self.emb[ self.x_v] # 1D: batch_size*n_words * window, 2D: emb_dim self.c_emb = self.emb_c[ self. c] # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1)) """ convolution """ self.c_phi = T.max( T.dot( self.c_emb.reshape( (batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2) # 1D: n_words, 2D: window, 3D: n_h_c self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2) """ forward """ self.h = relu( T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def dot(self, x): return relu(T.dot(x, self.W))
def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; elem=distance between sentences of ant and ment :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.y = y dim_x = dim_w * (2 + 4 + 20) + 1 batch = y.shape[0] """ Params """ if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab, dim_w)) else: self.emb = theano.shared(init_emb) self.W_d = theano.shared(sample_weights(dim_d)) self.W_i = theano.shared(sample_weights(dim_x, dim_h*3)) self.W_h = theano.shared(sample_weights(dim_h*3, dim_h)) self.W_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_d, self.W_i, self.W_h, self.W_o] """ Input Layer """ x_s = self.emb[x_span] # 1D: batch, 2D: limit * 2, 3D: dim_w x_w = self.emb[x_word] # 1D: batch, 2D: 4, 3D: dim_w x_c = self.emb[x_ctx] # 1D: batch, 2D: window * 2 * 2, 3D: dim_w x_d = self.W_d[x_dist] # 1D: batch x_s_avg = T.concatenate([T.mean(x_s[:, :x_s.shape[1]/2], 1), T.mean(x_s[:, x_s.shape[1]/2:], 1)], 1) x = T.concatenate([x_s_avg, x_w.reshape((batch, -1)), x_c.reshape((batch, -1)), x_d.reshape((batch, 1))], 1) """ Intermediate Layers """ h1 = relu(T.dot(x, self.W_i)) # h1: 1D: batch, 2D: dim_h h2 = relu(T.dot(h1, self.W_h)) # h2: 1D: batch, 2D: dim_h """ Output Layer """ p_y = sigmoid(T.dot(h2, self.W_o)) # p_y: 1D: batch """ Predicts """ self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX)) self.y_hat = self.binary_predict(p_y) # 1D: batch, 2D: 9 (thresholds) self.y_hat_index = T.argmax(p_y) self.p_y_hat = p_y[self.y_hat_index] """ Cost Function """ self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y))) # TODO: ranking criterion self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Update """ self.grad = T.grad(self.cost, self.params) self.updates = adam(self.params, self.grad) """ Check Results """ self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1))) # 1D: batch, 2D: 9 (thresholds) self.total_p = T.sum(self.y_hat, 0) self.total_r = T.sum(y, keepdims=True) self.correct = T.sum(self.result, 0) self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
def __init__(self, name, w, c, b, y, lr, init_w_emb, vocab_w_size, vocab_c_size, w_emb_dim, c_emb_dim, w_hidden_dim, c_hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.name = name self.w = w self.c = c self.b = b self.y = y self.lr = lr self.input = [self.w, self.c, self.b, self.y, self.lr] n_phi = w_emb_dim + c_emb_dim * window n_words = w.shape[0] """ params """ if init_w_emb is not None: self.emb = theano.shared(init_w_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.emb_c = theano.shared(sample_norm_dist(vocab_c_size, c_emb_dim)) self.W_in = theano.shared(sample_weights(w_hidden_dim, 1, window, n_phi)) self.W_c = theano.shared(sample_weights(c_hidden_dim, 1, window, c_emb_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim, 1)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) """ pad """ self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, n_phi), dtype=theano.config.floatX)) self.zero_c = theano.shared(np.zeros(shape=(1, 1, window / 2, c_emb_dim), dtype=theano.config.floatX)) self.params = [self.emb_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y] """ look up embedding """ x_emb = self.emb[self.w] # x_emb: 1D: n_words, 2D: w_emb_dim c_emb = self.emb_c[self.c] # c_emb: 1D: n_chars, 2D: c_emb_dim """ create feature """ c_phi = self.create_char_feature(self.b, c_emb, self.zero_c) + self.b_c # 1D: n_words, 2D: c_hidden_dim(50) x_phi = T.concatenate([x_emb, c_phi], axis=1) # 1D: n_words, 2D: w_emb_dim(100) + c_hidden_dim(50) """ convolution """ x_padded = T.concatenate([self.zero, x_phi.reshape((1, 1, x_phi.shape[0], x_phi.shape[1])), self.zero], axis=2) # x_padded: 1D: n_words + n_pad, 2D: n_phi x_in = conv2d(input=x_padded, filters=self.W_in) # 1D: 1, 2D: w_hidden_dim(300), 3D: n_words, 4D: 1 """ feed-forward computation """ h = relu(x_in.reshape((x_in.shape[1], x_in.shape[2])) + T.repeat(self.b_in, T.cast(x_in.shape[2], 'int32'), 1)).T self.o = T.dot(h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ prediction """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ cost function """ self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y]) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, x_emb, self.w, self.lr)
def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist] :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match] :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.x_slen = x_slen self.y = y dim_x = dim_w * (10 + 4 + 4 + 2 + 3) batch = y.shape[0] """ Params """ if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab, dim_w)) else: self.emb = theano.shared(init_emb) self.W_d = theano.shared(sample_weights(dim_d, dim_w)) self.W_l = theano.shared(sample_weights(7, dim_w)) self.W_i = theano.shared(sample_weights(dim_x, dim_h)) self.W_h = theano.shared(sample_weights(dim_h, dim_h)) self.W_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_d, self.W_l, self.W_i, self.W_h, self.W_o] """ Input Layer """ x_vec = T.concatenate([x_span, x_word, x_ctx], 1).flatten() # 1D: batch * (limit * 2 + 4 + 20) x_in = self.emb[x_vec] # 1D: batch, 2D: limit * 2, 3D: dim_w x_d = self.W_d[x_dist] # 1D: batch, 2D: 2, 3D: dim_w x_l = self.W_l[x_slen] # 1D: batch, 2D: 2, 3D: dim_w x = T.concatenate([x_in.reshape((batch, -1)), x_d.reshape((batch, -1)), x_l.reshape((batch, -1))], 1) """ Intermediate Layers """ h1 = relu(T.dot(x, self.W_i)) # h1: 1D: batch, 2D: dim_h h2 = relu(T.dot(h1, self.W_h)) # h2: 1D: batch, 2D: dim_h """ Output Layer """ p_y = sigmoid(T.dot(h2, self.W_o)) # p_y: 1D: batch """ Cost Function """ self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y))) # TODO: ranking criterion self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Update """ self.updates = sgd(self.cost, self.params, self.emb, x_in) """ Predicts """ self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX)) self.y_hat = self.binary_predict(p_y) # 1D: batch, 2D: 9 (thresholds) self.y_hat_index = T.argmax(p_y) self.p_y_hat = p_y[self.y_hat_index] """ Check Results """ self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1))) # 1D: batch, 2D: 9 (thresholds) self.total_p = T.sum(self.y_hat, 0) self.total_r = T.sum(y, keepdims=True) self.correct = T.sum(self.result, 0) self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; elem=distance between sentences of ant and ment :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.y = y dim_x = dim_w * (2 + 4 + 20) + 1 batch = y.shape[0] """ Params """ if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab, dim_w)) else: self.emb = theano.shared(init_emb) self.W_d = theano.shared(sample_weights(dim_d)) self.W_i = theano.shared(sample_weights(dim_x, dim_h * 3)) self.W_h = theano.shared(sample_weights(dim_h * 3, dim_h)) self.W_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_d, self.W_i, self.W_h, self.W_o] """ Input Layer """ x_s = self.emb[x_span] # 1D: batch, 2D: limit * 2, 3D: dim_w x_w = self.emb[x_word] # 1D: batch, 2D: 4, 3D: dim_w x_c = self.emb[x_ctx] # 1D: batch, 2D: window * 2 * 2, 3D: dim_w x_d = self.W_d[x_dist] # 1D: batch x_s_avg = T.concatenate([ T.mean(x_s[:, :x_s.shape[1] / 2], 1), T.mean(x_s[:, x_s.shape[1] / 2:], 1) ], 1) x = T.concatenate([ x_s_avg, x_w.reshape((batch, -1)), x_c.reshape((batch, -1)), x_d.reshape((batch, 1)) ], 1) """ Intermediate Layers """ h1 = relu(T.dot(x, self.W_i)) # h1: 1D: batch, 2D: dim_h h2 = relu(T.dot(h1, self.W_h)) # h2: 1D: batch, 2D: dim_h """ Output Layer """ p_y = sigmoid(T.dot(h2, self.W_o)) # p_y: 1D: batch """ Predicts """ self.thresholds = theano.shared( np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX)) self.y_hat = self.binary_predict(p_y) # 1D: batch, 2D: 9 (thresholds) self.y_hat_index = T.argmax(p_y) self.p_y_hat = p_y[self.y_hat_index] """ Cost Function """ self.nll = -T.sum(y * T.log(p_y) + (1. - y) * T.log( (1. - p_y))) # TODO: ranking criterion self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Update """ self.grad = T.grad(self.cost, self.params) self.updates = adam(self.params, self.grad) """ Check Results """ self.result = T.eq(self.y_hat, y.reshape( (y.shape[0], 1))) # 1D: batch, 2D: 9 (thresholds) self.total_p = T.sum(self.y_hat, 0) self.total_r = T.sum(y, keepdims=True) self.correct = T.sum(self.result, 0) self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))