def ready(self, args, train): # len * batch self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = T.matrix(dtype=theano.config.floatX) dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer(n_d=self.n_d, vocab=set(w for w in train)) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d)) activation = get_activation_by_name(args["activation"]) rnn_layer = LSTM(n_in=self.n_d, n_out=self.n_d, activation=activation) output_layer = Layer( n_in=self.n_d, n_out=self.n_V, activation=T.nnet.softmax, ) # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d)) # len * batch * (n_d+n_d) h = rnn_layer.forward_all(x, self.init_state, return_c=True) self.last_state = h[-1] h = h[:, :, self.n_d:] h = apply_dropout(h, self.dropout) self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys]) #self.nll = T.nnet.categorical_crossentropy( # self.p_y_given_x, # idys # ) self.layers = [embedding_layer, rnn_layer, output_layer] #self.params = [ x_flat ] + rnn_layer.params + output_layer.params self.params = embedding_layer.params + rnn_layer.params + output_layer.params self.num_params = sum( len(x.get_value(borrow=True).ravel()) for l in self.layers for x in l.params) say("# of params in total: {}\n".format(self.num_params))
def ready(self): args = self.args index = self.index = T.lscalar() x = self.x = T.fmatrix() y = self.y = T.ivector() dropout = self.dropout = theano.shared( np.float64(args.dropout).astype("float32")) n_d = args.hidden_dim layers = self.layers = [] for i in xrange(args.depth): l = Layer(n_in=28 * 28 if i == 0 else n_d, n_out=n_d, activation=ReLU) layers.append(l) output_layer = self.output_layer = Layer(n_in=n_d, n_out=10, activation=softmax) h = x for l in layers: h = l.forward(h) h = apply_dropout(h, dropout) self.h_final = h # batch * 10 probs = self.probs = output_layer.forward(h) # batch preds = self.preds = T.argmax(probs, axis=1) err = self.err = T.mean(T.cast(T.neq(preds, y), dtype="float32")) # loss = self.loss = -T.mean(T.log(probs[T.arange(y.shape[0]), y])) #loss = self.loss = T.mean( T.nnet.categorical_crossentropy( # probs, # y # )) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost += T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost = loss + l2_cost print "cost.dtype", self.cost.dtype
def build_model(self): args = self.args weights = self.weights meta_emb = self.meta_emb = self.embs[0] golden_embs = self.embs[1:] n_m_d = meta_emb.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX)) batch_ids = self.batch_ids = T.ivector('batch_d_char') batch_masks = self.batch_masks = T.fmatrix('batch_d_char_mask') layers = self.layers = [meta_emb] slices_embs = meta_emb.forward(batch_ids.ravel()) slices_embs = slices_embs.reshape((batch_ids.shape[0], n_m_d)) prev_output = apply_dropout(slices_embs, dropout, v2=True) self.all_loss = 0.0 for i in range(len(weights)): mask, weight, golden_emb = batch_masks[i], weights[i], golden_embs[ i] n_o_d = golden_emb.n_d layer = Layer(n_m_d, n_o_d, linear) layers.append(layer) mapped_output = layer.forward(prev_output) slices_embs = golden_emb.forward(batch_ids.ravel()) slices_embs = slices_embs.reshape((batch_ids.shape[0], n_o_d)) self.all_loss += weight * T.sum( T.sum((mapped_output - slices_embs) * (mapped_output - slices_embs), axis=1) * mask) / (1e-8 + T.sum(mask)) for i, l in enumerate(layers[1:]): say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out)) self.l2_sqr = None self.params = [] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) self.all_loss += self.l2_sqr n_params = sum( len(x.get_value(borrow=True).ravel()) for x in self.params) say("total # parameters: {}\n".format(n_params))
def get_recon_loss(self, idxs, sent_output): len_sent, len_doc_batch, n_d = sent_output.shape recon_layer = self.recon_layer padding_id = self.padding_id dropout = self.dropout # (len(sent)*len(doc)*batch)*n_e input_flat = idxs.ravel() true_recon = self.embedding_layer.recon_forward(input_flat) sent_output = apply_dropout(sent_output, dropout) pred_recon = recon_layer.forward(sent_output.reshape((len_sent*len_doc_batch, n_d))) # (len(sent)*len(doc)*batch) mask = T.cast(T.neq(input_flat, padding_id), theano.config.floatX) n = T.sum(mask) loss = T.sum((true_recon - pred_recon) ** 2, axis=1) * mask loss = T.sum(loss) / n return loss
def ready(self): args = self.args weights = self.weights # len(title) * batch idts = self.idts = T.imatrix() # len(body) * batch idbs = self.idbs = T.imatrix() # num pairs * 3, or num queries * candidate size idps = self.idps = T.imatrix() dropout = self.dropout = theano.shared(np.float64(args.dropout).astype( theano.config.floatX)) dropout_op = self.dropout_op = Dropout(self.dropout) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [ ] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) layers.append(feature_layer) # feature computation starts here # (len*batch)*n_e xt = embedding_layer.forward(idts.ravel()) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0,'x')) xt = xt * xt_w # len*batch*n_e xt = xt.reshape((idts.shape[0], idts.shape[1], n_e)) xt = apply_dropout(xt, dropout) # (len*batch)*n_e xb = embedding_layer.forward(idbs.ravel()) if weights is not None: xb_w = weights[idbs.ravel()].dimshuffle((0,'x')) xb = xb * xb_w # len*batch*n_e xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e)) xb = apply_dropout(xb, dropout) prev_ht = self.xt = xt prev_hb = self.xb = xb for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) hb = layers[i].forward_all(prev_hb) prev_ht = ht prev_hb = hb # normalize vectors if args.normalize: ht = self.normalize_3d(ht) hb = self.normalize_3d(hb) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.hb = hb # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) hb = self.average_without_padding(hb, idbs) else: ht = ht[-1] hb = hb[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = (ht+hb)*0.5 h_final = apply_dropout(h_final, dropout) h_final = self.normalize_2d(h_final) self.h_final = h_final say("h_final dtype: {}\n".format(ht.dtype)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) # For training: xp = h_final[idps.ravel()] xp = xp.reshape((idps.shape[0], idps.shape[1], n_d)) # num query * n_d query_vecs = xp[:,0,:] # num query pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 loss = T.mean( (diff>0)*diff ) self.loss = loss params = [ ] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params) )) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.cost = self.loss + l2_reg
def ready(self): args = self.args weights = self.weights # len(source) * batch idxs = self.idxs = T.imatrix() # len(target) * batch idys = self.idys = T.imatrix() idts = idys[:-1] idgs = idys[1:] dropout = self.dropout = theano.shared(np.float64(args.dropout).astype( theano.config.floatX)) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d n_V = self.n_V = embedding_layer.n_V if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [ ] for i in range(depth*2): if LayerType != RCNN: feature_layer = LayerType( n_in = n_e if i/2 == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType( n_in = n_e if i/2 == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) layers.append(feature_layer) self.output_layer = output_layer = Layer( n_in = n_d, n_out = n_V, activation = T.nnet.softmax, ) # feature computation starts here # (len*batch)*n_e xs_flat = embedding_layer.forward(idxs.ravel()) xs_flat = apply_dropout(xs_flat, dropout) if weights is not None: xs_w = weights[idxs.ravel()].dimshuffle((0,'x')) xs_flat = xs_flat * xs_w # len*batch*n_e xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e)) # (len*batch)*n_e xt_flat = embedding_layer.forward(idts.ravel()) xt_flat = apply_dropout(xt_flat, dropout) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0,'x')) xt_flat = xt_flat * xt_w # len*batch*n_e xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e)) prev_hs = xs prev_ht = xt for i in range(depth): # len*batch*n_d hs = layers[i*2].forward_all(prev_hs, return_c=True) ht = layers[i*2+1].forward_all(prev_ht, hs[-1]) hs = hs[:,:,-n_d:] ht = ht[:,:,-n_d:] prev_hs = hs prev_ht = ht prev_hs = apply_dropout(hs, dropout) prev_ht = apply_dropout(ht, dropout) self.p_y_given_x = output_layer.forward(prev_ht.reshape( (xt_flat.shape[0], n_d) )) h_final = hs[-1] self.scores2 = -(h_final[1:]-h_final[0]).norm(2,axis=1) h_final = self.normalize_2d(h_final) self.scores = T.dot(h_final[1:], h_final[0]) # (len*batch) nll = T.nnet.categorical_crossentropy( self.p_y_given_x, idgs.ravel() ) nll = nll.reshape(idgs.shape) self.nll = nll self.mask = mask = T.cast(T.neq(idgs, self.padding_id), theano.config.floatX) nll = T.sum(nll*mask, axis=0) #layers.append(embedding_layer) layers.append(output_layer) params = [ ] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params) )) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.loss = T.mean(nll) self.cost = self.loss + l2_reg
def ready(self): args = self.args embedding_layer = self.embedding_layer self.n_hidden = args.hidden_dim self.n_in = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX) ) # x is length * batch_size # y is batch_size self.x = T.imatrix('x') self.y = T.ivector('y') x = self.x y = self.y n_hidden = self.n_hidden n_in = self.n_in # fetch word embeddings # (len * batch_size) * n_in slices = embedding_layer.forward(x.ravel()) self.slices = slices # 3-d tensor, len * batch_size * n_in slices = slices.reshape( (x.shape[0], x.shape[1], n_in) ) # stacking the feature extraction layers pooling = args.pooling depth = args.depth layers = self.layers = [ ] prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) size = 0 softmax_inputs = [ ] activation = get_activation_by_name(args.act) for i in range(depth): if args.layer.lower() == "lstm": layer = LSTM( n_in = n_hidden if i > 0 else n_in, n_out = n_hidden ) elif args.layer.lower() == "strcnn": layer = StrCNN( n_in = n_hidden if i > 0 else n_in, n_out = n_hidden, activation = activation, decay = args.decay, order = args.order ) elif args.layer.lower() == "rcnn": layer = RCNN( n_in = n_hidden if i > 0 else n_in, n_out = n_hidden, activation = activation, order = args.order, mode = args.mode ) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output) if pooling: softmax_inputs.append(T.sum(prev_output, axis=0)) # summing over columns else: softmax_inputs.append(prev_output[-1]) prev_output = apply_dropout(prev_output, dropout) size += n_hidden # final feature representation is the concatenation of all extraction layers if pooling: softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0] else: softmax_input = T.concatenate(softmax_inputs, axis=1) softmax_input = apply_dropout(softmax_input, dropout, v2=True) # feed the feature repr. to the softmax output layer layers.append( Layer( n_in = size, n_out = self.nclasses, activation = softmax, has_bias = False ) ) for l,i in zip(layers, range(len(layers))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) # unnormalized score of y given x self.p_y_given_x = layers[-1].forward(softmax_input) self.pred = T.argmax(self.p_y_given_x, axis=1) self.nll_loss = T.mean( T.nnet.categorical_crossentropy( self.p_y_given_x, y )) # adding regularizations self.l2_sqr = None self.params = [ ] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams))
def ready(self, args, train): # len * batch depth = args["depth"] self.args = args self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = [ T.matrix(dtype=theano.config.floatX) for i in xrange(depth * 2) ] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) rnn_dropout_prob = np.float64(args["rnn_dropout"]).astype( theano.config.floatX) self.rnn_dropout = theano.shared(rnn_dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer(n_d=self.n_d, vocab=set(w for w in train)) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d)) activation = get_activation_by_name(args["activation"]) layers = self.layers = [] for i in xrange(depth): rnn_layer = KernelNN(n_in=self.n_d, n_out=self.n_d, activation=activation, highway=args["highway"], dropout=self.rnn_dropout) layers.append(rnn_layer) output_layer = Layer( n_in=self.n_d, n_out=self.n_V, activation=T.nnet.softmax, ) output_layer.W = embedding_layer.embeddings.T # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) #x = x_flat x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d)) # len * batch * (n_d+n_d) self.last_state = [] prev_h = x for i in xrange(depth): hidden = self.init_state[i * 2:i * 2 + 2] c, h = layers[i].forward_all(prev_h, hidden, return_c=True) self.last_state += [c[-1], h[-1]] prev_h = h prev_h = apply_dropout(prev_h, self.dropout) self.p_y_given_x = output_layer.forward(prev_h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idys) self.params = [x for l in layers for x in l.params] self.params += [embedding_layer.embeddings, output_layer.b] self.num_params = sum( len(x.get_value(borrow=True).ravel()) for x in self.params) say("# of params in total: {}\n".format(self.num_params)) layers += [embedding_layer, output_layer]
def ready_one_domain(self, idxs, idys, dom_ids, gold_rels, \ cnn_layer, rel_hid_layer, rel_out_layer, trans_layer, \ dom_hid_layer, dom_out_layer, lab_hid_layer, lab_out_layer): dropout = self.dropout embedding_layer = self.embedding_layer n_d = self.n_d n_e = self.n_e len_sentence, len_document, batch = idxs.shape dw = theano.shared(np.float64(0.4).astype(theano.config.floatX)) # (len(sent)*len(doc)*batch)*n_e sent_input_flat = embedding_layer.forward(idxs.ravel()) sent_input_flat = apply_dropout(sent_input_flat, dropout) # len(sent)*(len(doc)*batch)*n_e sent_input = sent_input_flat.reshape((len_sentence, len_document*batch, n_e)) # len(sent) * (len(doc)*batch) * n_d sent_output = cnn_layer.forward_all(sent_input) # reconstruction loss recon_loss = self.get_recon_loss(idxs, sent_output) # max pooling, (len(doc)*batch) * n_d sent_embedding = T.max(sent_output, axis=0) sent_embedding = apply_dropout(sent_embedding, dropout) # relevance score, (len(doc)*batch) * 2 rel_score = rel_out_layer.forward(rel_hid_layer.forward(sent_embedding)).ravel() # relevance loss gold_rels = gold_rels.ravel() rel_mask = T.cast(T.neq(gold_rels, REL_PAD), theano.config.floatX) gold_rel_mask = rel_mask * T.cast(T.neq(gold_rels, REL_UNK), theano.config.floatX) rel_loss = T.sum((gold_rels - rel_score) ** 2 * gold_rel_mask) n_rel_loss = batch rel_loss = rel_loss / n_rel_loss # document embedding via weighted combination, batch * n_d rel_score = (rel_score * rel_mask).reshape((len_document, batch)) weighted_sent_embedding = sent_embedding.reshape((len_document, batch, n_d)) * rel_score.dimshuffle((0, 1, 'x')) n = T.sum(rel_score, axis=0) + 1e-8 * T.sum(rel_mask, axis=0) orig_doc_embedding = T.sum(weighted_sent_embedding, axis=0) / n.dimshuffle((0, 'x')) # transform document embedding, batch * n_d doc_embedding = trans_layer.forward(orig_doc_embedding) # domain prediction. batch * 2 dom_logprob = dom_out_layer.forward(apply_dropout(dom_hid_layer.forward(doc_embedding), dropout)) # domain loss dom_loss = -dom_logprob[T.arange(batch), dom_ids] dom_loss = dw * T.mean(dom_loss) # domain adv loss adv_loss = self.rho * (-dom_loss) # label prediction. batch * n_c lab_logprob = lab_out_layer.forward(apply_dropout(lab_hid_layer.forward(doc_embedding), dropout)) lab_prob = T.exp(lab_logprob) # label loss lab_loss = -lab_logprob[T.arange(batch), idys] lab_loss = T.mean(lab_loss) return lab_loss, rel_loss, dom_loss, adv_loss, lab_prob, recon_loss
def ready(self): args = self.args embedding_layer = self.embedding_layer self.n_hidden = args.hidden_dim self.n_in = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX)) # x is length * batch_size # y is batch_size self.x = T.imatrix('x') self.y = T.ivector('y') x = self.x y = self.y n_hidden = self.n_hidden n_in = self.n_in # fetch word embeddings # (len * batch_size) * n_in slices = embedding_layer.forward(x.ravel()) self.slices = slices # 3-d tensor, len * batch_size * n_in slices = slices.reshape((x.shape[0], x.shape[1], n_in)) # stacking the feature extraction layers pooling = args.pooling depth = args.depth layers = self.layers = [] prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) size = 0 softmax_inputs = [] activation = get_activation_by_name(args.act) for i in range(depth): if args.layer.lower() == "lstm": layer = LSTM(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden) elif args.layer.lower() == "strcnn": layer = StrCNN(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden, activation=activation, decay=args.decay, order=args.order) elif args.layer.lower() == "rcnn": layer = RCNN(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden, activation=activation, order=args.order, mode=args.mode) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output) if pooling: softmax_inputs.append(T.sum(prev_output, axis=0)) # summing over columns else: softmax_inputs.append(prev_output[-1]) prev_output = apply_dropout(prev_output, dropout) size += n_hidden # final feature representation is the concatenation of all extraction layers if pooling: softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0] else: softmax_input = T.concatenate(softmax_inputs, axis=1) softmax_input = apply_dropout(softmax_input, dropout, v2=True) # feed the feature repr. to the softmax output layer layers.append( Layer(n_in=size, n_out=self.nclasses, activation=softmax, has_bias=False)) for l, i in zip(layers, range(len(layers))): say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out)) # unnormalized score of y given x self.p_y_given_x = layers[-1].forward(softmax_input) self.pred = T.argmax(self.p_y_given_x, axis=1) self.nll_loss = T.mean( T.nnet.categorical_crossentropy(self.p_y_given_x, y)) # adding regularizations self.l2_sqr = None self.params = [] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams))
def ready(self): encoder = self.encoder embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = encoder.dropout # len*batch x = self.x = encoder.x z = self.z = encoder.z n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN( n_in = n_e,# if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order ) elif layer_type == "lstm": l = LSTM( n_in = n_e,# if i == 0 else n_d, n_out = n_d, activation = activation ) layers.append(l) # len * batch #masks = T.cast(T.neq(x, padding_id), theano.config.floatX) masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0,1,"x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = Layer( n_in = size, n_out = 1, activation = sigmoid ) # len*batch*1 probs = output_layer.forward(h_final) # len*batch probs2 = probs.reshape(x.shape) self.MRG_rng = MRG_RandomStreams() z_pred = self.z_pred = T.cast(self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8") # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # self.z_pred = theano.gradient.disconnected_grad(z_pred) z2 = z.dimshuffle((0,1,"x")) logpz = - T.nnet.binary_crossentropy(probs, z2) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch zsum = T.sum(z, axis=0, dtype=theano.config.floatX) zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX) loss_mat = encoder.loss_mat if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:,args.aspect] self.loss_vec = loss_vec coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg cost = self.cost = cost_logpz * 10 + l2_cost print "cost.dtype", cost.dtype self.cost_e = loss * 10 + encoder.l2_cost
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN(n_in=n_e, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM(n_in=n_e, n_out=n_d, activation=activation) layers.append(l) # len * batch masks = T.cast(T.neq(x, padding_id), theano.config.floatX) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = ZLayer( n_in=size, n_hidden=args.hidden_dimension2, activation=activation) # sample z given text (i.e. x) z_pred, sample_updates = output_layer.sample_all(h_final) # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) self.sample_updates = sample_updates print "z_pred", z_pred.ndim probs = output_layer.forward_all(h_final, z_pred) print "probs", probs.ndim logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): encoder = self.encoder embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = encoder.dropout # len*batch x = self.x = encoder.x z = self.z = encoder.z n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in range(2): if layer_type == "rcnn": l = RCNN( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch #masks = T.cast(T.neq(x, padding_id), theano.config.floatX) masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = Layer(n_in=size, n_out=1, activation=sigmoid) # len*batch*1 probs = output_layer.forward(h_final) # len*batch probs2 = probs.reshape(x.shape) self.MRG_rng = MRG_RandomStreams() z_pred = self.z_pred = T.cast( self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8") # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # self.z_pred = theano.gradient.disconnected_grad(z_pred) z2 = z.dimshuffle((0, 1, "x")) logpz = -T.nnet.binary_crossentropy(probs, z2) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch zsum = T.sum(z, axis=0, dtype=theano.config.floatX) zdiff_pre = (z[1:] - z[:-1]) * 1.0 zdiff = T.sum(abs(zdiff_pre), axis=0, dtype=theano.config.floatX) loss_mat = encoder.loss_mat if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:, args.aspect] self.loss_vec = loss_vec coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg cost = self.cost = cost_logpz * 10 + l2_cost print("cost.dtype", cost.dtype) self.cost_e = loss * 10 + encoder.l2_cost
def ready(self): generator = self.generator embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = generator.dropout # len*batch x = generator.x z = generator.z_pred z = z.dimshuffle((0,1,"x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] depth = args.depth layer_type = args.layer.lower() for i in xrange(depth): if layer_type == "rcnn": l = ExtRCNN( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order ) elif layer_type == "lstm": l = ExtLSTM( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) layers.append(l) # len * batch * 1 masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # len*batch*n_e embs = generator.word_embs pooling = args.pooling lst_states = [ ] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev, z) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum/cnt_non_padding) # mean pooling else: lst_states.append(h_next[-1]) # last state h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer( n_in = size, n_out = self.nclasses, activation = sigmoid ) # batch * nclasses preds = self.preds = output_layer.forward(h_final) # batch loss_mat = self.loss_mat = (preds-y)**2 pred_diff = self.pred_diff = T.mean(T.max(preds, axis=1) - T.min(preds, axis=1)) if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:,args.aspect] self.loss_vec = loss_vec zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost_g = cost_logpz * 10 + generator.l2_cost self.cost_e = loss * 10 + l2_cost
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN( n_in = n_e, n_out = n_d, activation = activation, order = args.order ) elif layer_type == "lstm": l = LSTM( n_in = n_e, n_out = n_d, activation = activation ) layers.append(l) # len * batch masks = T.cast(T.neq(x, padding_id), theano.config.floatX) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = ZLayer( n_in = size, n_hidden = args.hidden_dimension2, activation = activation ) # sample z given text (i.e. x) z_pred, sample_updates = output_layer.sample_all(h_final) # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) self.sample_updates = sample_updates print "z_pred", z_pred.ndim probs = output_layer.forward_all(h_final, z_pred) print "probs", probs.ndim logpz = - T.nnet.binary_crossentropy(probs, z_pred) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): generator = self.generator args = self.args weights = self.weights dropout = generator.dropout # len(text) * batch idts = generator.x z = generator.z_pred z = z.dimshuffle((0, 1, "x")) # batch * 2 pairs = self.pairs = T.imatrix() # num pairs * 3, or num queries * candidate size triples = self.triples = T.imatrix() embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN LayerType2 = ExtRCNN elif args.layer.lower() == "lstm": LayerType = LSTM LayerType2 = ExtLSTM #elif args.layer.lower() == "gru": # LayerType = GRU depth = self.depth = args.depth layers = self.layers = [] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) layers.append(feature_layer) extlayers = self.extlayers = [] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType2(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType2(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) feature_layer.copy_params(layers[i]) extlayers.append(feature_layer) # feature computation starts here xt = generator.word_embs # encode full text into representation prev_ht = self.xt = xt for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) prev_ht = ht # encode selected text into representation prev_htz = self.xt = xt for i in range(depth): # len*batch*n_d htz = extlayers[i].forward_all(prev_htz, z) prev_htz = htz # normalize vectors if args.normalize: ht = self.normalize_3d(ht) htz = self.normalize_3d(htz) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.htz = htz # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) htz = self.average_without_padding(htz, idts, z) else: ht = ht[-1] htz = htz[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = apply_dropout(ht, dropout) h_final = self.normalize_2d(h_final) hz_final = apply_dropout(htz, dropout) hz_final = self.normalize_2d(hz_final) self.h_final = h_final self.hz_final = hz_final say("h_final dtype: {}\n".format(ht.shape)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) self.scores_z = T.dot(hz_final[1:], hz_final[0]) # For training encoder: xp = h_final[triples.ravel()] xp = xp.reshape((triples.shape[0], triples.shape[1], n_d)) # num query * n_d query_vecs = xp[:, 0, :] # num query pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 hinge_loss = T.mean((diff > 0) * diff) # For training generator # batch self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1) pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:, 1]], axis=1) alpha = args.alpha loss_vec = self_cosine_distance * alpha + pair_cosine_distance * ( 1 - alpha) #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8 zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz sfactor = args.sparsity cfactor = args.sparsity * args.coherent scost_vec = zsum * sfactor + zdiff * cfactor # batch cost_vec = loss_vec + scost_vec cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(scost_vec) self.obj = loss + sparsity_cost params = [] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params))) l2_reg = None for p in params: if l2_reg is None: l2_reg = T.sum(p**2) #p.norm(2) else: l2_reg = l2_reg + T.sum(p**2) #p.norm(2) l2_reg = l2_reg * args.l2_reg self.l2_cost = l2_reg beta = args.beta self.cost_g = cost_logpz + generator.l2_cost self.cost_e = hinge_loss + loss * beta + l2_reg
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) # len*batch x = self.x = T.imatrix() z = self.z = T.bmatrix() z = z.dimshuffle((0,1,"x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] depth = args.depth layer_type = args.layer.lower() for i in xrange(depth): if layer_type == "rcnn": l = ExtRCNN( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order ) elif layer_type == "lstm": l = ExtLSTM( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) layers.append(l) # len * batch * 1 masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) pooling = args.pooling lst_states = [ ] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev, z) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum/cnt_non_padding) # mean pooling else: lst_states.append(h_next[-1]) # last state h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer( n_in = size, n_out = self.nclasses, activation = sigmoid ) # batch * nclasses preds = self.preds = output_layer.forward(h_final) # batch loss_mat = self.loss_mat = (preds-y)**2 loss = self.loss = T.mean(loss_mat) pred_diff = self.pred_diff = T.mean(T.max(preds, axis=1) - T.min(preds, axis=1)) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost cost = self.cost = loss * 10 + l2_cost
def ready(self): args = self.args embedding_layer = self.embedding_layer user_embedding_layer = self.user_embedding_layer self.n_hidden = args.hidden_dim self.n_in = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX) ) # x is length * batch_size # y is batch_size self.x = T.imatrix('x') self.w_masks = T.fmatrix('mask') self.w_lens = T.fvector('lens') self.s_ml = T.iscalar('sent_maxlen') self.s_num = T.iscalar('sent_num') self.y = T.ivector('y') self.usr = T.ivector('users') x = self.x y = self.y usr = self.usr w_masks = self.w_masks w_lens = self.w_lens s_ml = self.s_ml s_num = self.s_num n_hidden = self.n_hidden n_emb = n_in = self.n_in layers = self.layers = [] slicesu = user_embedding_layer.forward(usr) slices = embedding_layer.forward(x.ravel()) self.slices = slices # important for updating word embeddings # 3-d tensor, len * batch_size * n_in slices = slices.reshape((x.shape[0], x.shape[1], n_in)) pooling = args.pooling prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) size = 0 n_hidden_t = n_hidden if args.direction == "bi": n_hidden_t = 2 * n_hidden softmax_inputs = [] activation = get_activation_by_name(args.act) if args.layer.lower() == "lstm": layer = LSTM(n_in=n_in, n_out=n_hidden_t, direction=args.direction ) elif args.layer.lower() == "cnn": layer = CNN(n_in=n_in, n_out=n_hidden_t, activation=activation, order=args.order ) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output, masks=w_masks) prev_output = apply_dropout(prev_output, dropout) # final feature representation is the concatenation of all extraction layers if args.user_atten: layer = IterAttentionLayer( n_in=n_emb, n_out=n_hidden_t ) layers.append(layer) if args.user_atten_base: slicesu = None softmax_input = layers[-1].multi_hop_forward( prev_output, user_embs=slicesu, isWord=True, masks=w_masks) else: if pooling: softmax_input = T.sum(prev_output, axis=0) / w_lens.dimshuffle(0, 'x') else: ind = T.cast(w_lens - T.ones_like(w_lens), 'int32') softmax_input = prev_output[T.arange(ind.shape[0]), ind] softmax_input = apply_dropout(softmax_input, dropout, v2=True) n_in = n_hidden_t size = 0 softmax_inputs = [] [sentlen, emblen] = T.shape(softmax_input) prev_output = softmax_input.reshape( (sentlen / s_num, s_num, emblen)).dimshuffle(1, 0, 2) if args.layer.lower() == "lstm": layer = LSTM(n_in=n_in, n_out=n_hidden_t, direction=args.direction ) elif args.layer.lower() == "cnn": layer = CNN(n_in=n_in, n_out=n_hidden_t, activation=activation, order=args.order, ) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output) prev_output = apply_dropout(prev_output, dropout) if args.user_atten: layer = IterAttentionLayer( n_in=n_emb, n_out=n_hidden_t ) layers.append(layer) if args.user_atten_base: slicesu = None softmax_input = layers[-1].multi_hop_forward( prev_output, user_embs=slicesu, isWord=False) else: if pooling: softmax_input = T.sum(prev_output, axis=0) / \ T.cast(s_num, 'float32') else: softmax_input = prev_output[-1] softmax_input = apply_dropout(softmax_input, dropout, v2=True) size = n_hidden_t layers.append(Layer( n_in=size, n_out=self.nclasses, activation=softmax, has_bias=False )) if not args.fix_emb: for l, i in zip(layers, range(len(layers))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) else: for l, i in zip(layers[1:], range(len(layers[1:]))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) # unnormalized score of y given x self.p_y_given_x = layers[-1].forward(softmax_input) self.pred = T.argmax(self.p_y_given_x, axis=1) self.nll_loss = T.mean(T.nnet.categorical_crossentropy( self.p_y_given_x, y )) # adding regularizations self.l2_sqr = None self.params = [] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) for x in self.params) say("total # parameters: {}\n".format(nparams))
def ready(self): args = self.args w_emb_layer = self.w_emb_layer c_emb_layer = self.c_emb_layer r_emb_layers = self.r_emb_layers r_matrix_layers = self.r_matrix_layers char_dim = self.char_dim = args.char_dim char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim word_dim = self.word_dim = args.word_dim word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) word_ids = self.word_ids = T.ivector('word_ids') char_ids = self.char_ids = T.imatrix('char_ids') char_lens = self.char_lens = T.fvector('char_lens') char_masks = self.char_masks = T.imatrix('char_masks') up_ids = self.up_ids = T.imatrix('up_ids') up_rels = self.up_rels = T.imatrix('up_rels') up_id_masks = self.up_id_masks = T.imatrix('up_id_masks') down_ids = self.down_ids = T.imatrix('down_ids') down_rels = self.down_rels = T.imatrix('down_rels') down_id_masks = self.down_id_masks = T.imatrix('down_id_masks') tag_ids = self.tag_ids = T.ivector('tag_ids') layers = self.layers = [w_emb_layer, c_emb_layer] layers.extend(r_emb_layers) layers.extend(r_matrix_layers) inputs = self.inputs = [] inputs.append(self.word_ids) inputs.append(self.char_ids) inputs.append(self.char_lens) inputs.append(self.char_masks) inputs.append(self.up_ids) inputs.append(self.up_rels) inputs.append(self.up_id_masks) inputs.append(self.down_ids) inputs.append(self.down_rels) inputs.append(self.down_id_masks) inputs.append(self.tag_ids) wslices = w_emb_layer.forward(word_ids) cslices = c_emb_layer.forward(char_ids.ravel()) cslices = cslices.reshape((char_ids.shape[0], char_ids.shape[1], char_dim)) cslices = cslices.dimshuffle(1, 0, 2) bv_ur_slicess = [] bv_dr_slicess = [] b_ur_slicess = [] b_dr_slicess = [] bv_ur_matrixss = [] bv_dr_matrixss = [] b_ur_matrixss = [] b_dr_matrixss = [] for r_matrix_layer in r_matrix_layers: bv_ur_matrixs = r_matrix_layer.forward1(up_rels.ravel()) bv_dr_matrixs = r_matrix_layer.forward1(down_rels.ravel()) b_ur_matrixs = r_matrix_layer.forward2(up_rels.ravel()) b_dr_matrixs = r_matrix_layer.forward2(down_rels.ravel()) bv_ur_matrixss.append(bv_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) bv_dr_matrixss.append(bv_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) b_ur_matrixss.append(b_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) b_dr_matrixss.append(b_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) for r_emb_layer in r_emb_layers: bv_ur_slices = r_emb_layer.forward(up_rels.ravel()) bv_dr_slices = r_emb_layer.forward(down_rels.ravel()) b_ur_slices = r_emb_layer.forward2(up_rels.ravel()) b_dr_slices = r_emb_layer.forward2(down_rels.ravel()) bv_ur_slicess.append(bv_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim))) bv_dr_slicess.append(bv_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim))) b_ur_slicess.append(b_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim))) b_dr_slicess.append(b_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim))) char_masks = char_masks.dimshuffle(1, 0) prev_output = wslices prev_size = word_dim if char_dim: layers.append(LSTM( n_in = char_dim, n_out = char_lstm_dim, direction = 'bi' if args.char_bidirect else 'si' )) prev_output_2 = cslices prev_output_2 = apply_dropout(prev_output_2, dropout, v2 = True) prev_output_2 = layers[-1].forward_all(cslices, char_masks) prev_output_2 = T.sum(prev_output_2, axis = 0) prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) + char_lens).dimshuffle(0, 'x') prev_size += char_lstm_dim prev_output = T.concatenate([prev_output, prev_output_2], axis = 1) prev_output = apply_dropout(prev_output, dropout) if args.conv != 0: for i in range(args.clayer): layers.append(GKNNMultiHeadGate( n_in = prev_size, n_out = prev_size, n_head = args.head )) prev_output = layers[-1].forward_all(prev_output, up_ids, up_id_masks, bv_ur_slicess[0], down_ids, down_id_masks, bv_dr_slicess[0]) prev_output = apply_dropout(prev_output, dropout) #prev_size *= 2 #layers.append(LSTM( # n_in = prev_size, # n_out = word_lstm_dim, # direction = 'bi' if args.word_bidirect else 'si' #)) #prev_output = prev_output.dimshuffle(0, 'x', 1) #prev_output = layers[-1].forward_all(prev_output) #prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[-1])) #prev_size = word_lstm_dim layers.append(Layer( n_in = prev_size, n_out = args.classes, activation = linear, #ReLU, has_bias = False )) n_tags = args.classes s_len = char_ids.shape[0] tags_scores = layers[-1].forward(prev_output) transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) pre_ids = T.arange(s_len + 1) s_ids = T.arange(s_len + 1) + 1 real_path_score += transitions[ padded_tags_ids[pre_ids], padded_tags_ids[s_ids] ].sum() all_paths_scores = CRFForward(observations, transitions) self.nll_loss = nll_loss = - (real_path_score - all_paths_scores) preds = CRFForward(observations, transitions, viterbi = True, return_alpha = False, return_best_sequence=True) self.pred = preds[1:-1] self.l2_sqr = None params = self.params = [transitions] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) #for l, i in zip(layers[3:], range(len(layers[3:]))): for l, i in zip(layers[2+len(r_emb_layers)+len(r_matrix_layers):], range(len(layers[2+len(r_emb_layers)+len(r_matrix_layers):]))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams)) cost = self.nll_loss + self.l2_sqr lr_method_name = args.learning lr_method_parameters = {} lr_method_parameters['lr'] = args.learning_rate updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs = self.inputs, outputs = [cost, nll_loss], updates = updates, allow_input_downcast = True ) f_eval = theano.function( inputs = self.inputs[:-1], outputs = self.pred, allow_input_downcast = True ) return f_train, f_eval
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] layer_type = args.layer.lower() for i in xrange(1): l = CNN( n_in = n_e, n_out = n_d, activation = activation, order = args.order ) layers.append(l) # len * batch masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0,1,'x')) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs # len*bacth*n_d h1 = layers[0].forward_all(embs) h_final = h1 size = n_d h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer( n_in = size, n_out = 1, activation = sigmoid ) # len*batch*1 probs = output_layer.forward(h_final) # len*batch self.MRG_rng = MRG_RandomStreams() z_pred_dim3 = self.MRG_rng.binomial(size=probs.shape, p=probs, dtype="int8") z_pred = z_pred_dim3.reshape(x.shape) # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) print "z_pred", z_pred.ndim #logpz = - T.nnet.binary_crossentropy(probs, z_pred_dim3) * masks logpz = - T.nnet.binary_crossentropy(probs, z_pred_dim3) logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): args = self.args weights = self.weights # len(source) * batch idxs = self.idxs = T.imatrix() # len(target) * batch idys = self.idys = T.imatrix() idts = idys[:-1] idgs = idys[1:] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d n_V = self.n_V = embedding_layer.n_V if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [] for i in range(depth * 2): if LayerType != RCNN: feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) layers.append(feature_layer) self.output_layer = output_layer = Layer( n_in=n_d, n_out=n_V, activation=T.nnet.softmax, ) # feature computation starts here # (len*batch)*n_e xs_flat = embedding_layer.forward(idxs.ravel()) xs_flat = apply_dropout(xs_flat, dropout) if weights is not None: xs_w = weights[idxs.ravel()].dimshuffle((0, 'x')) xs_flat = xs_flat * xs_w # len*batch*n_e xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e)) # (len*batch)*n_e xt_flat = embedding_layer.forward(idts.ravel()) xt_flat = apply_dropout(xt_flat, dropout) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0, 'x')) xt_flat = xt_flat * xt_w # len*batch*n_e xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e)) prev_hs = xs prev_ht = xt for i in range(depth): # len*batch*n_d hs = layers[i * 2].forward_all(prev_hs, return_c=True) ht = layers[i * 2 + 1].forward_all(prev_ht, hs[-1]) hs = hs[:, :, -n_d:] ht = ht[:, :, -n_d:] prev_hs = hs prev_ht = ht prev_hs = apply_dropout(hs, dropout) prev_ht = apply_dropout(ht, dropout) self.p_y_given_x = output_layer.forward( prev_ht.reshape((xt_flat.shape[0], n_d))) h_final = hs[-1] self.scores2 = -(h_final[1:] - h_final[0]).norm(2, axis=1) h_final = self.normalize_2d(h_final) self.scores = T.dot(h_final[1:], h_final[0]) # (len*batch) nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idgs.ravel()) nll = nll.reshape(idgs.shape) self.nll = nll self.mask = mask = T.cast(T.neq(idgs, self.padding_id), theano.config.floatX) nll = T.sum(nll * mask, axis=0) #layers.append(embedding_layer) layers.append(output_layer) params = [] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params))) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.loss = T.mean(nll) self.cost = self.loss + l2_reg
def ready(self): global total_generate_time #say("in generator ready: \n") #start_generate_time = time.time() embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN(n_in=n_e, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM(n_in=n_e, n_out=n_d, activation=activation) l = Layer(n_in=n_e, n_out=n_d, activation=sigmoid) layers.append(l) # len * batch #masks = T.cast(T.neq(x, padding_id), theano.config.floatX) masks = T.cast(T.neq(x, padding_id), theano.config.floatX).dimshuffle( (0, 1, "x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward(embs) h2 = layers[1].forward(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 #size = n_e output_layer = self.output_layer = Layer(n_in=size, n_out=1, activation=sigmoid) # len*batch*1 probs = output_layer.forward(h_final) #probs = output_layer.forward(embs) #probs1 = probs.reshape(x.shape) #probs_rev = output_layer.forward(flipped_embs) #probs1_rev = probs.reshape(x.shape) #probs = T.concatenate([probs1, probs1_rev[::-1]], axis=2) # len*batch probs2 = probs.reshape(x.shape) if self.args.seed is not None: self.MRG_rng = MRG_RandomStreams(self.args.seed) else: self.MRG_rng = MRG_RandomStreams() z_pred = self.z_pred = T.cast( self.MRG_rng.binomial(size=probs2.shape, p=probs2), theano.config.floatX) #"int8") # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) #self.sample_updates = sample_updates print "z_pred", z_pred.ndim z2 = z_pred.dimshuffle((0, 1, "x")) logpz = -T.nnet.binary_crossentropy(probs, z2) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() z = self.z = T.bmatrix() z = z.dimshuffle((0, 1, "x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] depth = args.depth layer_type = args.layer.lower() for i in range(depth): if layer_type == "rcnn": l = ExtRCNN(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = ExtLSTM(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch * 1 masks = T.cast( T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) pooling = args.pooling lst_states = [] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev, z) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum / cnt_non_padding) # mean pooling else: lst_states.append(h_next[-1]) # last state h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer(n_in=size, n_out=self.nclasses, activation=sigmoid) # batch * nclasses preds = self.preds = output_layer.forward(h_final) # batch loss_mat = self.loss_mat = (preds - y)**2 loss = self.loss = T.mean(loss_mat) pred_diff = self.pred_diff = T.mean( T.max(preds, axis=1) - T.min(preds, axis=1)) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost cost = self.cost = loss * 10 + l2_cost
def ready(self): args = self.args index = self.index = T.lscalar() x = self.x = T.fmatrix() y = self.y = T.ivector() dropout = self.dropout = theano.shared(np.float64(args.dropout).astype( "float32")) n_d = args.hidden_dim layers = self.layers = [ ] for i in xrange(args.depth): l = Layer( n_in = 28*28 if i == 0 else n_d, n_out = n_d, activation = ReLU ) layers.append(l) output_layer = self.output_layer = Layer( n_in = n_d, n_out = 10, activation = softmax ) h = x for l in layers: h = l.forward(h) h = apply_dropout(h, dropout) self.h_final = h # batch * 10 probs = self.probs = output_layer.forward(h) # batch preds = self.preds = T.argmax(probs, axis=1) err = self.err = T.mean(T.cast(T.neq(preds, y), dtype="float32")) # loss = self.loss = -T.mean( T.log(probs[T.arange(y.shape[0]), y]) ) #loss = self.loss = T.mean( T.nnet.categorical_crossentropy( # probs, # y # )) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost += T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost = loss + l2_cost print "cost.dtype", self.cost.dtype
def ready(self): generator = self.generator embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] unk_id = embedding_layer.vocab_map["<unk>"] unk_vec = embedding_layer.embeddings[unk_id] dropout = generator.dropout # len*batch x = generator.x z = generator.z_pred z = z.dimshuffle((0,1,"x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] depth = args.depth layer_type = args.layer.lower() for i in xrange(depth): l = CNN( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order ) layers.append(l) # len * batch * 1 masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # len*batch*n_e embs = generator.word_embs*z + unk_vec.dimshuffle(('x','x',0))*(1-z) pooling = args.pooling lst_states = [ ] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum/cnt_non_padding) # mean pooling else: lst_states.append(T.max(h_next, axis=0)) h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer( n_in = size, n_out = self.nclasses, activation = sigmoid ) # batch * nclasses p_y_given_x = self.p_y_given_x = output_layer.forward(h_final) preds = self.preds = p_y_given_x > 0.5 print preds, preds.dtype print self.nclasses # batch loss_mat = T.nnet.binary_crossentropy(p_y_given_x, y) if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:,args.aspect] self.loss_vec = loss_vec self.true_pos = T.sum(preds*y) self.tot_pos = T.sum(preds) self.tot_true = T.sum(y) zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) if not args.fix_emb: params += embedding_layer.params nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost_g = cost_logpz + generator.l2_cost self.cost_e = loss + l2_cost
def ready(self): args = self.args embedding_layer = self.embedding_layer num_aspects = self.num_aspects self.n_emb = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX) ) self.x = T.imatrix('x') self.w_masks = T.fmatrix('mask') self.w_lens = T.fvector('sent_len') self.s_maxlen = T.iscalar('sent_max_len') self.s_num = T.iscalar('sent_num') self.y = T.ivector('y') self.ay = T.imatrix('ay') self.ay_mask = T.fmatrix('ay_mask') self.aay = T.itensor3('aay') x = self.x query = self.query w_masks = self.w_masks w_lens = self.w_lens s_ml = self.s_maxlen s_num = self.s_num n_emb = self.n_emb y = self.y ay = self.ay ay_mask = self.ay_mask aay = self.aay layers = self.layers = [embedding_layer] slices = embedding_layer.forward(x.ravel()) self.slices = slices = slices.reshape( (x.shape[0], x.shape[1], n_emb) ) slices_query = embedding_layer.forward(query.flatten(), is_node = False) slices_query = slices_query.reshape( (query.shape[0], query.shape[1], n_emb)) layers.append(Query_Repr_Layer(slices_query)) slices_query_tmp = slices_query = layers[-1].forward() layer = LSTM(n_in = n_emb, n_out = n_emb) layers.append(layer) prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) prev_output = layers[-1].forward_all(prev_output, w_masks) layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh) layers.append(layer) self.slices_query = slices_query = layers[-1].forward(slices_query) maskss = [] w_lenss = [] for i in range(num_aspects): maskss.append(w_masks) w_lenss.append(w_lens) maskss = T.concatenate(maskss, axis = 1) w_lenss = T.concatenate(w_lenss) layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb) layers.append(layer) prev_output = layers[-1].forward(prev_output, slices_query, is_word = True, hop = args.hop_word, masks = w_masks, aspect_num = num_aspects) prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2])) prev_output = apply_dropout(prev_output, dropout, v2=True) prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / (num_aspects * s_num), s_num, prev_output.shape[1])) prev_output = prev_output.dimshuffle(2, 0, 1, 3) prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[1] * prev_output.shape[2], prev_output.shape[3])) layer = LSTM(n_in = n_emb * args.hop_word, n_out = n_emb) layers.append(layer) prev_output = layers[-1].forward_all(prev_output) #layers.append(Query_Repr_Layer(slices_query)) #slices_query = layers[-1].forward() layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh) layers.append(layer) slices_query = layers[-1].forward(slices_query_tmp) # bug layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb) layers.append(layer) prev_output = layers[-1].forward(prev_output, slices_query, is_word = False, hop = args.hop_sent, aspect_num = num_aspects) prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2])) prev_output = apply_dropout(prev_output, dropout, v2=True) prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / num_aspects, prev_output.shape[1])) softmax_inputs = [] for i in range(num_aspects): softmax_inputs.append(prev_output[i]) size = n_emb * args.hop_sent p_y_given_a = [] pred_ay = [] nll_loss_ay = [] for i in range(num_aspects): layers.append(Layer(n_in = size, n_out = args.score_scale, activation = softmax, has_bias = False,)) p_y_given_a.append(layers[-1].forward(softmax_inputs[i])) nll_loss_ay.append( T.mean(T.sum( -T.log(p_y_given_a[-1]) * aay[:, i, :] * ay_mask[:, i].dimshuffle(0, 'x')))) pred_ay.append(T.argmax(p_y_given_a[-1], axis = 1)) self.p_y_given_a = p_y_given_a self.nll_loss_ay = T.sum(nll_loss_ay) self.pred_ay = T.stack(pred_ay).dimshuffle(1, 0) for l,i in zip(layers[4:], range(len(layers[3:]))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) self.l2_sqr = None self.params = [ ] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams))
def ready(self): global total_encode_time #say("in encoder ready: \n") #start_encode_time = time.time() generator = self.generator embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = generator.dropout # len*batch x = generator.x z = generator.z_pred z = z.dimshuffle((0, 1, "x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] depth = args.depth layer_type = args.layer.lower() for i in xrange(depth): if layer_type == "rcnn": l = ExtRCNN(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = ExtLSTM(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch * 1 masks = T.cast( T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # len*batch*n_e embs = generator.word_embs pooling = args.pooling lst_states = [] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev, z) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum / cnt_non_padding) # mean pooling else: lst_states.append(h_next[-1]) # last state h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer(n_in=size, n_out=self.nclasses, activation=sigmoid) # batch * nclasses preds = self.preds = output_layer.forward(h_final) # batch loss_mat = self.loss_mat = (preds - y)**2 pred_diff = self.pred_diff = T.mean( T.max(preds, axis=1) - T.min(preds, axis=1)) if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:, args.aspect] self.loss_vec = loss_vec zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost_g = cost_logpz * 10 + generator.l2_cost self.cost_e = loss * 10 + l2_cost
def ready(self, args, train): # len * batch self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = T.matrix(dtype=theano.config.floatX) dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer( n_d = self.n_d, vocab = set(w for w in train) ) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format( self.n_V, self.n_d )) activation = get_activation_by_name(args["activation"]) rnn_layer = LSTM( n_in = self.n_d, n_out = self.n_d, activation = activation ) output_layer = Layer( n_in = self.n_d, n_out = self.n_V, activation = T.nnet.softmax, ) # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) x = x.reshape( (self.idxs.shape[0], self.idxs.shape[1], self.n_d) ) # len * batch * (n_d+n_d) h = rnn_layer.forward_all(x, self.init_state, return_c=True) self.last_state = h[-1] h = h[:,:,self.n_d:] h = apply_dropout(h, self.dropout) self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys]) #self.nll = T.nnet.categorical_crossentropy( # self.p_y_given_x, # idys # ) self.layers = [ embedding_layer, rnn_layer, output_layer ] #self.params = [ x_flat ] + rnn_layer.params + output_layer.params self.params = embedding_layer.params + rnn_layer.params + output_layer.params self.num_params = sum(len(x.get_value(borrow=True).ravel()) for l in self.layers for x in l.params) say("# of params in total: {}\n".format(self.num_params))
def ready(self): args = self.args weights = self.weights # len(title) * batch idts = self.idts = T.imatrix() # len(body) * batch idbs = self.idbs = T.imatrix() # num pairs * 3, or num queries * candidate size idps = self.idps = T.imatrix() dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) dropout_op = self.dropout_op = Dropout(self.dropout) embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN elif args.layer.lower() == "lstm": LayerType = LSTM elif args.layer.lower() == "gru": LayerType = GRU depth = self.depth = args.depth layers = self.layers = [] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) else: feature_layer = LayerType(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order, mode=args.mode, has_outgate=args.outgate) layers.append(feature_layer) # feature computation starts here # (len*batch)*n_e xt = embedding_layer.forward(idts.ravel()) if weights is not None: xt_w = weights[idts.ravel()].dimshuffle((0, 'x')) xt = xt * xt_w # len*batch*n_e xt = xt.reshape((idts.shape[0], idts.shape[1], n_e)) xt = apply_dropout(xt, dropout) # (len*batch)*n_e xb = embedding_layer.forward(idbs.ravel()) if weights is not None: xb_w = weights[idbs.ravel()].dimshuffle((0, 'x')) xb = xb * xb_w # len*batch*n_e xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e)) xb = apply_dropout(xb, dropout) prev_ht = self.xt = xt prev_hb = self.xb = xb for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) hb = layers[i].forward_all(prev_hb) prev_ht = ht prev_hb = hb # normalize vectors if args.normalize: ht = self.normalize_3d(ht) hb = self.normalize_3d(hb) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.hb = hb # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) hb = self.average_without_padding(hb, idbs) else: ht = ht[-1] hb = hb[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = (ht + hb) * 0.5 h_final = apply_dropout(h_final, dropout) h_final = self.normalize_2d(h_final) self.h_final = h_final say("h_final dtype: {}\n".format(ht.dtype)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) # For training: xp = h_final[idps.ravel()] xp = xp.reshape((idps.shape[0], idps.shape[1], n_d)) # num query * n_d query_vecs = xp[:, 0, :] # num query pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 loss = T.mean((diff > 0) * diff) self.loss = loss params = [] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params))) l2_reg = None for p in params: if l2_reg is None: l2_reg = p.norm(2) else: l2_reg = l2_reg + p.norm(2) l2_reg = l2_reg * args.l2_reg self.cost = self.loss + l2_reg
def ready(self): generator = self.generator args = self.args weights = self.weights dropout = generator.dropout # len(text) * batch idts = generator.x z = generator.z_pred z = z.dimshuffle((0,1,"x")) # batch * 2 pairs = self.pairs = T.imatrix() # num pairs * 3, or num queries * candidate size triples = self.triples = T.imatrix() embedding_layer = self.embedding_layer activation = get_activation_by_name(args.activation) n_d = self.n_d = args.hidden_dim n_e = self.n_e = embedding_layer.n_d if args.layer.lower() == "rcnn": LayerType = RCNN LayerType2 = ExtRCNN elif args.layer.lower() == "lstm": LayerType = LSTM LayerType2 = ExtLSTM #elif args.layer.lower() == "gru": # LayerType = GRU depth = self.depth = args.depth layers = self.layers = [ ] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) layers.append(feature_layer) extlayers = self.extlayers = [ ] for i in range(depth): if LayerType != RCNN: feature_layer = LayerType2( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) else: feature_layer = LayerType2( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order, mode = args.mode, has_outgate = args.outgate ) feature_layer.copy_params(layers[i]) extlayers.append(feature_layer) # feature computation starts here xt = generator.word_embs # encode full text into representation prev_ht = self.xt = xt for i in range(depth): # len*batch*n_d ht = layers[i].forward_all(prev_ht) prev_ht = ht # encode selected text into representation prev_htz = self.xt = xt for i in range(depth): # len*batch*n_d htz = extlayers[i].forward_all(prev_htz, z) prev_htz = htz # normalize vectors if args.normalize: ht = self.normalize_3d(ht) htz = self.normalize_3d(htz) say("h_title dtype: {}\n".format(ht.dtype)) self.ht = ht self.htz = htz # average over length, ignore paddings # batch * d if args.average: ht = self.average_without_padding(ht, idts) htz = self.average_without_padding(htz, idts, z) else: ht = ht[-1] htz = htz[-1] say("h_avg_title dtype: {}\n".format(ht.dtype)) # batch * d h_final = apply_dropout(ht, dropout) h_final = self.normalize_2d(h_final) hz_final = apply_dropout(htz, dropout) hz_final = self.normalize_2d(hz_final) self.h_final = h_final self.hz_final = hz_final say("h_final dtype: {}\n".format(ht.dtype)) # For testing: # first one in batch is query, the rest are candidate questions self.scores = T.dot(h_final[1:], h_final[0]) self.scores_z = T.dot(hz_final[1:], hz_final[0]) # For training encoder: xp = h_final[triples.ravel()] xp = xp.reshape((triples.shape[0], triples.shape[1], n_d)) # num query * n_d query_vecs = xp[:,0,:] # num query pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1) # num query * candidate size neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2) # num query neg_scores = T.max(neg_scores, axis=1) diff = neg_scores - pos_scores + 1.0 hinge_loss = T.mean( (diff>0)*diff ) # For training generator # batch self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1) pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:,1]], axis=1) alpha = args.alpha loss_vec = self_cosine_distance*alpha + pair_cosine_distance*(1-alpha) #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8 zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz sfactor = args.sparsity cfactor = args.sparsity * args.coherent scost_vec = zsum*sfactor + zdiff*cfactor # batch cost_vec = loss_vec + scost_vec cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(scost_vec) self.obj = loss + sparsity_cost params = [ ] for l in self.layers: params += l.params self.params = params say("num of parameters: {}\n".format( sum(len(x.get_value(borrow=True).ravel()) for x in params) )) l2_reg = None for p in params: if l2_reg is None: l2_reg = T.sum(p**2) #p.norm(2) else: l2_reg = l2_reg + T.sum(p**2) #p.norm(2) l2_reg = l2_reg * args.l2_reg self.l2_cost = l2_reg beta = args.beta self.cost_g = cost_logpz + generator.l2_cost self.cost_e = hinge_loss + loss*beta + l2_reg print "cost dtype", self.cost_g.dtype, self.cost_e.dtype