def __init__(self, n_in, n_out, fc_in, fc_out, sample=False): super(Sampler, self).__init__(n_in=n_in, n_out=n_out) if sample: self.MRG_rng = MRG_RandomStreams() self.fc_layer = Layer(n_in=fc_in, n_out=fc_out, activation=get_activation_by_name('relu'), has_bias=True) self.fc_layer_final = Layer( n_in=fc_out, n_out=1, activation=get_activation_by_name('sigmoid'), has_bias=True, clip_inp=True)
def ready(self): embedding_layer = self.embedding_layer args = self.args self.padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # inp_len x batch x = self.x = T.imatrix('x') fw_mask = self.fw_mask = T.imatrix('fw') chunk_sizes = self.chunk_sizes = T.imatrix('sizes') self.bm = T.imatrix('bm') self.posit_x = T.imatrix('pos') rv_mask = T.concatenate([T.ones((1, fw_mask.shape[1])), fw_mask[:-1]], axis=0) self.z_totals = T.sum(T.neq(self.x, self.padding_id), axis=0, dtype=theano.config.floatX) self.layers = [] self.params = [] n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) self.pad_mask = T.cast(T.neq(x, self.padding_id), 'int32') self.chunk_mask = T.cast(T.neq(chunk_sizes, 0), 'int32') embs = embedding_layer.forward(x.ravel()) self.word_embs = embs = embs.reshape((x.shape[0], x.shape[1], n_e)) self.embs = apply_dropout(embs, dropout) if args.generator_encoding == 'cnn': h_final, size = self.cnn_encoding(chunk_sizes, rv_mask, n_e, n_d / 2) else: h_final, size = self.lstm_encoding(fw_mask, rv_mask, n_e, n_d, activation) self.size = size self.h_final = apply_dropout(h_final, dropout)
def ready(self): embedding_layer = self.embedding_layer args = self.args self.padding_id = padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # inp_len x batch x = self.x = T.imatrix('x') n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] for i in xrange(2): if args.layer == 'lstm': l = LSTM( n_in=n_e, n_out=n_d, activation=activation, ) else: l = RCNN(n_in=n_e, n_out=n_d, activation=activation, order=args.order) layers.append(l) self.masks = masks = T.cast(T.neq(x, padding_id), theano.config.floatX) embs = embedding_layer.forward(x.ravel()) self.word_embs = embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) flipped_embs = embs[::-1] h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) self.h_final = h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = ZLayer( n_in=size, n_hidden=args.hidden_dimension2, activation=activation, layer='rcnn', ) z_pred, sample_updates = output_layer.sample_all(h_final) self.non_sampled_zpred, _ = output_layer.sample_all_pretrain(h_final) z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) self.sample_updates = sample_updates probs = output_layer.forward_all(h_final, z_pred) logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) * masks # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [] for l in layers + [output_layer] + [embedding_layer]: for p in l.params: params.append(p) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] layers = [] params = self.params = [] # hl_inp_len x (batch * n) y = self.y = T.imatrix('y') # (batch * n) x n_classes gold_standard_entities = self.gold_standard_entities = T.ivector('gs') loss_mask = self.loss_mask = T.ivector('loss_mask') dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) mask_y = T.cast(T.neq(y, padding_id), theano.config.floatX).dimshuffle( (0, 1, 'x')) n_d = args.hidden_dimension n_e = embedding_layer.n_d embs_y = embedding_layer.forward(y.ravel()) embs_y = embs_y.reshape((y.shape[0], y.shape[1], n_e)) flipped_embs_y = embs_y[::-1] flipped_mask_y = mask_y[::-1] rnn_fw = MaskedLSTM(n_in=n_e, n_out=n_d) rnn_rv = MaskedLSTM(n_in=n_e, n_out=n_d) h_f_y = rnn_fw.forward_all_hl(embs_y, mask_y) h_r_y = rnn_rv.forward_all_hl(flipped_embs_y, flipped_mask_y) # 1 x (batch * n) x n_d -> (batch * n) x (2 * n_d) x 1 h_concat_y = T.concatenate([h_f_y, h_r_y], axis=2).dimshuffle( (1, 2, 0)) layers.append(rnn_fw) layers.append(rnn_rv) if not args.qa_hl_only: self.x = x = T.imatrix('x') mask_x = T.cast(T.neq(x, padding_id), theano.config.floatX).dimshuffle((0, 1, 'x')) tiled_x_mask = T.tile(mask_x, (args.n, 1)).dimshuffle((1, 0, 2)) embs = embedding_layer.forward(x.ravel()) embs = embs.reshape((x.shape[0], x.shape[1], n_e)) self.embs = embs = apply_dropout(embs, dropout) flipped_embs_x = embs[::-1] flipped_mask_x = mask_x[::-1] h_f_x = rnn_fw.forward_all_doc(embs, mask_x) h_r_x = rnn_rv.forward_all_doc(flipped_embs_x, flipped_mask_x) h_concat_x = T.concatenate([h_f_x, h_r_x[::-1]], axis=2) softmax_mask = T.zeros_like(tiled_x_mask) - 1e8 self.softmax_mask = softmax_mask = softmax_mask * (tiled_x_mask - 1) # inp_len x batch x n_d -> inp_len x batch x (2 * n_d) # (batch * n) x inp_len x (2 * n_d) gen_h_final = T.tile(h_concat_x, (args.n, 1)).dimshuffle((1, 0, 2)) if args.bilinear: bilinear_l = Bilinear(n_d, x.shape[1], args.n) inp_dot_hl = bilinear_l.forward(gen_h_final, h_concat_y) layers.append(bilinear_l) else: # (batch * n) x inp_len x 1 inp_dot_hl = T.batched_dot(gen_h_final, h_concat_y) h_size = n_d * 2 inp_dot_hl = inp_dot_hl - softmax_mask inp_dot_hl = inp_dot_hl.ravel() # (batch * n) x inp_len self.alpha = alpha = T.nnet.softmax( inp_dot_hl.reshape((args.n * x.shape[1], x.shape[0]))) # (batch * n) x n_d * 2 o = T.batched_dot(alpha, gen_h_final) output_size = h_size * 4 h_concat_y = h_concat_y.reshape((o.shape[0], o.shape[1])) self.o = o = T.concatenate( [o, h_concat_y, T.abs_(o - h_concat_y), o * h_concat_y], axis=1) else: h_concat_y = h_concat_y.reshape((y.shape[1], n_d * 2)) self.o = o = h_concat_y output_size = n_d * 2 fc7 = Layer(n_in=output_size, n_out=512, activation=get_activation_by_name('relu'), has_bias=True) fc7_out = fc7.forward(o) output_layer = Layer(n_in=512, n_out=self.nclasses, activation=softmax, has_bias=True) layers.append(fc7) layers.append(output_layer) preds = output_layer.forward(fc7_out) self.preds_clipped = preds_clipped = T.clip(preds, 1e-7, 1.0 - 1e-7) cross_entropy = T.nnet.categorical_crossentropy( preds_clipped, gold_standard_entities) * loss_mask loss = self.loss = T.mean(cross_entropy) for l in layers + [embedding_layer]: for p in l.params: params.append(p) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost_e = loss + l2_cost
def ready(self): generator = self.generator embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] layers = [] params = self.params = [] # hl_inp_len x (batch * n) y = self.y = T.imatrix('y') # (batch * n) x n_classes gold_standard_entities = self.gold_standard_entities = T.ivector('gs') # inp_len x batch bm = self.bm = generator.bm loss_mask = self.loss_mask = T.ivector('loss_mask') # inp_len x batch x = generator.x z = generator.z_pred mask_y = T.cast(T.neq(y, padding_id), theano.config.floatX).dimshuffle( (0, 1, 'x')) n_d = args.hidden_dimension n_e = embedding_layer.n_d embs_y = embedding_layer.forward(y.ravel()) embs_y = embs_y.reshape((y.shape[0], y.shape[1], n_e)) flipped_embs_y = embs_y[::-1] flipped_mask_y = mask_y[::-1] rnn_fw = MaskedLSTM(n_in=n_e, n_out=n_d) rnn_rv = MaskedLSTM(n_in=n_e, n_out=n_d) h_f_y = rnn_fw.forward_all_hl(embs_y, mask_y) h_r_y = rnn_rv.forward_all_hl(flipped_embs_y, flipped_mask_y) layers.append(rnn_fw) layers.append(rnn_rv) mask_x = T.cast(T.neq(x, padding_id) * z, theano.config.floatX).dimshuffle((0, 1, 'x')) tiled_x_mask = T.tile(mask_x, (args.n, 1)).dimshuffle((1, 0, 2)) if args.use_generator_h: h_concat_x = self.generator.word_level_h if args.generator_encoding == 'cnn': layers.extend(self.generator.layers[:4]) else: layers.extend(self.generator.layers[:2]) else: embs_x = generator.word_embs flipped_embs_x = embs_x[::-1] flipped_mask_x = mask_x[::-1] h_f_x = rnn_fw.forward_all_doc(embs_x, mask_x) h_r_x = rnn_rv.forward_all_doc(flipped_embs_x, flipped_mask_x) h_concat_x = T.concatenate([h_f_x, h_r_x[::-1]], axis=2) softmax_mask = T.zeros_like(tiled_x_mask) - 1e8 self.softmax_mask = softmax_mask = softmax_mask * (tiled_x_mask - 1) # 1 x (batch * n) x n_d -> (batch * n) x (2 * n_d) x 1 h_concat_y = T.concatenate([h_f_y, h_r_y], axis=2).dimshuffle( (1, 2, 0)) # inp_len x batch x n_d -> inp_len x batch x (2 * n_d) # (batch * n) x inp_len x (2 * n_d) gen_h_final = T.tile(h_concat_x, (args.n, 1)).dimshuffle((1, 0, 2)) if args.bilinear: bilinear_l = Bilinear(n_d, x.shape[1], args.n) inp_dot_hl = bilinear_l.forward(gen_h_final, h_concat_y) layers.append(bilinear_l) else: # (batch * n) x inp_len x 1 inp_dot_hl = T.batched_dot(gen_h_final, h_concat_y) h_size = n_d * 2 inp_dot_hl = inp_dot_hl - softmax_mask inp_dot_hl = inp_dot_hl.ravel() # (batch * n) x inp_len self.alpha = alpha = T.nnet.softmax( inp_dot_hl.reshape((args.n * x.shape[1], x.shape[0]))) # (batch * n) x n_d * 2 o = T.batched_dot(alpha, gen_h_final) output_size = h_size * 4 h_concat_y = h_concat_y.reshape((o.shape[0], o.shape[1])) self.o = o = T.concatenate( [o, h_concat_y, T.abs_(o - h_concat_y), o * h_concat_y], axis=1) fc7 = Layer(n_in=output_size, n_out=512, activation=get_activation_by_name('relu'), has_bias=True) fc7_out = fc7.forward(o) output_layer = Layer(n_in=512, n_out=self.nclasses, activation=softmax, has_bias=True) layers.append(fc7) layers.append(output_layer) preds = output_layer.forward(fc7_out) self.preds_clipped = preds_clipped = T.clip(preds, 1e-7, 1.0 - 1e-7) cross_entropy = T.nnet.categorical_crossentropy( preds_clipped, gold_standard_entities) * loss_mask loss_mat = cross_entropy.reshape((x.shape[1], args.n)) word_ol = z * bm total_z_word_overlap_per_sample = T.sum(word_ol, axis=0) total_overlap_per_sample = T.sum(bm, axis=0) + args.bigram_smoothing self.word_overlap_loss = word_overlap_loss = total_z_word_overlap_per_sample / total_overlap_per_sample self.loss_vec = loss_vec = T.mean(loss_mat, axis=1) logpz = generator.logpz loss = self.loss = T.mean(cross_entropy) z_totals = T.sum(T.neq(x, padding_id), axis=0, dtype=theano.config.floatX) self.zsum = zsum = T.abs_(generator.zsum / z_totals - args.z_perc) self.zdiff = zdiff = generator.zdiff / z_totals self.cost_vec = cost_vec = loss_vec + args.coeff_adequacy * ( 1 - word_overlap_loss) + args.coeff_z * (2 * zsum + zdiff) self.logpz = logpz = T.sum(logpz, axis=0) self.cost_logpz = cost_logpz = T.mean(cost_vec * logpz) self.obj = T.mean(cost_vec) for l in layers + [embedding_layer]: for p in l.params: params.append(p) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost_g = cost_logpz * args.coeff_cost_scale + generator.l2_cost self.cost_e = loss + l2_cost
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN(n_in=n_e, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM(n_in=n_e, n_out=n_d, activation=activation) layers.append(l) # len * batch self.masks = T.cast(T.neq(x, padding_id), theano.config.floatX) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*batch*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 h1_sent = h1[args.sentence_length - 1::args.sentence_length] h2_sent = h2[args.sentence_length - 1::args.sentence_length] # h_final_sent = T.concatenate([h1_sent, h2_sent[::-1]], axis=2) # h_final_sent = apply_dropout(h_final_sent, dropout) output_layer = self.output_layer = ZLayer( n_in=size, n_hidden=args.hidden_dimension2, activation=activation) # sample z given text (i.e. x) z_pred, sample_updates = output_layer.sample_all(h_final) # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) self.sample_updates = sample_updates print "z_pred", z_pred.ndim probs_word = output_layer.forward_all(h_final, z_pred) # SENTENCE LEVEL # output_layer_sent = self.output_layer_sent = ZLayer( # n_in=size, # n_hidden=args.hidden_dimension2, # activation=activation # ) # # z_pred_sent, sample_updates_sent = output_layer_sent.sample_all(h_final_sent) # # z_pred_sent = self.z_pred_sent = theano.gradient.disconnected_grad(z_pred_sent) # self.sample_updates_sent = sample_updates_sent # # probs_sent = output_layer_sent.forward_all(h_final_sent, z_pred_sent) # # z_pred_sent = T.repeat(z_pred_sent, args.sentence_length, axis=0) self.z_pred_combined = z_pred # probs_sent = T.repeat(probs_sent, args.sentence_length, axis=0) probs = probs_word logpz = -T.nnet.binary_crossentropy(probs, self.z_pred_combined) * self.masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = self.z_pred_combined self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): generator = self.generator embedding_layer = self.embedding_layer embedding_layer_y = self.embedding_layer_y args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = generator.dropout # len*batch y = self.y = T.imatrix() y_mask = T.cast(T.neq(y, padding_id), theano.config.floatX) bv = self.bv = T.imatrix() z = generator.z_pred_combined z = z.dimshuffle((0, 1, "x")) y_mask = y_mask.dimshuffle((0, 1, "x")) # batch*nclasses n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) # (len*batch)*n_e embs = generator.word_embs # (gs_len*batch)*n_e embs_y = embedding_layer_y.forward(y.ravel()) embs_y = embs_y.reshape((y.shape[0], y.shape[1], n_e)) l = ExtRCNN(n_in=n_e, n_out=n_d, activation=activation, order=args.order) h_prev = embs h_prev_y = embs_y # len*batch*n_d h_next_y = l.forward_all_2(h_prev_y, y_mask) h_next_y = theano.gradient.disconnected_grad(h_next_y) h_next = l.forward_all(h_prev, z) h_next = h_next[::args.sentence_length] h_final_y = h_next_y[::args.sentence_length_hl] h_final = apply_dropout(h_next, dropout) h_final_y = h_final_y.dimshuffle(1, 0, 2) # 15 x 4 x 200 h_final = h_final.dimshuffle(1, 0, 2) # 15 x 10 x 200 h_final_y_r = (h_final_y**2).sum(2, keepdims=True) # 15 x 4 x 1 h_final_r = (h_final**2).sum(2, keepdims=True).dimshuffle( 0, 2, 1) # 15 x 1 x 10 batched_dot = T.batched_dot(h_final_y, h_final.dimshuffle(0, 2, 1)) # 15 x 4 x 10 squared_euclidean_distances = h_final_y_r + h_final_r - 2 * batched_dot # (15 x 4 x 1 + 15 x 1 x 10) + (15 x 4 x 10) similarity = T.sqrt(squared_euclidean_distances).dimshuffle( 1, 0, 2) # 4 x 15 x 10 loss_mat = self.loss_mat = T.min(similarity, axis=2, keepdims=True) # 4 x 15 x 1 self.loss_vec = loss_vec = T.mean(loss_mat, axis=0) zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz padded = T.shape_padaxis(T.zeros_like(bv[0]), axis=1).dimshuffle( (1, 0)) component_2 = T.concatenate([bv[1:], padded], axis=0) # component_2 = T.stack([shifted_bv, bv], axis=2) self.bigram_overlap = component_2 * bv intersection = T.sum(self.bigram_overlap) jac = (intersection + args.jaccard_smoothing) / (T.sum(bv) + args.jaccard_smoothing) jac = 1 - jac coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor samp = zsum * args.sparsity + zdiff * coherent_factor cost_vec = samp + loss_vec + jac cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) + jac self.encoder_params = l.params params = self.params = [] for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost_g = cost_logpz * 10 + generator.l2_cost self.cost_e = loss * 10 + l2_cost