class EncoderDecoder(object): def __init__(self, **kwargs): self.n_in_src = kwargs.pop('nembed_src') self.n_in_trg = kwargs.pop('nembed_trg') self.n_hids_src = kwargs.pop('nhids_src') self.n_hids_trg = kwargs.pop('nhids_trg') self.src_vocab_size = kwargs.pop('src_vocab_size') self.trg_vocab_size = kwargs.pop('trg_vocab_size') self.method = kwargs.pop('method') self.dropout = kwargs.pop('dropout') self.maxout_part = kwargs.pop('maxout_part') self.path = kwargs.pop('saveto') self.clip_c = kwargs.pop('clip_c') self.mkl = kwargs.pop('mkl') self.with_attention = kwargs.pop('with_attention') self.with_coverage = kwargs.pop('with_coverage') self.coverage_dim = kwargs.pop('coverage_dim') self.coverage_type = kwargs.pop('coverage_type') self.max_fertility = kwargs.pop('max_fertility') if self.coverage_type is 'linguistic': # make sure the dimension of linguistic coverage is always 1 self.coverage_dim = 1 self.with_context_gate = kwargs.pop('with_context_gate') self.params = [] self.layers = [] self.table_src = LookupTable(self.src_vocab_size, self.n_in_src, name='table_src') self.layers.append(self.table_src) self.encoder = BidirectionalEncoder(self.n_in_src, self.n_hids_src, self.table_src, self.mkl, name='birnn_encoder') self.layers.append(self.encoder) self.table_trg = LookupTable(self.trg_vocab_size, self.n_in_trg, name='table_trg') self.layers.append(self.table_trg) self.decoder = Decoder(self.mkl, self.n_in_trg, self.n_hids_trg, 2 * self.n_hids_src, with_attention=self.with_attention, with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, with_context_gate=self.with_context_gate, maxout_part=self.maxout_part, name='rnn_decoder') self.layers.append(self.decoder) self.logistic_layer = LogisticRegression(self.n_in_trg, self.trg_vocab_size) self.layers.append(self.logistic_layer) # for reconstruction self.with_reconstruction = kwargs.pop('with_reconstruction') self.reconstruction_weight = kwargs.pop('reconstruction_weight') if self.with_reconstruction: # note the source and target sides are reversed self.inverse_decoder = InverseDecoder(self.n_in_src, 2 * self.n_hids_src, self.n_hids_trg, with_attention=self.with_attention, maxout_part=self.maxout_part, name='rnn_inverse_decoder') self.layers.append(self.inverse_decoder) self.inverse_logistic_layer = LogisticRegression(self.n_in_src, self.src_vocab_size, name='inverse_LR') self.layers.append(self.inverse_logistic_layer) for layer in self.layers: self.params.extend(layer.params) def build_trainer_with_data_parallel(self, src, src_mask, trg, trg_mask, ite, devices, l1_reg_weight=1e-6, l2_reg_weight=1e-6, softmax_output_num_sampled=100000): assert K._BACKEND == 'tensorflow' src_mask_3d = [K.expand_dims(mask) for mask in src_mask] trg_mask_3d = [K.expand_dims(mask) for mask in trg_mask] num_devices = len(devices) loss_list = [] grads_list = [] # TODO: group the devices by hosts, first calculate the averaged gradients for each host for i, device in enumerate(devices): with tf.device(device): loss = self.calc_loss(src[i], src_mask_3d[i], trg[i], trg_mask_3d[i], l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight, softmax_output_num_sampled=softmax_output_num_sampled) loss_list.append(loss) grads = K.gradients(loss, self.params) grads_list.append(grads) avg_loss = sum(loss_list) / num_devices # use customized version of gradient to enable colocate_gradients with_ops # to ensure the gradient are computed by the same device that do the forward computation grads = avg_grads(grads_list) grads = grad_clip(grads, self.clip_c) updates = adadelta(self.params, grads) inps = src + src_mask + trg + trg_mask self.train_fn = K.function(inps, [avg_loss] + loss_list, updates=updates) def calc_loss(self, src, src_mask_3d, trg, trg_mask_3d, l1_reg_weight=1e-6, l2_reg_weight=1e-6, softmax_output_num_sampled=100000): annotations = self.encoder.apply(src, src_mask_3d) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0) trg_emb = self.table_trg.apply(trg) # shift_right assumes a 3D tensor, and time steps is dimension one trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])), [1, 0, 2]) hiddens, readout, alignment = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask_3d, init_context=init_context, c=annotations, c_mask=src_mask_3d) # apply dropout if self.dropout > 0.: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(readout, self.dropout) cost = calc_loss_from_readout(readout=readout, targets=trg, targets_mask=trg_mask_3d, logisticRegressionLayer=self.logistic_layer, softmax_output_num_sampled=softmax_output_num_sampled) if self.with_reconstruction: inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0) src_emb = self.table_src.apply(src) src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(src_emb, [1, 0, 2])), [1, 0, 2]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask_3d, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask_3d) if self.dropout > 0.: inverse_readout = Dropout(inverse_readout, self.dropout) inverse_logits = self.inverse_logistic_layer.get_logits(inverse_readout) inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out)) reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d) cost += reconstruction_cost * self.reconstruction_weight L1 = sum([K.sum(K.abs(param)) for param in self.params]) L2 = sum([K.sum(K.square(param)) for param in self.params]) params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight cost += params_regular return cost def build_trainer_with_model_parallel(self, src, src_mask, trg, trg_mask, ite, ps_device, devices, l1_reg_weight=1e-6, l2_reg_weight=1e-6): assert K._BACKEND == 'tensorflow' src_mask_3d = K.expand_dims(src_mask) trg_mask_3d = K.expand_dims(trg_mask) # compute loss and grads loss = self.calc_loss_with_model_parallel(src, src_mask_3d, trg, trg_mask_3d, ps_device=ps_device, devices=devices, l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight) grads = tf.gradients(loss, self.params, colocate_gradients_with_ops=True) grads = grad_clip(grads, self.clip_c) updates = adadelta(self.params, grads) inps = [src, src_mask, trg, trg_mask] self.train_fn = K.function(inps, [loss], updates=updates) def calc_loss_with_model_parallel(self, src, src_mask_3d, trg, trg_mask_3d, ps_device, devices, l1_reg_weight=1e-6, l2_reg_weight=1e-6): assert K._BACKEND == 'tensorflow' with tf.device(devices[0]): annotations = self.encoder.apply(src, src_mask_3d) init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0) trg_emb = self.table_trg.apply(trg) # shift_right assumes a 3D tensor, and time steps is dimension one trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])), [1, 0, 2]) hiddens, readout, alignment = self.decoder.run_pipeline( state_below=trg_emb_shifted, mask_below=trg_mask_3d, init_context=init_context, c=annotations, c_mask=src_mask_3d) if self.dropout > 0.: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(readout, self.dropout) logits = self.logistic_layer.get_logits_with_multiple_devices(readout, ps_device, devices) with tf.device(devices[0]): logits_flat = K.reshape(logits, shape=(-1, self.logistic_layer.n_out)) cost = get_category_cross_entropy_from_flat_logits(logits_flat, trg, trg_mask_3d) if self.with_reconstruction: with tf.device(devices[0]): inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0) src_emb = self.table_src.apply(src) src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions( src_emb, [1, 0, 2])), [1, 0, 2]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask_3d, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask_3d) with tf.device(devices[0]): if self.dropout > 0.: inverse_readout = Dropout(inverse_readout, self.dropout) inverse_logits = self.inverse_logistic_layer.get_logits_with_multiple_devices(inverse_readout, ps_device, devices) with tf.device(devices[0]): inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out)) reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d) with tf.device(devices[0]): cost += reconstruction_cost * self.reconstruction_weight L1 = sum([K.sum(K.abs(param)) for param in self.params]) L2 = sum([K.sum(K.square(param)) for param in self.params]) params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight cost += params_regular return cost def build_trainer(self, src, src_mask, trg, trg_mask, ite, l1_reg_weight=1e-6, l2_reg_weight=1e-6, softmax_output_num_sampled=100000): src_mask_3d = K.expand_dims(src_mask) trg_mask_3d = K.expand_dims(trg_mask) annotations = self.encoder.apply(src, src_mask_3d) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0) trg_emb = self.table_trg.apply(trg) # shift_right assumes a 3D tensor, and time steps is dimension one trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])), [1, 0, 2]) hiddens, readout, _ = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask_3d, init_context=init_context, c=annotations, c_mask=src_mask_3d) # apply dropout if self.dropout > 0.: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(readout, self.dropout) self.cost = calc_loss_from_readout(readout=readout, targets=trg, targets_mask=trg_mask_3d, logisticRegressionLayer=self.logistic_layer, softmax_output_num_sampled=softmax_output_num_sampled) # for reconstruction self.L1 = sum([K.sum(K.abs(param)) for param in self.params]) self.L2 = sum([K.sum(K.square(param)) for param in self.params]) params_regular = self.L1 * l1_reg_weight + self.L2 * l2_reg_weight # train cost train_cost = self.cost + params_regular # gradients grads = K.gradients(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # updates updates = adadelta(self.params, grads) # train function inps = [src, src_mask, trg, trg_mask] self.train_fn = K.function(inps, [train_cost], updates=updates, name='train_func') def build_sampler(self): # time steps, nb_samples x = K.placeholder((None, None), dtype='int32') c = self.encoder.apply(x, None) # None,None,None init_context = K.mean(c, axis=0) # None,None init_state = self.decoder.create_init_state(init_context) outs = [init_state, c] if not self.with_attention: outs.append(init_context) # compile function logger.info('Building compile_init_state_and_context function ...') self.compile_init_and_context = K.function([x], outs) logger.info('Done') if self.with_attention: c = K.placeholder((None, None, None)) init_context = K.mean(c, axis=0) else: init_context = K.placeholder((None, None)) # nb_samples y = K.placeholder((None,), dtype='int32') # nb_samples, state_dim cur_state = K.placeholder((None, None)) # if it is the first word, emb should be all zero, and it is indicated by -1 trg_emb = lookup_table(self.table_trg.W, y, name='trg_emb') if self.with_attention and self.with_coverage: cov_before = K.placeholder(shape=(None, None, None)) if self.coverage_type is 'linguistic': logger.info('Building compile_fertility ...') fertility = self.decoder._get_fertility(c) self.compile_fertility = K.function([c], [fertility]) logger.info('Done') else: fertility = None else: cov_before = None fertility = None # apply one step results = self.decoder.apply(state_below=trg_emb, init_state=cur_state, init_context=None if self.with_attention else init_context, c=c if self.with_attention else None, one_step=True, cov_before=cov_before, fertility=fertility) next_state = results[0] if self.with_attention: ctxs, alignment = results[1], results[2] if self.with_coverage: cov = results[3] else: # if with_attention=False, we always use init_context as the source representation ctxs = init_context readout = self.decoder.readout(next_state, ctxs, trg_emb) # maxout if self.maxout_part > 1: readout = self.decoder.one_step_maxout(readout) # compute the softmax probability next_probs = get_probs_from_logits(self.logistic_layer.get_logits(readout)) # sample from softmax distribution to get the sample # TODO: batch_size* nb_classes next_sample = K.argmax(K.random_multinomial(pvals=next_probs)) # compile function logger.info('Building compile_next_state_and_probs function ...') inps = [y, cur_state] if self.with_attention: inps.append(c) else: inps.append(init_context) outs = [next_probs, next_state, next_sample] if self.with_attention: outs.append(alignment) if self.with_coverage: inps.append(cov_before) outs.append(cov) self.compile_next_state_and_probs = K.function(inps, outs) logger.info('Done') # for reconstruction if self.with_reconstruction: if self.with_attention: # time steps, nb_samples, context_dim inverse_c = K.placeholder((None, None, None)) # mean pooling inverse_init_context = K.mean(inverse_c, axis=0) else: inverse_init_context = K.placeholder((None, None)) inverse_init_state = self.inverse_decoder.create_init_state(inverse_init_context) outs = [inverse_init_state] if not self.with_attention: outs.append(inverse_init_context) # compile function logger.info('Building compile_inverse_init_state_and_context function ...') self.compile_inverse_init_and_context = K.function([inverse_c], outs) logger.info('Done') # nb_samples src = K.placeholder(shape=(None,), dtype='int32') # nb_samples, state_dim inverse_cur_state = K.placeholder(shape=(None, None)) # time_steps, nb_samples trg_mask = K.placeholder(shape=(None, None)) # to 3D mask trg_mask_3d = K.expand_dims(trg_mask) # if it is the first word, emb should be all zero, and it is indicated by -1 src_emb = lookup_table(self.table_src.W, src, name='src_emb') # apply one step inverse_results = self.inverse_decoder.apply(state_below=src_emb, init_state=inverse_cur_state, init_context=None if self.with_attention else inverse_init_context, c=inverse_c if self.with_attention else None, c_mask=trg_mask_3d, one_step=True) inverse_next_state = inverse_results[0] if self.with_attention: inverse_ctxs, inverse_alignment = inverse_results[1], inverse_results[2] else: # if with_attention=False, we always use init_context as the source representation inverse_ctxs = init_context inverse_readout = self.inverse_decoder.readout(inverse_next_state, inverse_ctxs, src_emb) # maxout if self.maxout_part > 1: inverse_readout = self.inverse_decoder.one_step_maxout(inverse_readout) # apply dropout if self.dropout > 0.: inverse_readout = Dropout(inverse_readout, self.dropout) # compute the softmax probability inverse_next_probs = get_probs_from_logits(self.inverse_logistic_layer.get_logits(inverse_readout)) # sample from softmax distribution to get the sample inverse_next_sample = K.argmax(K.random_multinomial(pvals=inverse_next_probs)) # compile function logger.info('Building compile_inverse_next_state_and_probs function ...') inps = [src, trg_mask, inverse_cur_state] if self.with_attention: inps.append(inverse_c) else: inps.append(inverse_init_context) outs = [inverse_next_probs, inverse_next_state, inverse_next_sample] if self.with_attention: outs.append(inverse_alignment) self.compile_inverse_next_state_and_probs = K.function(inps, outs) logger.info('Done') def save(self, path=None): if path is None: path = self.path filenpz = open(path, "w") # parameter will have different name under tensorflow and theano val = dict([(self.norm_para_name(value.name), K.get_value(value)) for _, value in enumerate(self.params)]) logger.info("save the model {}".format(path)) numpy.savez(path, **val) filenpz.close() def norm_para_name(self, name): # LR_W:0 pos = name.find(':') if pos != -1: return name[:pos] else: return name def hot_fix_parameter_names(self, params): new_model_parameters = {} for k in params.keys(): val = params[k] new_name = self.norm_para_name(k) new_model_parameters[new_name] = val return new_model_parameters def load(self, path=None): if path is None: path = self.path if os.path.isfile(path): logger.info("load params {}".format(path)) val = numpy.load(path) val = self.hot_fix_parameter_names(val) for _, param in enumerate(self.params): param_name = self.norm_para_name(param.name) logger.info('Loading {} with shape {}'.format(param_name, K.get_value(param).shape)) if param_name not in val.keys(): logger.info('Adding new param {} with shape {}'.format(param_name, K.get_value(param).shape)) continue if K.get_value(param).shape != val[param_name].shape: logger.info("Error: model param != load param shape {} != {}".format( \ K.get_value(param).shape, val[param_name].shape)) raise Exception("loading params shape mismatch") else: K.set_value(param, val[param_name]) else: logger.warn("file {} does not exist, ignoring load".format(path))
class EncoderDecoder(object): def __init__(self, rng, **kwargs): self.n_in_src = kwargs.pop('nembed_src') self.n_in_trg = kwargs.pop('nembed_trg') self.n_hids_src = kwargs.pop('nhids_src') self.n_hids_trg = kwargs.pop('nhids_trg') self.src_vocab_size = kwargs.pop('src_vocab_size') self.trg_vocab_size = kwargs.pop('trg_vocab_size') self.method = kwargs.pop('method') self.dropout = kwargs.pop('dropout') self.maxout_part = kwargs.pop('maxout_part') self.path = kwargs.pop('saveto') self.clip_c = kwargs.pop('clip_c') self.rng = rng self.trng = RandomStreams(rng.randint(1e5)) # added by Zhaopeng Tu, 2016-06-09 self.with_attention = kwargs.pop('with_attention') # added by Zhaopeng Tu, 2016-04-29 self.with_coverage = kwargs.pop('with_coverage') self.coverage_dim = kwargs.pop('coverage_dim') self.coverage_type = kwargs.pop('coverage_type') self.max_fertility = kwargs.pop('max_fertility') if self.coverage_type is 'linguistic': # make sure the dimension of linguistic coverage is always 1 self.coverage_dim = 1 # added by Zhaopeng Tu, 2016-05-30 self.with_context_gate = kwargs.pop('with_context_gate') self.params = [] self.layers = [] self.table_src = LookupTable(self.rng, self.src_vocab_size, self.n_in_src, name='table_src') self.layers.append(self.table_src) self.encoder = BidirectionalEncoder(self.rng, self.n_in_src, self.n_hids_src, self.table_src, name='birnn_encoder') self.layers.append(self.encoder) # added by Longyue self.encoder_hist_1 = Encoder(self.rng, self.n_in_src, self.n_hids_src, self.table_src, name='rnn_encoder_hist_1') self.layers.append(self.encoder_hist_1) self.encoder_hist_2 = Encoder(self.rng, self.n_hids_src, self.n_hids_src, self.table_src, name='rnn_encoder_hist_2') self.layers.append(self.encoder_hist_2) self.table_trg = LookupTable(self.rng, self.trg_vocab_size, self.n_in_trg, name='table_trg') self.layers.append(self.table_trg) self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, self.n_hids_src, \ # added by Zhaopeng Tu, 2016-06-09 with_attention=self.with_attention, \ # added by Zhaopeng Tu, 2016-04-29 with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \ # added by Zhaopeng Tu, 2016-05-30 with_context_gate=self.with_context_gate, \ maxout_part=self.maxout_part, name='rnn_decoder') self.layers.append(self.decoder) self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg, self.trg_vocab_size) self.layers.append(self.logistic_layer) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction self.with_reconstruction = kwargs.pop('with_reconstruction') if self.with_reconstruction: # added by Zhaopeng Tu, 2016-07-27 self.reconstruction_weight = kwargs.pop('reconstruction_weight') # note the source and target sides are reversed self.inverse_decoder = InverseDecoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \ # added by Zhaopeng Tu, 2016-06-09 with_attention=self.with_attention, \ maxout_part=self.maxout_part, name='rnn_inverse_decoder') self.layers.append(self.inverse_decoder) self.srng = RandomStreams(rng.randint(1e5)) self.inverse_logistic_layer = LogisticRegression( self.rng, self.n_in_src, self.src_vocab_size, name='inverse_LR') self.layers.append(self.inverse_logistic_layer) for layer in self.layers: self.params.extend(layer.params) def build_trainer(self, src, src_mask, src_hist, src_hist_mask, trg, trg_mask, ite): # added by Longyue # checked by Zhaopeng: sentence dim = n_steps, hist_len, batch_size (4, 3, 25) # hist = (bath_size, sent_num, sent_len) --.T--> # hist = (sent_len, sent_num, bath_size) --lookup table--> # (sent_len, sent_num, bath_size, word_emb) --reshape--> # (sent_len, sent_num*bath_size, word_emb) --word-level rnn--> # (sent_len, sent_num*bath_size, hidden_size) --reshape--> # (sent_len, sent_num, bath_size, hidden_size) --[-1]--> # (sent_num, bath_size, hidden_size) --sent-level rnn--> # (sent_num, bath_size, hidden_size) --[-1]--> # (bath_size, hidden_size) = cross-sent context vector annotations_1 = self.encoder_hist_1.apply_1(src_hist, src_hist_mask) annotations_1 = annotations_1[-1] # get last hidden states annotations_2 = self.encoder_hist_2.apply_2(annotations_1) annotations_3 = annotations_2[-1] # get last hidden states #modified by Longyue annotations = self.encoder.apply(src, src_mask, annotations_3) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = (annotations * src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None] #added by Longyue init_context = concatenate([init_context, annotations_3], axis=annotations_3.ndim - 1) trg_emb = self.table_trg.apply(trg) trg_emb_shifted = T.zeros_like(trg_emb) trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1]) # modified by Longyue hiddens, readout, alignment = self.decoder.run_pipeline( state_below=trg_emb_shifted, mask_below=trg_mask, init_context=init_context, c=annotations, c_mask=src_mask, hist=annotations_3) # apply dropout if self.dropout < 1.0: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(self.trng, readout, 1, self.dropout) p_y_given_x = self.logistic_layer.get_probs(readout) self.cost = self.logistic_layer.cost(p_y_given_x, trg, trg_mask) / trg.shape[1] # self.cost = theano.printing.Print('likilihood cost:')(self.cost) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction if self.with_reconstruction: # now hiddens is the annotations inverse_init_context = (hiddens * trg_mask[:, :, None] ).sum(0) / trg_mask.sum(0)[:, None] src_emb = self.table_src.apply(src) src_emb_shifted = T.zeros_like(src_emb) src_emb_shifted = T.set_subtensor(src_emb_shifted[1:], src_emb[:-1]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask) # apply dropout if self.dropout < 1.0: # logger.info('Apply dropout with p = {}'.format(self.dropout)) inverse_readout = Dropout(self.srng, inverse_readout, 1, self.dropout) p_x_given_y = self.inverse_logistic_layer.get_probs( inverse_readout) self.reconstruction_cost = self.inverse_logistic_layer.cost( p_x_given_y, src, src_mask) / src.shape[1] # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost) self.cost += self.reconstruction_cost * self.reconstruction_weight self.L1 = sum(T.sum(abs(param)) for param in self.params) self.L2 = sum(T.sum(param**2) for param in self.params) params_regular = self.L1 * 1e-6 + self.L2 * 1e-6 # params_regular = theano.printing.Print('params_regular:')(params_regular) # train cost train_cost = self.cost + params_regular # gradients grads = T.grad(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # updates updates = adadelta(self.params, grads) # train function # modified by Longyue inps = [src, src_mask, src_hist, src_hist_mask, trg, trg_mask] self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function') # self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) def build_sampler(self): # added by Longyue x_hist = T.ltensor3() x_hist_mask = T.tensor3() annotations_1 = self.encoder_hist_1.apply_1(x_hist, x_hist_mask) annotations_1 = annotations_1[-1] annotations_2 = self.encoder_hist_2.apply_2(annotations_1) annotations_3 = annotations_2[-1] x = T.lmatrix() # Build Networks # src_mask is None c = self.encoder.apply(x, None, annotations_3) #init_context = ctx[0, :, -self.n_hids_src:] # mean pooling init_context = c.mean(0) # added by Longyue init_context = concatenate([init_context, annotations_3], axis=annotations_3.ndim - 1) init_state = self.decoder.create_init_state(init_context) outs = [init_state, c, annotations_3] if not self.with_attention: outs.append(init_context) # compile function print 'Building compile_init_state_and_context function ...' self.compile_init_and_context = theano.function( [x, x_hist, x_hist_mask], outs, name='compile_init_and_context') print 'Done' y = T.lvector() cur_state = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg), self.table_trg.apply(y)) # added by Zhaopeng Tu, 2016-06-09 # for with_attention=False if self.with_attention and self.with_coverage: cov_before = T.tensor3() if self.coverage_type is 'linguistic': print 'Building compile_fertility ...' fertility = self.decoder._get_fertility(c) fertility = T.addbroadcast(fertility, 1) self.compile_fertility = theano.function( [c], [fertility], name='compile_fertility') print 'Done' else: fertility = None else: cov_before = None fertility = None # apply one step # modified by Zhaopeng Tu, 2016-04-29 # [next_state, ctxs] = self.decoder.apply(state_below=trg_emb, results = self.decoder.apply( state_below=trg_emb, init_state=cur_state, # added by Zhaopeng Tu, 2016-06-09 init_context=None if self.with_attention else init_context, c=c if self.with_attention else None, hist=annotations_3, # added by Longyue one_step=True, # added by Zhaopeng Tu, 2016-04-27 cov_before=cov_before, fertility=fertility) next_state = results[0] if self.with_attention: ctxs, alignment = results[1], results[2] if self.with_coverage: cov = results[3] else: # if with_attention=False, we always use init_context as the source representation ctxs = init_context readout = self.decoder.readout(next_state, ctxs, trg_emb) # maxout if self.maxout_part > 1: readout = self.decoder.one_step_maxout(readout) # apply dropout if self.dropout < 1.0: readout = Dropout(self.trng, readout, 0, self.dropout) # compute the softmax probability next_probs = self.logistic_layer.get_probs(readout) # sample from softmax distribution to get the sample next_sample = self.trng.multinomial(pvals=next_probs).argmax(1) # compile function print 'Building compile_next_state_and_probs function ...' inps = [y, cur_state] if self.with_attention: inps.append(c) else: inps.append(init_context) # added by Longyue inps.append(annotations_3) outs = [next_probs, next_state, next_sample] # added by Zhaopeng Tu, 2016-06-09 if self.with_attention: outs.append(alignment) # added by Zhaopeng Tu, 2016-04-29 if self.with_coverage: inps.append(cov_before) if self.coverage_type is 'linguistic': inps.append(fertility) outs.append(cov) self.compile_next_state_and_probs = theano.function( inps, outs, name='compile_next_state_and_probs') print 'Done' # added by Zhaopeng Tu, 2016-07-18 # for reconstruction if self.with_reconstruction: # Build Networks # trg_mask is None inverse_c = T.tensor3() # mean pooling inverse_init_context = inverse_c.mean(0) inverse_init_state = self.inverse_decoder.create_init_state( inverse_init_context) outs = [inverse_init_state] if not self.with_attention: outs.append(inverse_init_context) # compile function print 'Building compile_inverse_init_state_and_context function ...' self.compile_inverse_init_and_context = theano.function( [inverse_c], outs, name='compile_inverse_init_and_context') print 'Done' src = T.lvector() inverse_cur_state = T.matrix() trg_mask = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src), self.table_src.apply(src)) # apply one step # modified by Zhaopeng Tu, 2016-04-29 inverse_results = self.inverse_decoder.apply( state_below=src_emb, init_state=inverse_cur_state, # added by Zhaopeng Tu, 2016-06-09 init_context=None if self.with_attention else inverse_init_context, c=inverse_c if self.with_attention else None, c_mask=trg_mask, one_step=True) inverse_next_state = inverse_results[0] if self.with_attention: inverse_ctxs, inverse_alignment = inverse_results[ 1], inverse_results[2] else: # if with_attention=False, we always use init_context as the source representation inverse_ctxs = init_context inverse_readout = self.inverse_decoder.readout( inverse_next_state, inverse_ctxs, src_emb) # maxout if self.maxout_part > 1: inverse_readout = self.inverse_decoder.one_step_maxout( inverse_readout) # apply dropout if self.dropout < 1.0: inverse_readout = Dropout(self.srng, inverse_readout, 0, self.dropout) # compute the softmax probability inverse_next_probs = self.inverse_logistic_layer.get_probs( inverse_readout) # sample from softmax distribution to get the sample inverse_next_sample = self.srng.multinomial( pvals=inverse_next_probs).argmax(1) # compile function print 'Building compile_inverse_next_state_and_probs function ...' inps = [src, trg_mask, inverse_cur_state] if self.with_attention: inps.append(inverse_c) else: inps.append(inverse_init_context) outs = [ inverse_next_probs, inverse_next_state, inverse_next_sample ] # added by Zhaopeng Tu, 2016-06-09 if self.with_attention: outs.append(inverse_alignment) self.compile_inverse_next_state_and_probs = theano.function( inps, outs, name='compile_inverse_next_state_and_probs') print 'Done' def save(self, path=None): if path is None: path = self.path filenpz = open(path, "w") val = dict([(value.name, value.get_value()) for index, value in enumerate(self.params)]) logger.info("save the model {}".format(path)) numpy.savez(path, **val) filenpz.close() def load(self, path=None): if path is None: path = self.path if os.path.isfile(path): logger.info("load params {}".format(path)) val = numpy.load(path) for index, param in enumerate(self.params): logger.info('Loading {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) if param.name not in val.keys(): logger.info('Adding new param {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) continue if param.get_value().shape != val[param.name].shape: logger.info("Error: model param != load param shape {} != {}".format(\ param.get_value().shape, val[param.name].shape)) raise Exception("loading params shape mismatch") else: param.set_value(val[param.name], borrow=True) else: logger.error("file {} does not exist".format(path)) self.save()
class EncoderDecoder(object): def __init__(self, rng, **kwargs): self.n_in_src = kwargs.get('nembed_src') self.n_in_trg = kwargs.get('nembed_trg') self.n_hids_src = kwargs.get('nhids_src') self.n_hids_trg = kwargs.get('nhids_trg') self.src_vocab_size = kwargs.get('src_vocab_size') self.trg_vocab_size = kwargs.get('trg_vocab_size') self.method = kwargs.get('method') self.dropout = kwargs.get('dropout') self.maxout_part = kwargs.get('maxout_part') self.path = kwargs.get('saveto') self.clip_c = kwargs.get('clip_c') self.rng = rng self.trng = RandomStreams(rng.randint(1e5)) # added by Zhaopeng Tu, 2016-04-29 self.with_coverage = kwargs.get('with_coverage') self.coverage_dim = kwargs.get('coverage_dim') self.coverage_type = kwargs.get('coverage_type') self.max_fertility = kwargs.get('max_fertility') if self.coverage_type is 'linguistic': # make sure the dimension of linguistic coverage is always 1 self.coverage_dim = 1 # added by Zhaopeng Tu, 2016-05-30 self.with_context_gate = kwargs.get('with_context_gate') # added by Zhaopeng Tu, 2017-11-29 self.with_layernorm = kwargs.get('with_layernorm', False) self.params = [] self.layers = [] self.table_src = LookupTable(self.rng, self.src_vocab_size, self.n_in_src, name='table_src') self.layers.append(self.table_src) self.encoder = BidirectionalEncoder(self.rng, self.n_in_src, self.n_hids_src, self.table_src, name='birnn_encoder') self.layers.append(self.encoder) self.table_trg = LookupTable(self.rng, self.trg_vocab_size, self.n_in_trg, name='table_trg') self.layers.append(self.table_trg) self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, \ maxout_part=self.maxout_part, name='rnn_decoder', \ # added by Zhaopeng Tu, 2016-04-29 with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \ # added by Zhaopeng Tu, 2016-05-30 with_context_gate=self.with_context_gate, \ with_layernorm=self.with_layernorm) self.layers.append(self.decoder) self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg, self.trg_vocab_size) self.layers.append(self.logistic_layer) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction self.with_reconstruction = kwargs.get('with_reconstruction') if self.with_reconstruction: # added by Zhaopeng Tu, 2016-07-27 self.reconstruction_weight = kwargs.get('reconstruction_weight') # note the source and target sides are reversed self.inverse_decoder = Decoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \ maxout_part=self.maxout_part, name='rnn_inverse_decoder', \ with_layernorm=self.with_layernorm) self.layers.append(self.inverse_decoder) self.srng = RandomStreams(rng.randint(1e5)) self.inverse_logistic_layer = LogisticRegression( self.rng, self.n_in_src, self.src_vocab_size, name='inverse_LR') self.layers.append(self.inverse_logistic_layer) for layer in self.layers: self.params.extend(layer.params) def build_trainer(self, src, src_mask, trg, trg_mask): annotations = self.encoder.apply(src, src_mask) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = (annotations * src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None] trg_emb = self.table_trg.apply(trg) trg_emb_shifted = T.zeros_like(trg_emb) trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1]) results = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask, init_context=init_context, c=annotations, c_mask=src_mask) hiddens, ctxs, readout, alignment = results[:4] # apply dropout if self.dropout < 1.0: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(self.trng, readout, 1, self.dropout) p_y_given_x = self.logistic_layer.get_probs(readout) self.cost = self.logistic_layer.cost(p_y_given_x, trg, trg_mask) / trg.shape[1] # self.cost = theano.printing.Print('likilihood cost:')(self.cost) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction if self.with_reconstruction: # now hiddens is the annotations inverse_init_context = (hiddens * trg_mask[:, :, None] ).sum(0) / trg_mask.sum(0)[:, None] src_emb = self.table_src.apply(src) src_emb_shifted = T.zeros_like(src_emb) src_emb_shifted = T.set_subtensor(src_emb_shifted[1:], src_emb[:-1]) inverse_results = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask) inverse_hiddens, inverse_ctxs, inverse_readout, inverse_alignment = inverse_results[: 4] # apply dropout if self.dropout < 1.0: # logger.info('Apply dropout with p = {}'.format(self.dropout)) inverse_readout = Dropout(self.srng, inverse_readout, 1, self.dropout) p_x_given_y = self.inverse_logistic_layer.get_probs( inverse_readout) self.reconstruction_cost = self.inverse_logistic_layer.cost( p_x_given_y, src, src_mask) / src.shape[1] # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost) self.cost += self.reconstruction_cost * self.reconstruction_weight self.L1 = sum(T.sum(abs(param)) for param in self.params) self.L2 = sum(T.sum(param**2) for param in self.params) params_regular = self.L1 * 1e-6 + self.L2 * 1e-6 # params_regular = theano.printing.Print('params_regular:')(params_regular) # train cost train_cost = self.cost + params_regular # gradients grads = T.grad(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # train function inps = [src, src_mask, trg, trg_mask] outs = [train_cost] if self.with_layernorm: inps = [src, src_mask, trg, trg_mask] lr = T.scalar(name='lr') print 'Building optimizers...', self.train_fn, self.update_fn = adam(lr, self.params, grads, inps, outs) else: # updates updates = adadelta(self.params, grads) # mode=theano.Mode(linker='vm') for ifelse # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch. self.train_fn = theano.function(inps, outs, updates=updates, name='train_function', mode=theano.Mode(linker='vm')) # self.train_fn = theano.function(inps, outs, updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) def build_sampler(self): x = T.lmatrix() # Build Networks # src_mask is None c = self.encoder.apply(x, None) #init_context = ctx[0, :, -self.n_hids_src:] # mean pooling init_context = c.mean(0) init_state = self.decoder.create_init_state(init_context) # compile function print 'Building compile_init_state_and_context function ...' self.compile_init_and_context = theano.function( [x], [init_state, c], name='compile_init_and_context') print 'Done' y = T.lvector() cur_state = T.matrix() # if it is the first word, emb should be a1l zero, and it is indicated by -1 trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg), self.table_trg.apply(y)) # added by Zhaopeng Tu, 2016-06-09 if self.with_coverage: cov_before = T.tensor3() if self.coverage_type is 'linguistic': print 'Building compile_fertility ...' fertility = self.decoder._get_fertility(c) fertility = T.addbroadcast(fertility, 1) self.compile_fertility = theano.function( [c], [fertility], name='compile_fertility') print 'Done' else: fertility = None else: cov_before = None fertility = None # apply one step # modified by Zhaopeng Tu, 2016-04-29 results = self.decoder.apply( state_below=trg_emb, init_state=cur_state, c=c, one_step=True, # added by Zhaopeng Tu, 2016-04-27 cov_before=cov_before, fertility=fertility) next_state, ctxs, alignment = results[:3] idx = 3 if self.with_coverage: cov = results[idx] idx += 1 readout = self.decoder.readout(next_state, ctxs, trg_emb) # maxout if self.maxout_part > 1: readout = self.decoder.one_step_maxout(readout) # apply dropout if self.dropout < 1.0: readout = Dropout(self.trng, readout, 0, self.dropout) # compute the softmax probability next_probs = self.logistic_layer.get_probs(readout) # sample from softmax distribution to get the sample next_sample = self.trng.multinomial(pvals=next_probs).argmax(1) # compile function print 'Building compile_next_state_and_probs function ...' inps = [y, cur_state, c] outs = [next_probs, next_state, next_sample, alignment] # added by Zhaopeng Tu, 2016-04-29 if self.with_coverage: inps.append(cov_before) if self.coverage_type is 'linguistic': inps.append(fertility) outs.append(cov) # mode=theano.Mode(linker='vm') for ifelse # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch. self.compile_next_state_and_probs = theano.function( inps, outs, name='compile_next_state_and_probs', mode=theano.Mode(linker='vm')) print 'Done' # added by Zhaopeng Tu, 2016-07-18 # for reconstruction if self.with_reconstruction: # Build Networks # trg_mask is None inverse_c = T.tensor3() # mean pooling inverse_init_context = inverse_c.mean(0) inverse_init_state = self.inverse_decoder.create_init_state( inverse_init_context) outs = [inverse_init_state] # compile function print 'Building compile_inverse_init_state_and_context function ...' self.compile_inverse_init_and_context = theano.function( [inverse_c], outs, name='compile_inverse_init_and_context') print 'Done' src = T.lvector() inverse_cur_state = T.matrix() trg_mask = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src), self.table_src.apply(src)) # apply one step # modified by Zhaopeng Tu, 2016-04-29 inverse_results = self.inverse_decoder.apply( state_below=src_emb, init_state=inverse_cur_state, c=inverse_c, c_mask=trg_mask, one_step=True) inverse_next_state, inverse_ctxs, inverse_alignment = inverse_results[: 3] inverse_readout = self.inverse_decoder.readout( inverse_next_state, inverse_ctxs, src_emb) # maxout if self.maxout_part > 1: inverse_readout = self.inverse_decoder.one_step_maxout( inverse_readout) # apply dropout if self.dropout < 1.0: inverse_readout = Dropout(self.srng, inverse_readout, 0, self.dropout) # compute the softmax probability inverse_next_probs, inverse_next_energy = self.inverse_logistic_layer.get_probs( inverse_readout) # sample from softmax distribution to get the sample inverse_next_sample = self.srng.multinomial( pvals=inverse_next_probs).argmax(1) # compile function print 'Building compile_inverse_next_state_and_probs function ...' inps = [src, trg_mask, inverse_cur_state, inverse_c] outs = [ inverse_next_probs, inverse_next_state, inverse_next_sample, inverse_alignment ] self.compile_inverse_next_state_and_probs = theano.function( inps, outs, name='compile_inverse_next_state_and_probs') print 'Done' def save(self, path=None): if path is None: path = self.path filenpz = open(path, "w") val = dict([(value.name, value.get_value()) for index, value in enumerate(self.params)]) logger.info("save the model {}".format(path)) numpy.savez(path, **val) filenpz.close() def load(self, path=None): if path is None: path = self.path if os.path.isfile(path): logger.info("load params {}".format(path)) val = numpy.load(path) for index, param in enumerate(self.params): logger.info('Loading {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) if param.name not in val.keys(): logger.info('Adding new param {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) continue if param.get_value().shape != val[param.name].shape: logger.info("Error: model param != load param shape {} != {}".format(\ param.get_value().shape, val[param.name].shape)) raise Exception("loading params shape mismatch") else: param.set_value(val[param.name], borrow=True) else: logger.error("file {} does not exist".format(path)) self.save()