def apply_func_on_depth(obj, func, depth, permeable_types=(list, tuple, dict)): if depth != 0 and isinstance(obj, permeable_types): if isinstance(obj, (list, tuple)): processed = list() for elem in obj: processed.append( apply_func_on_depth(elem, func, depth - 1, permeable_types=permeable_types)) if isinstance(obj, LSTMStateTuple): processed = LSTMStateTuple( c=processed[0], h=processed[1], ) elif isinstance(obj, tuple): processed = tuple(processed) return processed elif isinstance(obj, dict): processed = dict() for key, value in obj.items(): processed[key] = apply_func_on_depth( value, func, depth - 1, permeable_types=permeable_types) return processed return func(obj)
def deep_zip(objects, depth, permeable_types=(list, tuple, dict)): # print("(deep_zip)objects:", objects) if depth != 0 and isinstance(objects[0], permeable_types): if isinstance(objects[0], (list, tuple)): zipped = list() for comb in zip(*objects): zipped.append( deep_zip(comb, depth - 1, permeable_types=permeable_types)) if isinstance(objects[0], LSTMStateTuple): zipped = LSTMStateTuple( c=zipped[0], h=zipped[1], ) elif isinstance(objects[0], tuple): zipped = tuple(zipped) return zipped elif isinstance(objects[0], dict): zipped = dict() for key in objects[0].keys(): values = [obj[key] for obj in objects] zipped[key] = deep_zip(values, depth - 1, permeable_types=permeable_types) return zipped return tuple(objects)
def _add_encoder(self, encoder_inputs, seq_len): with tf.variable_scope('encoder'): cell_fw = tf.contrib.rnn.LSTMCell(self.hidden_size, initializer=self.rand_uni_init, state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell(self.hidden_size, initializer=self.rand_uni_init, state_is_tuple=True) outputs, states = bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_inputs, sequence_length=seq_len, swap_memory=True, dtype=tf.float32) # encoder_outputs, encoder_states = tf.nn.dynamic_rnn(cell_fw, encoder_inputs, # dtype=tf.float32, sequence_length=seq_len, # swap_memory=True) fw_outputs, bw_outputs = outputs c_fw, h_fw = states[0] c_bw, h_bw = states[1] encoder_c = c_fw + c_bw encoder_h = h_fw + h_bw # assemble to be a state tuple object encoder_states = LSTMStateTuple(c=encoder_c, h=encoder_h) encoder_outputs = fw_outputs + bw_outputs # add outputs # print(outputs) # print(states) return encoder_outputs, encoder_states
def call(self, inputs): demo, times, values, measurements, dt, lengths = inputs demo_encoded = self.demo_encoder(demo) initial_state = LSTMStateTuple(*tf.split(demo_encoded, 2, axis=-1)) values = tf.concat((values, tf.cast(measurements, tf.float32), dt), axis=-1) mask = tf.sequence_mask(tf.squeeze(lengths, axis=-1), name='mask') out = self.rnn(PhasedLSTMInput(times=times, x=values), mask=mask, initial_state=initial_state) return self.output_layer(out)
def call(self, inputs, state):#state sigmoid = math_ops.sigmoid # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) concat = _linear([inputs, h], 4 * self._num_units + 2 * self.n_chunk, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate f_master_t = concat[:,:self.n_chunk] f_master_t = self.cumsum(tf.nn.softmax(f_master_t,axis=-1)) f_master_t = tf.expand_dims(f_master_t,2) i_master_t = concat[:,self.n_chunk:2*self.n_chunk] i_master_t = self.cumsum(tf.nn.softmax(i_master_t,axis=-1),'left') i_master_t = tf.expand_dims(i_master_t,2) concat = concat[:, 2*self.n_chunk:] #reshape concat = tf.reshape(concat,[-1,self.n_chunk*4,self.chunk_size]) f_t = tf.nn.sigmoid(concat[:, :self.n_chunk]) i_t = tf.nn.sigmoid(concat[:, self.n_chunk : 2*self.n_chunk]) o_t = tf.nn.sigmoid(concat[:, 2*self.n_chunk : 3*self.n_chunk]) c_t_hat = tf.tanh(concat[:, 3*self.n_chunk:]) w_t = f_master_t * i_master_t new_c = w_t*(f_t*tf.reshape(c,[-1,self.n_chunk,self.chunk_size]) + i_t*c_t_hat) + \ (i_master_t-w_t)*c_t_hat + \ (f_master_t-w_t)*tf.reshape(c,[-1,self.n_chunk,self.chunk_size]) new_h = tf.tanh(new_c)*o_t new_c = tf.reshape(new_c,[-1,self._num_units]) new_h = tf.reshape(new_h,[-1,self._num_units]) # i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) # new_c = ( # c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) # new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def tuplify_pass(obj, outer_struct=None, outer_key=None): if isinstance(obj, dict): if 'name__' in obj: tuplified = copy.copy(obj) del tuplified['name__'] if obj['name__'] == "LSTMStateTuple": tuplified = LSTMStateTuple(**tuplified) else: tuplified = namedtuple(obj['name__'], sorted(tuplified))(**tuplified) outer_struct[outer_key] = tuplified else: for key in obj: tuplify_pass(obj[key], obj, key) elif isinstance(obj, list): for i in range(len(obj)): tuplify_pass(obj[i], obj, i) else: outer_struct[outer_key] = obj
def call(self, inputs, state): sigmoid = math_ops.sigmoid one = constant_op.constant(1, dtype=dtypes.int32) if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one) gate_inputs = math_ops.matmul(array_ops.concat([inputs, h], 1), self._kernel) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) i, j, f, o = array_ops.split(value=gate_inputs, num_or_size_splits=4, axis=one) forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype) add = math_ops.add multiply = math_ops.multiply new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j))) new_h = multiply(self._activation(new_c), sigmoid(o)) # vib if self.pruning: std = tf.exp(self._logD * 0.5) dim = tf.shape(self._logD)[0] # eps = tf.random.normal(shape=[self.batch_size, dim]) z_scale = tf.cond(self.is_training, lambda: tf.reshape(self._mu, shape=[1, -1]) + tf.random.normal( shape=[self.batch_size, dim]) * tf.reshape(std, shape=[1, -1]), lambda: (tf.reshape(self._mu, shape=[1, -1]) + tf.zeros( shape=[self.batch_size, dim])) * self.get_mask()) new_h = new_h * z_scale if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def add_model(self): with tf.variable_scope("embed_lookup"): #modify initializer here to add glove/word2vec embedding = getGlove([wrd for wrd in self.vocab if wrd != '<unk>'], 'wiki_300') _wrd_embed = tf.get_variable( 'embed_matrix', [len(self.vocab) - 1, self.p.embed_dim], initializer=tf.constant_initializer(embedding), regularizer=self.regularizer) wrd_pad = tf.Variable(tf.zeros([1, self.p.embed_dim]), trainable=False) self.embed_matrix = tf.concat([_wrd_embed, wrd_pad], axis=0) #Embed the source and target sentences. Elmo can be added here self.enc_inp_embed = tf.nn.embedding_lookup(self.embed_matrix, self.enc_inp) self.dec_inp_embed = tf.nn.embedding_lookup(self.embed_matrix, self.dec_inp) self.logger.info("Building encoder") with tf.variable_scope('encoder'): self.enc_cell = self.build_enc_cell() self.enc_outputs, self.enc_last_state = tf.nn.dynamic_rnn( cell=self.enc_cell, inputs=self.enc_inp_embed, sequence_length=self.enc_inp_len, dtype=self.p.dtype, time_major=False, scope='enc_rnn') #DED part. Also used for get_hidden self.enc_outputs_para, self.enc_last_state_para = tf.nn.dynamic_rnn( cell=self.enc_cell, inputs=self.dec_inp_embed, sequence_length=self.dec_inp_len, dtype=self.p.dtype, time_major=False, scope='enc_rnn') if self.p.use_bidir: self.fw_cell, self.bw_cell = self.build_bi_enc_cell() self.enc_outputs, self.enc_last_state_fw, self.enc_last_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( cells_fw=self.fw_cell, cells_bw=self.bw_cell, inputs=self.enc_inp_embed, sequence_length=self.enc_inp_len, dtype=self.p.dtype, time_major=False, scope='bi_enc_rnn') enc_last_state_fw = [state for state in self.enc_last_state_fw] enc_last_state_bw = [state for state in self.enc_last_state_bw] enc_last_state = [] for st, _ in enumerate(enc_last_state_fw): enc_last_state.append( LSTMStateTuple( tf.concat([ enc_last_state_fw[st].c, enc_last_state_bw[st].c ], axis=-1), tf.concat([ enc_last_state_fw[st].h, enc_last_state_bw[st].h ], axis=-1))) self.enc_last_state = tuple(enc_last_state) self.enc_outputs_para, self.enc_last_state_fw_para, self.enc_last_state_bw_para = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( cells_fw=self.fw_cell, cells_bw=self.bw_cell, inputs=self.dec_inp_embed, sequence_length=self.dec_inp_len, dtype=self.p.dtype, time_major=False, scope='bi_enc_rnn') enc_last_state_fw_para = [ state for state in self.enc_last_state_fw_para ] enc_last_state_bw_para = [ state for state in self.enc_last_state_bw_para ] enc_last_state = [] for st, _ in enumerate(enc_last_state_fw_para): enc_last_state.append( LSTMStateTuple( tf.concat([ enc_last_state_fw_para[st].c, enc_last_state_bw_para[st].c ], axis=-1), tf.concat([ enc_last_state_fw_para[st].h, enc_last_state_bw_para[st].h ], axis=-1))) self.enc_last_state_para = tuple(enc_last_state) if self.p.use_gan: self.transformation = self.build_generator( self.p.hidden_size * 2 if self.p.use_bidir else self.p.hidden_size) enc_last_state = [state for state in self.enc_last_state] enc_last_state[-1] = LSTMStateTuple( self.enc_last_state[-1].c, self.enc_last_state[-1].h + 10 * self.transformation) self.enc_last_state = tuple(enc_last_state) self.dec_cell, self.dec_initial_state = self.build_dec_cell( self.p.hidden_size * 2 if self.p.use_bidir else self.p.hidden_size) self.input_layer = Dense(self.p.hidden_size * 2 if self.p.use_bidir else self.p.hidden_size, name="input_projection") self.output_layer = Dense(len(self.vocab), name="output_projection") if self.p.mode == 'train': self.logger.info("Building training decoder") self.dec_inp_embed = self.input_layer( self.dec_inp_embed ) #decoder inputs dim should match encoder outputs dim training_helper = seq2seq.TrainingHelper( inputs=self.dec_inp_embed, sequence_length=self.dec_inp_len, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.dec_cell, helper=training_helper, initial_state=self.dec_initial_state, output_layer=self.output_layer) self.max_decoder_length = tf.reduce_max(self.dec_inp_len) # res = self.debug([self.dec_inp_embed]); pdb.set_trace() (self.dec_outputs_train, self.dec_last_state_train, self.dec_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_decoder_length)) #since output layer is passed to decoder, logits = output self.dec_logits_train = self.dec_outputs_train.rnn_output self.dec_pred_train = tf.argmax(self.dec_logits_train, axis=-1, name='decoder_pred_train') masks = tf.sequence_mask(lengths=self.dec_inp_len, maxlen=tf.shape(self.dec_inp)[1], dtype=self.p.dtype, name='masks') self.loss = seq2seq.sequence_loss(logits=self.dec_logits_train, targets=self.dec_out, weights=masks, average_across_timesteps=True, average_across_batch=True) tf.summary.scalar('loss', self.loss) elif self.p.mode == 'decode': self.logger.info("building decoder for inference") start_tokens = tf.ones([self.p.batch_size], tf.int32) * tf.cast( self.vocab_table.lookup(tf.constant('<sos>')), tf.int32) self.start_tokens = start_tokens # pdb.set_trace() end_token = tf.cast(self.vocab_table.lookup(tf.constant('<eos>')), tf.int32) def embed_and_input_proj(inputs): return self.input_layer( tf.nn.embedding_lookup(self.embed_matrix, inputs)) if not self.p.use_beam_search: self.logger.info("Building greedy decoder") decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) inference_decoder = seq2seq.BasicDecoder( cell=self.dec_cell, helper=decoding_helper, initial_state=self.dec_initial_state, output_layer=self.output_layer) else: self.logger.info("Building beam search decoder") inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=self.dec_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.dec_initial_state, beam_width=self.p.beam_width, output_layer=self.output_layer) (self.dec_out_decode, self.dec_last_state_decode, self.dec_out_length_decode) = (seq2seq.dynamic_decode( inference_decoder, output_time_major=False, maximum_iterations=self.p.max_decode_step)) if not self.p.use_beam_search: #batchsize X seq_len X 1 self.dec_pred_decode = tf.expand_dims( self.dec_out_decode.sample_id, -1) else: #batch_size X seq_len X beam_width self.dec_pred_decode = self.dec_out_decode.predicted_ids
encoder_fw_final_state, encoder_bw_final_state) = tf.nn.bidirectional_dynamic_rnn( cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=encoder_inputs_embedded, sequence_length=encoder_inputs_length, dtype=tf.float32, time_major=True) # 融合双向 LSTM 的状态 encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), axis=2) encoder_final_state_c = tf.concat( (encoder_fw_final_state.c, encoder_bw_final_state.c), axis=1) encoder_final_state_h = tf.concat( (encoder_fw_final_state.h, encoder_bw_final_state.h), axis=1) encoder_final_state = LSTMStateTuple(c=encoder_final_state_c, h=encoder_final_state_h) # decoder decoder_cell = LSTMCell(decoder_hidden_units) encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs)) decoder_lengths = encoder_inputs_length + 3 """ Decoder will contain manually specified by us transition step: output(t) -> output projection(t) -> prediction(t) (argmax) -> input embedding(t+1) -> input(t+1) """ assert EOS == 1 and PAD == 0 eos_time_slice = tf.ones((batch_size, ), dtype=tf.int32, name='EOS') pad_time_slice = tf.zeros((batch_size, ), dtype=tf.int32, name='PAD')
def state_size(self): return (LSTMStateTuple(self._num_units, self._num_units) if self._state_is_tuple else 2 * self._num_units)
def speller(encoder_outputs, encoder_state, decoder_inputs, source_sequence_length, target_sequence_length, mode, hparams): batch_size = tf.shape(encoder_outputs)[0] beam_width = hparams.beam_width if mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0: source_sequence_length = tf.contrib.seq2seq.tile_batch( source_sequence_length, multiplier=beam_width) encoder_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=beam_width) batch_size = batch_size * beam_width def embedding_fn(ids): # pass callable object to avoid OOM when using one-hot encoding if hparams.embedding_size != 0: target_embedding = tf.get_variable( 'target_embedding', [ hparams.target_vocab_size, hparams.embedding_size], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) return tf.nn.embedding_lookup(target_embedding, ids) else: return tf.one_hot(ids, hparams.target_vocab_size) cell_list = [] for layer in range(hparams.num_layers): with tf.variable_scope('decoder_cell_'.format(layer)): cell = lstm_cell(hparams.num_units * 2, hparams.dropout, mode) cell_list.append(cell) decoder_cell = tf.nn.rnn_cell.MultiRNNCell(cell_list) projection_layer = tf.layers.Dense( hparams.target_vocab_size, use_bias=True, name='projection_layer') initial_state = tuple([LSTMStateTuple(c=tf.concat([es[0].c, es[1].c], axis=-1), h=tf.concat([es[0].h, es[1].h], axis=-1)) for es in encoder_state[-hparams.num_layers:]]) maximum_iterations = None if mode != tf.estimator.ModeKeys.TRAIN: max_source_length = tf.reduce_max(source_sequence_length) maximum_iterations = tf.to_int32(tf.round(tf.to_float( max_source_length) * hparams.decoding_length_factor)) if mode == tf.estimator.ModeKeys.TRAIN: decoder_inputs = embedding_fn(decoder_inputs) if hparams.sampling_probability > 0.0: helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( decoder_inputs, target_sequence_length, embedding_fn, hparams.sampling_probability) else: helper = tf.contrib.seq2seq.TrainingHelper( decoder_inputs, target_sequence_length) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, initial_state, output_layer=projection_layer) elif mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0: start_tokens = tf.fill( [tf.div(batch_size, beam_width)], hparams.sos_id) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=embedding_fn, start_tokens=start_tokens, end_token=hparams.eos_id, initial_state=initial_state, beam_width=beam_width, output_layer=projection_layer) else: start_tokens = tf.fill([batch_size], hparams.sos_id) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding_fn, start_tokens, hparams.eos_id) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, initial_state, output_layer=projection_layer) decoder_outputs, final_context_state, final_sequence_length = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=maximum_iterations) return decoder_outputs, final_context_state, final_sequence_length
def build_encoding_model(self): self.fw_inputs = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='fw_inputs') self.bw_inputs = tf.reverse_sequence(input=self.fw_inputs, seq_lengths=self.seq_lens, seq_axis=0, batch_axis=1, name='bw_inputs') self.fw_char_lens = tf.placeholder(dtype=tf.int32, shape=[None, None], name='fw_char_lens') self.bw_char_lens = tf.reverse_sequence(input=self.fw_char_lens, seq_lengths=self.seq_lens, seq_axis=0, batch_axis=1, name='bw_char_lens') self.inputs = self.fw_inputs self.char_lens = self.fw_char_lens self.bptt = tf.placeholder(dtype=tf.int32, name='bptt', shape=()) # seq_masks = tf.expand_dims(self.seq_masks, axis=-1) input_shape = tf.shape(self.inputs) B = input_shape[1] fw_model = UniModel(self.rnn_layers, self.projection_dims, self.skip_connection, self.is_training, self.fine_tune_lr[1:] if isinstance( self.fine_tune_lr, list) else None, self.reuse, 'LMFW', is_cpu=self.is_cpu) bw_model = UniModel(self.rnn_layers, self.projection_dims, self.skip_connection, self.is_training, self.fine_tune_lr[1:] if isinstance( self.fine_tune_lr, list) else None, self.reuse, 'LMBW', is_cpu=self.is_cpu) embed_model = Embedding(self.char_vocab_size, self.char_vec_size, self.reuse, self.char_cnn_options['layers'], self.char_cnn_options['n_highways'], self.projection_dims, self.is_training, self.drop_e) embed_model.build() if isinstance(self.fine_tune_lr, list): embed_custom_lr = apply_custom_lr(self.fine_tune_lr[0]) else: def embed_custom_lr(x): return x fw_model.build(embed_model.output_shape) bw_model.build(embed_model.output_shape) initial_states = [] start_max_vals = [] start_mean_vals = [] start_outputs = [] start_last_outputs = [] start_output_shapes = [] projection_dims = self.projection_dims if isinstance( self.projection_dims, int) and self.projection_dims > 0 else None for layer in self.rnn_layers: if self.is_cpu: zeros = tf.fill(value=0.0, dims=(B, layer['units'])) initial_states.append(LSTMStateTuple(zeros, zeros)) else: zeros = tf.fill(value=0.0, dims=(1, B, layer['units'])) initial_states.append((zeros, zeros)) dims = projection_dims if self.projection_dims else layer['units'] max_val = tf.fill(value=-1e6, dims=(B, dims)) mean_val = tf.fill(value=0.0, dims=(B, dims)) start_output = tf.fill(value=0.0, dims=(0, B, dims)) start_max_vals.append(max_val) start_mean_vals.append(mean_val) start_last_outputs.append(mean_val) start_outputs.append(start_output) start_output_shapes.append(tf.TensorShape((None, None, dims))) max_len = tf.reduce_max(self.seq_lens) def cond(i, state, max_vals, mean_vals, all_outputs, last_outputs): return i < max_len def body(embed, model, inputs, char_lens, sl, bptt, max_len): def child(i, state, max_vals, mean_vals, all_outputs, last_outputs): i_to = tf.minimum(i + bptt, max_len) slice_inputs = inputs[i:i_to] slice_char_lens = char_lens[i:i_to] slice_inputs = embed.call(slice_inputs, slice_char_lens) slice_inputs = embed_custom_lr(slice_inputs) output_dict = model.call(slice_inputs, state) slice_seq_lens = tf.minimum(sl - i, bptt) mask = tf.expand_dims(tf.transpose( tf.sequence_mask(slice_seq_lens, dtype=tf.float32), (1, 0)), axis=-1) next_max_vals = [] next_mean_vals = [] new_all_outputs = [] new_last_outputs = [] for max_val, mean_val, outputs, past_outputs, last_output in zip( max_vals, mean_vals, output_dict['layer_outputs'][1:], all_outputs, last_outputs): max_outputs = outputs * mask + (1 - mask) * -1e6 max_val = tf.maximum(max_val, tf.reduce_max(max_outputs, axis=0)) mean_outputs = outputs * mask mean_val = ( mean_val * tf.expand_dims( tf.to_float(tf.minimum(i, sl)), axis=-1) + tf.reduce_sum(mean_outputs, axis=0)) / tf.expand_dims( tf.to_float(tf.minimum(i_to, sl)), axis=-1) next_max_vals.append(max_val) next_mean_vals.append(mean_val) new_all_outputs.append( tf.concat((past_outputs, mean_outputs), axis=0)) last_val = get_last_output(mean_outputs, slice_seq_lens) last_val = tf.where(slice_seq_lens > 0, last_val, last_output) new_last_outputs.append(last_val) return i_to, output_dict[ 'states'], next_max_vals, next_mean_vals, new_all_outputs, new_last_outputs return child start_i = tf.constant(0, dtype=tf.int32, shape=(), name='start_i') _, _, fw_layerwise_max, fw_layerwise_avg, fw_outputs, fw_last_output = tf.while_loop( cond, body(embed_model, fw_model, self.fw_inputs, self.fw_char_lens, self.seq_lens, self.bptt, max_len), [ start_i, initial_states, start_max_vals, start_mean_vals, start_outputs, start_last_outputs ], [ start_i.get_shape(), [ LSTMStateTuple(x.get_shape(), y.get_shape()) if self.is_cpu else (x.get_shape(), y.get_shape()) for x, y in initial_states ], [x.get_shape() for x in start_max_vals], [x.get_shape() for x in start_mean_vals], start_output_shapes, [x.get_shape() for x in start_last_outputs] ], swap_memory=True) _, _, bw_layerwise_max, bw_layerwise_avg, bw_outputs, bw_last_output = tf.while_loop( cond, body(embed_model, bw_model, self.bw_inputs, self.bw_char_lens, self.seq_lens, self.bptt, max_len), [ start_i, initial_states, start_max_vals, start_mean_vals, start_outputs, start_last_outputs ], [ start_i.get_shape(), [ LSTMStateTuple(x.get_shape(), y.get_shape()) if self.is_cpu else (x.get_shape(), y.get_shape()) for x, y in initial_states ], [x.get_shape() for x in start_max_vals], [x.get_shape() for x in start_mean_vals], start_output_shapes, [x.get_shape() for x in start_last_outputs] ], swap_memory=True) self.layerwise_max = [ tf.concat((fw, bw), axis=-1) for fw, bw in zip(fw_layerwise_max, bw_layerwise_max) ] self.layerwise_avg = [ tf.concat((fw, bw), axis=-1) for fw, bw in zip(fw_layerwise_avg, bw_layerwise_avg) ] self.layerwise_last = [ tf.concat((fw, bw), axis=-1) for fw, bw in zip(fw_last_output, bw_last_output) ] self.timewise_outputs = [ tf.concat((fw, tf.reverse_sequence(input=bw, seq_lengths=self.seq_lens, seq_axis=0, batch_axis=1)), axis=-1) for fw, bw in zip(fw_outputs, bw_outputs) ] self.layerwise_encode = [ tf.concat(out, axis=-1) for out in zip(fw_layerwise_avg, fw_layerwise_max, bw_layerwise_avg, bw_layerwise_max) ]
def build_encoder(self, encoder_inputs, encoder_input_lengths): """ Builds an RNN encoder. Can be configured to be uni- / bi- directional. Can also enable dropout. Returns outputs of the RNN at each timestep and also the final state """ with tf.variable_scope("encoder"): # Embeddings for orthographic characters char_embeddings = tf.Variable(tf.random_uniform( (self.n_chars, self.embed_dims), -1.0, 1.0), name="char_embeddings") encoder_input_embeddings = tf.nn.embedding_lookup( char_embeddings, encoder_inputs) # Unidirectional Run if not self.bidir: encoder_cell = self.cell_class_fn(self.hidden_dims) if self.mode == "training": encoder_cell = DropoutWrapper( encoder_cell, input_keep_prob=1.0 - self.dropout, output_keep_prob=1.0 - self.dropout, state_keep_prob=1.0 - self.dropout) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( encoder_cell, encoder_input_embeddings, dtype=tf.float32, time_major=True) # Bidirectional Run else: with tf.variable_scope("fw"): fw_encoder_cell = self.cell_class_fn(self.hidden_dims) if self.mode == "training": fw_encoder_cell = DropoutWrapper( fw_encoder_cell, input_keep_prob=1.0 - self.dropout, output_keep_prob=1.0 - self.dropout, state_keep_prob=1.0 - self.dropout) with tf.variable_scope("bw"): bw_encoder_cell = self.cell_class_fn(self.hidden_dims) if self.mode == "training": bw_encoder_cell = DropoutWrapper( bw_encoder_cell, input_keep_prob=1.0 - self.dropout, output_keep_prob=1.0 - self.dropout, state_keep_prob=1.0 - self.dropout) ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state)) = (tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_encoder_cell, cell_bw=bw_encoder_cell, inputs=encoder_input_embeddings, sequence_length=encoder_input_lengths, dtype=tf.float32, time_major=True)) # Concat final states of forward and backward run encoder_final_state_c = tf.concat( (encoder_fw_final_state.c, encoder_bw_final_state.c), 1) encoder_final_state_h = tf.concat( (encoder_fw_final_state.h, encoder_bw_final_state.h), 1) encoder_final_state = LSTMStateTuple(c=encoder_final_state_c, h=encoder_final_state_h) encoder_outputs = tf.concat( (encoder_fw_outputs, encoder_bw_outputs), -1) return encoder_outputs, encoder_final_state
def build(self): """Build the model""" print("Building the sequence to sequence model ... ") vocab_size = self.vocab_size state_size = self.state_size enc_layers = self.enc_layers # Placeholders with tf.name_scope("placeholders"): enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs") inp_lens = tf.placeholder(tf.int32, [None], "inp_lens") self.drop_out = tf.placeholder(tf.float32, (), "drop_out") self.enc_inputs = enc_inputs self.inp_lens = inp_lens if (self.mode == "train"): dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs") targets = tf.placeholder(tf.int32, [None, None], "targets") out_lens = tf.placeholder(tf.int32, [None], "out_lens") self.learning_rate = tf.placeholder(tf.float32, (), "learning_rate") self.lambda_kl = tf.placeholder(tf.float32, (), "lambda_kl") self.dec_inputs = dec_inputs self.targets = targets self.out_lens = out_lens batch_size = tf.shape(enc_inputs)[0] max_len = tf.shape(enc_inputs)[1] # Embedding with tf.variable_scope("embeddings"): embedding_matrix = tf.get_variable( name="embedding_matrix", shape=[vocab_size, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs) if (self.mode == "train"): dec_inputs = tf.nn.embedding_lookup(embedding_matrix, dec_inputs) # Encoder with tf.variable_scope("encoder"): # TODO: residual LSTM, layer normalization # if(self.bidirectional) # enc_cell_fw = [create_cell( # "enc-fw-%d" % i, state_size, self.drop_out, self.no_residual) # for i in range(enc_layers)] # enc_cell_bw = [create_cell( # "enc-bw-%d" % i, state_size, self.drop_out, self.no_residual) # for i in range(enc_layers)] # else: enc_cell = [ create_cell("enc-%d" % i, state_size, self.drop_out, self.no_residual) for i in range(enc_layers) ] enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell) enc_outputs, enc_state = tf.nn.dynamic_rnn( enc_cell, enc_inputs, sequence_length=inp_lens, dtype=tf.float32) # Decoder with tf.variable_scope("decoder"): dec_cell = [ create_cell("dec-%d" % i, state_size, self.drop_out, self.no_residual) for i in range(enc_layers) ] dec_cell = tf.nn.rnn_cell.MultiRNNCell(dec_cell) dec_proj = tf.layers.Dense( vocab_size, name="dec_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) # latent code if (self.vae): print("Using vae model") with tf.variable_scope("latent_code"): enc_mean = tf.reduce_sum(enc_outputs, 1) enc_mean /= tf.expand_dims(tf.cast(inp_lens, tf.float32), [1]) z_code = enc_mean if (self.prior == "gaussian"): print("Gaussian prior") latent_proj = tf.layers.Dense( 2 * state_size, name="latent_proj", kernel_initializer=tf.random_normal_initializer( stddev=0.05), bias_initializer=tf.constant_initializer(0.)) z_loc, z_scale = tf.split(latent_proj(z_code), [state_size, state_size], 1) z_mvn = tfd.MultivariateNormalDiag(z_loc, z_scale) z_sample = z_mvn.sample() elif (self.prior == "vmf"): # print("vmf prior") # latent_proj = tf.layers.Dense(state_size + 1, name="latent_proj", # kernel_initializer=tf.random_normal_initializer(stddev=0.05), # bias_initializer=tf.constant_initializer(0.)) # z_mu, z_conc = tf.split( # latent_proj(z_code), [state_size, 1], 1) # z_mu /= tf.expand_dims(tf.norm(z_mu, axis=1), axis=1) # z_conc = tf.reshape(z_conc, [batch_size]) # z_vmf = tfd.VonMisesFisher(z_mu, z_conc) # z_sample = z_vmf.sample() pass dec_init_state = (LSTMStateTuple(c=z_sample, h=z_sample), LSTMStateTuple(c=z_sample, h=z_sample)) else: print("Using normal seq2seq, no latent variable") dec_init_state = enc_state with tf.variable_scope("decoding"): # greedy decoding _, dec_outputs_predict = decoding_infer(self.dec_start_id, dec_cell, dec_proj, embedding_matrix, dec_init_state, enc_outputs, batch_size, max_len, inp_lens, max_len, self.is_attn, self.sampling_method, self.topk_sampling_size, state_size=self.state_size) # decoding with forward sampling # dec_outputs_sampling = decodeing_infer() # TBC if (self.mode == "train"): # training decoding dec_logits_train, _, _, _, _ = decoding_train( dec_inputs, dec_cell, dec_proj, dec_init_state, enc_outputs, max_len, inp_lens, max_len, self.is_attn, self.state_size) all_variables = slim.get_variables_to_restore() model_variables = [ var for var in all_variables if var.name.split("/")[0] == self.model_name ] print("%s model, variable list:" % self.model_name) for v in model_variables: print(" %s" % v.name) self.model_saver = tf.train.Saver(all_variables, max_to_keep=3) # loss and optimizer dec_mask = tf.sequence_mask(out_lens, max_len, dtype=tf.float32) dec_loss = tf.contrib.seq2seq.sequence_loss( dec_logits_train, targets, dec_mask) if (self.vae): if (self.prior == "gaussian"): standard_normal = tfd.MultivariateNormalDiag( tf.zeros(state_size), tf.ones(state_size)) prior_prob = standard_normal.log_prob(z_sample) # [B] posterior_prob = z_mvn.log_prob(z_sample) # [B] kl_loss = tf.reduce_mean(posterior_prob - prior_prob) loss = dec_loss + self.lambda_kl * kl_loss elif (self.prior == "vmf"): # vmf_mu_0 = tf.ones(state_size) / tf.cast(state_size, tf.float32) # standard_vmf = tfd.VonMisesFisher(vmf_mu_0, 0) # prior_prob = standard_vmf.log_prob(z_sample) # [B] # posterior_prob = z_vmf.log_prob(z_sample) # [B] # kl_loss = tf.reduce_mean(posterior_prob - prior_prob) # loss = dec_loss + self.lambda_kl * kl_loss pass else: loss = dec_loss optimizer = tf.train.AdamOptimizer(self.learning_rate) train_op = optimizer.minimize(loss) self.train_output = {"train_op": train_op, "loss": loss} self.train_output.update(self.inspect) if (self.vae): self.train_output["dec_loss"] = dec_loss self.train_output["kl_loss"] = kl_loss self.valid_output = {"nll": tf.exp(loss)} self.infer_output = {"dec_predict": dec_outputs_predict} else: self.infer_output = {"dec_predict": dec_outputs_predict} return
def build_graph(self): print("building generator graph...") with tf.variable_scope("seq2seq"): with tf.variable_scope("embedding"): # shared by encoder and decoder # self.embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size], dtype=tf.float32, # trainable=True, initializer=self.rand_uni_init) # using pretrain word vector self.embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size], dtype=tf.float32, trainable=True, initializer=tf.constant_initializer(self.pretrain_wv)) with tf.variable_scope("encoder"): topic_embedded = tf.nn.embedding_lookup(self.embedding, self.topic_input) # encode topic to representation topic_average = tf.reduce_mean(topic_embedded, axis=1) topic_state = tf.layers.dense(topic_average, self.hidden_size) with tf.variable_scope("decoder"): def _get_cell(_num_units): cell = tf.contrib.rnn.BasicLSTMCell(_num_units) if self.training_flag: cell = tf.contrib.rnn.DropoutWrapper(cell) return cell # single layer self.initial_state = LSTMStateTuple(c=topic_state, h=topic_state) self.decoder_cell = _get_cell(self.hidden_size) self.decoder_input_embedded = tf.nn.embedding_lookup(self.embedding, self.target_input) self.output_layer = layers_core.Dense(self.vocab_size, use_bias=False) # pre-train with targets # helper_pt = tf.contrib.seq2seq.TrainingHelper( inputs=self.decoder_input_embedded, sequence_length=self.sequence_lengths, time_major=False, ) decoder_pt = tf.contrib.seq2seq.BasicDecoder( cell=self.decoder_cell, helper=helper_pt, initial_state=self.initial_state, output_layer=self.output_layer ) outputs_pt, _final_state, sequence_lengths_pt = tf.contrib.seq2seq.dynamic_decode( decoder=decoder_pt, output_time_major=False, maximum_iterations=self.max_len, swap_memory=True, impute_finished=True ) self.logits_pt = outputs_pt.rnn_output self.g_predictions = tf.nn.softmax(self.logits_pt) masks = tf.sequence_mask(lengths=self.target_len, maxlen=self.max_len, dtype=tf.float32, name='masks') # print("target input:", self.target_input.shape) # print("logits:", self.logits_pt) self.target_output = tf.placeholder(tf.int32, [None, None]) self.pretrain_loss = tf.contrib.seq2seq.sequence_loss( self.logits_pt, self.target_output, masks, average_across_timesteps=True, average_across_batch=True) self.global_step = tf.Variable(0, trainable=False) # gradient clipping optimizer = tf.train.AdamOptimizer(self.learning_rate) gradients, v = zip(*optimizer.compute_gradients(self.pretrain_loss)) gradients, _ = tf.clip_by_global_norm(gradients, self.grad_norm) self.pretrain_updates = optimizer.apply_gradients(zip(gradients, v), global_step=self.global_step) # infer helper_i = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embedding, tf.fill([self.batch_size], self.vocab_dict['<GO>']), end_token=self.vocab_dict['<EOS>'] ) decoder_i = tf.contrib.seq2seq.BasicDecoder( cell=self.decoder_cell, helper=helper_i, initial_state=self.initial_state, output_layer=self.output_layer ) outputs_i, _final_state_i, sequence_lengths_i = tf.contrib.seq2seq.dynamic_decode( decoder=decoder_i, output_time_major=False, maximum_iterations=self.max_len, swap_memory=True, impute_finished=True ) sample_id = outputs_i.sample_id self.infer_tokens = tf.unstack(sample_id, axis=0) print("generator graph built successfully")
def build(self): """Build the model""" print("Building the Latent BOW - sequence to sequence model ... ") vocab_size = self.vocab_size key_size = self.key_size state_size = self.state_size enc_layers = self.enc_layers max_enc_bow = self.max_enc_bow lambda_enc_loss = self.lambda_enc_loss # Placeholders with tf.name_scope("placeholders"): enc_keys = tf.placeholder(tf.int32, [None, None], "enc_keys") enc_locs = tf.placeholder(tf.int32, [None, None], "enc_locs") enc_vals = tf.placeholder(tf.int32, [None, None], "enc_vals") enc_lens = tf.placeholder(tf.int32, [None], "enc_lens") self.drop_out = tf.placeholder(tf.float32, (), "drop_out") self.gumbel_tau = tf.placeholder(tf.float32, (), "gumbel_tau") self.enc_keys = enc_keys self.enc_locs = enc_locs self.enc_vals = enc_vals self.enc_lens = enc_lens enc_targets = tf.placeholder(tf.int32, [None, None], "enc_targets") dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs") dec_targets = tf.placeholder(tf.int32, [None, None], "dec_targets") dec_lens = tf.placeholder(tf.int32, [None], "dec_lens") self.enc_targets = enc_targets self.dec_inputs = dec_inputs self.dec_targets = dec_targets self.dec_lens = dec_lens batch_size = tf.shape(enc_keys)[0] max_enc_len = tf.shape(enc_keys)[1] max_dec_len = tf.shape(dec_targets)[1] # Embedding with tf.variable_scope("embeddings"): embedding_matrix_vals = tf.get_variable( name="embedding_matrix_vals", shape=[vocab_size, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) embedding_matrix_keys = tf.get_variable( name="embedding_matrix_keys", shape=[key_size, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) embedding_matrix_locs = tf.get_variable( name="embedding_matrix_locs", shape=[100, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) enc_keys = tf.nn.embedding_lookup(embedding_matrix_keys, enc_keys) enc_vals = tf.nn.embedding_lookup(embedding_matrix_vals, enc_vals) enc_locs = tf.nn.embedding_lookup(embedding_matrix_locs, enc_locs) enc_inputs = (enc_keys + enc_vals + enc_locs) / 3. dec_inputs = tf.nn.embedding_lookup(embedding_matrix_vals, dec_inputs) # Encoder with tf.variable_scope("encoder"): # TODO: residual LSTM, layer normalization enc_cell = [create_cell( "enc-%d" % i, state_size, self.drop_out, self.no_residual) for i in range(enc_layers)] enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell) enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_inputs, sequence_length=enc_lens, dtype=tf.float32) # Encoder bow prediction with tf.variable_scope("bow_output"): bow_topk_prob, gumbel_topk_prob, seq_neighbor_ind, seq_neighbor_prob = \ bow_predict_seq_tag(vocab_size, batch_size, enc_outputs, enc_lens, max_enc_len, self.is_gumbel, self.gumbel_tau) seq_neighbor_output = {"seq_neighbor_ind": seq_neighbor_ind, "seq_neighbor_prob": seq_neighbor_prob} # Encoder output, loss and metrics with tf.name_scope("enc_output"): # top k prediction bow_pred_prob, pred_ind = tf.nn.top_k(bow_topk_prob, max_enc_bow) # loss function enc_targets = _enc_target_list_to_khot( enc_targets, vocab_size, self.pad_id) enc_loss = enc_loss_fn( self.bow_loss_fn, enc_targets, bow_topk_prob, max_enc_bow) self.train_output = {"enc_loss": enc_loss} # performance monitor bow_metrics_dict = bow_train_monitor( bow_topk_prob, pred_ind, vocab_size, batch_size, enc_targets) self.train_output.update(bow_metrics_dict) # Encoder soft sampling with tf.name_scope("gumbel_topk_sampling"): sample_ind, sample_prob, sample_memory = bow_gumbel_topk_sampling( gumbel_topk_prob, embedding_matrix_vals, self.sample_size, vocab_size) sample_memory_lens = tf.ones(batch_size, tf.int32) * self.sample_size sample_memory_avg = tf.reduce_mean(sample_memory, 1) # [B, S] sample_memory_output = {"bow_pred_ind": pred_ind, "bow_pred_prob": bow_pred_prob, "sample_memory_ind": sample_ind, "sample_memory_prob": sample_prob } # Decoder # The initial state of the decoder = # encoder meaning vector z + encoder bow vector b with tf.variable_scope("decoder"): dec_cell = [create_cell( "dec-%d" % i, state_size, self.drop_out, self.no_residual) for i in range(enc_layers)] dec_cell = tf.nn.rnn_cell.MultiRNNCell(dec_cell) dec_proj = tf.layers.Dense(vocab_size, name="dec_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) dec_ptr_k_proj = [ tf.layers.Dense(state_size, name="dec_ptr_k_proj_%d" % pi, kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) for pi in range(self.num_pointers)] dec_ptr_g_proj = tf.layers.Dense(1, name="dec_ptr_g_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.), activation=tf.nn.sigmoid) bow_cond_gate_proj = tf.layers.Dense(1, name="bow_cond_gate_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.), activation=tf.nn.sigmoid) dec_init_state = [] for l in range(enc_layers): dec_init_state.append(LSTMStateTuple(c=enc_state[0].c, h=enc_state[0].h + sample_memory_avg)) dec_init_state = tuple(dec_init_state) # if(enc_layers == 2): # dec_init_state = (LSTMStateTuple( c=enc_state[0].c, # h=enc_state[0].h + sample_memory_avg), # LSTMStateTuple( c=enc_state[1].c, # h=enc_state[1].h + sample_memory_avg) ) # elif(enc_layers == 4): # dec_init_state = (LSTMStateTuple(c=enc_state[0].c, # h=enc_state[0].h + sample_memory_avg), # LSTMStateTuple( c=enc_state[1].c, # h=enc_state[1].h + sample_memory_avg) ) # else: raise Exception('enc_layers not in [2, 4]') if(self.source_attn): # [B, M + T, S] dec_memory = [sample_memory, enc_outputs] dec_mem_len = [sample_memory_lens, enc_lens] dec_max_mem_len = [self.sample_size, max_enc_len] else: dec_memory = sample_memory dec_mem_len = sample_memory_lens dec_max_mem_len = tf.shape(dec_memory)[1] if(self.bow_cond): bow_cond = sample_memory_avg else: bow_cond = None if(self.bow_cond_gate == False): bow_cond_gate_proj = None (dec_outputs_predict, dec_logits_train, dec_prob_train, pointer_ent, avg_max_ptr, avg_num_copy) = decode( self.dec_start_id, dec_inputs, dec_cell, dec_proj, embedding_matrix_vals, dec_init_state, dec_memory, dec_mem_len, dec_max_mem_len, batch_size, max_dec_len, self.sampling_method, self.topk_sampling_size, state_size, multi_source=True, copy=self.copy, copy_ind=sample_ind, dec_ptr_g_proj=dec_ptr_g_proj, dec_ptr_k_proj=dec_ptr_k_proj, bow_cond=bow_cond, bow_cond_gate_proj=bow_cond_gate_proj) # model saver, before the optimizer all_variables = slim.get_variables_to_restore() model_variables = [var for var in all_variables if var.name.split("/")[0] == self.model_name] print("%s model, variable list:" % self.model_name) for v in model_variables: print(" %s" % v.name) self.model_saver = tf.train.Saver(model_variables, max_to_keep=3) with tf.variable_scope("optimizer"): optimizer = tf.train.AdamOptimizer(self.learning_rate) # decoder output, training and inference, combined with encoder loss with tf.name_scope("dec_output"): dec_mask = tf.sequence_mask(dec_lens, max_dec_len, dtype=tf.float32) if(self.copy == False): dec_loss = tf.contrib.seq2seq.sequence_loss( dec_logits_train, dec_targets, dec_mask) else: dec_loss = _copy_loss(dec_prob_train, dec_targets, dec_mask) loss = dec_loss + lambda_enc_loss * enc_loss train_op = optimizer.minimize(loss) dec_output = {"train_op": train_op, "dec_loss": dec_loss, "loss": loss} self.train_output.update(dec_output) if(self.copy): pointer_ent =\ tf.reduce_sum(pointer_ent * dec_mask) / tf.reduce_sum(dec_mask) self.train_output['pointer_ent'] = pointer_ent avg_max_ptr =\ tf.reduce_sum(avg_max_ptr * dec_mask) / tf.reduce_sum(dec_mask) self.train_output['avg_max_ptr'] = avg_max_ptr avg_num_copy = tf.reduce_sum(avg_num_copy * dec_mask, 1) avg_num_copy = tf.reduce_mean(avg_num_copy) self.train_output['avg_num_copy'] = avg_num_copy self.infer_output = {"dec_predict": dec_outputs_predict} dec_out_mem_ratio = _calculate_dec_out_mem_ratio(dec_outputs_predict, sample_ind, vocab_size, self.pad_id, self.dec_start_id, self.dec_end_id) self.infer_output.update(dec_out_mem_ratio) self.infer_output.update(sample_memory_output) self.infer_output.update(seq_neighbor_output) return
def call(self, inputs, state): sigmoid = math_ops.sigmoid # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) concat = _linear([inputs, h], 4 * self._num_units + 2 * self.n_chunk, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate f_master_t = concat[:, :self.n_chunk] # print(f_master_t.eval(session=self.sess)) # plt.plot(f_master_t[:, 0].eval(), f_master_t[0, :]) # plt.show() # tf.summary.scalar("f_before", f_master_t) # tf.summary.histogram("f_before", f_master_t) #f_master_t = tf.nn.softmax(f_master_t , axis=-1) #f_master_t = self.differentiable_gumbel_sample(f_master_t , axis=-1, temperature=5) # tf.summary.histogram("f_act" , f_master_t) f_master_t = self.cumsum(f_master_t) # tf.summary.histogram("f_after", f_master_t) f_master_t = tf.expand_dims(f_master_t, 2) i_master_t = concat[:, self.n_chunk:2 * self.n_chunk] #i_master_t = tf.nn.softmax(i_master_t , axis=-1) #i_master_t = self.differentiable_gumbel_sample(i_master_t , axis=-1, temperature=5) # tf.summary.histogram("i_act" , i_master_t) i_master_t = self.cumsum(i_master_t, 'left') # tf.summary.histogram("i_after", i_master_t) i_master_t = tf.expand_dims(i_master_t, 2) concat = concat[:, 2 * self.n_chunk:] concat = tf.reshape(concat, [-1, self.n_chunk * 4, self.chunk_size]) f_t = tf.nn.sigmoid(concat[:, :self.n_chunk]) i_t = tf.nn.sigmoid(concat[:, self.n_chunk:2 * self.n_chunk]) o_t = tf.nn.sigmoid(concat[:, 2 * self.n_chunk:3 * self.n_chunk]) c_t_hat = tf.tanh(concat[:, 3 * self.n_chunk:]) w_t = f_master_t * i_master_t new_c = w_t * (f_t * tf.reshape(c , [-1 , self.n_chunk , self.chunk_size]) + i_t * c_t_hat) + \ (i_master_t - w_t) * c_t_hat + \ (f_master_t - w_t) * tf.reshape(c , [-1 , self.n_chunk , self.chunk_size]) new_h = tf.tanh(new_c) * o_t new_c = tf.reshape(new_c, [-1, self._num_units]) new_h = tf.reshape(new_h, [-1, self._num_units]) # i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) # new_c = ( # c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) # new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def build(self): """Build the model """ print("Building the bow - sequence to sequence model ... ") vocab_size = self.vocab_size state_size = self.state_size enc_layers = self.enc_layers max_enc_bow = self.max_enc_bow num_paraphrase = self.num_paraphrase # Placeholders with tf.name_scope("placeholders"): enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs") enc_lens = tf.placeholder(tf.int32, [None], "enc_lens") self.drop_out = tf.placeholder(tf.float32, (), "drop_out") self.max_len = tf.placeholder(tf.int32, (), "max_len") dec_bow = tf.placeholder(tf.int32, [None, None], "dec_bow") dec_bow_len = tf.placeholder(tf.int32, [None], "dec_bow_len") self.enc_inputs = enc_inputs self.enc_lens = enc_lens self.dec_bow = dec_bow self.dec_bow_len = dec_bow_len if(self.mode == "train"): enc_targets = tf.placeholder(tf.int32, [None, None], "enc_targets") enc_seq2seq_inputs = tf.placeholder( tf.int32, [None, num_paraphrase, None], "enc_seq2seq_inputs") enc_seq2seq_targets = tf.placeholder( tf.int32, [None, num_paraphrase, None], "enc_seq2seq_targets") enc_seq2seq_lens = tf.placeholder( tf.int32, [None, num_paraphrase], "enc_seq2seq_lens") dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs") dec_targets = tf.placeholder(tf.int32, [None, None], "dec_targets") dec_lens = tf.placeholder(tf.int32, [None], "dec_lens") self.enc_targets = enc_targets self.enc_seq2seq_inputs = enc_seq2seq_inputs self.enc_seq2seq_targets = enc_seq2seq_targets self.enc_seq2seq_lens = enc_seq2seq_lens self.dec_inputs = dec_inputs self.dec_targets = dec_targets self.dec_lens = dec_lens enc_batch_size = tf.shape(enc_inputs)[0] max_len = self.max_len dec_batch_size = tf.shape(dec_bow)[0] max_dec_bow = tf.shape(dec_bow)[1] # Embedding with tf.variable_scope("embeddings"): embedding_matrix = tf.get_variable( name="embedding_matrix", shape=[vocab_size, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs) if(self.mode == "train"): dec_inputs = tf.nn.embedding_lookup(embedding_matrix, dec_inputs) dec_bow = tf.nn.embedding_lookup(embedding_matrix, dec_bow) # Encoder with tf.variable_scope("encoder"): # TODO: residual LSTM, layer normalization enc_cell = [create_cell("enc-%d" % i, state_size, self.drop_out) for i in range(enc_layers)] enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell) enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_inputs, sequence_length=enc_lens, dtype=tf.float32) # Encoder bow prediction with tf.variable_scope("bow_output"): if(self.bow_pred_method == "mix_softmax"): bow_topk_prob = bow_predict_mix_softmax( enc_batch_size, vocab_size, max_enc_bow, enc_state) elif(self.bow_pred_method == "seq_tag"): bow_topk_prob, _, _, _ = bow_predict_seq_tag( vocab_size, enc_batch_size, enc_outputs, enc_lens, max_len) elif(self.bow_pred_method == "seq2seq"): bow_topk_prob, enc_seq2seq_loss, enc_infer_pred = \ bow_predict_seq2seq(enc_seq2seq_inputs, enc_seq2seq_targets, enc_seq2seq_lens, embedding_matrix, enc_outputs, enc_state, enc_layers, num_paraphrase, max_len, enc_lens, enc_batch_size, vocab_size, state_size, self.drop_out, self.dec_start_id) with tf.variable_scope("enc_optimizer"): enc_optimizer = tf.train.AdamOptimizer(self.learning_rate_enc) with tf.name_scope("enc_output"): # top k prediction pred_prob, pred_ind = tf.nn.top_k(bow_topk_prob, max_enc_bow) pred_prob_unnorm = pred_prob pred_prob /= tf.expand_dims(tf.reduce_sum(pred_prob, axis=1), [1]) pred_prob_dec, pred_ind_dec = tf.nn.top_k(bow_topk_prob, self.sample_size) pred_prob_dec /= tf.expand_dims(tf.reduce_sum(pred_prob_dec, axis=1), [1]) if(self.mode == "train"): with tf.name_scope("enc_loss"): # loss function enc_targets = _enc_target_list_to_khot( enc_targets, vocab_size, self.pad_id) enc_bow_loss = enc_loss_fn( self.bow_loss_fn, enc_targets, bow_topk_prob, max_enc_bow) if(self.bow_pred_method == "seq2seq"): # pure sequence to sequence for now enc_loss = enc_seq2seq_loss + 0.0 * enc_bow_loss else: enc_loss = enc_bow_loss enc_train_op = enc_optimizer.minimize(enc_loss) # prediction preformance monitor during training # write this in a function # TODO: top 10 recall with tf.name_scope("train_output"): # encoder training output self.enc_train_output = { "enc_train_op": enc_train_op, "enc_bow_loss": enc_bow_loss, "enc_loss": enc_loss} bow_metrics_dict = bow_train_monitor( bow_topk_prob, pred_ind, vocab_size, enc_batch_size, enc_targets) self.enc_train_output.update(bow_metrics_dict) if(self.bow_pred_method == "seq2seq"): self.enc_train_output["enc_seq2seq_loss"] = enc_seq2seq_loss # encoder inference output with tf.name_scope("infer_output"): if(self.bow_pred_method == "seq2seq"): (infer_overlap, infer_pred_support, infer_target_support, infer_prec, infer_recl) = bow_seq2seq_metrics( enc_targets, enc_infer_pred, vocab_size, self.pad_id) self.enc_infer_output = { "enc_infer_overlap": infer_overlap, "enc_infer_pred_support": infer_pred_support, "enc_infer_target_support": infer_target_support, "enc_infer_precision": infer_prec, "enc_infer_recall": infer_recl, "enc_infer_pred": enc_infer_pred} else: self.enc_infer_output = { "pred_prob": pred_prob, "pred_ind": pred_ind, "pred_prob_dec": pred_prob_dec, "pred_ind_dec": pred_ind_dec} # Decoder bow encoding # TODO: sampling from encoder topk prediction with tf.variable_scope("dec_bow_encoding"): dec_bow_mask = tf.expand_dims( tf.sequence_mask(dec_bow_len, max_dec_bow, dtype=tf.float32), [2]) # TODO: transformer based encoding, but our primary goal is to test the # effectiveness of sampling, so we skip it for now dec_bow_enc = tf.reduce_mean(dec_bow_mask * dec_bow, axis = 1) # [B, S] with tf.variable_scope("decoder"): dec_cell = [create_cell("dec-%d" % i, state_size, self.drop_out) for i in range(enc_layers)] dec_cell = tf.nn.rnn_cell.MultiRNNCell(dec_cell) dec_init_state = (LSTMStateTuple(dec_bow_enc, dec_bow_enc), LSTMStateTuple(dec_bow_enc, dec_bow_enc)) dec_proj = tf.layers.Dense(vocab_size, name="dec_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) dec_memory = dec_bow dec_mem_len = dec_bow_len dec_max_mem_len = max_dec_bow # greedy decoding # _, dec_outputs_predict = decoding_infer(self.dec_start_id, # dec_cell, # dec_proj, # embedding_matrix, # dec_init_state, # dec_bow, # dec_batch_size, # max_len, # dec_bow_len, # max_dec_bow, # self.is_attn) # if(self.mode == "train"): # # training decoding # dec_outputs_train = decoding_train( dec_inputs, # dec_cell, # dec_init_state, # dec_bow, # max_len, # dec_bow_len, # max_dec_bow, # self.is_attn) # dec_logits_train = dec_proj(dec_outputs_train) dec_outputs_predict, dec_logits_train = decode( self.dec_start_id, dec_inputs, dec_cell, dec_proj, embedding_matrix, dec_init_state, dec_memory, dec_mem_len, dec_max_mem_len, dec_batch_size, max_len, self.sampling_method, self.topk_sampling_size, state_size, multi_source=False) all_variables = slim.get_variables_to_restore() model_variables = [var for var in all_variables if var.name.split("/")[0] == self.model_name] print("%s model, variable list:" % self.model_name) for v in model_variables: print(" %s" % v.name) self.model_saver = tf.train.Saver(model_variables, max_to_keep=3) with tf.variable_scope("dec_optimizer"): dec_optimizer = tf.train.AdamOptimizer(self.learning_rate_dec) with tf.name_scope("dec_output"): if(self.mode == "train"): dec_mask = tf.sequence_mask(dec_lens, max_len, dtype=tf.float32) dec_loss = tf.contrib.seq2seq.sequence_loss( dec_logits_train, dec_targets, dec_mask) dec_train_op = dec_optimizer.minimize(dec_loss) self.dec_train_output = { "dec_train_op": dec_train_op, "dec_loss": dec_loss} self.dec_infer_output = {"dec_predict": dec_outputs_predict} return
tf.nn.rnn_cell.LSTMCell( num_units=no_units, initializer=tf.keras.initializers.glorot_normal(), state_is_tuple=True) ])) LSTM_outputs, LSTM_fw_state, LSTM_bw_state = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( cells_fw=LSTM_fw, cells_bw=LSTM_bw, inputs=train, dtype=tf.float32) # LSTM_outputs = tf.concat((LSTM_fw_output, LSTM_bw_output), 2) LSTM_state_c = tf.concat((LSTM_fw_state[-1][0].c, LSTM_bw_state[-1][0].c), 1) LSTM_state_h = tf.concat((LSTM_fw_state[-1][0].h, LSTM_bw_state[-1][0].h), 1) LSTM_final_state = LSTMStateTuple(c=LSTM_state_c, h=LSTM_state_h) output = tf.layers.Dense(2)(LSTM_state_h) #Defining Loss function losss = tf.losses.softmax_cross_entropy(target, output) trainop = tf.train.AdamOptimizer(learning_rate=0.001).minimize(losss) # losx=[] saver = tf.train.Saver() # # with tf.device('/gp): # # with tf.device('/gpu:0'): # with tf.Session() as sess: # sess.run(tf.global_variables_initializer())