def __init__(self, config, infer=False): self.config = config if infer: config.batch_size = 1 config.decoder.max_ast_depth = 1 # setup the encoder self.encoder = NonBayesianEncoder(config) # setup the decoder with the encoding as the initial state self.decoder = NonBayesianDecoder(config, initial_state=self.encoder.encoding, infer=infer) # get the decoder outputs output = tf.reshape(tf.concat(self.decoder.outputs, 1), [-1, self.decoder.cell1.output_size]) logits = tf.matmul(output, self.decoder.projection_w) + self.decoder.projection_b self.probs = tf.nn.softmax(logits) # 1. generation loss: log P(X | \Psi) self.targets = tf.placeholder(tf.int32, [config.batch_size, config.decoder.max_ast_depth]) self.gen_loss = seq2seq.sequence_loss([logits], [tf.reshape(self.targets, [-1])], [tf.ones([config.batch_size * config.decoder.max_ast_depth])]) # The optimizer self.loss = self.gen_loss self.train_op = tf.train.AdamOptimizer(config.learning_rate).minimize(self.loss) var_params = [np.prod([dim.value for dim in var.get_shape()]) for var in tf.trainable_variables()] if not infer: print('Model parameters: {}'.format(np.sum(var_params)))
def __init__(self, config, infer=False): assert config.model == 'lle', 'Trying to load different model implementation: ' + config.model self.config = config if infer: config.batch_size = 1 config.decoder.max_seq_length = 1 # setup the encoder self.encoder = BayesianEncoder(config) samples = tf.random_normal([config.batch_size, config.latent_size], mean=0., stddev=1., dtype=tf.float32) self.psi = self.encoder.psi_mean + tf.sqrt(self.encoder.psi_covariance) * samples # setup the decoder with psi as the initial state lift_w = tf.get_variable('lift_w', [config.latent_size, config.decoder.units]) lift_b = tf.get_variable('lift_b', [config.decoder.units]) self.initial_state = tf.nn.xw_plus_b(self.psi, lift_w, lift_b) self.decoder = BayesianDecoder(config, initial_state=self.initial_state, infer=infer) # get the decoder outputs output = tf.reshape(tf.concat(self.decoder.outputs, 1), [-1, self.decoder.cell1.output_size]) logits = tf.matmul(output, self.decoder.projection_w) + self.decoder.projection_b self.probs = tf.nn.softmax(logits) # 1. generation loss: log P(X | \Psi) self.targets = tf.placeholder(tf.int32, [config.batch_size, config.decoder.max_seq_length]) self.gen_loss = seq2seq.sequence_loss([logits], [tf.reshape(self.targets, [-1])], [tf.ones([config.batch_size * config.decoder.max_seq_length])]) # 2. latent loss: KL-divergence between P(\Psi | f(\Theta)) and P(\Psi) latent_loss = 0.5 * tf.reduce_sum(- tf.log(self.encoder.psi_covariance) - 1 + self.encoder.psi_covariance + tf.square(self.encoder.psi_mean), axis=1) self.latent_loss = config.alpha * latent_loss # 3. evidence loss: log P(f(\theta) | \Psi; \sigma) evidence_loss = [ev.evidence_loss(self.psi, encoding, config) for ev, encoding in zip(config.evidence, self.encoder.encodings)] evidence_loss = [tf.reduce_sum(loss, axis=1) for loss in evidence_loss] self.evidence_loss = config.beta * tf.reduce_sum(tf.stack(evidence_loss), axis=0) # The optimizer self.loss = self.gen_loss + self.latent_loss + self.evidence_loss self.train_op = tf.train.AdamOptimizer(config.learning_rate).minimize(self.loss) var_params = [np.prod([dim.value for dim in var.get_shape()]) for var in tf.trainable_variables()] if not infer: print('Model parameters: {}'.format(np.sum(var_params)))
def generate_sequence_output(num_encoder_symbols, encoder_outputs, encoder_state, targets, sequence_length, num_decoder_symbols, weights, buckets, softmax_loss_function=None, per_example_loss=False, name=None, use_attention=False): if len(targets) < buckets[-1][1]: raise ValueError("Length of targets (%d) must be at least that of last" "bucket (%d)." % (len(targets), buckets[-1][1])) all_inputs = encoder_outputs + targets + weights with tf.name_scope(name, "model_with_buckets", all_inputs): with tf.variable_scope("decoder_sequence_output", reuse=None): logits, attention_weights = attention_RNN(encoder_outputs, encoder_state, num_decoder_symbols, sequence_length, use_attention=use_attention) if per_example_loss is None: assert len(logits) == len(targets) # We need to make target and int64-tensor and set its shape. bucket_target = [tf.reshape(tf.to_int64(x), [-1]) for x in targets] crossent = sequence_loss_by_example( logits, bucket_target, weights, softmax_loss_function=softmax_loss_function) else: assert len(logits) == len(targets) bucket_target = [tf.reshape(tf.to_int64(x), [-1]) for x in targets] crossent = sequence_loss( logits, bucket_target, weights, softmax_loss_function=softmax_loss_function) return logits, crossent
def __init__(self, config, infer=False): self.config = config if infer: config.batch_size = 1 config.decoder.max_tokens = 1 # setup the encoder self.encoder = BayesianEncoder(config) samples = tf.random_normal([config.batch_size, config.latent_size], mean=0., stddev=1., dtype=tf.float32) self.psi = self.encoder.psi_mean + tf.sqrt( self.encoder.psi_covariance) * samples # setup the decoder with psi as the initial state lift_w = tf.get_variable('lift_w', [config.latent_size, config.decoder.units]) lift_b = tf.get_variable('lift_b', [config.decoder.units]) self.initial_state = tf.nn.xw_plus_b(self.psi, lift_w, lift_b) self.decoder = BayesianDecoder(config, initial_state=self.initial_state, infer=infer) # get the decoder outputs output = tf.reshape(tf.concat(self.decoder.outputs, 1), [-1, self.decoder.cell.output_size]) logits = tf.matmul( output, self.decoder.projection_w) + self.decoder.projection_b self.probs = tf.nn.softmax(logits) # 1. generation loss: log P(X | \Psi) self.targets = tf.placeholder( tf.int32, [config.batch_size, config.decoder.max_tokens]) self.gen_loss = seq2seq.sequence_loss( [logits], [tf.reshape(self.targets, [-1])], [tf.ones([config.batch_size * config.decoder.max_tokens])]) # 2. latent loss: KL-divergence between P(\Psi | f(\Theta)) and P(\Psi) latent_loss = 0.5 * tf.reduce_sum( -tf.log(self.encoder.psi_covariance) - 1 + self.encoder.psi_covariance + tf.square(self.encoder.psi_mean), axis=1) self.latent_loss = config.alpha * latent_loss # 3. evidence loss: log P(f(\theta) | \Psi; \sigma) evidence_loss = [ ev.evidence_loss(self.psi, encoding, config) for ev, encoding in zip(config.evidence, self.encoder.encodings) ] evidence_loss = [tf.reduce_sum(loss, axis=1) for loss in evidence_loss] self.evidence_loss = config.beta * tf.reduce_sum( tf.stack(evidence_loss), axis=0) # The optimizer self.loss = self.gen_loss + self.latent_loss + self.evidence_loss self.train_op = tf.train.AdamOptimizer(config.learning_rate).minimize( self.loss) var_params = [ np.prod([dim.value for dim in var.get_shape()]) for var in tf.trainable_variables() ] if not infer: print('Model parameters: {}'.format(np.sum(var_params)))
name="GO")] + encode_input[:-1]) previous_memory = tf.zeros(shape=(batch_size, memory_dim)) cell = core_rnn_cell.GRUCell(num_units=memory_dim) decode_outputs, decode_memory = legacy_seq2seq.embedding_rnn_seq2seq( encoder_inputs=encode_input, decoder_inputs=decode_input, cell=cell, num_encoder_symbols=vocab_size, num_decoder_symbols=vocab_size, embedding_size=embedding_dim) loss = legacy_seq2seq.sequence_loss( logits=decode_outputs, targets=labels, weights=weights) tf.summary.scalar("loss", loss) manitude = tf.sqrt(tf.reduce_sum(tf.square(decode_memory[1]))) tf.summary.scalar("manitude at t=1", manitude) summary_op = tf.summary.merge_all() learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) summary_writer = tf.summary.FileWriter("./sources/seq2seq/log/", sess.graph)
def build_model(self, forward_only): print("[*] Building a PTRModel math model") with tf.variable_scope(self.scope): self.a = weight('a', [1, 1]) # self.c = weight('c', [1, 1]) # self.d = weight('d', [1, 1]) self.b = weight('b', [1, 1], init='constant') self.beta = 1 + tf.nn.softplus(weight('beta', [1, 1])) prev_state = self.controller.init_state() tf.get_variable_scope().reuse_variables() for seq_length in range(1, self.max_length * self.max_length + 1): true_output = tf.placeholder(tf.float32, [self.output_dim], name='true_output_%s' % seq_length) self.true_outputs.append(true_output) for seq_length in range(1, self.max_length + 1): input_1 = tf.placeholder(tf.float32, [self.input_dim], name='input_1_%s' % seq_length) input_2 = tf.placeholder(tf.float32, [self.input_dim], name='input_2_%s' % seq_length) self.inputs_1.append(input_1) self.inputs_2.append(input_2) # present inputs prev_state = self.controller.update_memory( prev_state, [ tf.reshape(input_1, [1, -1]), tf.reshape(input_2, [1, -1]), tf.zeros((1, self.W)) ]) self.collect_states[seq_length] = self.collect_states[ seq_length - 1][0:(seq_length - 1)] + [self.copy_state(prev_state)] self.debug[seq_length] = [] state = prev_state self.prev_states[seq_length] = state candidate_outputs = [] for j in range(0, self.MAX_STEP): state, _ = self.controller(state, j) new_state = self.copy_state(state) self.collect_states[seq_length].append(new_state) candidate_outputs.append( tf.unstack(state['M'][-1][0:(seq_length * seq_length)])) self.debug[seq_length].append( (new_state['ptr'], new_state['dptr'])) self.outputs[seq_length] = candidate_outputs if not forward_only: for seq_length in range(self.min_length, self.max_length + 1): print(" [*] Building a loss model for seq_length %s" % seq_length) print(len(self.outputs[seq_length]), len(self.true_outputs[0:seq_length * seq_length]), len([1] * (seq_length * seq_length))) # print(self.outputs[seq_length][0].shape,self.true_outputs[0:2*seq_length][0].shape,len([1] * (2*seq_length))) all_losses = [] for index in range(self.MAX_STEP): loss = sequence_loss( logits=self.outputs[seq_length][index], targets=self.true_outputs[0:seq_length * seq_length], weights=[1] * (seq_length * seq_length), average_across_timesteps=False, average_across_batch=False, softmax_loss_function=l2_loss) all_losses.append(loss) all_losses = tf.stack(all_losses) cn = tf.pow(tf.to_float(seq_length), self.a) + self.b max_pos = tf.clip_by_value(cn, 0, self.MAX_STEP - 1) stop_pos = D(self.MAX_STEP, max_pos, 1, self.beta) loss1 = tf.reduce_sum( tf.expand_dims(all_losses, 0) * stop_pos) + 0.0001 * tf.reduce_sum(cn) self.losses[seq_length] = loss1 if not self.params: self.params = tf.trainable_variables() grads = [] for grad in tf.gradients( loss1, self.params ): # + self.weight_decay*tf.add_n(tf.get_collection('l2')) if grad is not None: grads.append( tf.clip_by_value(grad, self.min_grad, self.max_grad)) else: grads.append(grad) self.grads[seq_length] = grads with tf.variable_scope("opt", reuse=None): if not forward_only: for seq_length in range(self.min_length, self.max_length + 1): self.optims[seq_length] = self.opt.apply_gradients( zip(self.grads[seq_length], self.params), global_step=self.global_step) self.saver = tf.train.Saver() print(" [*] Build a PTRModel math model finished")
def chat(input_text): word_cnt, train_dict, train_reverse_dict = load_dict(DICT_FILE) LINE_BREAK = u'<Break>' WORD_DELIMITER = u'/' UNK_WORD = u'<UNK>' PADDING_WORD = u'<PAD>' START_WORD = u'<GO>' END_WORD = u'<EOS>' START_ID = train_dict[START_WORD] END_ID = train_dict[END_WORD] PAD_ID = train_dict[PADDING_WORD] UNK_ID = train_dict[UNK_WORD] #Attenion tf.reset_default_graph() RNN_CELL_TYPE = 'LSTMCell_Attention' learning_rate = 1.0 encoder_length = 15 decoder_length = 20 embed_dim = 128 cell = tf.contrib.rnn.LSTMCell(embed_dim) num_encoder_symbols = VOCAB_SIZE num_decoder_symbols = VOCAB_SIZE embedding_size = embed_dim encoder_len_placeholder = tf.placeholder(tf.int32) encoder_placeholders = [ tf.placeholder(tf.int32, shape=[None], name="encoder_%d" % i) for i in range(encoder_length) ] decoder_placeholders = [ tf.placeholder(tf.int32, shape=[None], name="decoder_%d" % i) for i in range(decoder_length) ] target_placeholders = [ tf.placeholder(tf.int32, shape=[None], name="target_%d" % i) for i in range(decoder_length) ] target_weights_placeholders = [ tf.placeholder(tf.float32, shape=[None], name="decoder_weight_%d" % i) for i in range(decoder_length) ] outputs, states = embedding_attention_seq2seq(encoder_placeholders, decoder_placeholders, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False) loss = sequence_loss(outputs, target_placeholders, target_weights_placeholders) #train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) #train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) train_step = tf.train.AdagradOptimizer(learning_rate).minimize(loss) saver = tf.train.Saver() sess = tf.Session() #sess.run(tf.global_variables_initializer()) saved_model = MODEL_FILE #print('Loading model from:', saved_model) #t0 = time.time() saver.restore(sess, saved_model) #t1 = time.time() #print(t1-t0) #input_text = u'你要去哪?' output_text = generate_response(sess, input_text, train_dict, train_reverse_dict, encoder_length, decoder_length, PAD_ID, UNK_ID, START_ID, END_ID, cell, embed_dim, VOCAB_SIZE, encoder_placeholders, decoder_placeholders, target_weights_placeholders) #print(output_text.encode("utf-8")) return output_text
enc_inp[:-1]) weights = [ tf.placeholder(tf.float32, shape=(None, ), name="weight%i" % t) for t in range(seq_length) ] labels = [ tf.placeholder(tf.int32, shape=(None, ), name="labels%i" % t) for t in range(seq_length) ] # weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels] prev_mem = tf.zeros((batch_size, memory_dim)) cell = MultiRNNCell([BasicLSTMCell(memory_dim)] * 3) dec_outputs, dec_memory = legacy_seq2seq.embedding_rnn_seq2seq( enc_inp, dec_inp, cell, vocab_size, vocab_size, embedding_dim) loss = legacy_seq2seq.sequence_loss(dec_outputs, labels, weights, vocab_size) optimizer = tf.train.AdamOptimizer(starter_learning_rate).minimize(loss) tf.summary.scalar('loss', loss) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter('./logs', sess.graph) sess.run(tf.global_variables_initializer()) for step in range(iterations): loss_t, summary = train_batch(batch_size) if step % 100 == 0: print("itrations: %d, train_loss: %.5f." % (step, loss_t), end='\r') if step % 500 == 0: summary_writer.add_summary(summary, step) summary_writer.flush()
def __init__(self, config, iterator, infer=False, bayou_mode=True): assert config.model == 'lle', 'Trying to load different model implementation: ' + config.model self.config = config newBatch = iterator.get_next() nodes, edges, targets = newBatch[:3] ev_data = newBatch[3:] nodes = tf.transpose(nodes) edges = tf.transpose(edges) with tf.variable_scope("Embedding"): emb = tf.get_variable( 'emb', [config.decoder.vocab_size, config.decoder.units]) with tf.variable_scope("Encoder"): self.encoder = BayesianEncoder(config, ev_data, infer) samples_1 = tf.random_normal( [config.batch_size, config.latent_size], mean=0., stddev=1., dtype=tf.float32) self.psi_encoder = self.encoder.psi_mean + tf.sqrt( self.encoder.psi_covariance) * samples_1 # setup the reverse encoder. with tf.variable_scope("Reverse_Encoder"): embAPI = tf.get_variable('embAPI', [ config.reverse_encoder.vocab_size, config.reverse_encoder.units ]) embRT = tf.get_variable( 'embRT', [config.evidence[4].vocab_size, config.reverse_encoder.units]) embFS = tf.get_variable( 'embFS', [config.evidence[5].vocab_size, config.reverse_encoder.units]) self.reverse_encoder = BayesianReverseEncoder( config, embAPI, nodes, edges, ev_data[4], embRT, ev_data[5], embFS) samples_2 = tf.random_normal( [config.batch_size, config.latent_size], mean=0., stddev=1., dtype=tf.float32) self.psi_reverse_encoder = self.reverse_encoder.psi_mean + tf.sqrt( self.reverse_encoder.psi_covariance) * samples_2 # setup the decoder with psi as the initial state with tf.variable_scope("Decoder"): lift_w = tf.get_variable( 'lift_w', [config.latent_size, config.decoder.units]) lift_b = tf.get_variable('lift_b', [config.decoder.units]) if bayou_mode or infer: initial_state = tf.nn.xw_plus_b(self.psi_encoder, lift_w, lift_b, name="Initial_State") else: initial_state = tf.nn.xw_plus_b(self.psi_reverse_encoder, lift_w, lift_b, name="Initial_State") self.decoder = BayesianDecoder(config, emb, initial_state, nodes, edges) with tf.variable_scope("RE_Decoder"): ## RE emb_RE = config.evidence[ 4].emb * 0.0 #tf.get_variable('emb_RE', [config.evidence[4].vocab_size, config.evidence[4].units]) lift_w_RE = tf.get_variable( 'lift_w_RE', [config.latent_size, config.evidence[4].units]) lift_b_RE = tf.get_variable('lift_b_RE', [config.evidence[4].units]) if bayou_mode or infer: initial_state_RE = tf.nn.xw_plus_b(self.psi_encoder, lift_w_RE, lift_b_RE, name="Initial_State_RE") else: initial_state_RE = tf.nn.xw_plus_b(self.psi_reverse_encoder, lift_w_RE, lift_b_RE, name="Initial_State_RE") input_RE = tf.transpose( tf.reverse_v2(tf.zeros_like(ev_data[4]), axis=[1])) output = SimpleDecoder(config, emb_RE, initial_state_RE, input_RE, config.evidence[4]) projection_w_RE = tf.get_variable( 'projection_w_RE', [config.evidence[4].units, config.evidence[4].vocab_size]) projection_b_RE = tf.get_variable('projection_b_RE', [config.evidence[4].vocab_size]) logits_RE = tf.nn.xw_plus_b(output.outputs[-1], projection_w_RE, projection_b_RE) labels_RE = tf.one_hot(tf.squeeze(ev_data[4]), config.evidence[4].vocab_size, dtype=tf.int32) loss_RE = tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_RE, logits=logits_RE) cond = tf.not_equal(tf.reduce_sum(self.encoder.psi_mean, axis=1), 0) # cond = tf.reshape( tf.tile(tf.expand_dims(cond, axis=1) , [1,config.evidence[5].max_depth]) , [-1] ) self.loss_RE = tf.reduce_mean( tf.where(cond, loss_RE, tf.zeros(cond.shape))) with tf.variable_scope("FS_Decoder"): #FS emb_FS = config.evidence[ 5].emb #tf.get_variable('emb_FS', [config.evidence[5].vocab_size, config.evidence[5].units]) lift_w_FS = tf.get_variable( 'lift_w_FS', [config.latent_size, config.evidence[5].units]) lift_b_FS = tf.get_variable('lift_b_FS', [config.evidence[5].units]) if bayou_mode or infer: initial_state_FS = tf.nn.xw_plus_b(self.psi_encoder, lift_w_FS, lift_b_FS, name="Initial_State_FS") else: initial_state_FS = tf.nn.xw_plus_b(self.psi_reverse_encoder, lift_w_FS, lift_b_FS, name="Initial_State_FS") input_FS = tf.transpose(tf.reverse_v2(ev_data[5], axis=[1])) self.decoder_FS = SimpleDecoder(config, emb_FS, initial_state_FS, input_FS, config.evidence[5]) output = tf.reshape(tf.concat(self.decoder_FS.outputs, 1), [-1, self.decoder_FS.cell1.output_size]) logits_FS = tf.matmul(output, self.decoder_FS.projection_w_FS ) + self.decoder_FS.projection_b_FS # logits_FS = output targets_FS = tf.reverse_v2(tf.concat( [tf.zeros_like(ev_data[5][:, -1:]), ev_data[5][:, :-1]], axis=1), axis=[1]) # self.gen_loss_FS = tf.contrib.seq2seq.sequence_loss(logits_FS, target_FS, # tf.ones_like(target_FS, dtype=tf.float32)) cond = tf.not_equal(tf.reduce_sum(self.encoder.psi_mean, axis=1), 0) cond = tf.reshape( tf.tile(tf.expand_dims(cond, axis=1), [1, config.evidence[5].max_depth]), [-1]) cond = tf.where(cond, tf.ones(cond.shape), tf.zeros(cond.shape)) self.gen_loss_FS = seq2seq.sequence_loss( [logits_FS], [tf.reshape(targets_FS, [-1])], [cond]) # get the decoder outputs with tf.name_scope("Loss"): output = tf.reshape(tf.concat(self.decoder.outputs, 1), [-1, self.decoder.cell1.output_size]) logits = tf.matmul( output, self.decoder.projection_w) + self.decoder.projection_b ln_probs = tf.nn.log_softmax(logits) # 1. generation loss: log P(Y | Z) cond = tf.not_equal(tf.reduce_sum(self.encoder.psi_mean, axis=1), 0) cond = tf.reshape( tf.tile(tf.expand_dims(cond, axis=1), [1, config.decoder.max_ast_depth]), [-1]) cond = tf.where(cond, tf.ones(cond.shape), tf.zeros(cond.shape)) self.gen_loss = seq2seq.sequence_loss([logits], [tf.reshape(targets, [-1])], [cond]) # 2. latent loss: negative of the KL-divergence between P(\Psi | f(\Theta)) and P(\Psi) KL_loss = 0.5 * tf.reduce_mean( tf.log(self.encoder.psi_covariance) - tf.log(self.reverse_encoder.psi_covariance) - 1 + self.reverse_encoder.psi_covariance / self.encoder.psi_covariance + tf.square(self.encoder.psi_mean - self.reverse_encoder.psi_mean ) / self.encoder.psi_covariance, axis=1) #KL_cond = tf.not_equal(tf.reduce_sum(self.encoder.psi_mean, axis=1) , 0) self.KL_loss = KL_loss #tf.reduce_mean( tf.where( KL_cond , KL_loss, tf.zeros_like(KL_loss)) , axis = 0 ) if bayou_mode or infer: self.loss = self.gen_loss + 1 / 32 * self.loss_RE + 8 / 32 * self.gen_loss_FS else: self.loss = self.KL_loss + 1 * (self.gen_loss + 1 / 32 * self.loss_RE + 8 / 32 * self.gen_loss_FS) if infer: # self.gen_loss is P(Y|Z) where Z~P(Z|X) # P(Y) = int_Z P(YZ) = int_Z P(Y|Z)P(Z) = int_Z P(Y|Z)P(Z|X)P(Z)/P(Z|X) = sum_Z P(Y|Z)P(Z)/P(Z|X) where Z~P(Z|X) # last step by importace_sampling # this self.prob_Y is approximate and you need to introduce one more tensor dimension to do this efficiently over multiple trials # P(Y) = P(Y|Z)P(Z)/P(Z|X) where Z~P(Z|X) self.probY = -1 * self.loss + self.get_multinormal_lnprob(self.psi_encoder) \ - self.get_multinormal_lnprob(self.psi_encoder,self.encoder.psi_mean,self.encoder.psi_covariance) self.EncA, self.EncB = self.calculate_ab( self.encoder.psi_mean, self.encoder.psi_covariance) self.RevEncA, self.RevEncB = self.calculate_ab( self.reverse_encoder.psi_mean, self.reverse_encoder.psi_covariance) self.allEvSigmas = [ev.sigma for ev in self.config.evidence] #unused if MultiGPU is being used with tf.name_scope("train"): if bayou_mode: train_ops = get_var_list()['decoder_vars'] else: train_ops = get_var_list()['rev_encoder_vars'] if not infer: opt = tf.train.AdamOptimizer(config.learning_rate) self.train_op = opt.minimize(self.loss, var_list=train_ops) var_params = [ np.prod([dim.value for dim in var.get_shape()]) for var in tf.trainable_variables() ] print('Model parameters: {}'.format(np.sum(var_params)))
# Decoder input: prepend some "GO" token and drop the final # token of the encoder input dec_inp = ([tf.zeros_like(enc_inp[0], dtype=np.int32, name="GO")] + enc_inp[:-1]) cell = LSTMCell(memory_dim) dec_outputs, dec_memory = embedding_rnn_seq2seq(enc_inp, dec_inp, cell, vocab_size, vocab_size, embedding_dim, feed_previous=False) loss = sequence_loss(dec_outputs, labels, weights) global_step = tf.Variable(0, trainable=False) boundaries = [10000] values = [0.01, 0.001] learning_rate = tf.train.piecewise_constant(global_step, boundaries, values) #learning_rate = 0.01 #0.97485 #momentum = 0.9 #optimizer = tf.train.GradientDescentOptimizer(learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(loss) sess.run(tf.global_variables_initializer()) def train_batch(batch_size):
def build_model(self, forward_only): print("[*] Building a PTRModel math model") with tf.variable_scope(self.scope): #embedding_matrix = tf.eye(self.input_dim, self.W) #embedding_matrix = weight('embedding', [self.input_dim, self.W], init='xavier') self.a = weight('a', [1, 1], init='constant', value=0.45) self.b = weight('b', [1, 1], init='constant', value=-0.55) self.c = weight('c', [1, 1], init='constant', value=0.0) self.beta = 1 + tf.nn.softplus(weight('beta', [1, 1])) prev_state = self.controller.init_state() tf.get_variable_scope().reuse_variables() for seq_length in range(1, self.max_length + 1): input_1 = tf.placeholder(tf.float32, [self.input_dim], name='input_1_%s' % seq_length) true_output = tf.placeholder(tf.float32, [self.output_dim], name='true_output_%s' % seq_length) self.inputs_1.append(input_1) self.true_outputs.append(true_output) # present inputs prev_state = self.controller.update_memory( prev_state, [ tf.reshape(input_1, [1, -1]), tf.reshape(input_1, [1, -1]) ]) self.collect_states[seq_length] = self.collect_states[ seq_length - 1][0:(seq_length - 1)] + [self.copy_state(prev_state)] state = prev_state self.prev_states[seq_length] = state stops = [] candidate_outputs = [] for j in range(self.MAX_STEP): state, _ = self.controller(state, j) self.collect_states[seq_length].append( self.copy_state(state)) candidate_outputs.append( tf.unstack(state['M'][-1][0:seq_length])) # stops.append(state['stop']) self.outputs[seq_length] = candidate_outputs self.stops[seq_length] = stops if not forward_only: for seq_length in range(self.min_length, self.max_length + 1): print(" [*] Building a loss model for seq_length %s" % seq_length) all_losses = [] for index in range(self.MAX_STEP): loss = sequence_loss( logits=self.outputs[seq_length][index], targets=self.true_outputs[0:seq_length], weights=[1] * seq_length, average_across_timesteps=False, average_across_batch=False, softmax_loss_function=l2_loss) all_losses.append(loss) all_losses = tf.stack(all_losses) #step_dist = tf.nn.softmax(tf.concat(self.stops[seq_length], 1)) #step_dist = tf.nn.softmax(tf.nn.embedding_lookup(self.length2stepdist, [seq_length])) #max_pos = tf.to_float(tf.argmax(step_dist)) max_pos = tf.clip_by_value( self.a * (tf.to_float(seq_length)**2) + self.b * tf.to_float(seq_length) + self.c, 0, self.MAX_STEP - 1) stop_pos = D(self.MAX_STEP, max_pos, 1, self.beta) loss1 = tf.reduce_sum( tf.expand_dims(all_losses, 0) * stop_pos) + 0.001 * tf.reduce_sum(max_pos) self.losses[seq_length] = loss1 if not self.params: self.params = tf.trainable_variables() grads = [] for grad in tf.gradients( loss1, self.params ): # + self.weight_decay*tf.add_n(tf.get_collection('l2')) if grad is not None: grads.append( tf.clip_by_value(grad, self.min_grad, self.max_grad)) else: grads.append(grad) self.grads[seq_length] = grads with tf.variable_scope("opt", reuse=None): if not forward_only: for seq_length in range(self.min_length, self.max_length + 1): self.optims[seq_length] = self.opt.apply_gradients( zip(self.grads[seq_length], self.params), global_step=self.global_step) self.saver = tf.train.Saver() print(" [*] Build a PTRModel math model finished")
def __init__(self, vocab_size, buckets, size, num_layers, batch_size, num_softmax_samples, do_decode, num_gpus=2, train_and_test=False): """ :param source_vocab_size: 原始词词数目 :param target_vocab_size: 目标词词数目 :param buckets: 桶 :param size: cell的神经元数量 :param num_layers: 神经网络层数 :param batch_size: :param do_decode: 训练还是测试 影响seq2seq的解码过程 :param num_gpus: gpu的数量 :param 训练和预测一起进行 """ self._cur_gpu = 0 # 此参数用于自动选择gpu和cpu self._num_gpus = num_gpus # gpu的数量 self.sess = None # tf的session 若为None则后面需要创建一个新的 self.buckets = buckets self.global_step = tf.Variable( 0, trainable=False) # 一个tensor 用于记录训练集训练的次数 encoder_inputs = [] # encoder inputs decoder_inputs = [] target_inputs = [] loss_weight_inputs = [] # 所有的编码输入标识符号 for i in range(buckets[-1][0]): encoder_inputs.append( tf.placeholder(tf.int32, shape=[batch_size], name="encoder{}".format(i))) squence_length = tf.placeholder(tf.int32, [batch_size], name='squence_length') self.squence_length = squence_length # 所有的解码输出标识符号 for i in range(buckets[-1][1]): decoder_inputs.append( tf.placeholder(tf.int32, shape=[batch_size], name="decoder{}".format(i))) target_inputs.append( tf.placeholder(tf.int64, shape=[batch_size], name="target{}".format(i))) loss_weight_inputs.append( tf.placeholder(tf.float32, shape=[batch_size], name="loss_weight{}".format(i))) encoder_inputs_buckets = {} decoder_inputs_buckets = {} target_inputs_buckets = {} loss_weight_inputs_buckets = {} # bucket部分的 encoder decoder target # 解码和编码部分的bucket for bucket_id, bucket in enumerate(buckets): encoder_inputs_buckets[bucket_id] = encoder_inputs[0:bucket[0]] decoder_inputs_buckets[bucket_id] = decoder_inputs[0:bucket[1]] target_inputs_buckets[bucket_id] = target_inputs[0:bucket[1]] loss_weight_inputs_buckets[bucket_id] = loss_weight_inputs[ 0:bucket[1]] self.encoder_inputs_buckets = encoder_inputs_buckets self.decoder_inputs_buckets = decoder_inputs_buckets self.target_inputs_buckets = target_inputs_buckets self.loss_weight_inputs_buckets = loss_weight_inputs_buckets # 所有的编码部分和解码部分的embedding with tf.variable_scope( 'embedding', reuse=True if train_and_test else None), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vocab_size, size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) # every word look up a word vector. emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] encoder_embedding_buckets = {} decoder_embedding_buckets = {} # bucket embedding 部分的 encoder decoder for i, bucket in enumerate(buckets): encoder_embedding_buckets[i] = emb_encoder_inputs[0:bucket[0]] decoder_embedding_buckets[i] = emb_decoder_inputs[0:bucket[1]] # 这里需要使用bucket encoder_output_buckets = {} encoder_state_buckets = {} device = self._next_device() for bucket_id, bucket in enumerate(buckets): encoder_input_embedding = encoder_embedding_buckets[bucket_id] for layer_id in range(num_layers): with tf.variable_scope( "encoder%d" % layer_id, reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): cell = LSTMCell(num_units=size, initializer=tf.random_uniform_initializer( -0.1, 0.1, seed=123), state_is_tuple=True) encoder_input_embedding, state = static_rnn( cell=cell, inputs=encoder_input_embedding, sequence_length=squence_length, dtype=tf.float32) output = encoder_input_embedding encoder_output_buckets[bucket_id] = output encoder_state_buckets[bucket_id] = state with tf.variable_scope('output_projection', reuse=True if train_and_test else None): w = tf.get_variable( 'w', [size, vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) loop_function = _extract_argmax_and_embed(embedding, (w, v)) if do_decode else None cell = LSTMCell(size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=True) decoder_output_buckets = {} decoder_state_buckets = {} device = self._next_device() for bucket_id, bucket in enumerate(buckets): with tf.variable_scope( "decoder", reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): t = tf.concat(values=[ tf.reshape(x, [-1, 1, size]) for x in encoder_output_buckets[bucket_id] ], axis=1) decoder_output, decoder_state = attention_decoder( decoder_inputs=decoder_embedding_buckets[bucket_id], initial_state=encoder_state_buckets[bucket_id], attention_states=t, cell=cell, num_heads=1, loop_function=loop_function, initial_state_attention=do_decode) decoder_output_buckets[bucket_id] = decoder_output decoder_state_buckets[bucket_id] = decoder_state model_output_buckets = {} # 输出的 logits model_output_predict_buckets = {} model_output_predict_merger_buckets = {} model_output_accuracy = {} device = self._next_device() for bucket_id, bucket in enumerate(buckets): model_output = [] model_output_predict = [] model_accuracy = [] with tf.variable_scope( "output", reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): for j in range(len(decoder_output_buckets[bucket_id])): output = tf.nn.xw_plus_b( decoder_output_buckets[bucket_id][j], w, v) predict = tf.argmax(input=output, axis=1, name="predict_{}_{}".format( bucket_id, j)) accuracy_bool = tf.equal( x=target_inputs_buckets[bucket_id][j], y=predict) model_accuracy.append( tf.reduce_mean( tf.cast(x=accuracy_bool, dtype=tf.float32))) model_output.append(output) model_output_predict.append( tf.reshape(tensor=predict, shape=[-1, 1])) model_output_buckets[bucket_id] = model_output model_output_predict_buckets[bucket_id] = model_output_predict model_output_predict_merger_buckets[bucket_id] = tf.concat( values=model_output_predict, axis=1) model_output_accuracy[bucket_id] = tf.add_n(inputs=model_accuracy, name="bucket_id_{}".format(bucket_id)) / \ buckets[bucket_id][1] self.model_output_buckets = model_output_buckets self.model_output_predict_buckets = model_output_predict_buckets self.model_output_predict_merger_buckets = model_output_predict_merger_buckets self.model_output_accuracy = model_output_accuracy def sampled_loss_func(labels, logits): # tf1.0的规范更加严格 with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(v, tf.float32) local_inputs = tf.cast(logits, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_softmax_samples, num_classes=vocab_size), tf.float32) device = self._next_device() loss_buckets = {} for bucket_id, bucket in enumerate(buckets): with tf.variable_scope( 'loss', reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): if num_softmax_samples != 0 and not do_decode: # 这里的输入部分不相同的原因是前者替换了softmax函数 loss = sequence_loss_by_example( logits=decoder_output_buckets[bucket_id], targets=target_inputs_buckets[bucket_id], weights=loss_weight_inputs_buckets[bucket_id], average_across_timesteps=True, softmax_loss_function=sampled_loss_func) # loss = sequence_loss(logits=model_output_buckets[bucket_id], # targets=target_inputs_buckets[bucket_id], # weights=loss_weight_inputs_buckets[bucket_id] # ) else: loss = sequence_loss( logits=model_output_buckets[bucket_id], targets=target_inputs_buckets[bucket_id], weights=loss_weight_inputs_buckets[bucket_id]) loss_buckets[bucket_id] = tf.reduce_mean(loss) # 计算平均loss self.loss_buckets = loss_buckets
feed_previous=False) with tf.variable_scope('decoder', reuse=True): cell = tf.contrib.rnn.GRUCell(num_units) decode_outputs_t, decode_states_t = seq2seq.embedding_rnn_seq2seq( enc_in2, dec_in2, cell, vocab_size, vocab_size, embed_dim, output_projection=None, feed_previous=True) loss_weights = [tf.ones(l.shape, dtype=tf.float32) for l in labels2] loss = seq2seq.sequence_loss(decode_outputs, labels2, loss_weights, vocab_size) train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) # </Model> def train(training_data, testing_data): in_data_train, la_data_train, out_data_train = training_data in_data_test, la_data_test, out_data_test = testing_data with tf.Session() as sess: sess.run(tf.global_variables_initializer()) total_num = len(in_data_train) num_per_epoch = total_num // batch_size for epoch in range(epochs): global global_num
def add_loss_op(self, output): with tf.name_scope('losses'): all_ones = [tf.ones([self.config.batch_size * self.config.num_steps])] # sequence loss is the mean of batch and sentence loss cross_entropy = sequence_loss([output], [tf.reshape(self.labels_placeholder, [-1])], all_ones, len(self.vocab)) return cross_entropy
def build_model(self, forward_only, is_copy=True): print(" [*] Building a NTM model") with tf.variable_scope(self.scope): # present start symbol if is_copy: _, _, prev_state = self.cell(self.start_symbol, state=None) self.save_state(prev_state, 0, self.max_length) zeros = np.zeros(self.cell.input_dim, dtype=np.float32) tf.get_variable_scope().reuse_variables() for seq_length in range(1, self.max_length + 1): progress(seq_length / float(self.max_length)) input_ = tf.placeholder(tf.float32, [self.cell.input_dim], name='input_%s' % seq_length) true_output = tf.placeholder( tf.float32, [self.cell.output_dim], name='true_output_%s' % seq_length) self.inputs.append(input_) self.true_outputs.append(true_output) # present inputs _, _, prev_state = self.cell(input_, prev_state) self.save_state(prev_state, seq_length, self.max_length) # present end symbol if is_copy: _, _, state = self.cell(self.end_symbol, prev_state) self.save_state(state, seq_length) self.prev_states[seq_length] = state if not forward_only: # present targets outputs, output_logits = [], [] for _ in range(seq_length): output, output_logit, state = self.cell(zeros, state) self.save_state(state, seq_length, is_output=True) outputs.append(output) output_logits.append(output_logit) self.outputs[seq_length] = outputs self.output_logits[seq_length] = output_logits if not forward_only: for seq_length in range(self.min_length, self.max_length + 1): print(" [*] Building a loss model for seq_length %s" % seq_length) loss = sequence_loss( logits=self.output_logits[seq_length], targets=self.true_outputs[0:seq_length], weights=[1] * seq_length, average_across_timesteps=False, average_across_batch=False, softmax_loss_function=softmax_loss_function) self.losses[seq_length] = loss if not self.params: self.params = tf.trainable_variables() # grads, norm = tf.clip_by_global_norm( # tf.gradients(loss, self.params), 5) grads = [] for grad in tf.gradients(loss, self.params): if grad is not None: grads.append( tf.clip_by_value(grad, self.min_grad, self.max_grad)) else: grads.append(grad) self.grads[seq_length] = grads opt = tf.train.RMSPropOptimizer(self.lr, decay=self.decay, momentum=self.momentum) reuse = seq_length != 1 with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): self.optims[seq_length] = opt.apply_gradients( zip(grads, self.params), global_step=self.global_step) if not reuse: tf.get_variable_scope().reuse_variables() model_vars = \ [v for v in tf.global_variables() if v.name.startswith(self.scope)] self.saver = tf.train.Saver(model_vars) print(" [*] Build a NTM model finished")