def encode(self, seq, reuse=None): # input_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(seq, 1)), 1) if self.embeddings_mat is not None: input_embed = layers.embed_sequence( seq, vocab_size=self.vocab_size, embed_dim=self.embed_dim, initializer=tf.constant_initializer(self.embeddings_mat, dtype=tf.float32), trainable=False, scope='embed', reuse=reuse) else: input_embed = layers.embed_sequence(seq, vocab_size=self.vocab_size, embed_dim=self.embed_dim, scope='embed', reuse=reuse) forward_cell = tf.contrib.rnn.LSTMCell(num_units=self.num_units / 2, reuse=reuse) backward_cell = tf.contrib.rnn.LSTMCell(num_units=self.num_units / 2, reuse=reuse) # encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, input_embed, dtype=tf.float32) # encoder_final_state_vec = tf.nn.l2_normalize(tf.concat(encoder_final_state, 1), 1) encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn( forward_cell, backward_cell, input_embed, dtype=tf.float32) encoder_states = tf.nn.rnn_cell.LSTMStateTuple( c=tf.concat((encoder_states[0][0], encoder_states[1][0]), 1), h=tf.concat((encoder_states[0][1], encoder_states[1][1]), 1)) encoder_final_state_vec = tf.nn.l2_normalize( tf.concat(encoder_states, 1), 1) return encoder_states, encoder_final_state_vec
def make_graph(self,mode, features, labels, params): embed_dim = params.embed_dim num_units = params.num_units input,output = features['input'], features['output'] batch_size = tf.shape(input)[0] start_tokens = tf.zeros([batch_size], dtype= tf.int64) train_output = tf.concat([tf.expand_dims(start_tokens, 1), output], 1) input_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(input, 1)), 1) output_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(train_output, 1)), 1) input_embed = layers.embed_sequence(input, vocab_size=self.vocab_size, embed_dim = embed_dim, scope = 'embed') output_embed = layers.embed_sequence(train_output, vocab_size=self.vocab_size, embed_dim = embed_dim, scope = 'embed', reuse = True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') cell = tf.contrib.rnn.LSTMCell(num_units=num_units) if self.FLAGS.use_residual_lstm: cell = tf.contrib.rnn.ResidualWrapper(cell) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, input_embed, dtype=tf.float32) def decode(helper, scope, reuse=None): # Decoder is partially based on @ilblackdragon//tf_example/seq2seq.py with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=num_units, memory=encoder_outputs, memory_sequence_length=input_lengths) cell = tf.contrib.rnn.LSTMCell(num_units=num_units) attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=num_units / 2) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, self.vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=out_cell.zero_state( dtype=tf.float32, batch_size=batch_size)) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.FLAGS.output_max_length) return outputs[0] train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths) pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1) train_outputs = decode(train_helper, 'decode') pred_outputs = decode(pred_helper, 'decode', reuse=True) tf.identity(train_outputs.sample_id[0], name='train_pred') weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1)) loss = tf.contrib.seq2seq.sequence_loss(train_outputs.rnn_output, output, weights=weights) train_op = layers.optimize_loss( loss, tf.train.get_global_step(), optimizer=params.optimizer, learning_rate=params.learning_rate, summaries=['loss', 'learning_rate']) tf.identity(pred_outputs.sample_id[0], name='predict') return tf.estimator.EstimatorSpec(mode=mode, predictions=pred_outputs.sample_id, loss=loss, train_op=train_op)
def seq2seq_model(inputs, targets, keep_prob, batch_size, seq_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int_dict): encoder_embedded_input = embed_sequence( ids=inputs, vocab_size=answers_num_words + 1, embed_dim=encoder_embedding_size, initializer=tf.random_uniform_initializer(minval=0, maxval=1)) encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, seq_length) preprocessed_targets = preprocess_targets(targets, questionswords2int_dict, batch_size) decoder_embeddings_matrix = tf.Variable( tf.random_uniform( shape=[questions_num_words + 1, decoder_embedding_size], minval=0, maxval=1)) decoder_embedded_input = tf.nn.embedding_lookup( params=decoder_embeddings_matrix, ids=preprocessed_targets) train_pred, test_pred = decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, seq_length, rnn_size, num_layers, questionswords2int_dict, keep_prob, batch_size) return train_pred, test_pred
def get_encoder_layer(input_data, rnn_size, num_layers, source_sequence_length, source_vocab_size, encoding_embedding_size): ''' 构造Encoder层 参数说明: - input_data: 输入tensor - rnn_size: rnn隐层结点数量 - num_layers: 堆叠的rnn cell数量 - source_sequence_length: 源数据的序列长度 - source_vocab_size: 源数据的词典大小 - encoding_embedding_size: embedding的大小 ''' # Encoder embedding encoder_embed_input = layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size) # RNN cell def get_lstm_cell(rnn_size): lstm_cell = rnn.LSTMCell(rnn_size, initializer=tf.truncated_normal_initializer) return lstm_cell cell = rnn.MultiRNNCell( [get_lstm_cell(rnn_size) for _ in range(num_layers)]) encoder_output, encoder_state = tf.nn.dynamic_rnn( cell, encoder_embed_input, sequence_length=source_sequence_length, dtype=tf.float32) return encoder_output, encoder_state
def seq2seq(inputs, targets, batch_size, questionword2int, encoder_embedded_size, decoder_embedding_size, questions_num_words, answer_num_word, rnn_size, keep_prob, num_of_layers, sequence_length): encoder_embedded_input = layers.embed_sequence( inputs, encoder_embedded_size, answer_num_word, initializer=tf.random_uniform_initializer(0, 1)) encoder_state = rnn_encoder(encoder_embedded_input, rnn_size, keep_prob, num_of_layers, sequence_length) preprocessing_targets = rnn_training_data(batch_size, targets, questionword2int) decoder_embeddings_matrix = tf.Variable( tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1)) decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessing_targets) training_prediction, test_predictions = decoder_rnn( rnn_size, keep_prob, num_of_layers, questions_num_words, encoder_state, questionword2int, batch_size, decoder_embedded_input, decoder_embeddings_matrix, sequence_length) return training_prediction, test_predictions
def __attention_loss_branch(self, rnn_features): output_embed = layers.embed_sequence(self.att_train_output, vocab_size=self.vocab_att_size, embed_dim=self.att_embed_dim, scope='embed') # with tf.device('/cpu:0'): embeddings = tf.Variable(tf.truncated_normal( shape=[self.vocab_att_size, self.att_embed_dim], stddev=0.1), name='decoder_embedding') start_tokens = tf.zeros([self.batch_size], dtype=tf.int64) train_helper = tf.contrib.seq2seq.TrainingHelper( output_embed, self.att_train_length) pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1) train_outputs = self.__att_decode(train_helper, rnn_features, 'decode') pred_outputs = self.__att_decode(pred_helper, rnn_features, 'decode', reuse=True) # train_decode_result = train_outputs[0].rnn_output[0, :-1, :] # pred_decode_result = pred_outputs[0].rnn_output[0, :, :] mask = tf.cast( tf.sequence_mask(self.batch_size * [self.att_train_length[0] - 1], self.att_train_length[0]), tf.float32) att_loss = tf.contrib.seq2seq.sequence_loss( train_outputs[0].rnn_output, self.att_target_output, weights=mask) return att_loss
def get_encoder_layer(input_data, rnn_size, num_layers, source_seq_len, source_vocab_size, embedding_size): """ 构造 encoder 层 :param input_data: :param rnn_size: :param num_layers: :param source_seq_len: :param source_vocab_size: :param embedding_size: :return: """ encoder_embed_input = layers.embed_sequence(input_data, source_vocab_size, embedding_size) # build RNN cell def get_lstm_cell(rnn_size): lstm_cell = rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer( -0.1, 0.1, seed=2)) return lstm_cell cell = rnn.MultiRNNCell( [get_lstm_cell(rnn_size) for _ in range(num_layers)]) encoder_out, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input, source_seq_len, dtype=tf.float32) return encoder_out, encoder_state
def build_network(is_training): train_output_embed= encoder_net(image, 'encode_features',is_training) #vocab_size: 输入数据的总词汇量,指的是总共有多少类词汇,不是总个数,embed_dim:想要得到的嵌入矩阵的维度 output_embed = layers.embed_sequence(train_output, vocab_size=cfg.VOCAB_SIZE, embed_dim=cfg.VOCAB_SIZE, scope='embed')#有种变为one-hot的意味 embeddings = tf.Variable(tf.truncated_normal(shape=[cfg.VOCAB_SIZE, cfg.VOCAB_SIZE], stddev=0.1), name='decoder_embedding')#embdding变为类别 start_tokens = tf.zeros([cfg.BATCH_SIZE], dtype=tf.int64) train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, train_length) #用于inference阶段的helper,将output输出后的logits使用argmax获得id再经过embedding layer来获取下一时刻的输入。 #start_tokens: batch中每个序列起始输入的token_id end_token:序列终止的token_id #start_tokens: int32 vector shaped [batch_size], the start tokens. #end_token: int32 scalar, the token that marks end of decoding. pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1)#GO,EOS的序号 train_outputs = decode(train_helper, train_output_embed, 'decode') pred_outputs = decode(pred_helper, train_output_embed, 'decode', reuse=True) train_decode_result = train_outputs[0].rnn_output[:, :-1, :] pred_decode_result = pred_outputs[0].rnn_output mask = tf.cast(tf.sequence_mask(cfg.BATCH_SIZE * [train_length[0] - 1], train_length[0]), tf.float32) att_loss = tf.contrib.seq2seq.sequence_loss(train_outputs[0].rnn_output, target_output,weights=mask) loss = tf.reduce_mean(att_loss) return loss,train_decode_result, pred_decode_result
def encoding_layer(inputs, encode_token, em_size, num_layers, num_units, drop_val): # Maps a sequence of symbols to a sequence of embeddings. # embed_sequence is equivalant to: # encode_embed = tf.get_variable("encode_embedding", # initializer=tf.random_uniform([encode_token, em_size]), # dtype=tf.float32) # encode_embed_input = tf.nn.embedding_lookup(encode_embed, inputs) encode_embed_input = layers.embed_sequence(inputs, vocab_size=encode_token, embed_dim=em_size) stacked_lstm_fw = rnn.MultiRNNCell( [get_a_lstm(num_units) for _ in range(num_layers)]) stacked_lstm_bw = rnn.MultiRNNCell( [get_a_lstm(num_units) for _ in range(num_layers)]) outputs, final_states = tf.nn.bidirectional_dynamic_rnn(stacked_lstm_fw, stacked_lstm_bw, encode_embed_input, dtype=tf.float32) output_fw, output_bw = outputs state_fw, state_bw = final_states encode_output = tf.concat([output_fw, output_bw], 2) encode_state = tf.concat([state_fw, state_bw], 2) # ref 0: GPU setup # stacked_lstm = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers, num_units, # direction='bidirectional', # dropout=drop_val) return encode_output, encode_state
def TestModel(input1): batch_size = tf.shape(input1)[0] start_tokens = tf.zeros([batch_size], dtype=tf.int64) input_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(input1, 1)), 1) input_embed = layers.embed_sequence( input1, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed') with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') if(dropout==1): cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(num_units),1) else: cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units)#initial_state = cell.zero_state([batch_size], dtype=tf.float32) if(uni_directional==1): encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, input_embed, dtype=tf.float32) num_units1=512 else: ((encoder_fw_outputs,encoder_bw_outputs), (encoder_fw_final_state,encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell,cell_bw=cell,inputs=input_embed,dtype=tf.float32, time_major=True) encoder_outputs=tf.concat((encoder_fw_outputs,encoder_bw_outputs),2) encoder_final_state_c=tf.concat((encoder_fw_final_state.c,encoder_bw_final_state.c),1) encoder_final_state_h=tf.concat((encoder_fw_final_state.h,encoder_bw_final_state.h),1) encoder_final_state=LSTMStateTuple(c=encoder_final_state_c,h=encoder_final_state_h) num_units1=1024 pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1) decoder_cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units1) projection_layer = Dense(units=vocab_size,use_bias=True) def decode(helper, scope, reuse=None): with tf.variable_scope(scope, reuse=reuse): if(decode_method==1): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=num_units1, memory=encoder_outputs, memory_sequence_length=input_lengths) attn_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism, attention_layer_size=num_units1/2) out_cell = tf.contrib.rnn.OutputProjectionWrapper( attn_cell, vocab_size, reuse=reuse ) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=out_cell.zero_state( dtype=tf.float32, batch_size=batch_size),output_layer=projection_layer) else: out_cell = tf.contrib.rnn.OutputProjectionWrapper( decoder_cell, vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=out_cell.zero_state( dtype=tf.float32, batch_size=batch_size),output_layer=projection_layer) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=output_max_length ) return outputs[0] train_outputs = decode(pred_helper, 'decode', reuse=True) return train_outputs
def preprocess_pandas(features_df): # Organize continues features. final_features = [tf.expand_dims(tf.cast(features_df[var], tf.float32), 1) for var in continuous_vars] # Embed categorical variables into distributed representation. for var in categorical_vars: feature = layers.embed_sequence( features_df[var + '_ids'], vocab_size=len(categorical_var_encoders[var].classes_), embed_dim=CATEGORICAL_EMBED_SIZE, scope=var) final_features.append(feature) # Concatenate all features into one vector. features = tf.concat(final_features, 1) return features
def decode(self, encoder_out, scope, output, reuse=None): # From the encoder encoder_state = encoder_out[0] # Perform the embedding # if mode=='train': # if output is None: # raise Exception('output must be provided for mode=train') train_output = tf.concat( [tf.expand_dims(self.start_tokens, 1), output], 1) output_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(train_output, 1)), 1) output_embed = layers.embed_sequence(train_output, vocab_size=self.vocab_size, embed_dim=self.embed_dim, scope='encode/embed', reuse=True) # Prepare the helper # if mode=='train': # helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths) # if mode=='predict': # helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( # self.embeddings, # start_tokens=tf.to_int32(self.start_tokens), # end_token=1 # ) helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths) # Decoder is partially based on @ilblackdragon//tf_example/seq2seq.py with tf.variable_scope(scope, reuse=reuse): # attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( # num_units=self.num_units, memory=encoder_outputs, # memory_sequence_length=input_lengths) cell = tf.contrib.rnn.LSTMCell(num_units=self.num_units) # attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=self.num_units / 2) out_cell = tf.contrib.rnn.OutputProjectionWrapper(cell, self.vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=encoder_state) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.FLAGS.output_max_length + 1) return outputs[0]
def build_embedding(self): with tf.variable_scope("Embedding"): self.encoder_input_embedding = tcl.embed_sequence( self.encoder_inputs, # [None, None, 15] self.data.encoder_vocab_size, self.encoder_embedding_size, scope="encoder_input_embedding") self.decoder_embedding = tf.Variable( tf.random_uniform([ self.data.decoder_vocab_size, self.decoder_embedding_size ])) # [31,15] self.decoder_input_embedding = tf.nn.embedding_lookup( self.decoder_embedding, self.decoder_input, name="decoder_target_embedding") # [None, None, 15]
def embed_features(feature, vocab_size, embed_dim, scope='Embed', reuse=False, pretrained=None, trainable=True): with tf.variable_scope(scope, reuse=reuse): embeded = layers.embed_sequence(feature, vocab_size=vocab_size, embed_dim=embed_dim, trainable=trainable) if pretrained is not None: tf.contrib.framework.init_from_checkpoint( pretrained, {scope + '/': scope + '/'}) return embeded
def __build_model(self): # Define model self.input_x = tf.placeholder(tf.int32, [None, self.seqlen], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, self.total_class], name="input_y") w = tf.get_variable("w_e", [self.seqlen, self.embed_dim]) #self.embed = tf.nn.embedding_lookup(w, self.input_x) self.embed = layers.embed_sequence(self.input_x, vocab_size=self.vocab_size, embed_dim=self.embed_dim) #self.rnn_unit = tf.nn.rnn_cell.DropoutWrapper( self.rnn_unit = tf.nn.rnn_cell.GRUCell(self.embed_dim) # output_keep_prob=1-self.dropout_keep) #self.cell_stack = tf.nn.rnn_cell.MultiRNNCell([self.rnn_unit] * self.total_layer) words = tf.unstack(self.embed, axis=1) _, encoding = tf.nn.static_rnn(cell=self.rnn_unit, inputs=words, dtype=tf.float32) # calc logits self.logits = tf.layers.dense(encoding, self.total_class, activation=None) self.pred = tf.nn.softmax(self.logits) #self.pred = tf.argmax(self.logits, 1, name="pred") self.acc = tf.reduce_mean(tf.cast( tf.equal(tf.argmax(self.input_y, 1), tf.argmax(self.logits, 1)), "float"), name="acc") self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)) params, _ = tf.clip_by_global_norm( tf.gradients(self.loss, tf.trainable_variables()), self.clip_norm) self.train_op = tf.train.AdamOptimizer(self.lr).apply_gradients( zip(params, tf.trainable_variables())) self.global_step = tf.Variable(self.init_step, trainable=False) summary = [] summary.append(tf.summary.scalar("loss", self.loss)) summary.append(tf.summary.scalar("acc", self.acc)) self.summary = tf.summary.merge(summary)
def cnn_model_fn(features, labels, mode, params): # mapping the features into our embedding layer print(features) input_layer = embed_sequence(ids=features["x"], vocab_size=vocab_size, embed_dim=embedding_size, initializer=params["embedding_initializer"] ) # [batch, sentence_len, embed_size] print(input_layer.shape) training = mode == estimator.estimator.ModeKeys.TRAIN dropout_emb = tf.layers.dropout(inputs=input_layer, rate=0.2, training=training) conv = tf.layers.conv1d( inputs=dropout_emb, filters=32, kernel_size=3, padding="same", activation=tf.nn.relu) # [batch, sentence_len, filters] print(conv.shape) pool = tf.reduce_max(input_tensor=conv, axis=1) # [batch, filters] hidden = tf.layers.dense(inputs=pool, units=250) dropout_hidden = tf.layers.dropout(inputs=hidden, rate=0.2, training=training) logits = tf.layers.dense(inputs=dropout_hidden, units=1) # This will be None when predicting if labels is not None: labels = tf.reshape(labels, [-1, 1]) optimizer = tf.train.AdamOptimizer() def _train_op_fn(loss): return optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return head.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits, train_op_fn=_train_op_fn)
'dev_bleu': [], 'ig': [] } #################### model #################### tf.reset_default_graph() X = tf.placeholder(tf.int32, [None, None]) X_len = tf.placeholder(tf.int32, [None]) Y = tf.placeholder(tf.int32, [None, None]) Y_len = tf.placeholder(tf.int32, [None]) Y_mask = tf.placeholder(tf.float32, [None, None]) Star = tf.placeholder(tf.float32, [None, 5]) inputs_enc = layers.embed_sequence(X, vocab_size=vocab_dim, embed_dim=embedding_dim) outputs_enc = layers.embed_sequence(Y, vocab_size=vocab_dim, embed_dim=embedding_dim) cell_enc = tf.contrib.rnn.BasicLSTMCell(num_units=latent_dim) outputs_enc, state_enc = tf.nn.dynamic_rnn(cell=cell_enc, inputs=inputs_enc, sequence_length=X_len, dtype=tf.float32, scope='g1') cell_dec = tf.contrib.rnn.BasicLSTMCell(num_units=latent_dim // 2, state_is_tuple=False) g1 = tf.concat([state_enc.h, Star], axis=-1) latent = tf.layers.dense(g1, latent_dim) init = latent # tf.layers.dense(latent, latent_dim)
def _model(self, features, labels, mode, params): """ main model. """ question_sequence = features['question_seq'] answer_sequence = features['answer_seq'] batch_size = tf.shape(question_sequence)[0] start_token = tf.ones([1], tf.int32) model_size = params["model_size"] num_layers = params["num_layers"] keep_prob = params["keep_prob"] vocab_size = params["vocab_size"] embedding_size = params["embedding_size"] question_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(question_sequence, self.vocabs["<PAD>"])), 1) answer_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(answer_sequence, self.vocabs["<PAD>"])), 1) question_embed = layers.embed_sequence(question_sequence, vocab_size=vocab_size, embed_dim=embedding_size, scope='embed') answer_embed = layers.embed_sequence(answer_sequence, vocab_size=vocab_size, embed_dim=embedding_size, scope='embed', reuse=True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') fcells = [] for i in range(num_layers): c = tf.nn.rnn_cell.GRUCell(model_size) c = tf.nn.rnn_cell.DropoutWrapper(c, input_keep_prob=keep_prob, output_keep_prob=keep_prob) fcells.append(c) # I cant figure out how to use tuple version. fcell = tf.nn.rnn_cell.MultiRNNCell(fcells) #bcells = [] #for i in range(num_layers): # c = tf.nn.rnn_cell.GRUCell(model_size) # c = tf.nn.rnn_cell.DropoutWrapper(c, input_keep_prob=keep_prob, # output_keep_prob=keep_prob) # bcells.append(c) # I cant figure out how to use tuple version. #bcell = tf.nn.rnn_cell.MultiRNNCell(bcells) bcell = tf.contrib.rnn.GRUCell(num_units=model_size) #icell = tf.contrib.rnn.GRUCell(num_units=model_size) encoder_outputs, encoder_final_state = tf.nn.bidirectional_dynamic_rnn( fcell, bcell, question_embed, sequence_length=question_lengths, dtype=tf.float32) # helpers train_helper = tf.contrib.seq2seq.TrainingHelper(answer_embed, answer_lengths, time_major=False) start_tokens = tf.tile(tf.constant([self.vocabs['<START>']], dtype=tf.int32), [batch_size], name='start_tokens') pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=start_tokens, end_token=self.vocabs["<EOS>"]) # rnn cell and dense layer cell = tf.contrib.rnn.GRUCell(num_units=model_size) cells = [] for i in range(num_layers): c = tf.nn.rnn_cell.GRUCell(model_size) c = tf.nn.rnn_cell.DropoutWrapper(c, input_keep_prob=keep_prob, output_keep_prob=keep_prob) cells.append(c) # I cant figure out how to use tuple version. cell = tf.nn.rnn_cell.MultiRNNCell(cells) projection_layer = Dense( units=vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) # deocder in seq2seq model. For this case we don't have an encoder. def decode(helper, scope, output_max_length, reuse=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=model_size, memory=encoder_outputs[0], memory_sequence_length=question_lengths) #cell = tf.contrib.rnn.GRUCell(num_units=model_size) attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=model_size) #out_cell = tf.contrib.rnn.OutputProjectionWrapper( # attn_cell, vocab_size, reuse=reuse #) decoder = tf.contrib.seq2seq.BasicDecoder( cell=attn_cell, helper=helper, initial_state=attn_cell.zero_state(dtype=tf.float32, batch_size=batch_size), #initial_state=encoder_final_state, output_layer=projection_layer) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=output_max_length) return outputs[0] train_outputs = decode(train_helper, 'decode', 3000) pred_outputs = decode(pred_helper, 'decode', 300, reuse=True) targets = answer_sequence[:, 1:] probs = tf.nn.softmax(pred_outputs.rnn_output, name="probs") # in case in prediction mode return if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions={ "probs": probs, "syms": pred_outputs.sample_id }) # mask the PADs mask = tf.to_float( tf.not_equal(answer_sequence[:, :-1], self.vocabs["<PAD>"])) #tf.identity(mask[0], name='mask') #tf.identity(targets[0], name='targets') #tf.identity(train_outputs.rnn_output[0,output_lengths[0]-2:output_lengths[0],:], name='rnn_out') # Loss function loss = tf.contrib.seq2seq.sequence_loss( train_outputs.rnn_output[:, :-1, :], targets, mask) tf.summary.scalar("loss", loss) # Optimizer learning_rate = tf.Variable(0.0, trainable=False) initial_learning_rate = tf.constant(0.001) learning_rate = tf.train.exponential_decay(initial_learning_rate, tf.train.get_global_step(), 100, 0.99) tf.summary.scalar("learning_rate", learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 5.0) optimizer = tf.train.AdamOptimizer(learning_rate) # Visualise gradients vis_grads = [0 if i is None else i for i in grads] for g in vis_grads: tf.summary.histogram("gradients_" + str(g), g) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.train.get_global_step()) tf.identity(question_sequence[0], name="train_input") tf.identity(train_outputs.sample_id[0], name='train_pred') tf.identity(pred_outputs.sample_id[0], name='predictions') return tf.estimator.EstimatorSpec(mode=mode, predictions=None, loss=loss, train_op=train_op)
def seq2seq_model_fn(self, mode, features, labels, params): src_vocab_size = params['src_vocab_size'] tar_vocab_size = params['tar_vocab_size'] embed_dim = params['embed_dim'] rnn_size = params['rnn_size'] inp = features['input'] output = features['output'] batch_size = tf.shape(inp)[0] start_tokens = tf.zeros([batch_size], dtype=tf.int64) train_output = tf.concat([tf.expand_dims(start_tokens, 1), output], 1) input_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(inp, self.STOP_ID)), 1) output_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(train_output, self.STOP_ID)), 1) input_embed = layers.embed_sequence(inp, vocab_size=src_vocab_size, embed_dim=embed_dim, scope='src_embed') output_embed = layers.embed_sequence(train_output, vocab_size=tar_vocab_size, embed_dim=embed_dim, scope='tar_embed') with tf.variable_scope('tar_embed', reuse=True): embeddings = tf.get_variable('embeddings') cell = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(num_units=rnn_size), input_keep_prob=self.keep_prob, output_keep_prob=self.keep_prob) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( cell, input_embed, dtype=tf.float32) train_helper = tf.contrib.seq2seq.TrainingHelper( output_embed, output_lengths) pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=tf.to_int32(start_tokens), end_token=self.STOP_ID) def decode(helper, scope, train=True, reuse=None): with tf.variable_scope(scope, reuse=reuse): cell = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(num_units=rnn_size), input_keep_prob=self.keep_prob, output_keep_prob=self.keep_prob) if train: if self.use_attn: attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=rnn_size, memory=encoder_outputs, memory_sequence_length=input_lengths) cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=rnn_size / 2) out_cell = tf.contrib.rnn.OutputProjectionWrapper( cell, tar_vocab_size, reuse=reuse) if self.use_attn: decoder_initial_state = out_cell.zero_state( dtype=tf.float32, batch_size=batch_size) decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_final_state) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=decoder_initial_state) else: decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=encoder_final_state) else: tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( encoder_final_state, multiplier=self.beam_width) if self.use_attn: tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_width) tiled_sequence_length = tf.contrib.seq2seq.tile_batch( input_lengths, multiplier=self.beam_width) attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=rnn_size, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=rnn_size / 2) out_cell = tf.contrib.rnn.OutputProjectionWrapper( cell, tar_vocab_size, reuse=reuse) if self.use_attn: decoder_initial_state = out_cell.zero_state( dtype=tf.float32, batch_size=batch_size * self.beam_width) decoder_initial_state = decoder_initial_state.clone( cell_state=tiled_encoder_final_state) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=out_cell, embedding=embeddings, start_tokens=tf.to_int32(start_tokens), end_token=self.STOP_ID, initial_state=decoder_initial_state, beam_width=self.beam_width) else: decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=out_cell, embedding=embeddings, start_tokens=tf.to_int32(start_tokens), end_token=self.STOP_ID, initial_state=tiled_encoder_final_state, beam_width=self.beam_width) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, impute_finished=train or not self.use_beam_search, maximum_iterations=2 * tf.reduce_max(output_lengths)) return outputs[0] train_outputs = decode(train_helper, 'decode') if not self.use_beam_search: pred_outputs = decode(pred_helper, 'decode', train=True, reuse=True) tf.identity(train_outputs.sample_id[0], name='train_pred') weights = tf.to_float( tf.not_equal(train_output[:, :-1], self.STOP_ID)) self.loss = tf.contrib.seq2seq.sequence_loss( train_outputs.rnn_output, output, weights=weights) train_op = layers.optimize_loss( self.loss, tf.train.get_global_step(), optimizer=params.get('optimizer', 'Adam'), learning_rate=params.get('learning_rate', 0.001), summaries=['loss', 'learning_rate']) tf.identity(pred_outputs.sample_id[0], name='predictions') return tf.estimator.EstimatorSpec( mode=mode, predictions=pred_outputs.sample_id, loss=self.loss, train_op=train_op) else: pred_outputs = decode(pred_helper, 'decode', train=False, reuse=True) tf.identity(train_outputs.sample_id[0], name='train_pred') weights = tf.to_float( tf.not_equal(train_output[:, :-1], self.STOP_ID)) self.loss = tf.contrib.seq2seq.sequence_loss( train_outputs.rnn_output, output, weights=weights) train_op = layers.optimize_loss( self.loss, tf.train.get_global_step(), optimizer=params.get('optimizer', 'Adam'), learning_rate=params.get('learning_rate', 0.001), summaries=['loss', 'learning_rate']) tf.identity(pred_outputs.predicted_ids[0], name='predictions') return tf.estimator.EstimatorSpec( mode=mode, predictions=pred_outputs.predicted_ids[:, :, 0], loss=self.loss, train_op=train_op)
def __init__(self, is_testing): super().__init__() self.is_testing = is_testing print("Preparing data...") self.train, self.valid, self.test, self.vocab = self.encode_data( bAbI('en-valid-10k')) print("Creating graph...") with tf.Graph().as_default(), tf.device('/cpu:0'): regularizer = layers.l2_regularizer(1e-4) self.session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self.global_step = tf.Variable(initial_value=0, trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) self.facts_ph = tf.placeholder(tf.int32, shape=(None, None)) # (bs*#facts, seq) self.facts_pos_ph = tf.placeholder(tf.int32, shape=(None, )) # (bs*#facts, ) self.question_ph = tf.placeholder(tf.int32, shape=(None, None)) # (bs, seq) self.answers_ph = tf.placeholder(tf.int32, shape=(None, )) # (bs, ) self.edge_indices_ph = tf.placeholder(tf.int32, shape=(None, 2)) self.fact_segments_ph = tf.placeholder(tf.int32, shape=(None, )) self.edge_segments_ph = tf.placeholder(tf.int32, shape=(None, )) self.q_seq_length_ph = tf.placeholder(tf.int32, shape=(None, )) self.f_seq_length_ph = tf.placeholder(tf.int32, shape=(None, )) self.task_indices_ph = tf.placeholder(tf.int32, shape=(None, )) self.edge_keep_prob_ph = tf.placeholder(tf.float32, shape=()) self.is_training_ph = tf.placeholder(tf.bool) placeholders = [ self.facts_ph, self.facts_pos_ph, self.question_ph, self.answers_ph, self.edge_indices_ph, self.fact_segments_ph, self.edge_segments_ph, self.q_seq_length_ph, self.f_seq_length_ph, self.task_indices_ph, self.edge_keep_prob_ph ] self.train_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='train-queue') self.val_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='val-queue') self.train_enqueue_op = self.train_queue.enqueue(placeholders) self.train_qsize_op = self.train_queue.size() tf.summary.scalar('queues/train', self.train_qsize_op) self.val_enqueue_op = self.val_queue.enqueue(placeholders) self.val_qsize_op = self.val_queue.size() tf.summary.scalar('queues/val', self.val_qsize_op) def avg_n(x): return tf.reduce_mean(tf.stack(x, axis=0), axis=0) towers = [] with tf.variable_scope(tf.get_variable_scope()): for device_nr, device in enumerate(self.devices): with tf.device('/cpu:0'): if self.is_testing: facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = placeholders else: facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = tf.cond( self.is_training_ph, true_fn=lambda: self.train_queue.dequeue(), false_fn=lambda: self.val_queue.dequeue(), ) vars = (facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob) for v, ph in zip(vars, placeholders): v.set_shape(ph.get_shape()) facts_emb = layers.embed_sequence( facts_ph, self.vocab.size(), self.emb_size, scope='word-embeddings') questions_emb = layers.embed_sequence( question_ph, self.vocab.size(), self.emb_size, scope='word-embeddings', reuse=True) with tf.device(device), tf.name_scope("device-%s" % device_nr): def mlp(x, scope, n_hidden): with tf.variable_scope(scope): for i in range(3): x = layers.fully_connected( x, n_hidden, weights_regularizer=regularizer) return layers.fully_connected( x, n_hidden, weights_regularizer=regularizer, activation_fn=None) _, (_, f_encoding) = tf.nn.dynamic_rnn( tf.nn.rnn_cell.LSTMCell(32), facts_emb, dtype=tf.float32, sequence_length=f_seq_length_ph, scope='fact-encoder') random_pos_offsets = tf.random_uniform( tf.shape(answers_ph), minval=0, maxval=self.num_facts, dtype=tf.int32) fact_pos = facts_pos_ph + tf.gather( random_pos_offsets, fact_segments_ph) facts_pos_encoding = tf.one_hot( fact_pos, 2 * self.num_facts) f_encoding = tf.concat( [f_encoding, facts_pos_encoding], axis=1) _, (_, q_encoding) = tf.nn.dynamic_rnn( tf.nn.rnn_cell.LSTMCell(32), questions_emb, dtype=tf.float32, sequence_length=q_seq_length_ph, scope='question-encoder') def graph_fn(x): with tf.variable_scope('graph-fn'): x = layers.fully_connected( x, self.n_hidden, weights_regularizer=regularizer) x = layers.fully_connected( x, self.n_hidden, weights_regularizer=regularizer) return layers.fully_connected( x, self.vocab.size(), activation_fn=None, weights_regularizer=regularizer) x = tf.concat([ f_encoding, tf.gather(q_encoding, fact_segments_ph) ], 1) x0 = mlp(x, 'pre', self.n_hidden) edge_features = tf.gather(q_encoding, edge_segments_ph) x = x0 outputs = [] log_losses = [] with tf.variable_scope('steps'): lstm_cell = LSTMCell(self.n_hidden) state = lstm_cell.zero_state( tf.shape(x)[0], tf.float32) for step in range(self.n_steps): x = message_passing( x, edge_indices_ph, edge_features, lambda x: mlp(x, 'message-fn', self. n_hidden), edge_keep_prob) x = mlp(tf.concat([x, x0], axis=1), 'post-fn', self.n_hidden) x, state = lstm_cell(x, state) with tf.variable_scope('graph-sum'): graph_sum = tf.segment_sum( x, fact_segments_ph) out = graph_fn(graph_sum) outputs.append(out) log_losses.append( tf.reduce_mean( tf.nn. sparse_softmax_cross_entropy_with_logits( labels=answers_ph, logits=out))) tf.get_variable_scope().reuse_variables() reg_loss = sum( tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES)) loss = avg_n(log_losses) + reg_loss towers.append({ 'loss': loss, 'grads': self.optimizer.compute_gradients(loss), 'log_losses': tf.stack(log_losses), # (n_steps, 1) 'answers': answers_ph, # (batch_size, n_outputs) 'outputs': tf.stack( outputs), # (n_steps, batch_size, n_outputs) 'task_indices': task_indices_ph # (batch_size, n_outputs }) tf.get_variable_scope().reuse_variables() self.loss = avg_n([t['loss'] for t in towers]) self.out = tf.concat([t['outputs'] for t in towers], axis=1) self.answers = tf.concat([t['answers'] for t in towers], axis=0) self.task_indices = tf.concat([t['task_indices'] for t in towers], axis=0) tf.summary.scalar('losses/total', self.loss) tf.summary.scalar('losses/reg', reg_loss) log_losses = avg_n([t['log_losses'] for t in towers]) for i in range(self.n_steps): tf.summary.scalar('steps/%d/losses/log' % i, log_losses[i]) avg_gradients = util.average_gradients( [t['grads'] for t in towers]) self.train_step = self.optimizer.apply_gradients( avg_gradients, global_step=self.global_step) self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() util.print_vars(tf.trainable_variables()) self.train_writer = tf.summary.FileWriter( '/tmp/tensorboard/bAbI/%s/train/%s' % (self.revision, self.name), self.session.graph) self.test_writer = tf.summary.FileWriter( '/tmp/tensorboard/bAbI/%s/test/%s' % (self.revision, self.name), self.session.graph) self.summaries = tf.summary.merge_all() print("Starting data loaders...") train_mp_queue = mp.Manager().Queue(maxsize=self.qsize) val_mp_queue = mp.Manager().Queue(maxsize=self.qsize) data_loader_processes = [ mp.Process(target=self.data_loader, args=(train_mp_queue, True)) for i in range(4) ] val_data_loader_processes = [ mp.Process(target=self.data_loader, args=(val_mp_queue, False)) for i in range(1) ] for p in data_loader_processes + val_data_loader_processes: p.daemon = True p.start() queue_putter_threads = [ threading.Thread(target=self.queue_putter, args=(train_mp_queue, self.train_enqueue_op, 'train', 1000)), threading.Thread(target=self.queue_putter, args=(val_mp_queue, self.val_enqueue_op, 'val', 1)), ] for t in queue_putter_threads: t.daemon = True t.start() train_qsize, val_qsize = 0, 0 print("Waiting for queue to fill...") while train_qsize < self.qsize or val_qsize < self.qsize: train_qsize = self.session.run(self.train_qsize_op) val_qsize = self.session.run(self.val_qsize_op) print('train_qsize: %d, val_qsize: %d' % (train_qsize, val_qsize), flush=True) time.sleep(1)
# will use this to set the weights for every category in every methodology initial_emb_weights = [ np.random.rand(n, embedding_dim) for n in n_cat_by_feature ] # the actual features features = [ tf.placeholder(shape=[H, W], dtype="int32", name="feat%d" % i) for i, _ in enumerate(n_cat_by_feature) ] # 1.1) embed on channel -> concat on channel embedded1 = [] for f, n, w in zip(features, n_cat_by_feature, initial_emb_weights): e = layers.embed_sequence(f, vocab_size=n, embed_dim=embedding_dim, initializer=tf.constant_initializer(w)) embedded1.append(e) out11 = tf.concat(embedded1, axis=2) # 1.2) onehot on channel -> 1x1 conv separately -> concat on channel embedded2 = [] for f, n, w in zip(features, n_cat_by_feature, initial_emb_weights): one_hot = layers.one_hot_encoding(f, num_classes=n) conv_out = layers.conv2d(inputs=one_hot, num_outputs=embedding_dim, weights_initializer=tf.constant_initializer(w), kernel_size=1, stride=1)
def build_model(self): self.placeholders = _get_placeholders(self.spatial_dim) with tf.variable_scope("theta"): units_embedded = layers.embed_sequence( self.placeholders.screen_unit_type, vocab_size=SCREEN_FEATURES.unit_type.scale, embed_dim=self.unit_type_emb_dim, scope="unit_type_emb", trainable=self.trainable ) # Let's not one-hot zero which is background player_relative_screen_one_hot = layers.one_hot_encoding( self.placeholders.player_relative_screen, num_classes=SCREEN_FEATURES.player_relative.scale )[:, :, :, 1:] player_relative_minimap_one_hot = layers.one_hot_encoding( self.placeholders.player_relative_minimap, num_classes=MINIMAP_FEATURES.player_relative.scale )[:, :, :, 1:] channel_axis = 3 screen_numeric_all = tf.concat( [self.placeholders.screen_numeric, units_embedded, player_relative_screen_one_hot], axis=channel_axis ) minimap_numeric_all = tf.concat( [self.placeholders.minimap_numeric, player_relative_minimap_one_hot], axis=channel_axis ) # BUILD CONVNNs screen_output = self._build_convs(screen_numeric_all, "screen_network") minimap_output = self._build_convs(minimap_numeric_all, "minimap_network") # State representation (last layer before separation as described in the paper) self.map_output = tf.concat([screen_output, minimap_output], axis=channel_axis) # BUILD CONVLSTM self.rnn_in = tf.reshape(self.map_output, [1, -1, 32, 32, 64]) self.cell = tf.contrib.rnn.Conv2DLSTMCell(input_shape=[32, 32, 1], # input dims kernel_shape=[3, 3], # for a 3 by 3 conv output_channels=64) # number of feature maps c_init = np.zeros((1, 32, 32, 64), np.float32) h_init = np.zeros((1, 32, 32, 64), np.float32) self.state_init = [c_init, h_init] step_size = tf.shape(self.map_output)[:1] # Get step_size from input dimensions c_in = tf.placeholder(tf.float32, [None, 32, 32, 64]) h_in = tf.placeholder(tf.float32, [None, 32, 32, 64]) self.state_in = (c_in, h_in) state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in) self.step_size = tf.placeholder(tf.float32, [1]) (self.outputs, self.state) = tf.nn.dynamic_rnn(self.cell, self.rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False, dtype=tf.float32) lstm_c, lstm_h = self.state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(self.outputs, [-1, 32, 32, 64]) # 1x1 conv layer to generate our spatial policy self.spatial_action_logits = layers.conv2d( rnn_out, data_format="NHWC", num_outputs=1, kernel_size=1, stride=1, activation_fn=None, scope='spatial_action', trainable=self.trainable ) spatial_action_probs = tf.nn.softmax(layers.flatten(self.spatial_action_logits)) map_output_flat = tf.reshape(self.outputs, [-1, 65536]) # (32*32*64) # fully connected layer for Value predictions and action_id self.fc1 = layers.fully_connected( map_output_flat, num_outputs=256, activation_fn=tf.nn.relu, scope="fc1", trainable=self.trainable ) # fc/action_id action_id_probs = layers.fully_connected( self.fc1, num_outputs=len(actions.FUNCTIONS), activation_fn=tf.nn.softmax, scope="action_id", trainable=self.trainable ) # fc/value self.value_estimate = tf.squeeze(layers.fully_connected( self.fc1, num_outputs=1, activation_fn=None, scope='value', trainable=self.trainable ), axis=1) # disregard non-allowed actions by setting zero prob and re-normalizing to 1 ((MINE) THE MASK) action_id_probs *= self.placeholders.available_action_ids action_id_probs /= tf.reduce_sum(action_id_probs, axis=1, keepdims=True) def logclip(x): return tf.log(tf.clip_by_value(x, 1e-12, 1.0)) spatial_action_log_probs = ( logclip(spatial_action_probs) * tf.expand_dims(self.placeholders.is_spatial_action_available, axis=1) ) # non-available actions get log(1e-10) value but that's ok because it's never used action_id_log_probs = logclip(action_id_probs) self.value_estimate = self.value_estimate self.action_id_probs = action_id_probs self.spatial_action_probs = spatial_action_probs self.action_id_log_probs = action_id_log_probs self.spatial_action_log_probs = spatial_action_log_probs selected_spatial_action_flat = ravel_index_pairs( self.placeholders.selected_spatial_action, self.spatial_dim ) selected_log_probs = self._get_select_action_probs(selected_spatial_action_flat) # maximum is to avoid 0 / 0 because this is used to calculate some means sum_spatial_action_available = tf.maximum( 1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available) ) neg_entropy_spatial = tf.reduce_sum( self.spatial_action_probs * self.spatial_action_log_probs ) / sum_spatial_action_available neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum( self.action_id_probs * self.action_id_log_probs, axis=1 )) # Sample now actions from the corresponding dstrs defined by the policy network theta self.sampled_action_id = weighted_random_sample(self.action_id_probs) self.sampled_spatial_action = weighted_random_sample(self.spatial_action_probs) self.value_estimate = self.value_estimate policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage) value_loss = tf.losses.mean_squared_error( self.placeholders.value_target, self.value_estimate) loss = ( policy_loss + value_loss * self.loss_value_weight + neg_entropy_spatial * self.entropy_weight_spatial + neg_entropy_action_id * self.entropy_weight_action_id ) self.train_op = layers.optimize_loss( loss=loss, global_step=tf.train.get_global_step(), optimizer=self.optimiser, clip_gradients=self.max_gradient_norm, summaries=OPTIMIZER_SUMMARIES, learning_rate=None, name="train_op" ) self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate)) self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target)) self._scalar_summary("action/is_spatial_action_available", tf.reduce_mean(self.placeholders.is_spatial_action_available)) self._scalar_summary("action/selected_id_log_prob", tf.reduce_mean(selected_log_probs.action_id)) self._scalar_summary("loss/policy", policy_loss) self._scalar_summary("loss/value", value_loss) self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial) self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id) self._scalar_summary("loss/total", loss) self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage)) self._scalar_summary("action/selected_total_log_prob", tf.reduce_mean(selected_log_probs.total)) self._scalar_summary("action/selected_spatial_log_prob", tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available) self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver(max_to_keep=2) self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES) self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
def seq2seq(mode, features, labels, params): vocab_size = params['vocab_size'] embed_dim = params['embed_dim'] num_units = params['num_units'] input_max_length = params['input_max_length'] output_max_length = params['output_max_length'] inp = features['input'] output_tensor = features['output'] batch_size = tf.shape(inp)[0] start_tokens = tf.zeros([batch_size], dtype=tf.int64) + GO_TOKEN train_output = tf.concat([tf.expand_dims(start_tokens, 1), output_tensor], 1) #print (train_output.get_shape().as_list()) input_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(inp, 1)), 1) #print (input_lengths.get_shape().as_list()) output_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(train_output, 1)), 1) #print (output_lengths.get_shape().as_list()) input_embed = layers.embed_sequence( inp, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed') output_embed = layers.embed_sequence( train_output, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed', reuse=True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') cell = tf.contrib.rnn.GRUCell(num_units=num_units) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, input_embed, dtype=tf.float32) #print (encoder_outputs.get_shape().as_list()) train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths) # train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( # output_embed, output_lengths, embeddings, 0.3 # ) pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=tf.to_int32(start_tokens), end_token=8) def decode(helper, scope, reuse=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=num_units, memory=encoder_outputs, memory_sequence_length=input_lengths) cell = tf.contrib.rnn.GRUCell(num_units=num_units) attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=num_units / 2) out_cell = tf.contrib.rnn.OutputProjectionWrapper( attn_cell, vocab_size, reuse=reuse ) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=out_cell.zero_state( dtype=tf.float32, batch_size=batch_size)) #initial_state=encoder_final_state) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=output_max_length ) return outputs[0] train_outputs = decode(train_helper, 'decode') pred_outputs = decode(pred_helper, 'decode', reuse=True) tf.identity(train_outputs.sample_id[0], name='train_pred') weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1)) loss = tf.contrib.seq2seq.sequence_loss( train_outputs.rnn_output, output_tensor, weights=weights) train_op = layers.optimize_loss( loss, tf.train.get_global_step(), optimizer=params.get('optimizer', 'Adam'), learning_rate=params.get('learning_rate', 0.001), summaries=['loss', 'learning_rate']) tf.identity(pred_outputs.sample_id[0], name='predictions') # if mode == tf.estimator.ModeKeys.PREDICT: # return tf.estimator.EstimatorSpec(mode=mode, predictions = pred_outputs) return tf.estimator.EstimatorSpec( mode=mode, predictions=pred_outputs.sample_id, loss=loss, train_op=train_op )
def seq2seq(features, labels, mode, params): vocab_size = params['vocab_size'] embed_dim = params['embed_dim'] num_units = params['num_units'] input_max_length = params['input_max_length'] output_max_length = params['output_max_length'] dropout = params['dropout'] attention_mechanism_name = params['attention_mechanism_name'] cell_type = params['cell_type'] beam_width = params['beam_width'] inp = features['input'] batch_size = tf.shape(inp)[0] start_tokens = tf.zeros([batch_size], dtype=tf.int64) input_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(inp, 1)), 1) input_embed = layers.embed_sequence(inp, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed') with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') if cell_type.upper() == 'GRU': fw_cell = tf.contrib.rnn.GRUCell(num_units=num_units) bw_cell = tf.contrib.rnn.GRUCell(num_units=num_units) elif cell_type.upper() == 'LSTM': fw_cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units) bw_cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units) else: raise ValueError("The Memory Cell unit %s provided is not valid " % cell_type) if dropout > 0.0: print(" %s, dropout=%g " % (type(fw_cell).__name__, dropout)) fw_cell = tf.contrib.rnn.DropoutWrapper(cell=fw_cell, input_keep_prob=(1.0 - dropout)) bw_cell = tf.contrib.rnn.DropoutWrapper(cell=bw_cell, input_keep_prob=(1.0 - dropout)) bd_encoder_outputs, bd_encoder_final_state = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell, inputs=input_embed, dtype=tf.float32) encoder_outputs = tf.concat(bd_encoder_outputs, -1) encoder_final_state = tf.concat(bd_encoder_final_state, -1) pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=tf.to_int32(start_tokens), end_token=END_TOKEN) if mode == tf.estimator.ModeKeys.PREDICT: # Specific for Prediction pred_outputs = set_decoder.setting_decoder(pred_helper, 'decode', num_units, encoder_outputs, encoder_final_state, input_lengths, vocab_size, batch_size, output_max_length, attention_mechanism_name, cell_type, embeddings, start_tokens, END_TOKEN, beam_width, reuse=False) if beam_width > 0: tf.identity(pred_outputs.predicted_ids, name='predictions') return tf.estimator.EstimatorSpec( mode=mode, predictions=pred_outputs.predicted_ids) else: tf.identity(pred_outputs.sample_id[0], name='predictions') return tf.estimator.EstimatorSpec( mode=mode, predictions=pred_outputs.sample_id) else: # Specific For Training output = features['output'] train_output = tf.concat([tf.expand_dims(start_tokens, 1), output], 1) output_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(train_output, 1)), 1) output_embed = layers.embed_sequence(train_output, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed', reuse=True) train_helper = tf.contrib.seq2seq.TrainingHelper( output_embed, output_lengths) train_outputs = set_decoder.setting_decoder(train_helper, 'decode', num_units, encoder_outputs, encoder_final_state, input_lengths, vocab_size, batch_size, output_max_length, attention_mechanism_name, cell_type, embeddings, start_tokens, END_TOKEN, beam_width, reuse=None) pred_outputs = set_decoder.setting_decoder(pred_helper, 'decode', num_units, encoder_outputs, encoder_final_state, input_lengths, vocab_size, batch_size, output_max_length, attention_mechanism_name, cell_type, embeddings, start_tokens, END_TOKEN, beam_width, reuse=True) tf.identity(train_outputs.sample_id[0], name='train_pred') weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1)) loss = tf.contrib.seq2seq.sequence_loss(train_outputs.rnn_output, output, weights=weights) train_op = layers.optimize_loss( loss, tf.train.get_global_step(), optimizer=params.get('optimizer', 'Adam'), learning_rate=params.get('learning_rate', 0.001), summaries=['loss', 'learning_rate']) tf.identity(pred_outputs.sample_id[0], name='predictions') return tf.estimator.EstimatorSpec(mode=mode, predictions=pred_outputs.sample_id, loss=loss, train_op=train_op)
def seq2seq(self, features, labels, params): vocab_size = params['vocab_size'] embed_dim = params['embed_dim'] num_units = params['num_units'] output_max_length = params['output_max_length'] print("获得输入张量的名字", features.name, labels.name) #inp = tf.identity(features[0], 'input_0') #output = tf.identity(labels[0], 'output_0') #print(inp.name,output.name)#用于钩子函数显示 batch_size = tf.shape(features)[0] start_tokens = tf.tile( [self.START_TOKEN], [batch_size]) #也可以使用tf.zeros([batch_size], dtype=tf.int32) train_output = tf.concat([tf.expand_dims(start_tokens, 1), labels], 1) #为其添加开始标志 input_lengths = tf.reduce_sum(tf.cast( tf.not_equal(features, self.END_TOKEN), tf.int32), 1, name="len") output_lengths = tf.reduce_sum(tf.cast( tf.not_equal(train_output, self.END_TOKEN), tf.int32), 1, name="outlen") input_embed = layers.embed_sequence(features, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed') output_embed = layers.embed_sequence(train_output, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed', reuse=True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') Indcell = tf.nn.rnn_cell.DeviceWrapper( tf.contrib.rnn.IndRNNCell(num_units=num_units), "/device:GPU:0") IndyLSTM_cell = tf.nn.rnn_cell.DeviceWrapper( tf.contrib.rnn.IndyLSTMCell(num_units=num_units), "/device:GPU:0") multi_cell = tf.nn.rnn_cell.MultiRNNCell([Indcell, IndyLSTM_cell]) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( multi_cell, input_embed, sequence_length=input_lengths, dtype=tf.float32) if self.useScheduled: train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( output_embed, tf.tile([output_max_length], [batch_size]), embeddings, 0.3) else: train_helper = tf.contrib.seq2seq.TrainingHelper( output_embed, tf.tile([output_max_length], [batch_size])) pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=tf.tile([self.START_TOKEN], [batch_size]), end_token=self.END_TOKEN) def decode(helper, scope, reuse=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( #注意力模型 num_units=num_units, memory=encoder_outputs, memory_sequence_length=input_lengths) cell = tf.contrib.rnn.IndRNNCell(num_units=num_units) if reuse == None: keep_prob = 0.8 else: keep_prob = 1 cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=keep_prob) attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=num_units / 2) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=out_cell.zero_state(dtype=tf.float32, batch_size=batch_size)) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=output_max_length) return outputs[0] train_outputs = decode(train_helper, 'decode') pred_outputs = decode(pred_helper, 'decode', reuse=True) #tf.identity(train_outputs.sample_id[0], name='train_pred') # weights = tf.cast(tf.not_equal(train_output[:, :-1], 0),tf.float32)#掩码 masks = tf.sequence_mask(output_lengths, output_max_length, dtype=tf.float32, name="masks") loss = tf.contrib.seq2seq.sequence_loss(train_outputs.rnn_output, labels, weights=masks) train_op = layers.optimize_loss( loss, tf.train.get_global_step(), optimizer=params.get('optimizer', 'Adam'), learning_rate=params.get('learning_rate', 0.001), summaries=['loss', 'learning_rate']) #tf.identity(pred_outputs.sample_id[0], name='predictions') # 用于钩子函数显示 return train_op, pred_outputs.sample_id, loss
def fast_text_model_fn(self, features, labels, mode, params): vocab_table = lookup.index_table_from_file( vocabulary_file=self.VOCAB_FILE, num_oov_buckets=1, default_value=-1) text = features[self.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=self.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, self.MAX_LEN]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, self.MAX_LEN]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) with tf.name_scope('embedding'): embedding_vectors = layers.embed_sequence( word_id_vector, vocab_size=self.VOCAB_LEN, embed_dim=self.EMBED_DIM, initializer=layers.xavier_initializer(seed=42)) tf.logging.info('Word Vectors = {}'.format(embedding_vectors)) with tf.name_scope('fast_text'): average_vectors = tf.reduce_sum(embedding_vectors, axis=1) tf.logging.info( 'Average Word Vectors = {}'.format(average_vectors)) with tf.name_scope('hidden_layer'): fc1 = tf.keras.layers.Dense(1024, activation='relu')(average_vectors) d1 = tf.keras.layers.Dropout(0.5)(fc1) fc2 = tf.keras.layers.Dense(self.EMBED_DIM / 2, activation='relu')(d1) d2 = tf.keras.layers.Dropout(0.5)(fc2) tf.logging.info('Hidden Layer = {}'.format(d2)) with tf.name_scope('output'): logits = tf.keras.layers.Dense(self.TARGET_SIZE, activation=None)(d2) tf.logging.info('Logits Layer = {}'.format(logits)) probabilities = tf.nn.softmax(logits) predicted_indices = tf.argmax(probabilities, axis=1) tf.summary.histogram('fasttext', average_vectors) tf.summary.histogram('softmax', probabilities) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class': predicted_indices, 'probabilities': probabilities } exported_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(predicted_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices), 'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices), 'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices), 'f1_score': self.streaming_f1(labels=labels, predictions=predicted_indices, n_classes=self.TARGET_SIZE) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def __init__(self, is_testing): super().__init__() self.is_testing = is_testing with tf.Graph().as_default(), tf.device('/cpu:0'): regularizer = layers.l2_regularizer(1e-4) self.name = "%s %s" % (self.revision, self.message) self.train, self.valid, self.test = self.encode_data(sudoku()) print("Building graph...") self.session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self.global_step = tf.Variable(initial_value=0, trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) self.mode = tf.placeholder(tf.string) edges = self.sudoku_edges() edges = [(i + (b * 81), j + (b * 81)) for b in range(self.batch_size) for i, j in edges] ridx = [edges.index((j, i)) for i, j in edges] edge_indices = tf.constant(edges, tf.int32) n_edges = tf.shape(edge_indices)[0] positions = tf.constant([[(i, j) for i in range(9) for j in range(9)] for b in range(self.batch_size)], tf.int32) # (bs, 81, 2) rows = layers.embed_sequence(positions[:, :, 0], 9, self.emb_size, scope='row-embeddings', unique=True) # bs, 81, emb_size cols = layers.embed_sequence(positions[:, :, 1], 9, self.emb_size, scope='cols-embeddings', unique=True) # bs, 81, emb_size def avg_n(x): return tf.reduce_mean(tf.stack(x, axis=0), axis=0) towers = [] with tf.variable_scope(tf.get_variable_scope()): for device_nr, device in enumerate(self.devices): with tf.device('/cpu:0'): if self.is_testing: (quizzes, answers ), edge_keep_prob = self.test.get_next(), 1.0 else: (quizzes, answers), edge_keep_prob = tf.cond( tf.equal(self.mode, "train"), true_fn=lambda: (self.train.get_next(), self.edge_keep_prob), false_fn=lambda: (self.valid.get_next(), 1.0)) x = layers.embed_sequence( quizzes, 10, self.emb_size, scope='nr-embeddings', unique=True) # bs, 81, emb_size x = tf.concat([x, rows, cols], axis=2) x = tf.reshape(x, (-1, 3 * self.emb_size)) with tf.device(device), tf.name_scope("device-%s" % device_nr): def mlp(x, scope, n_out): with tf.variable_scope(scope): for i in range(3): x = layers.fully_connected( x, n_out, weights_regularizer=regularizer) return layers.fully_connected( x, n_out, weights_regularizer=regularizer, activation_fn=None) x = mlp(x, 'C1', self.n_hidden) dependents = tf.zeros((n_edges, 10)) outputs = [] log_losses = [] with tf.variable_scope('steps'): for step in range(self.n_steps): # M_F = c2(c1(x, p), c1(x, N_F\p), d_pF) # d_pF = sum_{q \in N_F\p} (M_F) # p(y_p|x) = softmax(sum(M_F)) logits, messages = message_passing( x, edge_indices, dependents, lambda x: mlp(x, 'C2', 10)) dependents = tf.gather( logits, edge_indices[:, 0]) - tf.gather( messages, ridx) out = tf.reshape(logits, (-1, 81, 10)) outputs.append(out) log_losses.append( tf.reduce_mean( tf.nn. sparse_softmax_cross_entropy_with_logits( labels=answers, logits=out))) tf.get_variable_scope().reuse_variables() reg_loss = sum( tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES)) loss = log_losses[-1] + reg_loss towers.append({ 'loss': loss, 'grads': [(tf.clip_by_value(g, -10.0, 10.0), v) for g, v in self.optimizer.compute_gradients(loss) ], 'log_losses': tf.stack(log_losses), # (n_steps, 1) 'quizzes': quizzes, # (bs, 81, 10) 'answers': answers, # (bs, 81, 10) 'outputs': tf.stack(outputs) # n_steps, bs, 81, 10 }) tf.get_variable_scope().reuse_variables() self.loss = avg_n([t['loss'] for t in towers]) self.out = tf.concat([t['outputs'] for t in towers], axis=1) # n_steps, bs, 81, 10 self.predicted = tf.cast(tf.argmax(self.out, axis=3), tf.int32) self.answers = tf.concat([t['answers'] for t in towers], axis=0) self.quizzes = tf.concat([t['quizzes'] for t in towers], axis=0) tf.summary.scalar('losses/total', self.loss) tf.summary.scalar('losses/reg', reg_loss) log_losses = avg_n([t['log_losses'] for t in towers]) for step in range(self.n_steps): equal = tf.equal(self.answers, self.predicted[step]) digit_acc = tf.reduce_mean(tf.to_float(equal)) tf.summary.scalar('steps/%d/digit-acc' % step, digit_acc) puzzle_acc = tf.reduce_mean( tf.to_float(tf.reduce_all(equal, axis=1))) tf.summary.scalar('steps/%d/puzzle-acc' % step, puzzle_acc) tf.summary.scalar('steps/%d/losses/log' % step, log_losses[step]) avg_gradients = util.average_gradients( [t['grads'] for t in towers]) self.train_step = self.optimizer.apply_gradients( avg_gradients, global_step=self.global_step) self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() util.print_vars(tf.trainable_variables()) self.train_writer = tf.summary.FileWriter( self.tensorboard_dir + '/sudoku/%s/train/%s' % (self.revision, self.name), self.session.graph) self.test_writer = tf.summary.FileWriter( self.tensorboard_dir + '/sudoku/%s/test/%s' % (self.revision, self.name), self.session.graph) self.summaries = tf.summary.merge_all()
def __init__(self, config, num_words, num_answers, reuse=False, device=''): ResnetModel.__init__(self, "clevr", device=device) with tf.variable_scope(self.scope_name, reuse=reuse): batch_size = None self._is_training = tf.placeholder(tf.bool, name="is_training") dropout_keep_scalar = float(config["dropout_keep_prob"]) dropout_keep = tf.cond(self._is_training, lambda: tf.constant(dropout_keep_scalar), lambda: tf.constant(1.0)) ##################### # QUESTION ##################### self._question = tf.placeholder(tf.int32, [batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length') self._answer = tf.placeholder(tf.int64, [batch_size], name='answer') word_emb = tfc_layers.embed_sequence( ids=self._question, vocab_size=num_words, embed_dim=config["question"]["word_embedding_dim"], scope="word_embedding", reuse=reuse) if config["question"]['glove']: self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") word_emb = tf.concat([word_emb, self._glove], axis=2) word_emb = tf.nn.dropout(word_emb, dropout_keep) _, last_rnn_state = rnn.rnn_factory( inputs=word_emb, seq_length=self._seq_length, cell=config["question"]["cell"], num_hidden=config["question"]["rnn_state_size"], bidirectional=config["question"]["bidirectional"], max_pool=config["question"]["max_pool"], layer_norm=config["question"]["layer_norm"], reuse=reuse) last_rnn_state = tf.nn.dropout(last_rnn_state, dropout_keep) ##################### # IMAGES ##################### self._image = tf.placeholder(tf.float32, [batch_size] + config['image']["dim"], name='image') visual_features = get_image_features(image=self._image, is_training=self._is_training, config=config['image']) with tf.variable_scope("image_film_stack", reuse=reuse): film_stack = FiLM_Stack(image=visual_features, film_input=last_rnn_state, is_training=self._is_training, config=config["film_block"], reuse=reuse) visual_features = film_stack.get() # Pool Image Features with tf.variable_scope("image_pooling"): multimodal_features = get_attention(visual_features, last_rnn_state, is_training=self._is_training, config=config["pooling"], dropout_keep=dropout_keep, reuse=reuse) with tf.variable_scope("classifier"): self.hidden_state = tfc_layers.fully_connected(multimodal_features, num_outputs=config["classifier"]["no_mlp_units"], normalizer_fn=tfc_layers.batch_norm, normalizer_params={"center": True, "scale": True, "decay": 0.9, "is_training": self._is_training, "reuse": reuse}, activation_fn=tf.nn.relu, reuse=reuse, scope="classifier_hidden_layer") self.out = tfc_layers.fully_connected(self.hidden_state, num_outputs=num_answers, activation_fn=None, reuse=reuse, scope="classifier_softmax_layer") ##################### # Loss ##################### self.cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.out, labels=self._answer, name='cross_entropy') self.loss = tf.reduce_mean(self.cross_entropy) self.softmax = tf.nn.softmax(self.out, name='answer_prob') self.prediction = tf.argmax(self.out, axis=1, name='predicted_answer') # no need to compute the softmax with tf.variable_scope('accuracy'): self.accuracy = tf.equal(self.prediction, self._answer) self.accuracy = tf.reduce_mean(tf.cast(self.accuracy, tf.float32)) tf.summary.scalar('accuracy', self.accuracy) print('Model... build!')
def seq2seq(mode, features, labels, params): vocab_size = params['vocab_size'] embed_dim = params['embed_dim'] num_units = params['num_units'] input_max_length = params['input_max_length'] output_max_length = params['output_max_length'] inp = features['input'] output = features['output'] batch_size = tf.shape(inp)[0] start_tokens = tf.zeros([batch_size], dtype=tf.int64) train_output = tf.concat([tf.expand_dims(start_tokens, 1), output], 1) input_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(inp, 1)), 1) output_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(train_output, 1)), 1) input_embed = layers.embed_sequence(inp, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed') output_embed = layers.embed_sequence(train_output, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed', reuse=True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') cell = tf.contrib.rnn.GRUCell(num_units=num_units) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, input_embed, dtype=tf.float32) train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths) # train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( # output_embed, output_lengths, embeddings, 0.3 # ) pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1) def decode(helper, scope, reuse=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=num_units, memory=encoder_outputs, memory_sequence_length=input_lengths) cell = tf.contrib.rnn.GRUCell(num_units=num_units) attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=num_units / 2) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, vocab_size, reuse=reuse) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=helper, initial_state=out_cell.zero_state(dtype=tf.float32, batch_size=batch_size)) #initial_state=encoder_final_state) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=output_max_length) return outputs[0] train_outputs = decode(train_helper, 'decode') pred_outputs = decode(pred_helper, 'decode', reuse=True) tf.identity(train_outputs.sample_id[0], name='train_pred') weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1)) loss = tf.contrib.seq2seq.sequence_loss(train_outputs.rnn_output, output, weights=weights) train_op = layers.optimize_loss(loss, tf.train.get_global_step(), optimizer=params.get('optimizer', 'Adam'), learning_rate=params.get( 'learning_rate', 0.001), summaries=['loss', 'learning_rate']) tf.identity(pred_outputs.sample_id[0], name='predictions') return tf.estimator.EstimatorSpec(mode=mode, predictions=pred_outputs.sample_id, loss=loss, train_op=train_op)
def fast_text_model_fn(self, features, labels, mode, params): vocab_table = lookup.index_table_from_file(vocabulary_file=self.VOCAB_FILE, num_oov_buckets=1, default_value=-1) text = features[self.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=self.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, self.MAX_LEN]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, self.MAX_LEN]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) with tf.name_scope('embedding'): embedding_vectors = layers.embed_sequence(word_id_vector, vocab_size=self.VOCAB_LEN, embed_dim=self.EMBED_DIM, initializer=layers.xavier_initializer(seed=42)) tf.logging.info('Word Vectors = {}'.format(embedding_vectors)) with tf.name_scope('fast_text'): average_vectors = tf.reduce_sum(embedding_vectors, axis=1) tf.logging.info('Average Word Vectors = {}'.format(average_vectors)) with tf.name_scope('hidden_layer'): fc1 = tf.keras.layers.Dense(1024, activation='relu')(average_vectors) d1 = tf.keras.layers.Dropout(0.5)(fc1) fc2 = tf.keras.layers.Dense(self.EMBED_DIM / 2, activation='relu')(d1) d2 = tf.keras.layers.Dropout(0.5)(fc2) tf.logging.info('Hidden Layer = {}'.format(d2)) with tf.name_scope('output'): logits = tf.keras.layers.Dense(self.TARGET_SIZE, activation=None)(d2) tf.logging.info('Logits Layer = {}'.format(logits)) probabilities = tf.nn.softmax(logits) predicted_indices = tf.argmax(probabilities, axis=1) tf.summary.histogram('fasttext', average_vectors) tf.summary.histogram('softmax', probabilities) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class': predicted_indices, 'probabilities': probabilities } exported_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(predicted_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices), 'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices), 'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices), 'f1_score': self.streaming_f1(labels=labels, predictions=predicted_indices, n_classes=self.TARGET_SIZE) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def model_fn(features, labels, mode, params): sequences = features["sequences"] # 分散表現を取得 emb_sequences = embed_sequence(sequences, params["vocab_size"], params["embed_dim"], initializer=tf.random_uniform_initializer( -1, 1)) # 文章の長さを取得 mask = tf.to_int32(tf.not_equal(sequences, tf.zeros_like(sequences))) length = tf.reduce_sum(mask, axis=-1) print(params) if params["lstm"] == 1: # RNN(LSTM / 双方向)を実行 cell = tf.nn.rnn_cell.LSTMCell(num_units=params["embed_dim"]) outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell, cell_bw=cell, inputs=emb_sequences, dtype=tf.float32, sequence_length=length) output_fw, output_bw = outputs states_fw, states_bw = states # 双方向の出力を結合 output = tf.concat([output_fw, output_bw], axis=-1) else: output = emb_sequences # 出力の総和を取る(average pooling) mask = tf.expand_dims(tf.cast(mask, tf.float32), -1) length = tf.expand_dims(tf.cast(length, tf.float32), -1) logits = tf.reduce_sum(emb_sequences * mask, 1) / length logits = layers.dense(logits, params["category_size"]) # 結果出力の準備 (結果出力モード) predictions = { "classes": tf.argmax(logits, axis=1), # 1位のカテゴリ "probabilities": tf.nn.softmax(logits, name="probabilities") # 識別確率 } # 結果出力モード if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # lossの計算 (学習モード / 評価モード) onehot_labels = tf.one_hot(indices=tf.to_int32(labels), depth=params["category_size"]) loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) # 学習モード if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer( learning_rate=params["learning_rate"]) train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # 評価値の計算 (評価モード) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } # 評価モード return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
# will use this to set the weights for every category in every methodology initial_emb_weights = [np.random.rand(n, embedding_dim) for n in n_cat_by_feature] # the actual features features = [ tf.placeholder(shape=[H, W], dtype="int32", name="feat%d" % i) for i, _ in enumerate(n_cat_by_feature) ] # 1.1) embed on channel -> concat on channel embedded1 = [] for f, n, w in zip(features, n_cat_by_feature, initial_emb_weights): e = layers.embed_sequence( f, vocab_size=n, embed_dim=embedding_dim, initializer=tf.constant_initializer(w) ) embedded1.append(e) out11 = tf.concat(embedded1, axis=2) # 1.2) onehot on channel -> 1x1 conv separately -> concat on channel embedded2 = [] for f, n, w in zip(features, n_cat_by_feature, initial_emb_weights): one_hot = layers.one_hot_encoding(f, num_classes=n) conv_out = layers.conv2d( inputs=one_hot, num_outputs=embedding_dim, weights_initializer=tf.constant_initializer(w),
def attend(pixels, word_indices, pattern_indices, char_indices, memory_mask, parses): """ :param pixels: (bs, h, w) :param word_indices: (bs, h, w) :param pattern_indices: (bs, h, w) :param char_indices: (bs, h, w) :param parses: (bs, h, w, 4, 2) """ bs = tf.shape(pixels)[0] X, Y = tf.meshgrid(tf.linspace(0.0, 1.0, RealData.im_size[0]), tf.linspace(0.0, 1.0, RealData.im_size[0])) X = tf.tile(X[None, ..., None], (bs, 1, 1, 1)) Y = tf.tile(Y[None, ..., None], (bs, 1, 1, 1)) word_embeddings = tf.reshape( layers.embed_sequence(tf.reshape(word_indices, (bs, -1)), vocab_size=train.word_hash_size, embed_dim=self.n_hid, unique=False, scope="word-embeddings"), (bs, h, w, self.n_hid)) pattern_embeddings = tf.reshape( layers.embed_sequence(tf.reshape(pattern_indices, (bs, -1)), vocab_size=train.pattern_hash_size, embed_dim=self.n_hid, unique=False, scope="pattern-embeddings"), (bs, h, w, self.n_hid)) char_embeddings = tf.reshape( layers.embed_sequence(tf.reshape(char_indices, (bs, -1)), vocab_size=train.n_output, embed_dim=self.n_hid, unique=False, scope="char-embeddings"), (bs, h, w, self.n_hid)) pixels = tf.reshape(pixels, (bs, h, w, 3)) parses = tf.reshape(parses, (bs, h, w, 8)) memory_mask = tf.reshape(memory_mask, (bs, h, w, 1)) x = tf.concat([ pixels, word_embeddings, pattern_embeddings, char_embeddings, parses, X, Y, memory_mask ], axis=3) with tf.variable_scope('attend'): # x = tf.nn.relu(dilated_block(x)) for i in range(4): x = tf.nn.relu(dilated_block(x)) x = layers.dropout(x, self.keep_prob, is_training=self.is_training_ph) pre_att_logits = x att_logits = layers.conv2d(x, train.n_memories, 3, activation_fn=None, weights_regularizer=self.regularizer ) # (bs, h, w, n_memories) att_logits = memory_mask * att_logits - ( 1.0 - memory_mask ) * 1000 # TODO only sum the memory_mask idx, in the softmax logits = tf.reshape(att_logits, (bs, -1)) # (bs, h * w * n_memories) logits -= tf.reduce_max(logits, axis=1, keepdims=True) lp = tf.nn.log_softmax(logits, axis=1) # (bs, h * w * n_memories) p = tf.nn.softmax(logits, axis=1) # (bs, h * w * n_memories) spatial_attention = tf.reshape( p, (bs, h * w * train.n_memories, 1, 1)) # (bs, h * w * n_memories, 1, 1) p_uniform = memory_mask / tf.reduce_sum( memory_mask, axis=(1, 2, 3), keepdims=True) cross_entropy_uniform = -tf.reduce_sum( p_uniform * tf.reshape(lp, (bs, h, w, train.n_memories)), axis=(1, 2, 3)) # (bs, 1) attention_entropy = -tf.reduce_sum(p * lp, axis=1) / tf.log( 2.) # (bs, 1) cp = tf.reduce_sum(tf.reshape(p, (bs, h, w, train.n_memories)), axis=3, keepdims=True) context = tf.reduce_sum(cp * pre_att_logits, axis=(1, 2)) # (bs, 4*n_hidden) return spatial_attention, attention_entropy, cross_entropy_uniform, context
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) word_embeddings = layers.embed_sequence(word_id_vector, vocab_size=params.N_WORDS, embed_dim=50) min_vectors = tf.reduce_min(word_embeddings, axis=1) max_vectors = tf.reduce_max(word_embeddings, axis=1) min_max_vectors = tf.concat([min_vectors, max_vectors], axis=1) d1 = tf.keras.layers.Dense(25, activation='relu')(min_max_vectors) logits = tf.keras.layers.Dense(commons.TARGET_SIZE)(d1) probabilities = tf.nn.softmax(logits) predicted_indices = tf.argmax(probabilities, axis=1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class': tf.gather(commons.TARGET_LABELS, predicted_indices), 'probabilities': probabilities } exported_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs) weights = features[commons.WEIGHT_COLUNM_NAME] loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights) tf.summary.scalar('loss', loss) acc = tf.equal(predicted_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices, weights=weights), 'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices, weights=weights), 'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices, weights=weights), 'f1_score': streaming_f1(labels=labels, predictions=predicted_indices) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)