def model(): print("building model ...") with tf.variable_scope('train'): print("building model ...") X_pl = tf.placeholder(tf.float32, [None, num_features]) X_expand = tf.expand_dims(X_pl, axis=2) print("X_pl", X_pl.get_shape()) t_pl = tf.placeholder(tf.int32, [None,]) print("t_pl", t_pl.get_shape()) is_training_pl = tf.placeholder(tf.bool) cell_fw = tf.nn.rnn_cell.GRUCell(205) cell_bw = tf.nn.rnn_cell.GRUCell(205) seq_len = tf.reduce_sum(tf.ones(tf.shape(X_pl), dtype=tf.int32), axis=1) _, enc_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=X_expand, sequence_length=seq_len, dtype=tf.float32) enc_states = tf.concat(1, enc_states) enc_states_drop = dropout(enc_states, is_training=is_training_pl) l1 = fully_connected(enc_states_drop, 200, activation_fn=None) l1 = batch_norm(l1, is_training=is_training_pl) l1_relu = relu(l1) l1_dropout = dropout(l1_relu, is_training=is_training_pl) l2 = fully_connected(l1_dropout, 200, activation_fn=None) l2 = batch_norm(l2, is_training=is_training_pl) l2_relu = relu(l2) l_out = fully_connected(l2_relu, num_outputs=num_classes, activation_fn=None) l_out_softmax = tf.nn.softmax(l_out) tf.contrib.layers.summarize_variables() with tf.variable_scope('metrics'): loss = sparse_softmax_cross_entropy_with_logits(l_out, t_pl) print("loss", loss.get_shape()) loss = tf.reduce_mean(loss) print("loss", loss.get_shape()) tf.summary.scalar('train/loss', loss) argmax = tf.to_int32(tf.argmax(l_out, 1)) print("argmax", argmax.get_shape()) correct = tf.to_float(tf.equal(argmax, t_pl)) print("correct,", correct.get_shape()) accuracy = tf.reduce_mean(correct) print("accuracy", accuracy.get_shape()) with tf.variable_scope('optimizer'): print("building optimizer ...") global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(loss) gradients, variables = zip(*grads_and_vars) clipped_gradients, global_norm = ( tf.clip_by_global_norm(gradients, clip_norm)) clipped_grads_and_vars = zip(clipped_gradients, variables) tf.summary.scalar('train/global_gradient_norm', global_norm) train_op = optimizer.apply_gradients(clipped_grads_and_vars, global_step=global_step) return X_pl, t_pl, is_training_pl, l_out, l_out_softmax, loss, accuracy, train_op, global_step
def define_sequence_model(self): seed=12345 np.random.seed(12345) layer_list=[] with self.graph.as_default() as g: utt_length=tf.placeholder(tf.int32,shape=(None)) g.add_to_collection(name="utt_length",value=utt_length) with tf.name_scope("input"): input_layer=tf.placeholder(dtype=tf.float32,shape=(None,None,self.n_in),name="input_layer") if self.dropout_rate!=0.0: print "Using dropout to avoid overfitting and the dropout rate is",self.dropout_rate is_training_drop=tf.placeholder(dtype=tf.bool,shape=(),name="is_training_drop") input_layer_drop=dropout(input_layer,self.dropout_rate,is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop",value=is_training_drop) else: layer_list.append(input_layer) g.add_to_collection("input_layer",layer_list[0]) with tf.name_scope("hidden_layer"): basic_cell=[] if "tanh" in self.hidden_layer_type: is_training_batch=tf.placeholder(dtype=tf.bool,shape=(),name="is_training_batch") bn_params={"is_training":is_training_batch,"decay":0.99,"updates_collections":None} g.add_to_collection("is_training_batch",is_training_batch) for i in xrange(len(self.hidden_layer_type)): if self.dropout_rate!=0.0: if self.hidden_layer_type[i]=="tanh": new_layer=fully_connected(layer_list[-1],self.hidden_layer_size[i],activation_fn=tf.nn.tanh,normalizer_fn=batch_norm,normalizer_params=bn_params) new_layer_drop=dropout(new_layer,self.dropout_rate,is_training=is_training_drop) layer_list.append(new_layer_drop) if self.hidden_layer_type[i]=="lstm": basic_cell.append(MyDropoutWrapper(BasicLSTMCell(num_units=self.hidden_layer_size[i]),self.dropout_rate,self.dropout_rate,is_training=is_training_drop)) if self.hidden_layer_type[i]=="gru": basic_cell.append(MyDropoutWrapper(GRUCell(num_units=self.hidden_layer_size[i]),self.dropout_rate,self.dropout_rate,is_training=is_training_drop)) else: if self.hidden_layer_type[i]=="tanh": new_layer=fully_connected(layer_list[-1],self.hidden_layer_size[i],activation_fn=tf.nn.tanh,normalizer_fn=batch_norm,normalizer_params=bn_params) layer_list.append(new_layer) if self.hidden_layer_type[i]=="lstm": basic_cell.append(LayerNormBasicLSTMCell(num_units=self.hidden_layer_size[i])) if self.hidden_layer_type[i]=="gru": basic_cell.append(LayerNormGRUCell(num_units=self.hidden_layer_size[i])) multi_cell=MultiRNNCell(basic_cell) rnn_outputs,rnn_states=tf.nn.dynamic_rnn(multi_cell,layer_list[-1],dtype=tf.float32,sequence_length=utt_length) layer_list.append(rnn_outputs) with tf.name_scope("output_layer"): if self.output_type=="linear" : output_layer=tf.layers.dense(rnn_outputs,self.n_out) # stacked_rnn_outputs=tf.reshape(rnn_outputs,[-1,self.n_out]) # stacked_outputs=tf.layers.dense(stacked_rnn_outputs,self.n_out) # output_layer=tf.reshape(stacked_outputs,[-1,utt_length,self.n_out]) g.add_to_collection(name="output_layer",value=output_layer) with tf.name_scope("training_op"): if self.optimizer=="adam": self.training_op=tf.train.AdamOptimizer()
def _init_body(self, scope): with tf.variable_scope(scope): word_level_inputs = tf.reshape(self.inputs_embedded, [ self.document_size * self.sentence_size, self.word_size, self.embedding_size ]) word_level_lengths = tf.reshape( self.word_lengths, [self.document_size * self.sentence_size]) with tf.variable_scope('word') as scope: word_encoder_output, _ = bidirectional_rnn( self.word_cell, self.word_cell, word_level_inputs, word_level_lengths, scope=scope) with tf.variable_scope('attention') as scope: word_level_output = task_specific_attention( word_encoder_output, self.word_output_size, scope=scope) with tf.variable_scope('dropout'): word_level_output = layers.dropout( word_level_output, keep_prob=self.dropout_keep_proba, is_training=self.is_training, ) # sentence_level sentence_inputs = tf.reshape( word_level_output, [self.document_size, self.sentence_size, self.word_output_size]) with tf.variable_scope('sentence') as scope: sentence_encoder_output, _ = bidirectional_rnn( self.sentence_cell, self.sentence_cell, sentence_inputs, self.sentence_lengths, scope=scope) with tf.variable_scope('attention') as scope: sentence_level_output = task_specific_attention( sentence_encoder_output, self.sentence_output_size, scope=scope) with tf.variable_scope('dropout'): sentence_level_output = layers.dropout( sentence_level_output, keep_prob=self.dropout_keep_proba, is_training=self.is_training, ) with tf.variable_scope('classifier'): self.logits = layers.fully_connected( sentence_level_output, self.classes, activation_fn=None) self.prediction = tf.argmax(self.logits, axis=-1)
def conv_model(X, Y_, mode): XX = tf.reshape(X, [-1, 28, 28, 1]) biasInit = tf.constant_initializer(0.1, dtype=tf.float32) Y1 = layers.conv2d(XX, num_outputs=6, kernel_size=[6, 6], biases_initializer=biasInit) Y2 = layers.conv2d(Y1, num_outputs=12, kernel_size=[5, 5], stride=2, biases_initializer=biasInit) Y3 = layers.conv2d(Y2, num_outputs=24, kernel_size=[4, 4], stride=2, biases_initializer=biasInit) Y4 = layers.flatten(Y3) Y5 = layers.relu(Y4, 200, biases_initializer=biasInit) # to deactivate dropout on the dense layer, set keep_prob=1 Y5d = layers.dropout(Y5, keep_prob=0.75, noise_shape=None, is_training=mode==learn.ModeKeys.TRAIN) Ylogits = layers.linear(Y5d, 10) predict = tf.nn.softmax(Ylogits) classes = tf.cast(tf.argmax(predict, 1), tf.uint8) loss = conv_model_loss(Ylogits, Y_, mode) train_op = conv_model_train_op(loss, mode) eval_metrics = conv_model_eval_metrics(classes, Y_, mode) return learn.ModelFnOps( mode=mode, # You can name the fields of your predictions dictionary as you like. predictions={"predictions": predict, "classes": classes}, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics )
def _dnn_logits(self, features, is_training=False): net = layers.input_from_feature_columns( features, self._get_dnn_feature_columns(), weight_collections=[self._dnn_weight_collection]) for layer_id, num_hidden_units in enumerate(self._dnn_hidden_units): net = layers.legacy_fully_connected( net, num_hidden_units, activation_fn=self._dnn_activation_fn, weight_collections=[self._dnn_weight_collection], bias_collections=[self._dnn_weight_collection], name="hiddenlayer_%d" % layer_id) if self._dnn_dropout is not None and is_training: net = layers.dropout( net, keep_prob=(1.0 - self._dnn_dropout)) self._add_hidden_layer_summary(net, "hiddenlayer_%d" % layer_id) logit = layers.legacy_fully_connected( net, self._num_label_columns(), weight_collections=[self._dnn_weight_collection], bias_collections=[self._dnn_weight_collection], name="dnn_logit") self._add_hidden_layer_summary(logit, "dnn_logit") return logit
def model_fn(x, target, mode, params): """Model function for Estimator.""" y_ = tf.cast(target, tf.float32) x_image = tf.reshape(x, [-1, 28, 28, 1]) # first convolutional layer h_conv1 = layers.convolution2d(x_image, 32, [5,5]) h_pool1 = layers.max_pool2d(h_conv1, [2,2]) # second convolutional layer h_conv2 = layers.convolution2d(h_pool1, 64, [5,5]) h_pool2 = layers.max_pool2d(h_conv2, [2,2]) # densely connected layer h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) h_fc1 = layers.fully_connected(h_pool2_flat, 1024) h_fc1_drop = layers.dropout( h_fc1, keep_prob=params["dropout"], is_training=(mode == ModeKeys.TRAIN)) # readout layer y_conv = layers.fully_connected(h_fc1_drop, 10, activation_fn=None) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(y_conv, y_)) train_op = tf.contrib.layers.optimize_loss( loss=cross_entropy, global_step=tf.contrib.framework.get_global_step(), learning_rate=params["learning_rate"], optimizer="Adam") predictions = tf.argmax(y_conv, 1) return predictions, cross_entropy, train_op
def dnn_logits_fn(): """Builds the logits from the input layer.""" previous_layer = input_layer for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(previous_layer,)) as hidden_layer_scope: net = layers.fully_connected( previous_layer, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) previous_layer = net with variable_scope.variable_scope( "logits", values=(previous_layer,)) as logits_scope: dnn_logits = layers.fully_connected( previous_layer, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=logits_scope) _add_hidden_layer_summary(dnn_logits, logits_scope.name) return dnn_logits
def define_feedforward_model(self): layer_list=[] with self.graph.as_default() as g: is_training_batch=tf.placeholder(tf.bool,shape=(),name="is_training_batch") bn_params={"is_training":is_training_batch,"decay":0.99,"updates_collections":None} g.add_to_collection("is_training_batch",is_training_batch) with tf.name_scope("input"): input_layer=tf.placeholder(dtype=tf.float32,shape=(None,self.n_in),name="input_layer") if self.dropout_rate!=0.0: print "Using dropout to avoid overfitting and the dropout rate is",self.dropout_rate is_training_drop=tf.placeholder(dtype=tf.bool,shape=(),name="is_training_drop") input_layer_drop=dropout(input_layer,self.dropout_rate,is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop",value=is_training_drop) else: layer_list.append(input_layer) g.add_to_collection("input_layer",layer_list[0]) for i in xrange(len(self.hidden_layer_size)): with tf.name_scope("hidden_layer_"+str(i+1)): if self.dropout_rate!=0.0: last_layer=layer_list[-1] if self.hidden_layer_type[i]=="tanh": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.tanh,normalizer_fn=batch_norm,\ normalizer_params=bn_params) if self.hidden_layer_type[i]=="sigmoid": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.sigmoid,normalizer_fn=batch_norm,\ normalizer_params=bn_params) new_layer_drop=dropout(new_layer,self.dropout_rate,is_training=is_training_drop) layer_list.append(new_layer_drop) else: last_layer=layer_list[-1] if self.hidden_layer_type[i]=="tanh": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.tanh,normalizer_fn=batch_norm,\ normalizer_params=bn_params) if self.hidden_layer_type[i]=="sigmoid": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.sigmoid,normalizer_fn=batch_norm,\ normalizer_params=bn_params) layer_list.append(new_layer) with tf.name_scope("output_layer"): if self.output_type=="linear": output_layer=fully_connected(layer_list[-1],self.n_out,activation_fn=None) if self.output_type=="tanh": output_layer=fully_connected(layer_list[-1],self.n_out,activation_fn=tf.nn.tanh) g.add_to_collection(name="output_layer",value=output_layer) with tf.name_scope("training_op"): if self.optimizer=="adam": self.training_op=tf.train.AdamOptimizer()
def build_model(self, features, feature_columns, is_training): """See base class.""" self._feature_columns = feature_columns input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=self._num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( self._scope + "/input_from_feature_columns", values=features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( features, self._get_feature_columns(), weight_collections=[self._scope], trainable=self._trainable, scope=scope) hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=self._num_ps_replicas)) for layer_id, num_hidden_units in enumerate(self._hidden_units): with variable_scope.variable_scope( self._scope + "/hiddenlayer_%d" % layer_id, values=[net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=self._activation_fn, variables_collections=[self._scope], trainable=self._trainable, scope=scope) if self._dropout is not None and is_training: net = layers.dropout( net, keep_prob=(1.0 - self._dropout)) self._add_hidden_layer_summary(net, scope.name) with variable_scope.variable_scope( self._scope + "/logits", values=[net], partitioner=hidden_layer_partitioner) as scope: logits = layers.fully_connected( net, self._num_label_columns, activation_fn=None, variables_collections=[self._scope], trainable=self._trainable, scope=scope) self._add_hidden_layer_summary(logits, "logits") return logits
def model(): tf.set_random_seed(1) print("building model ...") with tf.variable_scope('train'): print("building model ...") X_pl = tf.placeholder(tf.float32, [None, num_features]) print("X_pl", X_pl.get_shape()) t_pl = tf.placeholder(tf.int32, [None,]) print("t_pl", t_pl.get_shape()) is_training_pl = tf.placeholder(tf.bool) X_bn = batch_norm(X_pl, is_training=is_training_pl) print("X_bn", X_bn.get_shape()) l1 = fully_connected(X_pl, num_outputs=100, activation_fn=relu)#, normalizer_fn=batch_norm) print("l1", l1.get_shape()) l1_drop = dropout(l1, is_training=is_training_pl) print("l1_drop", l1_drop.get_shape()) l_out = fully_connected(l1_drop, num_outputs=num_classes, activation_fn=None) print("l_out", l_out.get_shape()) l_out_softmax = tf.nn.softmax(l_out) tf.contrib.layers.summarize_variables() with tf.variable_scope('metrics'): loss = sparse_softmax_cross_entropy_with_logits(l_out, t_pl) print("loss", loss.get_shape()) loss = tf.reduce_mean(loss) print("loss", loss.get_shape()) tf.summary.scalar('train/loss', loss) argmax = tf.to_int32(tf.argmax(l_out, 1)) print("argmax", argmax.get_shape()) correct = tf.to_float(tf.equal(argmax, t_pl)) print("correct,", correct.get_shape()) accuracy = tf.reduce_mean(correct) print("accuracy", accuracy.get_shape()) with tf.variable_scope('optimizer'): print("building optimizer ...") global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(loss) gradients, variables = zip(*grads_and_vars) clipped_gradients, global_norm = ( tf.clip_by_global_norm(gradients, clip_norm)) clipped_grads_and_vars = zip(clipped_gradients, variables) tf.summary.scalar('train/global_gradient_norm', global_norm) train_op = optimizer.apply_gradients(clipped_grads_and_vars, global_step=global_step) return X_pl, t_pl, is_training_pl, l_out, l_out_softmax, loss, accuracy, train_op, global_step
def general_module_end_operations(self, tensor, dropout_on, strided_max_pool_on): """ Common end of module operations. :param tensor: The tensor being processed. :type tensor: tf.Tensor :param dropout_on: Whether to include dropout or not. :type dropout_on: bool :param strided_max_pool_on: Whether to include a strided max pool at the end of the module. :type strided_max_pool_on: bool :return: The processed tensor. :rtype: tf.Tensor """ if strided_max_pool_on: tensor = max_pool2d(tensor, kernel_size=3, stride=2, padding='VALID') if dropout_on: tensor = dropout(tensor, self.dropout_keep_probability_tensor) return tensor
def conv_model(feature, target, mode): """2-layer convolution model.""" # Convert the target to a one-hot tensor of shape (batch_size, 10) and # with a on-value of 1 for each one-hot vector of length 10. target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0) # Reshape feature to 4d tensor with 2nd and 3rd dimensions being # image width and height final dimension being the number of color channels. feature = tf.reshape(feature, [-1, 28, 28, 1]) # First conv layer will compute 32 features for each 5x5 patch with tf.variable_scope('conv_layer1'): h_conv1 = layers.convolution(feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu) h_pool1 = max_pool_2x2(h_conv1) # Second conv layer will compute 64 features for each 5x5 patch. with tf.variable_scope('conv_layer2'): h_conv2 = layers.convolution(h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu) h_pool2 = max_pool_2x2(h_conv2) # reshape tensor into a batch of vectors h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) # Densely connected layer with 1024 neurons. h_fc1 = layers.dropout( layers.fully_connected( h_pool2_flat, 1024, activation_fn=tf.nn.relu), keep_prob=0.5, is_training=mode == tf.contrib.learn.ModeKeys.TRAIN) # Compute logits (1 per class) and compute loss. logits = layers.fully_connected(h_fc1, 10, activation_fn=None) loss = tf.contrib.losses.softmax_cross_entropy(logits, target) # Create a tensor for training op. train_op = layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='SGD', learning_rate=0.001) return tf.argmax(logits, 1), loss, train_op
def build_model(self, features, feature_columns, is_training): """See base class.""" features = self._get_feature_dict(features) self._feature_columns = feature_columns net = layers.input_from_feature_columns( features, self._get_feature_columns(), weight_collections=[self._weight_collection_name]) for layer_id, num_hidden_units in enumerate(self._hidden_units): with variable_scope.variable_op_scope( [net], "hiddenlayer_%d" % layer_id, partitioner=partitioned_variables.min_max_variable_partitioner( max_partitions=self._config.num_ps_replicas)) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=self._activation_fn, variables_collections=[self._weight_collection_name], scope=scope) if self._dropout is not None and is_training: net = layers.dropout( net, keep_prob=(1.0 - self._dropout)) self._add_hidden_layer_summary(net, scope.name) with variable_scope.variable_op_scope( [net], "dnn_logits", partitioner=partitioned_variables.min_max_variable_partitioner( max_partitions=self._config.num_ps_replicas)) as scope: logits = layers.fully_connected( net, self._num_label_columns, activation_fn=None, variables_collections=[self._weight_collection_name], scope=scope) self._add_hidden_layer_summary(logits, "dnn_logits") return logits
def _dnn_logits(self, features, is_training=False): net = layers.input_from_feature_columns( features, self._get_dnn_feature_columns(), weight_collections=[self._dnn_weight_collection] ) for layer_id, num_hidden_units in enumerate(self._dnn_hidden_units): with variable_scope.variable_op_scope( [net], "hiddenlayer_%d" % layer_id, partitioner=partitioned_variables.min_max_variable_partitioner( max_partitions=self._config.num_ps_replicas ), ) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=self._dnn_activation_fn, variables_collections=[self._dnn_weight_collection], scope=scope, ) if self._dnn_dropout is not None and is_training: net = layers.dropout(net, keep_prob=(1.0 - self._dnn_dropout)) self._add_hidden_layer_summary(net, scope.name) with variable_scope.variable_op_scope( [net], "dnn_logit", partitioner=partitioned_variables.min_max_variable_partitioner(max_partitions=self._config.num_ps_replicas), ) as scope: logit = layers.fully_connected( net, self._target_column.num_label_columns, activation_fn=None, variables_collections=[self._dnn_weight_collection], scope=scope, ) self._add_hidden_layer_summary(logit, "dnn_logit") return logit
def __init__(self, max_seq_len, max_sent_len, num_classes, vocab_size, embedding_size, max_grad_norm, dropout_keep_proba, learning_rate): # Parameters self.learning_rate = learning_rate self.vocab_size = vocab_size self.num_classes = num_classes self.max_seq_len = max_seq_len self.embedding_size = embedding_size self.word_encoder_num_hidden = max_seq_len self.word_output_size = max_seq_len self.sentence_encoder_num_hidden = max_sent_len self.sentence_output_size = max_sent_len self.max_grad_norm = max_grad_norm self.dropout_keep_proba = dropout_keep_proba # tf graph input self.input_x = tf.placeholder(shape=[None, None, None], dtype=tf.int32, name="input_x") self.input_y = tf.placeholder(shape=[None, self.num_classes], dtype=tf.int32, name="input_y") self.word_lengths = tf.placeholder(shape=[None, None], dtype=tf.int32, name="word_lengths") self.sentence_lengths = tf.placeholder(shape=[ None, ], dtype=tf.int32, name="sentence_lengths") self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") # input_x dims (self.document_size, self.sentence_size, self.word_size) = tf.unstack(tf.shape(self.input_x)) with tf.device("/gpu:0"), tf.name_scope("embedding_layer"): w = tf.Variable( tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0), dtype=tf.float32, name="w" ) # TODO check if this needs to be marked as untrainable self.input_x_embedded = tf.nn.embedding_lookup(w, self.input_x) # reshape input_x after embedding self.input_x_embedded = tf.reshape(self.input_x_embedded, [ self.document_size * self.sentence_size, self.word_size, self.embedding_size ]) self.input_x_embedded_lengths = tf.reshape( self.word_lengths, [self.document_size * self.sentence_size]) with tf.variable_scope("word_level"): self.word_encoder_outputs = self.bidirectional_RNN( num_hidden=self.word_encoder_num_hidden, inputs=self.input_x_embedded) word_level_output = self.attention( inputs=self.word_encoder_outputs, output_size=self.word_output_size) with tf.variable_scope("dropout"): print('self.is_training: {}'.format(self.is_training)) word_level_output = layers.dropout( word_level_output, keep_prob=self.dropout_keep_proba, is_training=self.is_training) # reshape word_level output self.sentence_encoder_inputs = tf.reshape( word_level_output, [self.document_size, self.sentence_size, self.word_output_size]) with tf.variable_scope("sentence_level"): self.sentence_encoder_outputs = self.bidirectional_RNN( num_hidden=self.sentence_encoder_num_hidden, inputs=self.sentence_encoder_inputs) sentence_level_output = self.attention( inputs=self.sentence_encoder_outputs, output_size=self.sentence_output_size) with tf.variable_scope("dropout"): sentence_level_output = layers.dropout( sentence_level_output, keep_prob=self.dropout_keep_proba, is_training=self.is_training) # Final model prediction with tf.variable_scope("classifier_output"): self.logits = layers.fully_connected( sentence_level_output, self.num_classes, activation_fn=None) # trainable=self.is_training) self.predictions = tf.argmax(self.logits, axis=1, name="predictions") # Calculate mean cross-entropy loss with tf.variable_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( labels=self.input_y, logits=self.logits) self.loss = tf.reduce_mean(losses) tf.summary.scalar("Loss", self.loss) # Accuracy with tf.variable_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, axis=1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") tf.summary.scalar("Accuracy", self.accuracy)
def _dnn_tree_combined_model_fn( features, labels, mode, head, dnn_hidden_units, dnn_feature_columns, tree_learner_config, num_trees, tree_examples_per_layer, config=None, dnn_optimizer="Adagrad", dnn_activation_fn=nn.relu, dnn_dropout=None, dnn_input_layer_partitioner=None, dnn_input_layer_to_tree=True, dnn_steps_to_train=10000, tree_feature_columns=None, tree_center_bias=True): """DNN and GBDT combined model_fn. Args: features: `dict` of `Tensor` objects. labels: Labels used to train on. mode: Mode we are in. (TRAIN/EVAL/INFER) head: A `Head` instance. dnn_hidden_units: List of hidden units per layer. dnn_feature_columns: An iterable containing all the feature columns used by the model's DNN. tree_learner_config: A config for the tree learner. num_trees: Number of trees to grow model to after training DNN. tree_examples_per_layer: Number of examples to accumulate before growing the tree a layer. This value has a big impact on model quality and should be set equal to the number of examples in training dataset if possible. It can also be a function that computes the number of examples based on the depth of the layer that's being built. config: `RunConfig` of the estimator. dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN. If `None`, will use the Adagrad optimizer with default learning rate of 0.001. dnn_activation_fn: Activation function applied to each layer of the DNN. If `None`, will use `tf.nn.relu`. dnn_dropout: When not `None`, the probability to drop out a given unit in the DNN. dnn_input_layer_partitioner: Partitioner for input layer of the DNN. Defaults to `min_max_variable_partitioner` with `min_slice_size` 64 << 20. dnn_input_layer_to_tree: Whether to provide the DNN's input layer as a feature to the tree. dnn_steps_to_train: Number of steps to train dnn for before switching to gbdt. tree_feature_columns: An iterable containing all the feature columns used by the model's boosted trees. If dnn_input_layer_to_tree is set to True, these features are in addition to dnn_feature_columns. tree_center_bias: Whether a separate tree should be created for first fitting the bias. Returns: A `ModelFnOps` object. Raises: ValueError: if inputs are not valid. """ if not isinstance(features, dict): raise ValueError("features should be a dictionary of `Tensor`s. " "Given type: {}".format(type(features))) if not dnn_feature_columns: raise ValueError("dnn_feature_columns must be specified") # Build DNN Logits. dnn_parent_scope = "dnn" dnn_partitioner = dnn_input_layer_partitioner or ( partitioned_variables.min_max_variable_partitioner( max_partitions=config.num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( dnn_parent_scope, values=tuple(six.itervalues(features)), partitioner=dnn_partitioner): with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=dnn_partitioner) as input_layer_scope: input_layer = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=input_layer_scope) previous_layer = input_layer for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(previous_layer,)) as hidden_layer_scope: net = layers.fully_connected( previous_layer, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) previous_layer = net with variable_scope.variable_scope( "logits", values=(previous_layer,)) as logits_scope: dnn_logits = layers.fully_connected( previous_layer, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=logits_scope) _add_hidden_layer_summary(dnn_logits, logits_scope.name) def _dnn_train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=training_util.get_global_step(), learning_rate=_DNN_LEARNING_RATE, optimizer=_get_optimizer(dnn_optimizer), name=dnn_parent_scope, variables=ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope), # Empty summaries to prevent optimizers from logging training_loss. summaries=[]) # Build Tree Logits. global_step = training_util.get_global_step() with ops.device(global_step.device): ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config="", # Initialize an empty ensemble. name="ensemble_model") tree_features = features.copy() if dnn_input_layer_to_tree: tree_features["dnn_input_layer"] = input_layer tree_feature_columns.append(layers.real_valued_column("dnn_input_layer")) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=config.is_chief, num_ps_replicas=config.num_ps_replicas, ensemble_handle=ensemble_handle, center_bias=tree_center_bias, examples_per_layer=tree_examples_per_layer, learner_config=tree_learner_config, feature_columns=tree_feature_columns, logits_dimension=head.logits_dimension, features=tree_features) with ops.name_scope("gbdt"): predictions_dict = gbdt_model.predict(mode) tree_logits = predictions_dict["predictions"] def _tree_train_op_fn(loss): """Returns the op to optimize the loss.""" update_op = gbdt_model.train(loss, predictions_dict, labels) with ops.control_dependencies( [update_op]), (ops.colocate_with(global_step)): update_op = state_ops.assign_add(global_step, 1).op return update_op tree_train_logits = dnn_logits + tree_logits def _no_train_op_fn(loss): """Returns a no-op.""" del loss return control_flow_ops.no_op() model_fn_ops = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_train_op = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits).train_op tree_train_op = head.create_model_fn_ops( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits).train_op if tree_center_bias: num_trees += 1 finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor() model_fn_ops.training_hooks.extend([ trainer_hooks.SwitchTrainOp( dnn_train_op, dnn_steps_to_train, tree_train_op), trainer_hooks.StopAfterNTrees( num_trees, attempted_trees, finalized_trees)]) return model_fn_ops
def my_drop_out(output): return tf.where(self.is_training, tcl.dropout(output, keep_prob = keep_prob_, is_training=True), output)
def _dnn_model_fn(features, labels, mode, params): """Deep Neural Net model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * head: A `_Head` instance. * hidden_units: List of hidden units per layer. * feature_columns: An iterable containing all the feature columns used by the model. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. If `None`, will use the Adagrad optimizer with a default learning rate of 0.05. * activation_fn: Activation function applied to each layer. If `None`, will use `tf.nn.relu`. * dropout: When not `None`, the probability we will drop out a given coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * num_ps_replicas: The number of parameter server replicas. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. """ head = params["head"] hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] optimizer = params.get("optimizer") or "Adagrad" activation_fn = params.get("activation_fn") dropout = params.get("dropout") gradient_clip_norm = params.get("gradient_clip_norm") num_ps_replicas = params.get("num_ps_replicas", 0) features = _get_feature_dict(features) parent_scope = "dnn" input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( parent_scope + "/input_from_feature_columns", values=features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, weight_collections=[parent_scope], scope=scope) hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, values=[net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=activation_fn, variables_collections=[parent_scope], scope=scope) if dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout( net, keep_prob=(1.0 - dropout)) _add_hidden_layer_summary(net, scope.name) with variable_scope.variable_scope( parent_scope + "/logits", values=[net], partitioner=hidden_layer_partitioner) as scope: logits = layers.fully_connected( net, head.logits_dimension, activation_fn=None, variables_collections=[parent_scope], scope=scope) _add_hidden_layer_summary(logits, scope.name) def _train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=contrib_variables.get_global_step(), learning_rate=_LEARNING_RATE, optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm, name=parent_scope, # Empty summaries to prevent optimizers from logging the training_loss. summaries=[]) return head.head_ops(features, labels, mode, _train_op_fn, logits)
def main(unused_argv): mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) if FLAGS.download_only: sys.exit(0) # Sanity check on the number of workers and the worker index #if FLAGS.worker_index >= FLAGS.num_workers: # raise ValueError("Worker index %d exceeds number of workers %d " % # (FLAGS.worker_index, FLAGS.num_workers)) # Sanity check on the number of parameter servers if FLAGS.num_parameter_servers <= 0: raise ValueError("Invalid num_parameter_servers value: %d" % FLAGS.num_parameter_servers) # air ps_hosts = re.findall(r'[\w\.:]+', FLAGS.ps_hosts) worker_hosts = re.findall(r'[\w\.:]+', FLAGS.worker_hosts) server = tf.train.Server({"ps":ps_hosts,"worker":worker_hosts}, job_name = FLAGS.job_name, task_index = FLAGS.worker_index) print("Worker GRPC URL: %s" % server.target) print("Worker index = %d" % FLAGS.worker_index) print("Number of workers = %d" % FLAGS.num_workers) if FLAGS.job_name == "ps": server.join() # air else: is_chief = (FLAGS.worker_index == 0) if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = FLAGS.num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate # Construct device setter object device_setter = get_device_setter(FLAGS.num_parameter_servers, FLAGS.num_workers) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. with tf.device(device_setter): global_step = tf.Variable(0, name="global_step", trainable=False) ''' # Variables of the hidden layer hid_w = tf.Variable( tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") # Variables of the softmax layer sm_w = tf.Variable( tf.truncated_normal([FLAGS.hidden_units, 10], stddev=1.0 / math.sqrt(FLAGS.hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") ''' #air ''' W1 = tf.Variable(tf.truncated_normal([784,1024], stddev=0.01)) b1 = tf.Variable(tf.zeros([1024])) W2 = tf.Variable(tf.truncated_normal([1024,1024], stddev=0.01)) b2 = tf.Variable(tf.zeros([1024])) W3 = tf.Variable(tf.truncated_normal([1024,512], stddev=0.01)) b3 = tf.Variable(tf.zeros([512])) W4 = tf.Variable(tf.truncated_normal([512,10], stddev=0.01)) b4 = tf.Variable(tf.zeros([10])) ''' with tf.name_scope('input'): x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input") # target 10 output classes y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input") prob = tf.placeholder(tf.float32, name='keep_prob') x_image = tf.reshape(x, [-1,28,28,1]) stack1_conv1 = layers.convolution2d(x_image, 64, [3,3], weights_regularizer=layers.l2_regularizer(0.1), biases_regularizer=layers.l2_regularizer(0.1), scope='stack1_Conv1') stack1_conv2 = layers.convolution2d(stack1_conv1, 64, [3,3], weights_regularizer=layers.l2_regularizer(0.1), biases_regularizer=layers.l2_regularizer(0.1), scope='stack1_Conv2') stack1_pool = layers.max_pool2d(stack1_conv2, [2,2], padding='SAME', scope='stack1_Pool') stack3_pool_flat = layers.flatten(stack1_pool, scope='stack3_pool_flat') fcl1 = layers.fully_connected(stack3_pool_flat, 512, weights_regularizer=layers.l2_regularizer(0.1), biases_regularizer=layers.l2_regularizer(0.1), scope='FCL1') fcl1_d = layers.dropout(fcl1, keep_prob=prob, scope='dropout1') fcl2 = layers.fully_connected(fcl1_d, 128, weights_regularizer=layers.l2_regularizer(0.1), biases_regularizer=layers.l2_regularizer(0.1), scope='FCL2') fcl2_d = layers.dropout(fcl2, keep_prob=prob, scope='dropout2') y, cross_entropy = skflow.models.logistic_regression(fcl2_d, y_, init_stddev=0.01) '''with tf.name_scope('Softmax'): fcl_softmax = layers.fully_connected(fcl2_d, 10, weights_regularizer=layers.l2_regularizer(0.1), biases_regularizer=layers.l2_regularizer(0.1), scope='Softmax') y = tf.nn.softmax(fcl_softmax, name='y-output') cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)), reduction_indices=[1]))''' with tf.name_scope('train'): start_l_rate = 0.001 decay_step = 1000 decay_rate = 0.5 learning_rate = tf.train.exponential_decay(start_l_rate, global_step, decay_step, decay_rate, staircase=False) grad_op = tf.train.RMSPropOptimizer(learning_rate=learning_rate) '''rep_op = tf.train.SyncReplicasOptimizer(grad_op, replicas_to_aggregate=len(workers), replica_id=FLAGS.task_index, total_num_replicas=len(workers))''' train_op = tf.contrib.layers.optimize_loss(loss=cross_entropy, global_step=global_step, learning_rate=0.001, optimizer=grad_op, clip_gradients=1) #air # Ops: located on the worker specified with FLAGS.worker_index #x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) #y = tf.placeholder(tf.float32, [None, 10]) #y_ = None ''' hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) cross_entropy = -tf.reduce_sum(y_*tf.log(tf.clip_by_value(y, 1e-10, 1.0))) ''' #air ''' h1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1) h1d = tf.nn.dropout(h1, 0.7) h2 = tf.nn.sigmoid(tf.matmul(h1d, W2) + b2) h2d = tf.nn.dropout(h2, 0.7) h3 = tf.nn.sigmoid(tf.matmul(h2d, W3) + b3) h3d = tf.nn.dropout(h3, 0.7) y_ = tf.nn.softmax(tf.matmul(h3d, W4) + b4) cost = -tf.reduce_sum(y*tf.log(tf.clip_by_value(y_, 1e-10, 1.0))) #air opt = tf.train.AdamOptimizer(FLAGS.learning_rate)''' '''if FLAGS.sync_replicas: opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=FLAGS.num_workers, replica_id=FLAGS.worker_index, name="mnist_sync_replicas")''' '''train_step = opt.minimize(cost, global_step=global_step)''' '''if FLAGS.sync_replicas and is_chief: # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() init_tokens_op = opt.get_init_tokens_op()''' #air correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #air init_op = tf.initialize_all_variables() #train_dir = tempfile.mkdtemp() sv = tf.train.Supervisor(is_chief=is_chief, #logdir=train_dir, init_op=init_op, recovery_wait_secs=1, global_step=global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.worker_index]) # The chief worker (worker_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. if is_chief: print("Worker %d: Initializing session..." % FLAGS.worker_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.worker_index) '''sess = sv.prepare_or_wait_for_session(FLAGS.worker_grpc_url, config=sess_config)''' sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % FLAGS.worker_index) '''if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op print("Starting chief queue runner and running init_tokens_op") sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_tokens_op)''' # Perform training time_begin = time.time() print("Training begins @ %s" % time.ctime(time_begin)) local_step = 1 while True: # Training feed batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) train_feed = {x: batch_xs, y_: batch_ys, prob: 0.8} _, step, loss = sess.run([train_op, global_step, cross_entropy], feed_dict=train_feed) now = time.time() if(local_step % 2 == 0): print("%s: Worker %d: training step %d done (global step: %d), loss: %.6f" % (time.ctime(now), FLAGS.worker_index, local_step, step+1, loss)) if step+1 >= FLAGS.train_steps: break local_step += 1 time_end = time.time() print("Training ends @ %s" % time.ctime(time_end)) training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time) acc_acu = 0. for i in xrange(int(10000/1000)): test_x, test_y = mnist.test.next_batch(1000) #print(test_x.shape) acc_batch = sess.run(accuracy, feed_dict={x: test_x, y_: test_y, prob: 1.0}) print(acc_batch) acc_acu += acc_batch acc = acc_acu/10.0 print ("test accuracy %g" % acc) sv.stop()
def define_sequence_model(self): seed = 12345 np.random.seed(12345) layer_list = [] with self.graph.as_default() as g: utt_length = tf.placeholder(tf.int32, shape=(None)) g.add_to_collection(name="utt_length", value=utt_length) with tf.name_scope("input"): input_layer = tf.placeholder(dtype=tf.float32, shape=(None, None, self.n_in), name="input_layer") if self.dropout_rate != 0.0: print "Using dropout to avoid overfitting and the dropout rate is", self.dropout_rate is_training_drop = tf.placeholder(dtype=tf.bool, shape=(), name="is_training_drop") input_layer_drop = dropout(input_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop", value=is_training_drop) else: layer_list.append(input_layer) g.add_to_collection("input_layer", layer_list[0]) with tf.name_scope("hidden_layer"): basic_cell = [] if "tanh" in self.hidden_layer_type: is_training_batch = tf.placeholder( dtype=tf.bool, shape=(), name="is_training_batch") bn_params = { "is_training": is_training_batch, "decay": 0.99, "updates_collections": None } g.add_to_collection("is_training_batch", is_training_batch) for i in xrange(len(self.hidden_layer_type)): if self.dropout_rate != 0.0: if self.hidden_layer_type[i] == "tanh": new_layer = fully_connected( layer_list[-1], self.hidden_layer_size[i], activation_fn=tf.nn.tanh, normalizer_fn=batch_norm, normalizer_params=bn_params) new_layer_drop = dropout( new_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(new_layer_drop) if self.hidden_layer_type[i] == "lstm": basic_cell.append( MyDropoutWrapper(BasicLSTMCell( num_units=self.hidden_layer_size[i]), self.dropout_rate, self.dropout_rate, is_training=is_training_drop)) if self.hidden_layer_type[i] == "gru": basic_cell.append( MyDropoutWrapper(GRUCell( num_units=self.hidden_layer_size[i]), self.dropout_rate, self.dropout_rate, is_training=is_training_drop)) else: if self.hidden_layer_type[i] == "tanh": new_layer = fully_connected( layer_list[-1], self.hidden_layer_size[i], activation_fn=tf.nn.tanh, normalizer_fn=batch_norm, normalizer_params=bn_params) layer_list.append(new_layer) if self.hidden_layer_type[i] == "lstm": basic_cell.append( LayerNormBasicLSTMCell( num_units=self.hidden_layer_size[i])) if self.hidden_layer_type[i] == "gru": basic_cell.append( LayerNormGRUCell( num_units=self.hidden_layer_size[i])) multi_cell = MultiRNNCell(basic_cell) rnn_outputs, rnn_states = tf.nn.dynamic_rnn( multi_cell, layer_list[-1], dtype=tf.float32, sequence_length=utt_length) layer_list.append(rnn_outputs) with tf.name_scope("output_layer"): if self.output_type == "linear": output_layer = tf.layers.dense(rnn_outputs, self.n_out) # stacked_rnn_outputs=tf.reshape(rnn_outputs,[-1,self.n_out]) # stacked_outputs=tf.layers.dense(stacked_rnn_outputs,self.n_out) # output_layer=tf.reshape(stacked_outputs,[-1,utt_length,self.n_out]) g.add_to_collection(name="output_layer", value=output_layer) with tf.name_scope("training_op"): if self.optimizer == "adam": self.training_op = tf.train.AdamOptimizer()
def define_feedforward_model_utt(self): """ utterance index embedding last dim of input should be index TO DO LIST: embedding matrix size is fixed not fit to data """ layer_list = [] with self.graph.as_default() as g: self.global_step = tf.Variable(0, trainable=False) self.is_training_batch = tf.placeholder(tf.bool, shape=(), name="is_training_batch") # bn_params={"is_training":is_training_batch,"decay":0.99,"updates_collections":None} # g.add_to_collection("is_training_batch", is_training_batch) with tf.name_scope("input"): # shape (N, 319) self.input_lin_layer = tf.placeholder(dtype=tf.float32, shape=(None, self.n_in), name="input_layer") # embedding shape (UTT, 10) self.utt_embeddings = tf.get_variable("utt-embeddings", [1000, 10], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # label (N, 1) self.utt_index_t = tf.placeholder(dtype=tf.int32, shape=(None, 1), name="utt_index") # embedding result (N, 1, 10) embedding_utt = tf.nn.embedding_lookup(self.utt_embeddings, self.utt_index_t) # concatenate embedding result and linguistic feature , shape (N, 329) # shape (N, 10) embedding_utt = tf.squeeze(embedding_utt, axis=-2) self.input_layer = tf.concat([self.input_lin_layer, embedding_utt], 1) if self.dropout_rate != 0.0: print("Using dropout to avoid overfitting and the dropout rate is", self.dropout_rate) is_training_drop = tf.placeholder(dtype=tf.bool, shape=(), name="is_training_drop") input_layer_drop = dropout(self.input_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop", value=is_training_drop) else: layer_list.append(self.input_layer) # hidden layer for i in range(len(self.hidden_layer_size)): with tf.name_scope("hidden_layer_" + str(i + 1)): if self.dropout_rate != 0.0: last_layer = layer_list[-1] if self.hidden_layer_type[i] == "tanh": new_layer=fully_connected(last_layer, self.hidden_layer_size[i], activation_fn=None) new_layer = tf.contrib.layers.batch_norm(new_layer,is_training=self.is_training_batch) new_layer = tf.nn.tanh(new_layer) if self.hidden_layer_type[i]=="sigmoid": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.sigmoid) if self.hidden_layer_type[i]=="relu": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.relu) if self.hidden_layer_type[i]=="selu": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.selu) new_layer_drop=dropout(new_layer,self.dropout_rate,is_training=is_training_drop) layer_list.append(new_layer_drop) else: # pdb.set_trace() last_layer = layer_list[-1] if self.hidden_layer_type[i] == "tanh": new_layer = fully_connected(last_layer, self.hidden_layer_size[i], activation_fn=None) new_layer = tf.nn.tanh(new_layer) tf.summary.histogram("%s th layer activation" % str(i), new_layer) if self.hidden_layer_type[i] == "sigmoid": new_layer = fully_connected(last_layer, self.hidden_layer_size[i], activation_fn=tf.nn.sigmoid) if self.hidden_layer_type[i] == "relu": new_layer = fully_connected(last_layer, self.hidden_layer_size[i], activation_fn=tf.nn.relu) if self.hidden_layer_type[i] == "selu": new_layer = fully_connected(last_layer, self.hidden_layer_size[i], activation_fn=tf.nn.selu) layer_list.append(new_layer) with tf.name_scope("output_layer"): if self.output_type == "linear": self.output_layer = fully_connected(layer_list[-1], self.n_out, activation_fn=None) if self.output_type == "tanh": self.output_layer = fully_connected(layer_list[-1], self.n_out, activation_fn=tf.nn.tanh)
def __init__(self, feature_num, class_num, is_training, step=1e-4, size=64, batch_size=100): self.weight_decay = 5.0 self.bn_params = { # Decay for the moving averages. 'decay': 0.999, 'center': True, 'scale': True, # epsilon to prevent 0s in variance. 'epsilon': 0.001, # None to force the updates during train_op 'updates_collections': None, 'is_training': is_training } self.batch_size = batch_size self.feature_num = feature_num self.class_num = class_num self.X = tf.placeholder(tf.float32, [None, feature_num]) self.y_ = tf.placeholder(tf.float32, [None, class_num]) with tf.contrib.framework.arg_scope( [layers.convolution2d], kernel_size=3, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, #normalizer_params=self.bn_params, #weights_initializer=layers.variance_scaling_initializer(), weights_regularizer=layers.l2_regularizer(self.weight_decay)): self.X = tf.reshape(self.X, [-1, size, size, 3]) self.keep_prob = tf.placeholder(tf.float32) net = layers.convolution2d(self.X, num_outputs=8) net = layers.max_pool2d(net, kernel_size=2) net = layers.relu(net, num_outputs=8) net = layers.convolution2d(net, num_outputs=16) net = layers.convolution2d(net, num_outputs=16) net = layers.max_pool2d(net, kernel_size=2) net = layers.relu(net, num_outputs=16) net = layers.convolution2d(net, num_outputs=32) net = layers.convolution2d(net, num_outputs=32) net = layers.max_pool2d(net, kernel_size=2) net = layers.dropout(net, keep_prob=self.keep_prob) net = layers.relu(net, num_outputs=32) net = layers.convolution2d(net, num_outputs=64) net = layers.convolution2d(net, num_outputs=64) net = layers.max_pool2d(net, kernel_size=2) net = layers.dropout(net, keep_prob=self.keep_prob) net = layers.relu(net, num_outputs=64) net = layers.flatten(net, [-1, 4 * 4 * 32]) net = layers.fully_connected(net, num_outputs=64, activation_fn=tf.nn.relu) net = layers.dropout(net, keep_prob=self.keep_prob) net = layers.fully_connected(net, num_outputs=self.class_num) self.y = layers.softmax(net) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(net, self.y_)) self.optimizer = tf.train.RMSPropOptimizer(step).minimize(self.loss) pred = tf.equal(tf.argmax(self.y, 1), tf.argmax(self.y_, 1)) self.acc = tf.reduce_mean(tf.cast(pred, tf.float32)) self.sess = tf.Session()
def __init__(self, max_seq_len, max_sent_len, num_classes, vocab_size, embedding_size, max_grad_norm, dropout_proba, learn_rate): # Params self.learning_rate = learn_rate self.vocab_size = vocab_size self.num_classes = num_classes self.max_seq_len = max_seq_len self.embedding_size = embedding_size self.word_encoder_num_hidden = max_seq_len self.word_output_size = max_seq_len self.sentence_encoder_num_hidden = max_sent_len self.sentence_output_size = max_sent_len self.max_grad_norm = max_grad_norm self.dropout_keep_proba = dropout_proba # Input self.input_x = tf.placeholder(shape=[None, None, None], dtype=tf.int32, name='input_x') self.input_y = tf.placeholder(shape=[None, self.num_classes], dtype=tf.int32, name='input_y') self.word_lengths = tf.placeholder(shape=[None, None], dtype=tf.int32, name='word_lengths') self.sentence_lengths = tf.placeholder(shape=[ None, ], dtype=tf.int32, name='sentence_lengths') self.is_training = tf.placeholder(dtype=tf.bool, name='is_training') # Input_x dim self.document_size, self.sentence_size, self.word_size = tf.unstack( tf.shape(self.input_x)) with tf.device('/gpu:0'), tf.name_scope('embedding_layer'): w = tf.Variable(tf.random_uniform( [self.vocab_size, self.embedding_size], -1., 1.), dtype=tf.float32, name='W') self.input_x_embedded = tf.nn.embedding_lookup(w, self.input_x) # reshape input_x after embedding self.input_x_embedded = tf.reshape(self.input_x_embedded, [ self.document_size * self.sentence_size, self.word_size, self.embedding_size ]) self.input_x_embedded_lengths = tf.reshape( self.word_lengths, [self.document_size * self.sentence_size]) with tf.variable_scope("word_level"): self.word_encoder_outputs = self.bidirectional_RNN( num_hidden=self.word_encoder_num_hidden, inputs=self.input_x_embedded) word_level_output = self.attention( inputs=self.word_encoder_outputs, output_size=self.word_output_size) with tf.variable_scope('dropout'): print('self.is_training:{}'.format(self.is_training)) word_level_output = layers.dropout( word_level_output, keep_prob=self.dropout_keep_proba, is_training=self.is_training) # reshape word level output self.sentence_encoder_inputs = tf.reshape( word_level_output, [self.document_size, self.sentence_size, self.word_output_size]) with tf.variable_scope('sentence_level'): self.sentence_encoder_outputs = self.bidirectional_RNN( num_hidden=self.sentence_encoder_num_hidden, inputs=self.sentence_encoder_inputs) sentence_level_output = self.attention( inputs=self.sentence_encoder_outputs, output_size=self.sentence_output_size) with tf.variable_scope('dropout'): sentence_level_output = layers.dropout( sentence_level_output, keep_prob=self.dropout_keep_proba, is_trainin=self.is_training) # Final model prediction with tf.variable_scope('classifier_output'): self.logits = layers.fully_connected(sentence_level_output, self.num_classes, activation_fn=None) self.predictions = tf.argmax(self.logits, axis=1, name='predictions') # Calculate mean corss-entropy loss with tf.variable_scope('loss'): losses = tf.nn.softmax_cross_entropy_with_logits( labels=self.input_y, logits=self.logits) self.loss = tf.reduce_mean(losses) tf.summary.scalar('Loss', self.loss) # Accuracy with tf.variable_scope('accuracy'): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, axis=1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy') tf.summary.scalar('Accuracy', self.accuracy)
def define_feedforward_model(self): """ the basic deep feedforward dnn model """ layer_list=[] with self.graph.as_default() as g: #pdb.set_trace() self.global_step = tf.Variable(0,trainable=False) g.add_to_collection(name='global_step',value=self.global_step) is_training_batch=tf.placeholder(tf.bool,shape=(),name="is_training_batch") # bn_params={"is_training":is_training_batch,"decay":0.99,"updates_collections":None} g.add_to_collection("is_training_batch",is_training_batch) with tf.name_scope("input"): input_layer=tf.placeholder(dtype=tf.float32,shape=(None,self.n_in),name="input_layer") if self.dropout_rate!=0.0: print("Using dropout to avoid overfitting and the dropout rate is",self.dropout_rate) is_training_drop=tf.placeholder(dtype=tf.bool,shape=(),name="is_training_drop") input_layer_drop=dropout(input_layer,self.dropout_rate,is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop",value=is_training_drop) else: layer_list.append(input_layer) g.add_to_collection("input_layer",layer_list[0]) for i in range(len(self.hidden_layer_size)): with tf.name_scope("hidden_layer_"+str(i+1)): if self.dropout_rate!=0.0: last_layer=layer_list[-1] if self.hidden_layer_type[i]=="tanh": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=None) new_layer = tf.contrib.layers.batch_norm(new_layer,is_training=is_training_batch) new_layer = tf.nn.tanh(new_layer) if self.hidden_layer_type[i]=="sigmoid": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.sigmoid) if self.hidden_layer_type[i]=="relu": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.relu) if self.hidden_layer_type[i]=="selu": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.selu) new_layer_drop=dropout(new_layer,self.dropout_rate,is_training=is_training_drop) layer_list.append(new_layer_drop) else: # pdb.set_trace() last_layer=layer_list[-1] if self.hidden_layer_type[i]=="tanh": new_layer = fully_connected(last_layer, self.hidden_layer_size[i], activation_fn=None) # new_layer = tf.contrib.layers.batch_norm(new_layer, is_training=is_training_batch) new_layer = tf.nn.tanh(new_layer) if self.hidden_layer_type[i]=="sigmoid": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.sigmoid) if self.hidden_layer_type[i]=="relu": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.relu) if self.hidden_layer_type[i]=="selu": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.selu) layer_list.append(new_layer) # pdb.set_trace() with tf.name_scope("output_layer"): if self.output_type=="linear": output_layer=fully_connected(layer_list[-1],self.n_out,activation_fn=None) if self.output_type=="tanh": output_layer=fully_connected(layer_list[-1],self.n_out,activation_fn=tf.nn.tanh) g.add_to_collection(name="output_layer",value=output_layer)
### DENOISING AUTOENCODERS # force AE to learn useful features by adding noise to the input # add noise to the inputs, reconstruction loss calculated based on the original inputs # v1: add Gaussian noise X = tf.placeholder(tf.float32, shape=[None, n_inputs]) X_noisy = X + tf.random_normal(tf.shape(X)) [...] hidden1 = activation(tf.matmul(X_noisy, weights1) + biases1) [...] reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # v2: "dropout" (inputs randomly switched off) from tensorflow.contrib.layers import dropout keep_prob = 0.7 is_training = tf.placeholder_with_default(False, shape=(), name="is_training") X = tf.placeholder(tf.float32, shape=[None, n_inputs]) X_drop = dropout(X, keep_prob, is_training=is_training) [...] hidden1 = activation(tf.matmul(X_drop, weights1) + biases1) [...] reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) [...] # -> during training, do not forget to set is_training to True sess.run(training_op, feed_dict={X: X_batch, is_training: True}) # for testing, need to be False, but no need to set explicitly as set by default ### SPARSE AUTOENCODERS # add terms to cost function; e.g. limit the number of significant active neurons # 1) compute actual sparsity = average activation of each neuron in the coding layer over whole training batch # 2) penalize neurons that are too active: add sparsity loss to cost function # for sparsity loss, better to use KL divergence, stronger gradients than MSE
def _dnn_model_fn(features, labels, mode, params, config=None): """Deep Neural Net model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * head: A `_Head` instance. * hidden_units: List of hidden units per layer. * feature_columns: An iterable containing all the feature columns used by the model. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. If `None`, will use the Adagrad optimizer with a default learning rate of 0.05. * activation_fn: Activation function applied to each layer. If `None`, will use `tf.nn.relu`. Note that a string containing the unqualified name of the op may also be provided, e.g., "relu", "tanh", or "sigmoid". * dropout: When not `None`, the probability we will drop out a given coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to a `float` multiplier. Multiplier will be used to multiply with learning rate for the embedding variables. * input_layer_min_slice_size: Optional. The min slice size of input layer partitions. If not provided, will use the default of 64M. config: `RunConfig` object to configure the runtime settings. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. """ head = params["head"] hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] optimizer = params.get("optimizer") or "Adagrad" activation_fn = _get_activation_fn(params.get("activation_fn")) dropout = params.get("dropout") gradient_clip_norm = params.get("gradient_clip_norm") input_layer_min_slice_size = ( params.get("input_layer_min_slice_size") or 64 << 20) num_ps_replicas = config.num_ps_replicas if config else 0 embedding_lr_multipliers = params.get("embedding_lr_multipliers", {}) features = _get_feature_dict(features) parent_scope = "dnn" partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas) with variable_scope.variable_scope( parent_scope, values=tuple(six.itervalues(features)), partitioner=partitioner): input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=input_layer_min_slice_size)) with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner) as input_layer_scope: if all([ isinstance(fc, feature_column._FeatureColumn) # pylint: disable=protected-access for fc in feature_columns ]): net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, weight_collections=[parent_scope], scope=input_layer_scope) else: net = fc_core.input_layer( features=features, feature_columns=feature_columns, weight_collections=[parent_scope]) for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(net,)) as hidden_layer_scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=activation_fn, variables_collections=[parent_scope], scope=hidden_layer_scope) if dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) with variable_scope.variable_scope( "logits", values=(net,)) as logits_scope: logits = layers.fully_connected( net, head.logits_dimension, activation_fn=None, variables_collections=[parent_scope], scope=logits_scope) _add_hidden_layer_summary(logits, logits_scope.name) def _train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=training_util.get_global_step(), learning_rate=_LEARNING_RATE, optimizer=_get_optimizer(optimizer), gradient_multipliers=( dnn_linear_combined._extract_embedding_lr_multipliers( # pylint: disable=protected-access embedding_lr_multipliers, parent_scope, input_layer_scope.name)), clip_gradients=gradient_clip_norm, name=parent_scope, # Empty summaries to prevent optimizers from logging training_loss. summaries=[]) return head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_train_op_fn, logits=logits)
def build_larger_lenet5(X_train_shape, n_outputs, use_batch_norm=False, use_dropout=False): lprint('Mimicing LeNet-5') print('X_train_shape', X_train_shape) X = tf.placeholder(tf.float32, shape=(None, X_train_shape[1], X_train_shape[2], X_train_shape[3]), name='X') y = tf.placeholder(tf.int64, shape=(None), name='y') #fake_is_training = tf.placeholder(tf.bool, shape=(), name='is_training') last_output = X layers = [] he_init = tf.contrib.layers.variance_scaling_initializer() norm_fn = None norm_params = None is_training = tf.placeholder(tf.bool, shape=(), name='is_training') keep_prob = 0.5 lprint('Use Batch Normalization:', use_batch_norm) lprint('Use Dropout:', use_dropout, ', keep_prob:', keep_prob) if use_batch_norm: norm_fn = batch_norm norm_params = { 'is_training': is_training, 'decay': 0.99, 'updates_collections': None } with tf.name_scope('cnn'): with tf.contrib.framework.arg_scope( [fully_connected, conv2d], activation_fn=tf.nn.relu, #normalizer_fn=norm_fn, #normalizer_params=norm_params, weights_initializer=he_init): C1 = conv2d(inputs=X, num_outputs=64, kernel_size=5, stride=1, padding='SAME', normalizer_fn=norm_fn, normalizer_params=norm_params) P1 = tf.nn.max_pool(C1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME') C2 = conv2d(inputs=P1, num_outputs=64, kernel_size=5, stride=1, padding='SAME', normalizer_fn=norm_fn, normalizer_params=norm_params) P2 = tf.nn.max_pool(C2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME') C3 = conv2d(inputs=P1, num_outputs=128, kernel_size=4, stride=1, padding='SAME', normalizer_fn=norm_fn, normalizer_params=norm_params) P3 = tf.nn.max_pool(C2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') lprint('Pool3 shape:', P3) pool_shape = P2.get_shape().as_list() #shaped = tf.reshape(last_pool, [-1, 10]) reshape = tf.reshape( P3, [-1, pool_shape[1] * pool_shape[2] * pool_shape[3]]) F1 = fully_connected(reshape, 2048) if use_dropout: hidden_drop = dropout(F1, keep_prob, is_training=is_training) last_output = hidden_drop else: last_output = F1 logits = fully_connected(last_output, n_outputs, scope='outputs', activation_fn=None, weights_initializer=he_init) lprint(C1) lprint(last_output) lprint(logits) return X, y, logits, is_training
keep_prob_h2 = 0.9 keep_prob_h3 = 1 # Regularizador: L2 Ln_reg = tf.contrib.layers.l2_regularizer(lambda_ln) # Pesos por clase: clase 0, clase 1 class_weights = tf.constant([1.0, 2.04]) # Definicion de la arquitectura de red a: batch norm with tf.variable_scope("dnn"): with tf.contrib.framework.arg_scope([fully_connected], normalizer_fn=batch_norm, normalizer_params=bn_params, weights_regularizer=Ln_reg): X_drop = dropout(X, keep_prob_h1, is_training=phase) hidden1 = fully_connected(inputs=X_drop, num_outputs=n_hidden1, scope='hidden1') hidden1_drop = dropout(hidden1, keep_prob_h1, is_training=phase) hidden2 = fully_connected(inputs=hidden1_drop, num_outputs=n_hidden2, scope='hidden2') hidden2_drop = dropout(hidden2, keep_prob_h2, is_training=phase) hidden3 = fully_connected(inputs=hidden2_drop, num_outputs=n_hidden3, scope='hidden3') hidden3_drop = dropout(hidden3, keep_prob_h3, is_training=phase) logits = fully_connected(inputs=hidden3_drop, num_outputs=n_outputs, scope='outputs')
def build_model(x_pl, input_width, input_height, output_dim, batch_size): # make distributed representation of input image for localization network loc_l1 = pool(x_pl, kernel_size=[2, 2], scope="localization_l1") loc_l2 = conv(loc_l1, num_outputs=8, kernel_size=[5, 5], stride=[1, 1], padding="SAME", scope="localization_l2") loc_l3 = pool(loc_l2, kernel_size=[2, 2], scope="localization_l3") loc_l4 = conv(loc_l3, num_outputs=8, kernel_size=[5, 5], stride=[1, 1], padding="SAME", scope="localization_l4") loc_l4_flatten = flatten(loc_l4, scope="localization_l4-flatten") loc_l5 = dense(loc_l4_flatten, num_outputs=50, activation_fn=relu, scope="localization_l5") # set up weights for transformation (notice we always need 6 output neurons) with tf.name_scope("localization"): W_loc_out = tf.get_variable("localization_loc-out", [50, 6], initializer=tf.constant_initializer(0.0)) initial = np.array([[0.45, 0, 0], [0, 0.45, 0]]) initial = initial.astype('float32') initial = initial.flatten() b_loc_out = tf.Variable(initial_value=initial, name='b-loc-out') loc_out = tf.matmul(loc_l5, W_loc_out) + b_loc_out # spatial transformer l_trans1 = transformer(x_pl, loc_out, out_size=(OUT_HEIGHT, OUT_WIDTH)) l_trans1.set_shape([None, OUT_HEIGHT, OUT_WIDTH, NUM_COL_CHANNELS]) print( "Transformer network output shape: ", l_trans1.get_shape()) # classification network #Blok 1 conv_l11 = conv(l_trans1, num_outputs=64, kernel_size=[3, 3]) conv_l12 = conv(conv_l11, num_outputs=64, kernel_size=[3, 3]) pool_l13 = pool(conv_l12, kernel_size=[2, 2], stride=[2,2]) #Blok 2 #conv_l21 = conv(pool_l13, num_outputs=128, kernel_size=[3, 3]) #conv_l22 = conv(conv_l21, num_outputs=128, kernel_size=[3, 3]) #pool_l23 = pool(conv_l22, kernel_size=[2, 2], stride=[2,2]) #Blok 3 #conv_l31 = conv(pool_l13, num_outputs=128, kernel_size=[3, 3]) conv_l32 = conv(pool_l13, num_outputs=64, kernel_size=[3, 3]) conv_l33 = conv(conv_l32, num_outputs=64, kernel_size=[3, 3]) pool_l34 = pool(conv_l33, kernel_size=[2, 2], stride=[2,2]) #Blok 4 conv_l41 = conv(pool_l34, num_outputs=128, kernel_size=[3, 3]) conv_l42 = conv(conv_l41, num_outputs=128, kernel_size=[3, 3]) conv_l43 = conv(conv_l42, num_outputs=128, kernel_size=[3, 3]) pool_l44 = pool(conv_l43, kernel_size=[2, 2], stride=[2,2]) #Blok 5 conv_l51 = conv(pool_l44, num_outputs=256, kernel_size=[3, 3]) conv_l52 = conv(conv_l51, num_outputs=256, kernel_size=[3, 3]) conv_l53 = conv(conv_l52, num_outputs=256, kernel_size=[3, 3]) pool_l54 = pool(conv_l53, kernel_size=[2, 2], stride=[2,2]) dense_flatten = flatten(pool_l54) dense_1 = dense(dense_flatten, num_outputs=2048, activation_fn=relu) dropout_l4 =dropout(dense_1) dense_2 = dense(dropout_l4, num_outputs=2048, activation_fn=relu) dropout_l5 =dropout(dense_2) logit = dense(dropout_l5, num_outputs=output_dim, activation_fn=None) l_out = tf.nn.softmax(logit) return l_out,logit,l_trans1, loc_out
def model(inputs, dropout_keep_prob=0.5, num_classes=43, is_training=True, scope=''): """ This is the implementation of the current model: 2DConvolution Inception module Inceptiuon Module Max Pooling Fully Connected Layer, Relu, Xavier initialization Dropout Fully Connected Layer, Relu, Xavier initialization Dropout Fully Connected Layer, Relu, Xavier initialization Dropout Softmax `inputs` Input data `dropout_keep_prob` : Float, The probability that each element is kept. `num_classes` : Integer, Number of data classes. `is_training` : Bool, indicating whether or not the model is in training mode. If so, dropout is applied and values scaled. Otherwise, inputs is returned. `scope` : String, scope of the current model """ with tf.name_scope(scope, "model", [inputs]): with ops.arg_scope([layers.max_pool2d], padding='SAME'): end_points['conv0'] = layers.conv2d(inputs, 64, [7, 7], stride=2, scope='conv0') with tf.variable_scope("inception_3a"): end_points['inception_3a'] = get_inception_layer( end_points['conv0'], 64, 96, 128, 16, 32, 32) with tf.variable_scope("inception_3b"): end_points['inception_3b'] = get_inception_layer( end_points['inception_3a'], 128, 128, 192, 32, 96, 64) end_points['pool2'] = layers.max_pool2d(end_points['inception_3b'], [3, 3], scope='pool2') #print(end_points['pool2'].shape) end_points['reshape'] = tf.reshape(end_points['pool2'], [-1, 8 * 8 * 480]) end_points['fully_2'] = layers.fully_connected( end_points['reshape'], 200, activation_fn=tf.nn.relu, scope='fully_2') end_points['dropout1'] = layers.dropout(end_points['fully_2'], dropout_keep_prob, is_training=is_training) end_points['fully_3'] = layers.fully_connected( end_points['dropout1'], 400, activation_fn=tf.nn.relu, scope='fully_3') end_points['dropout2'] = layers.dropout(end_points['fully_3'], dropout_keep_prob, is_training=is_training) end_points['fully_4'] = layers.fully_connected( end_points['dropout2'], 300, activation_fn=tf.nn.relu, scope='fully_4') end_points['dropout3'] = layers.dropout(end_points['fully_4'], dropout_keep_prob, is_training=is_training) end_points['logits'] = layers.fully_connected( end_points['dropout3'], num_classes, activation_fn=None, scope='logits') end_points['predictions'] = tf.nn.softmax(end_points['logits'], name='predictions') return end_points['logits'], end_points
def build_model(embedding, options): """ Builds the entire computational graph used for training """ # description string: #words x #samples with tf.device('/gpu:0'): with tf.variable_scope('input'): x = tf.placeholder(tf.int64, shape=[None, None, None], name='x') # 3D vector batch,news and sequence(before embedding)40*32*13 x_mask = tf.placeholder(tf.float32, shape=[None, None], name='x_mask') # mask batch,news y = tf.placeholder(tf.int64, shape=[None], name='y') x_d1 = tf.placeholder(tf.int64, shape=[None, None, None, None], name='x_d1') x_d1_mask = tf.placeholder(tf.float32, shape=[None, None, None], name='x_d1_mask') x_d2 = tf.placeholder(tf.int64, shape=[None, None, None, None], name='x_d2') x_d2_mask = tf.placeholder(tf.float32, shape=[None, None, None], name='x_d2_mask') final_mask = tf.placeholder(tf.float32, shape=[None, None], name='final_mask') tech = tf.placeholder(tf.float32, shape=[None, None,7], name='technical') #shape is batch time unit # final_mask shape is day*n_samples ##TODO important keep_prob = tf.placeholder(tf.float32, [], name='keep_prob') is_training = tf.placeholder(tf.bool, name='is_training') ##TODO important sequence_mask = tf.cast(tf.abs(tf.sign(x)), tf.float32) # 3D sequence_d1_mask = tf.cast(tf.abs(tf.sign(x_d1)), tf.float32) # 4D sequence_d2_mask = tf.cast(tf.abs(tf.sign(x_d2)), tf.float32) # 4D n_timesteps = tf.shape(x)[0] # time steps n_samples = tf.shape(x)[1] # n samples # # word embedding ##TODO word embedding emb = tf.nn.embedding_lookup(embedding, x) emb_d1 = tf.nn.embedding_lookup(embedding, x_d1) emb_d2 = tf.nn.embedding_lookup(embedding, x_d2) '''if options['use_dropout']: emb = layers.dropout(emb, keep_prob=keep_prob, is_training=is_training) ''' with tf.device('/gpu:0'): # fed into the input of BILSTM from the official document ##TODO word level LSTM with tf.name_scope('news'): att = news(emb, sequence_mask, x_mask, keep_prob, is_training, options) ##TODO att shape 32*600 att_day1 32*3*600 att_day2 32*4*600 with tf.name_scope('day1'): att_day1 = days(emb_d1, sequence_d1_mask, x_d1_mask, keep_prob, is_training, options) # TODO bilstm layers # Change the time step and batch with tf.device('/gpu:0'): with tf.name_scope('day2'): att_day2 = days(emb_d2, sequence_d2_mask, x_d2_mask, keep_prob, is_training, options) with tf.name_scope('final'): final = tf.concat([att_day2, att_day1, tf.expand_dims(att, 1)], 1) '''if options['use_dropout']: final = layers.dropout(final, keep_prob=keep_prob, is_training=is_training) ''' # final shape is 8*32*600 if options['last_layer'] == 'LSTM': final = bilstm_filter(final, final_mask, keep_prob, prefix='day_lstm', dim=100, is_training=is_training) # output shape: batch,time_step,2*lstm_unit(concate) 32*7*600 #tech_ind = lstm_filter(tech, tf.ones(shape=[tf.shape(tech)[0],tf.shape(tech)[1]]), keep_prob, prefix='tech_lstm', dim=50, # is_training=is_training) ##TODO day level attention att_final = attention_v2(tf.concat(final, 2), final_mask, name='day_attention', keep=keep_prob,r=4, is_training=is_training) # already masked after attention ##TODO take day lstm average # att_final = tf.reduce_mean(tf.concat(final,2),1) # tech_att = tf.reduce_mean(tf.concat(tech_ind,2),1) ##TODO take the lasts #tech_att=tech_ind[:,-1,:] #att_final = tf.concat([att_final,tech_att],axis=1) logit = tf.layers.dense(att_final, 100, activation=tf.nn.tanh, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='ff', reuse=tf.AUTO_REUSE) # logit = tf.layers.batch_normalization(logit, training=is_training) # logit=tf.nn.tanh(logit) ''' # logit1 = tf.reduce_sum(tf.concat(final,2) * tf.expand_dims(final_mask,-1),0) / tf.expand_dims(tf.reduce_sum(final_mask,0),1) # logit2 = tf.reduce_max(ctx3 * tf.expand_dims(x1_mask,2),0) ''' if options['last_layer'] == 'CNN': att_ctx = tf.concat([att_day1, tf.expand_dims(att, 1)], 1) xavier = layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32) conv1 = tf.layers.conv1d(att_ctx, filters=options['CNN_filter'], kernel_size=options['CNN_kernel'], padding='same', strides=1, activation=tf.nn.relu, kernel_initializer=xavier, name='conv1') conv2 = tf.layers.conv1d(final, filters=options['CNN_filter'], kernel_size=options['CNN_kernel'], padding='same', strides=1, activation=tf.nn.relu, kernel_initializer=xavier, name='conv2') pool1 = tf.layers.max_pooling1d(conv1, pool_size=2, strides=2, padding='same', data_format='channels_last', name='pool1') pool2 = tf.layers.max_pooling1d(conv2, pool_size=2, strides=2, padding='same', data_format='channels_last', name='pool2') d1size = math.ceil(options['delay1'] / 2) * options['CNN_filter'] d2size = math.ceil(options['delay2'] / 2) * options['CNN_filter'] pool1_flat = tf.reshape(pool1, [-1, d1size]) pool2_flat = tf.reshape(pool2, [-1, d2size]) cnn_final = tf.concat([att, pool1_flat, pool2_flat], -1) logit = tf.layers.dense(cnn_final, 300, activation=tf.nn.tanh, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='ff', reuse=tf.AUTO_REUSE) # logit = tf.layers.batch_normalization(logit, training=is_training) # logit=tf.nn.tanh(logit) if options['use_dropout']: logit = layers.dropout(logit, keep_prob=keep_prob, is_training=is_training,seed=None) pred = tf.layers.dense(logit, 2, activation=None, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='fout', reuse=tf.AUTO_REUSE) logger.info('Building f_cost...') # todo not same labels = tf.one_hot(y, depth=2, axis=1) # labels = y preds = tf.nn.softmax(pred, 1,name='softmax') # preds = tf.nn.sigmoid(pred) # pred=tf.reshape(pred,[-1]) cost = tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=labels) # cost = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,logits=pred),1) # cost = -tf.reduce_sum((tf.cast(labels, tf.float32) * tf.log(preds + 1e-8)),axis=1) #cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pred, labels=y) logger.info('Done') ''' logit1 = tf.reduce_sum(ctx1 * tf.expand_dims(x_mask, 2), 0) / tf.expand_dims(tf.reduce_sum(x_mask, 0), 1) logit2 = tf.reduce_max(ctx1 * tf.expand_dims(x_mask, 2), 0) logit = tf.concat([logit1, logit2], 1) ''' with tf.variable_scope('logging'): tf.summary.scalar('current_cost', tf.reduce_mean(cost)) tf.summary.histogram('predicted_value', preds) summary = tf.summary.merge_all() return is_training, cost, x, x_mask, y, n_timesteps, preds, summary
def _dnn_classifier_model_fn(features, targets, mode, params): """Deep Neural Net model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * hidden_units: List of hidden units per layer. * feature_columns: An iterable containing all the feature columns used by the model. * n_classes: number of target classes. * weight_column_name: A string defining the weight feature column, or None if there are no weights. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. * activation_fn: Activation function applied to each layer. If `None`, will use `tf.nn.relu`. * dropout: When not `None`, the probability we will drop out a given coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * enable_centered_bias: A bool. If True, estimator will learn a centered bias variable for each class. Rest of the model structure learns the residual after centered bias. * num_ps_replicas: The number of parameter server replicas. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. """ hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] optimizer = params["optimizer"] activation_fn = params["activation_fn"] dropout = params["dropout"] gradient_clip_norm = params["gradient_clip_norm"] enable_centered_bias = params["enable_centered_bias"] num_ps_replicas = params["num_ps_replicas"] features = _get_feature_dict(features) parent_scope = "dnn" num_label_columns = 1 if n_classes == 2 else n_classes if n_classes == 2: loss_fn = loss_ops.sigmoid_cross_entropy else: loss_fn = loss_ops.sparse_softmax_cross_entropy input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( parent_scope + "/input_from_feature_columns", values=features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, weight_collections=[parent_scope], scope=scope) hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, values=[net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected(net, num_hidden_units, activation_fn=activation_fn, variables_collections=[parent_scope], scope=scope) if dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dropout)) _add_hidden_layer_summary(net, scope.name) with variable_scope.variable_scope( parent_scope + "/logits", values=[net], partitioner=hidden_layer_partitioner) as scope: logits = layers.fully_connected(net, num_label_columns, activation_fn=None, variables_collections=[parent_scope], scope=scope) _add_hidden_layer_summary(logits, scope.name) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) if mode == estimator.ModeKeys.TRAIN: loss = loss_fn(logits, targets, weight=_get_weight_tensor(features, weight_column_name)) train_ops = [ optimizers.optimize_loss( loss=loss, global_step=contrib_variables.get_global_step(), learning_rate=_LEARNING_RATE, optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm, name=parent_scope) ] if enable_centered_bias: train_ops.append( _centered_bias_step(targets, loss_fn, num_label_columns)) return None, loss, control_flow_ops.group(*train_ops) elif mode == estimator.ModeKeys.EVAL: predictions = _predictions(logits=logits, n_classes=n_classes) weight = _get_weight_tensor(features, weight_column_name) training_loss = loss_fn(logits, targets, weight=weight) loss = _rescale_eval_loss(training_loss, weight) return predictions, loss, [] else: # mode == estimator.ModeKeys.INFER: predictions = _predictions(logits=logits, n_classes=n_classes) return predictions, None, []
bn_params = { 'is_training': train_mode, 'decay': 0.9, 'updates_collections': None } # We can build short code using 'arg_scope' to avoid duplicate code # same function with different arguments with arg_scope([fully_connected], activation_fn=tf.nn.relu, weights_initializer=xavier_init, biases_initializer=None, normalizer_fn=batch_norm, normalizer_params=bn_params): hidden_layer1 = fully_connected(X, hidden_output_size, scope="h1") h1_drop = dropout(hidden_layer1, keep_prob, is_training=train_mode) hidden_layer2 = fully_connected(h1_drop, hidden_output_size, scope="h2") h2_drop = dropout(hidden_layer2, keep_prob, is_training=train_mode) hidden_layer3 = fully_connected(h2_drop, hidden_output_size, scope="h3") h3_drop = dropout(hidden_layer3, keep_prob, is_training=train_mode) hidden_layer4 = fully_connected(h3_drop, hidden_output_size, scope="h4") h4_drop = dropout(hidden_layer4, keep_prob, is_training=train_mode) hypothesis = fully_connected(h4_drop, final_output_size, activation_fn=None, scope="hypothesis") # define cost/loss & optimizer cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis, labels=Y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
weights_regularizer=layers.l2_regularizer(1.0), biases_regularizer=layers.l2_regularizer(1.0), scope='stack3_Conv3') stack3_pool = layers.max_pool2d(stack3_conv3, [2,2], padding='SAME', scope='stack3_Pool')''' stack3_pool_flat = layers.flatten(stack1_pool, scope='stack3_pool_flat') fcl1 = layers.fully_connected( stack3_pool_flat, 512, weights_regularizer=layers.l2_regularizer(0.1), biases_regularizer=layers.l2_regularizer(0.1), scope='FCL1') fcl1_d = layers.dropout(fcl1, keep_prob=0.5, scope='dropout1') fcl2 = layers.fully_connected( fcl1_d, 128, weights_regularizer=layers.l2_regularizer(0.1), biases_regularizer=layers.l2_regularizer(0.1), scope='FCL2') fcl2_d = layers.dropout(fcl2, keep_prob=0.5, scope='dropout2') y, cross_entropy = skflow.models.logistic_regression(fcl2_d, y_, init_stddev=0.01) '''train_op = tf.contrib.layers.optimize_loss(loss=cross_entropy, global_step=global_step, learning_rate=0.001, optimizer='Adam', clip_gradients=1,
def _dnn_tree_combined_model_fn( features, labels, mode, head, dnn_hidden_units, dnn_feature_columns, tree_learner_config, num_trees, tree_examples_per_layer, config=None, dnn_optimizer="Adagrad", dnn_activation_fn=nn.relu, dnn_dropout=None, dnn_input_layer_partitioner=None, dnn_input_layer_to_tree=True, dnn_steps_to_train=10000, tree_feature_columns=None, tree_center_bias=True): """DNN and GBDT combined model_fn. Args: features: `dict` of `Tensor` objects. labels: Labels used to train on. mode: Mode we are in. (TRAIN/EVAL/INFER) head: A `Head` instance. dnn_hidden_units: List of hidden units per layer. dnn_feature_columns: An iterable containing all the feature columns used by the model's DNN. tree_learner_config: A config for the tree learner. num_trees: Number of trees to grow model to after training DNN. tree_examples_per_layer: Number of examples to accumulate before growing the tree a layer. This value has a big impact on model quality and should be set equal to the number of examples in training dataset if possible. It can also be a function that computes the number of examples based on the depth of the layer that's being built. config: `RunConfig` of the estimator. dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN. If `None`, will use the Adagrad optimizer with default learning rate of 0.001. dnn_activation_fn: Activation function applied to each layer of the DNN. If `None`, will use `tf.nn.relu`. dnn_dropout: When not `None`, the probability to drop out a given unit in the DNN. dnn_input_layer_partitioner: Partitioner for input layer of the DNN. Defaults to `min_max_variable_partitioner` with `min_slice_size` 64 << 20. dnn_input_layer_to_tree: Whether to provide the DNN's input layer as a feature to the tree. dnn_steps_to_train: Number of steps to train dnn for before switching to gbdt. tree_feature_columns: An iterable containing all the feature columns used by the model's boosted trees. If dnn_input_layer_to_tree is set to True, these features are in addition to dnn_feature_columns. tree_center_bias: Whether a separate tree should be created for first fitting the bias. Returns: A `ModelFnOps` object. Raises: ValueError: if inputs are not valid. """ if not isinstance(features, dict): raise ValueError("features should be a dictionary of `Tensor`s. " "Given type: {}".format(type(features))) if not dnn_feature_columns: raise ValueError("dnn_feature_columns must be specified") # Build DNN Logits. dnn_parent_scope = "dnn" dnn_partitioner = dnn_input_layer_partitioner or ( partitioned_variables.min_max_variable_partitioner( max_partitions=config.num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( dnn_parent_scope, values=tuple(six.itervalues(features)), partitioner=dnn_partitioner): with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=dnn_partitioner) as input_layer_scope: input_layer = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=input_layer_scope) previous_layer = input_layer for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(previous_layer,)) as hidden_layer_scope: net = layers.fully_connected( previous_layer, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) previous_layer = net with variable_scope.variable_scope( "logits", values=(previous_layer,)) as logits_scope: dnn_logits = layers.fully_connected( previous_layer, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=logits_scope) _add_hidden_layer_summary(dnn_logits, logits_scope.name) def _dnn_train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=training_util.get_global_step(), learning_rate=_DNN_LEARNING_RATE, optimizer=_get_optimizer(dnn_optimizer), name=dnn_parent_scope, variables=ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope), # Empty summaries to prevent optimizers from logging training_loss. summaries=[]) # Build Tree Logits. global_step = training_util.get_global_step() with ops.device(global_step.device): ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config="", # Initialize an empty ensemble. name="ensemble_model") tree_features = features.copy() if dnn_input_layer_to_tree: tree_features["dnn_input_layer"] = input_layer tree_feature_columns.append(layers.real_valued_column("dnn_input_layer")) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=config.is_chief, num_ps_replicas=config.num_ps_replicas, ensemble_handle=ensemble_handle, center_bias=tree_center_bias, examples_per_layer=tree_examples_per_layer, learner_config=tree_learner_config, feature_columns=tree_feature_columns, logits_dimension=head.logits_dimension, features=tree_features) with ops.name_scope("gbdt"): predictions_dict = gbdt_model.predict(mode) tree_logits = predictions_dict["predictions"] def _tree_train_op_fn(loss): """Returns the op to optimize the loss.""" update_op = gbdt_model.train(loss, predictions_dict, labels) with ops.control_dependencies( [update_op]), (ops.colocate_with(global_step)): update_op = state_ops.assign_add(global_step, 1).op return update_op tree_train_logits = dnn_logits + tree_logits def _no_train_op_fn(loss): """Returns a no-op.""" del loss return control_flow_ops.no_op() model_fn_ops = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_train_op = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits).train_op tree_train_op = head.create_model_fn_ops( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits).train_op if tree_center_bias: num_trees += 1 finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor() model_fn_ops.training_hooks.extend([ trainer_hooks.SwitchTrainOp( dnn_train_op, dnn_steps_to_train, tree_train_op), trainer_hooks.StopAfterNTrees( num_trees, attempted_trees, finalized_trees)]) return model_fn_ops
def __init__(self, num_classes=20, pretrained_embed=None, embedding_size=100, hidden_size=64, dropout_keep_proba=0.8, max_word_num=200, train_embed=True): self.num_classes = int(num_classes) self.embedding_size = int(embedding_size) self.pretrained_embed = pretrained_embed # [vocab_size, embedding_size] self.hidden_size = int(hidden_size) self.dropout_keep_proba = dropout_keep_proba self.max_word_num = int(max_word_num) self.train_embed = train_embed with tf.variable_scope('placeholder'): self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.max_word_num], name='input_x_rnn') if self.num_classes > 0: self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, self.num_classes], name='input_y_label') else: self.input_y = tf.placeholder(dtype=tf.float32, shape=[ None, ], name='input_y_label') self.is_training = tf.placeholder(dtype=tf.bool, name='is_training') with tf.variable_scope("word_embedding"): word_embedding_valid = tf.Variable( initial_value=self.pretrained_embed, trainable=self.train_embed, dtype=tf.float32) word_embedding_pad = tf.constant(value=0, dtype=tf.float32, shape=[1, self.embedding_size]) self.word_embedding_mat = tf.concat( [word_embedding_pad, word_embedding_valid], axis=0) #shape: [batch_size, max_word_num, embedding_size] self.embedded_input = tf.nn.embedding_lookup( self.word_embedding_mat, self.input_x) with tf.variable_scope("doc2vec"): # doc_encoded: [batch_size, max_word_num, hidden_size*2] doc_encoded = self.BidirectionalGRUEncoder(self.embedded_input, self.hidden_size, name='bi-gru') print("bi-GRU out shape: ", doc_encoded.shape) # doc_vec: [batch_size, hidden_size*2] doc_vec, self.weights = self.AttentionLayer(doc_encoded, self.hidden_size, name='attention') print("attention out shape: ", doc_vec.shape) doc_vec_dropped = layers.dropout(doc_vec, keep_prob=self.dropout_keep_proba, is_training=self.is_training) if self.num_classes > 0: out = layers.fully_connected(inputs=doc_vec_dropped, num_outputs=self.num_classes, activation_fn=None) else: out = layers.fully_connected(inputs=doc_vec_dropped, num_outputs=1, activation_fn=None) print("logit shape: ", out.shape) if self.num_classes > 0: with tf.variable_scope('cross_entro_loss'): # cross-entropy loss self.cross_entro = tf.losses.softmax_cross_entropy( onehot_labels=self.input_y, logits=out, reduction=tf.losses.Reduction.MEAN) else: with tf.variable_scope('mse_loss'): # mse loss self.mse = tf.losses.mean_squared_error( labels=self.input_y, predictions=tf.squeeze(out), reduction=tf.losses.Reduction.MEAN) self.predict = tf.argmax(out, axis=1, name='predict') if self.num_classes > 0: with tf.variable_scope('accuracy'): self.label = tf.argmax(self.input_y, axis=1, name='label') self.acc = tf.reduce_mean( tf.cast(tf.equal(self.predict, self.label), tf.float32))
def _dnn_linear_combined_model_fn(features, labels, mode, params): """Deep Neural Net and Linear combined model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * head: A `Head` instance. * linear_feature_columns: An iterable containing all the feature columns used by the Linear model. * linear_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the Linear model. * joint_linear_weights: If True a single (possibly partitioned) variable will be used to store the linear model weights. It's faster, but requires all columns are sparse and have the 'sum' combiner. * dnn_feature_columns: An iterable containing all the feature columns used by the DNN model. * dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN model. * dnn_hidden_units: List of hidden units per DNN layer. * dnn_activation_fn: Activation function applied to each DNN layer. If `None`, will use `tf.nn.relu`. * dnn_dropout: When not `None`, the probability we will drop out a given DNN coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * num_ps_replicas: The number of parameter server replicas. Returns: `estimator.ModelFnOps` Raises: ValueError: If both `linear_feature_columns` and `dnn_features_columns` are empty at the same time. """ head = params["head"] linear_feature_columns = params.get("linear_feature_columns") linear_optimizer = params.get("linear_optimizer") joint_linear_weights = params.get("joint_linear_weights") dnn_feature_columns = params.get("dnn_feature_columns") dnn_optimizer = params.get("dnn_optimizer") dnn_hidden_units = params.get("dnn_hidden_units") dnn_activation_fn = params.get("dnn_activation_fn") dnn_dropout = params.get("dnn_dropout") gradient_clip_norm = params.get("gradient_clip_norm") num_ps_replicas = params["num_ps_replicas"] if not linear_feature_columns and not dnn_feature_columns: raise ValueError( "Either linear_feature_columns or dnn_feature_columns must be defined.") features = _get_feature_dict(features) # Build DNN Logits. dnn_parent_scope = "dnn" if not dnn_feature_columns: dnn_logits = None else: input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( dnn_parent_scope + "/input_from_feature_columns", values=features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=scope) hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( dnn_parent_scope + "/hiddenlayer_%d" % layer_id, values=[net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=scope) if dnn_dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout( net, keep_prob=(1.0 - dnn_dropout)) # TODO(b/31209633): Consider adding summary before dropout. _add_hidden_layer_summary(net, scope.name) with variable_scope.variable_scope( dnn_parent_scope + "/logits", values=[net], partitioner=hidden_layer_partitioner) as scope: dnn_logits = layers.fully_connected( net, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=scope) _add_hidden_layer_summary(dnn_logits, scope.name) # Build Linear logits. linear_parent_scope = "linear" if not linear_feature_columns: linear_logits = None else: linear_partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_scope( linear_parent_scope, values=features.values(), partitioner=linear_partitioner) as scope: if joint_linear_weights: linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=linear_feature_columns, num_outputs=head.logits_dimension, weight_collections=[linear_parent_scope], scope=scope) else: linear_logits, _, _ = layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=linear_feature_columns, num_outputs=head.logits_dimension, weight_collections=[linear_parent_scope], scope=scope) # Combine logits and build full model. if dnn_logits is not None and linear_logits is not None: logits = dnn_logits + linear_logits elif dnn_logits is not None: logits = dnn_logits else: logits = linear_logits def _make_training_op(training_loss): """Training op for the DNN linear combined model.""" train_ops = [] if dnn_logits is not None: train_ops.append( optimizers.optimize_loss( loss=training_loss, global_step=contrib_variables.get_global_step(), learning_rate=_DNN_LEARNING_RATE, optimizer=_get_optimizer(dnn_optimizer), clip_gradients=gradient_clip_norm, variables=ops.get_collection(dnn_parent_scope), name=dnn_parent_scope, # Empty summaries, because head already logs "loss" summary. summaries=[])) if linear_logits is not None: train_ops.append( optimizers.optimize_loss( loss=training_loss, global_step=contrib_variables.get_global_step(), learning_rate=_linear_learning_rate(len(linear_feature_columns)), optimizer=_get_optimizer(linear_optimizer), clip_gradients=gradient_clip_norm, variables=ops.get_collection(linear_parent_scope), name=linear_parent_scope, # Empty summaries, because head already logs "loss" summary. summaries=[])) return control_flow_ops.group(*train_ops) return head.head_ops( features, labels, mode, _make_training_op, logits=logits)
def build_model(embedding, options): """ Builds the entire computational graph used for training """ # description string: #words x #samples with tf.device('/gpu:0'): with tf.variable_scope('input'): x = tf.placeholder( tf.int64, shape=[None, None, None], name='x' ) # 3D vector batch,N and instances(before embedding)40*32*13 x_mask = tf.placeholder(tf.float32, shape=[None, None], name='x_mask') # mask batch,N y = tf.placeholder(tf.int64, shape=[None], name='y') #group actual tech = tf.placeholder(tf.float32, shape=[None, None, 7], name='technical') #shape is batch time unit ##TODO important keep_prob = tf.placeholder(tf.float32, [], name='keep_prob') is_training = tf.placeholder(tf.bool, name='is_training') #alpha_balance = tf.placeholder(tf.float32,[],name = 'alpha_balance') ##TODO important sequence_mask = tf.cast(tf.abs(tf.sign(x)), tf.float32) # 3D n_timesteps = tf.shape(x)[0] # time steps ##TODO word embedding emb = tf.nn.embedding_lookup(embedding, x) with tf.device('/gpu:0'): # fed into the input of BILSTM from the official document with tf.name_scope('sentence_enc'): batch = tf.shape(emb)[0] #32 N = tf.shape(emb)[1] #40 N instances in a group word = tf.shape(emb)[2] #13 ##TODO make instances prediction through attention encoding and MLP with tf.variable_scope(name_or_scope='sentence_enc', reuse=tf.AUTO_REUSE): word_level_inputs = tf.reshape( emb, [batch * N, word, options['dim_word']]) word_level_mask = tf.reshape(sequence_mask, [batch * N, word]) ##TODO word level LSTM word_encoder_out = bilstm_filter( word_level_inputs, word_level_mask, keep_prob, prefix='sequence_encode', dim=options['dim'], is_training=is_training ) # output shape: batch*news,sequence,2*lstm_units(32*40)*12*600 word_encoder_out = tf.concat( word_encoder_out, 2) * tf.expand_dims(word_level_mask, -1) ################################### TODO word-attention word_level_output = attention_v2(word_encoder_out, word_level_mask, name='word_attention', keep=keep_prob, r=10, is_training=is_training) if options['use_dropout']: word_level_output = layers.dropout(word_level_output, keep_prob=keep_prob, is_training=is_training, seed=None) #32*N,D att = tf.reshape(word_level_output, [batch, N, 2 * options['dim']]) ##TODO att shape 32*40*200 with tf.name_scope('instance_prediction'): temp = tf.layers.dense( word_level_output, 150, activation=tf.nn.tanh, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='inst_temp', reuse=tf.AUTO_REUSE) if options['use_dropout']: temp = layers.dropout(temp, keep_prob=keep_prob, is_training=is_training, seed=None) pred_sig_ = tf.layers.dense( temp, 1, activation=None, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='inst_pred', reuse=tf.AUTO_REUSE) inst_pred = tf.nn.tanh(pred_sig_) #32*N,1 NOT 32,N,1, float32 ##tf.sigmoid 改成 tf.relu L = tf.reshape(inst_pred, [batch, N, 1]) L_input = L * tf.expand_dims(x_mask, -1) # mask before attention #coef = tf.concat([L for i in range(2*options['dim'])], 2) coef = tf.concat([L_input for i in range(2 * options['dim'])], 2) with tf.name_scope('Group_prediction'): bag_repre = tf.multiply(coef, att) bag_repre = tf.reduce_mean(bag_repre, axis=1) #32,200 tech_ind = lstm_filter( tech, tf.ones(shape=[tf.shape(tech)[0], tf.shape(tech)[1]]), keep_prob, prefix='tech_lstm', dim=50, is_training=is_training) #32,N,50 #TODO take day lstm average tech_att = tf.reduce_mean(tf.concat(tech_ind, 2), 1) #32,50 bag_repre = tf.concat([bag_repre, tech_att], axis=1) #32,250 ##TODO take the lasts #tech_att=tech_ind[:,-1,:] logit = tf.layers.dense( bag_repre, 300, activation=tf.nn.tanh, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='group_mil', reuse=tf.AUTO_REUSE) ##TODO new cost logger.info('Building f_cost...') """x_simil = Euclidean_distance(att) #32,N,N 有placeholder l_diff = instance_diff(L) #32,N,N 有placeholder simil_cost = tf.reduce_sum(tf.multiply(x_simil,l_diff),[1,2])/tf.cast(N*N,tf.float32) #32, group_cost = tf.cast(tf.square(y-group_pred),tf.float32) #32 # cost由int64变为float32 total_cost = simil_cost + alpha_balance * group_cost #[32,1] cost = tf.reshape(total_cost,(1,-1)) #1,32""" group_ = tf.layers.dense( logit, 2, activation=None, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='fout', reuse=tf.AUTO_REUSE) #32,2 labels = tf.one_hot(y, depth=2, axis=1) #32,2 group_pred = tf.nn.softmax(group_, 1, name='softmax') #32,2 cost = tf.nn.softmax_cross_entropy_with_logits_v2( logits=group_, labels=labels) #1,32""" """pred = tf.layers.dense(logit, 2, activation=None, use_bias=True, kernel_initializer=layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32), name='fout', reuse=tf.AUTO_REUSE)#32,2 labels = tf.one_hot(y, depth=2, axis=1)#32,2 preds = tf.nn.softmax(pred, 1,name='softmax') #32,2 cost = tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=labels) #1,32""" logger.info('Done') with tf.variable_scope('logging'): tf.summary.scalar('current_cost', tf.reduce_mean(cost)) tf.summary.histogram('predicted_value', group_pred) summary = tf.summary.merge_all() return is_training, cost, x, x_mask, y, n_timesteps, group_pred, summary
def get_conv_model(features, labels, mode, params): parent_scope = "cnn" # TODO Need to have two: one for expand, one for conquer # features = _get_feature_dict(features) head = params.get("head") feature_columns = params.get("feature_columns") activation_fn = params.get("activation_fn") dropout = params.get("dropout") learning_rate = params.get("learning_rate") optimizer = params.get("optimizer") # with variable_scope.variable_scope( # parent_scope + "/input_from_feature_columns", # values=features.values()) as scope: # net = layers.input_from_feature_columns( # columns_to_tensors=features, # feature_columns=feature_columns, # weight_collections=[parent_scope], # scope=scope) with variable_scope.variable_scope( parent_scope + "/convlayer_1", values=[features]) as scope: net = layers.conv2d(features, num_outputs=32, kernel_size=3, variables_collections=[parent_scope], scope=scope) net = layers.max_pool2d(net, 2, stride=1, padding='SAME') with variable_scope.variable_scope( parent_scope + "/convlayer_2", values=[features]) as scope: net = layers.conv2d(features, num_outputs=64, kernel_size=5, padding='VALID', variables_collections=[parent_scope], scope=scope) # net = layers.max_pool2d(net, 1, # stride=1, # padding='SAME') # # with variable_scope.variable_scope( # parent_scope + "/max_pool_1", # values=[net]) as scope: shape = net.get_shape() net = tf.reshape(net, [-1, shape[3].value], name="reshape_1") hidden_units = [256, 128] for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, values=[net]) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=activation_fn, variables_collections=[parent_scope], scope=scope) if dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout( net, keep_prob=(1.0 - dropout)) with variable_scope.variable_scope( parent_scope + "/logits", values=[net]) as scope: logits = layers.fully_connected( net, head.logits_dimension, activation_fn=None, variables_collections=[parent_scope], scope=scope) def _train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=contrib_variables.get_global_step(), learning_rate=learning_rate, optimizer=optimizer, name=parent_scope, # Empty summaries to prevent optimizers from logging the training_loss. summaries=[]) return head.head_ops(features, labels, mode, _train_op_fn, logits)
def _dnn_tree_combined_model_fn( features, labels, mode, head, dnn_hidden_units, dnn_feature_columns, tree_learner_config, num_trees, tree_examples_per_layer, config=None, dnn_optimizer="Adagrad", dnn_activation_fn=nn.relu, dnn_dropout=None, dnn_input_layer_partitioner=None, dnn_input_layer_to_tree=True, dnn_steps_to_train=10000, predict_with_tree_only=False, tree_feature_columns=None, tree_center_bias=False, dnn_to_tree_distillation_param=None, use_core_versions=False, output_type=model.ModelBuilderOutputType.MODEL_FN_OPS): """DNN and GBDT combined model_fn. Args: features: `dict` of `Tensor` objects. labels: Labels used to train on. mode: Mode we are in. (TRAIN/EVAL/INFER) head: A `Head` instance. dnn_hidden_units: List of hidden units per layer. dnn_feature_columns: An iterable containing all the feature columns used by the model's DNN. tree_learner_config: A config for the tree learner. num_trees: Number of trees to grow model to after training DNN. tree_examples_per_layer: Number of examples to accumulate before growing the tree a layer. This value has a big impact on model quality and should be set equal to the number of examples in training dataset if possible. It can also be a function that computes the number of examples based on the depth of the layer that's being built. config: `RunConfig` of the estimator. dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN. If `None`, will use the Adagrad optimizer with default learning rate of 0.001. dnn_activation_fn: Activation function applied to each layer of the DNN. If `None`, will use `tf.nn.relu`. dnn_dropout: When not `None`, the probability to drop out a given unit in the DNN. dnn_input_layer_partitioner: Partitioner for input layer of the DNN. Defaults to `min_max_variable_partitioner` with `min_slice_size` 64 << 20. dnn_input_layer_to_tree: Whether to provide the DNN's input layer as a feature to the tree. dnn_steps_to_train: Number of steps to train dnn for before switching to gbdt. predict_with_tree_only: Whether to use only the tree model output as the final prediction. tree_feature_columns: An iterable containing all the feature columns used by the model's boosted trees. If dnn_input_layer_to_tree is set to True, these features are in addition to dnn_feature_columns. tree_center_bias: Whether a separate tree should be created for first fitting the bias. dnn_to_tree_distillation_param: A Tuple of (float, loss_fn), where the float defines the weight of the distillation loss, and the loss_fn, for computing distillation loss, takes dnn_logits, tree_logits and weight tensor. If the entire tuple is None, no distillation will be applied. If only the loss_fn is None, we will take the sigmoid/softmax cross entropy loss be default. When distillation is applied, `predict_with_tree_only` will be set to True. use_core_versions: Whether feature columns and loss are from the core (as opposed to contrib) version of tensorflow. Returns: A `ModelFnOps` object. Raises: ValueError: if inputs are not valid. """ if not isinstance(features, dict): raise ValueError("features should be a dictionary of `Tensor`s. " "Given type: {}".format(type(features))) if not dnn_feature_columns: raise ValueError("dnn_feature_columns must be specified") if dnn_to_tree_distillation_param: if not predict_with_tree_only: logging.warning("update predict_with_tree_only to True since distillation" "is specified.") predict_with_tree_only = True # Build DNN Logits. dnn_parent_scope = "dnn" dnn_partitioner = dnn_input_layer_partitioner or ( partitioned_variables.min_max_variable_partitioner( max_partitions=config.num_ps_replicas, min_slice_size=64 << 20)) if (output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC and not use_core_versions): raise ValueError("You must use core versions with Estimator Spec") with variable_scope.variable_scope( dnn_parent_scope, values=tuple(six.itervalues(features)), partitioner=dnn_partitioner): with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=dnn_partitioner) as input_layer_scope: if use_core_versions: input_layer = feature_column_lib.input_layer( features=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope]) else: input_layer = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=input_layer_scope) previous_layer = input_layer for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(previous_layer,)) as hidden_layer_scope: net = layers.fully_connected( previous_layer, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) previous_layer = net with variable_scope.variable_scope( "logits", values=(previous_layer,)) as logits_scope: dnn_logits = layers.fully_connected( previous_layer, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=logits_scope) _add_hidden_layer_summary(dnn_logits, logits_scope.name) def _dnn_train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=training_util.get_global_step(), learning_rate=_DNN_LEARNING_RATE, optimizer=_get_optimizer(dnn_optimizer), name=dnn_parent_scope, variables=ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope), # Empty summaries to prevent optimizers from logging training_loss. summaries=[]) # Build Tree Logits. global_step = training_util.get_global_step() with ops.device(global_step.device): ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config="", # Initialize an empty ensemble. name="ensemble_model") tree_features = features.copy() if dnn_input_layer_to_tree: tree_features["dnn_input_layer"] = input_layer tree_feature_columns.append(layers.real_valued_column("dnn_input_layer")) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=config.is_chief, num_ps_replicas=config.num_ps_replicas, ensemble_handle=ensemble_handle, center_bias=tree_center_bias, examples_per_layer=tree_examples_per_layer, learner_config=tree_learner_config, feature_columns=tree_feature_columns, logits_dimension=head.logits_dimension, features=tree_features, use_core_columns=use_core_versions) with ops.name_scope("gbdt"): predictions_dict = gbdt_model.predict(mode) tree_logits = predictions_dict["predictions"] def _tree_train_op_fn(loss): """Returns the op to optimize the loss.""" if dnn_to_tree_distillation_param: loss_weight, loss_fn = dnn_to_tree_distillation_param weight_tensor = head_lib._weight_tensor( # pylint: disable=protected-access features, head.weight_column_name) dnn_logits_fixed = array_ops.stop_gradient(dnn_logits) if loss_fn is None: # we create the loss_fn similar to the head loss_fn for # multi_class_head used previously as the default one. n_classes = 2 if head.logits_dimension == 1 else head.logits_dimension loss_fn = distillation_loss.create_dnn_to_tree_cross_entropy_loss_fn( n_classes) dnn_to_tree_distillation_loss = loss_weight * loss_fn( dnn_logits_fixed, tree_logits, weight_tensor) summary.scalar("dnn_to_tree_distillation_loss", dnn_to_tree_distillation_loss) loss += dnn_to_tree_distillation_loss update_op = gbdt_model.train(loss, predictions_dict, labels) with ops.control_dependencies( [update_op]), (ops.colocate_with(global_step)): update_op = state_ops.assign_add(global_step, 1).op return update_op if predict_with_tree_only: if mode == model_fn.ModeKeys.TRAIN or mode == model_fn.ModeKeys.INFER: tree_train_logits = tree_logits else: tree_train_logits = control_flow_ops.cond( global_step > dnn_steps_to_train, lambda: tree_logits, lambda: dnn_logits) else: tree_train_logits = dnn_logits + tree_logits def _no_train_op_fn(loss): """Returns a no-op.""" del loss return control_flow_ops.no_op() if tree_center_bias: num_trees += 1 finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor() if output_type == model.ModelBuilderOutputType.MODEL_FN_OPS: if use_core_versions: model_fn_ops = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_train_op = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits) dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops( dnn_train_op).train_op tree_train_op = head.create_estimator_spec( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits) tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops( tree_train_op).train_op model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops( model_fn_ops) else: model_fn_ops = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_train_op = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits).train_op tree_train_op = head.create_model_fn_ops( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits).train_op # Add the hooks model_fn_ops.training_hooks.extend([ trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train, tree_train_op), trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees) ]) return model_fn_ops elif output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC: fusion_spec = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_spec = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits) tree_spec = head.create_estimator_spec( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits) training_hooks = [ trainer_hooks.SwitchTrainOp(dnn_spec.train_op, dnn_steps_to_train, tree_spec.train_op), trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees) ] fusion_spec = fusion_spec._replace(training_hooks=training_hooks + list(fusion_spec.training_hooks)) return fusion_spec
def _init_network(self): """Defines the tensorflow network.""" # Placeholders for dataset self.state_data = tf.placeholder(tf.float32, (None, None, self.dX)) self.K_data = tf.placeholder(tf.float32, (None, self.dU, self.dX)) self.k_data = tf.placeholder(tf.float32, (None, self.dU)) self.precision_data = tf.placeholder(tf.float32, (None, self.dU, self.dU)) dataset = tf.data.Dataset.from_tensor_slices( ( self.state_data, self.K_data, self.k_data, self.precision_data, ) ).shuffle(10000).batch(self.batch_size).repeat() # Batch iterator self.iterator = dataset.make_initializable_iterator() state_batch, self.K_batch, self.k_batch, self.precision_batch = self.iterator.get_next() # Compose and normalize state batch state_batch = tf.concat( values=[ state_batch, ], axis=1 ) # Other placeholders self.state_batch = tf.reshape(state_batch, (-1, self.dX)) self.is_training = tf.placeholder(tf.bool, ()) self.K_center = tf.placeholder(tf.float32, (self.dU, self.dX)) self.K_scale = tf.placeholder(tf.float32, (self.dU, self.dX)) with tf.variable_scope('state_normalization'): state_batch_normalized = tf.layers.batch_normalization( self.state_batch, training=self.is_training, center=False, scale=False, renorm=True ) # Action estimator with tf.variable_scope('action_estimator'), arg_scope( [layers.fully_connected], activation_fn=tf.nn.leaky_relu, weights_regularizer=layers.l2_regularizer(scale=self.weight_decay) ): h = layers.fully_connected(state_batch_normalized, self.N_hidden) h = layers.fully_connected(h, self.N_hidden) h = layers.fully_connected(h, self.N_hidden) self.action_estimation = layers.fully_connected(h, self.dU, activation_fn=None) # Stabilizer estimator with tf.variable_scope('stabilizer_estimator'), arg_scope( [layers.fully_connected], activation_fn=tf.nn.leaky_relu, weights_regularizer=layers.l2_regularizer(scale=self.weight_decay), ): # Encoder h = layers.fully_connected(state_batch_normalized, self.N_hidden * self.dX) self.latent = layers.fully_connected(h, self.dZ, activation_fn=None) # Stabilizer Translation h = layers.fully_connected(self.latent, self.N_hidden * self.dX, biases_initializer=None) h = layers.dropout(h, keep_prob=1 - self.dropout_rate, is_training=self.is_training) h = layers.fully_connected(h, self.N_hidden * self.dX, biases_initializer=None) h = layers.dropout(h, keep_prob=1 - self.dropout_rate, is_training=self.is_training) self.stabilizer_estimation = tf.reshape( layers.fully_connected(h, self.dX * self.dU, activation_fn=None, biases_initializer=None), (-1, self.dU, self.dX) ) self.action_regulation = tf.einsum( 'inm,im->in', self.stabilizer_estimation * self.K_scale + self.K_center, # Reverse K standardization self.state_batch, ) self.action_out = self.action_estimation + self.action_regulation
def dnn_sampled_softmax_classifier_model_fn(features, target_indices, mode, params): """model_fn that uses candidate sampling. Args: features: Single Tensor or dict of Tensor (depends on data passed to `fit`) target_indices: A single Tensor of shape [batch_size, n_labels] containing the target indices. mode: Represents if this training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters that are listed below. hidden_units- List of hidden units per layer. All layers are fully connected. Ex. `[64, 32]` means first layer has 64 nodes and second one has 32. feature_columns- An iterable containing all the feature columns used by the model. All items in the set should be instances of classes derived from `FeatureColumn`. n_classes- number of target classes. It must be greater than 2. n_samples- number of sample target classes. Needs to be tuned - A good starting point could be 2% of n_classes. n_labels- number of labels in each example. top_k- The number of classes to predict. optimizer- An instance of `tf.Optimizer` used to train the model. If `None`, will use an Adagrad optimizer. dropout- When not `None`, the probability we will drop out a given coordinate. gradient_clip_norm- A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. See tf.clip_by_global_norm for more details. num_ps_replicas- The number of parameter server replicas. Returns: predictions: A single Tensor or a dict of Tensors. loss: A scalar containing the loss of the step. train_op: The op for training. """ hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] n_classes = params["n_classes"] n_samples = params["n_samples"] n_labels = params["n_labels"] top_k = params["top_k"] optimizer = params["optimizer"] dropout = params["dropout"] gradient_clip_norm = params["gradient_clip_norm"] num_ps_replicas = params["num_ps_replicas"] parent_scope = "dnn_ss" # Setup the input layer partitioner. input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) # Create the input layer. with variable_scope.variable_scope( parent_scope + "/input_from_feature_columns", features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( features, feature_columns, weight_collections=[parent_scope], scope=scope) # Setup the hidden layer partitioner. hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) final_hidden_layer_dim = None # Create hidden layers using fully_connected. for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, [net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected(net, num_hidden_units, variables_collections=[parent_scope], scope=scope) final_hidden_layer_dim = num_hidden_units # Add dropout if it is enabled. if dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dropout)) # Create the weights and biases for the logit layer. with variable_scope.variable_scope( parent_scope + "/logits", [net], partitioner=hidden_layer_partitioner) as scope: dtype = net.dtype.base_dtype weights_shape = [n_classes, final_hidden_layer_dim] weights = variables.model_variable( "weights", shape=weights_shape, dtype=dtype, initializer=initializers.xavier_initializer(), trainable=True, collections=[parent_scope]) biases = variables.model_variable( "biases", shape=[n_classes,], dtype=dtype, initializer=init_ops.zeros_initializer, trainable=True, collections=[parent_scope]) if mode == estimator.ModeKeys.TRAIN: # Call the candidate sampling APIs and calculate the loss. sampled_values = nn.learned_unigram_candidate_sampler( true_classes=math_ops.to_int64(target_indices), num_true=n_labels, num_sampled=n_samples, unique=True, range_max=n_classes) sampled_softmax_loss = nn.sampled_softmax_loss( weights=weights, biases=biases, inputs=net, labels=math_ops.to_int64(target_indices), num_sampled=n_samples, num_classes=n_classes, num_true=n_labels, sampled_values=sampled_values) loss = math_ops.reduce_mean(sampled_softmax_loss, name="loss") train_op = optimizers.optimize_loss( loss=loss, global_step=contrib_framework.get_global_step(), learning_rate=_DEFAULT_LEARNING_RATE, optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm, name=parent_scope) return None, loss, train_op elif mode == estimator.ModeKeys.EVAL: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) # Since the targets have multiple labels, setup the target probabilities # as 1.0/n_labels for each of the labels. target_one_hot = array_ops.one_hot( indices=target_indices, depth=n_classes, on_value=1.0 / n_labels) target_one_hot = math_ops.reduce_sum( input_tensor=target_one_hot, reduction_indices=[1]) loss = math_ops.reduce_mean( nn.softmax_cross_entropy_with_logits(logits, target_one_hot)) return predictions, loss, None elif mode == estimator.ModeKeys.INFER: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) return predictions, None, None
'is_training': train_mode, 'decay': 0.9, 'updates_collections': None } # We can build short code using 'arg_scope' to avoid duplicate code # same function with different arguments with arg_scope([fully_connected], activation_fn=tf.nn.relu, weights_initializer=xavier_init, biases_initializer=None, normalizer_fn=batch_norm, normalizer_params=bn_params ): hidden_layer1 = fully_connected(X, hidden_output_size, scope="h1") h1_drop = dropout(hidden_layer1, keep_prob, is_training=train_mode) hidden_layer2 = fully_connected(h1_drop, hidden_output_size, scope="h2") h2_drop = dropout(hidden_layer2, keep_prob, is_training=train_mode) hidden_layer3 = fully_connected(h2_drop, hidden_output_size, scope="h3") h3_drop = dropout(hidden_layer3, keep_prob, is_training=train_mode) hidden_layer4 = fully_connected(h3_drop, hidden_output_size, scope="h4") h4_drop = dropout(hidden_layer4, keep_prob, is_training=train_mode) hypothesis = fully_connected(h4_drop, final_output_size, activation_fn=None, scope="hypothesis") # define cost/loss & optimizer cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits=hypothesis, labels=Y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # initialize
with tf.variable_scope("M-LSTM-1", reuse=tf.AUTO_REUSE): cell_1 = SkipLSTMCell(num_units=64) initial_state_1 = cell_1.trainable_initial_state(batch_size=batch_size) hidden_1 = conv1d(x_m_lstm, num_outputs=1, kernel_size=1, padding='VALID', stride=1, weights_regularizer=l2_regularizer(scale=1.0e-3)) rnn_outputs_1, _ = tf.nn.dynamic_rnn(cell_1, hidden_1, dtype=tf.float32, initial_state=initial_state_1) rnn_outputs_1 = rnn_outputs_1.h[:, -1, :] hidden_2 = dropout(inputs=rnn_outputs_1, keep_prob=0.7) output_1 = fully_connected(hidden_2, num_outputs=32) # M-LSTM (2) with tf.variable_scope("M-LSTM-2", reuse=tf.AUTO_REUSE): cell_2 = SkipLSTMCell(num_units=64) initial_state_2 = cell_2.trainable_initial_state(batch_size=batch_size) hidden_3 = conv1d(x_m_lstm, num_outputs=1, kernel_size=4, padding='VALID', stride=2, weights_regularizer=l2_regularizer(scale=1.0e-3)) rnn_outputs_2, _ = tf.nn.dynamic_rnn(cell_2, hidden_3,
def _dnn_linear_combined_model_fn(features, labels, mode, params): """Deep Neural Net and Linear combined model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * head: A `Head` instance. * linear_feature_columns: An iterable containing all the feature columns used by the Linear model. * linear_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the Linear model. * joint_linear_weights: If True a single (possibly partitioned) variable will be used to store the linear model weights. It's faster, but requires all columns are sparse and have the 'sum' combiner. * dnn_feature_columns: An iterable containing all the feature columns used by the DNN model. * dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN model. * dnn_hidden_units: List of hidden units per DNN layer. * dnn_activation_fn: Activation function applied to each DNN layer. If `None`, will use `tf.nn.relu`. * dnn_dropout: When not `None`, the probability we will drop out a given DNN coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * num_ps_replicas: The number of parameter server replicas. Returns: `estimator.ModelFnOps` Raises: ValueError: If both `linear_feature_columns` and `dnn_features_columns` are empty at the same time. """ head = params["head"] linear_feature_columns = params.get("linear_feature_columns") linear_optimizer = params.get("linear_optimizer") joint_linear_weights = params.get("joint_linear_weights") dnn_feature_columns = params.get("dnn_feature_columns") dnn_optimizer = params.get("dnn_optimizer") dnn_hidden_units = params.get("dnn_hidden_units") dnn_activation_fn = params.get("dnn_activation_fn") dnn_dropout = params.get("dnn_dropout") gradient_clip_norm = params.get("gradient_clip_norm") num_ps_replicas = params["num_ps_replicas"] if not linear_feature_columns and not dnn_feature_columns: raise ValueError( "Either linear_feature_columns or dnn_feature_columns must be defined.") features = _get_feature_dict(features) # Build DNN Logits. dnn_parent_scope = "dnn" if not dnn_feature_columns: dnn_logits = None else: input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( dnn_parent_scope + "/input_from_feature_columns", values=features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=scope) hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( dnn_parent_scope + "/hiddenlayer_%d" % layer_id, values=[net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=scope) if dnn_dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout( net, keep_prob=(1.0 - dnn_dropout)) # TODO(b/31209633): Consider adding summary before dropout. _add_hidden_layer_summary(net, scope.name) with variable_scope.variable_scope( dnn_parent_scope + "/logits", values=[net], partitioner=hidden_layer_partitioner) as scope: dnn_logits = layers.fully_connected( net, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=scope) _add_hidden_layer_summary(dnn_logits, scope.name) # Build Linear logits. linear_parent_scope = "linear" if not linear_feature_columns: linear_logits = None else: linear_partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_scope( linear_parent_scope, values=features.values(), partitioner=linear_partitioner) as scope: if joint_linear_weights: linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=linear_feature_columns, num_outputs=head.logits_dimension, weight_collections=[linear_parent_scope], scope=scope) else: linear_logits, _, _ = layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=linear_feature_columns, num_outputs=head.logits_dimension, weight_collections=[linear_parent_scope], scope=scope) # Combine logits and build full model. if dnn_logits is not None and linear_logits is not None: logits = dnn_logits + linear_logits elif dnn_logits is not None: logits = dnn_logits else: logits = linear_logits def _make_training_op(training_loss): """Training op for the DNN linear combined model.""" train_ops = [] if dnn_logits is not None: train_ops.append( optimizers.optimize_loss( loss=training_loss, global_step=contrib_variables.get_global_step(), learning_rate=_DNN_LEARNING_RATE, optimizer=_get_optimizer(dnn_optimizer), clip_gradients=gradient_clip_norm, variables=ops.get_collection(dnn_parent_scope), name=dnn_parent_scope, # Empty summaries, because head already logs "loss" summary. summaries=[])) if linear_logits is not None: train_ops.append( optimizers.optimize_loss( loss=training_loss, global_step=contrib_variables.get_global_step(), learning_rate=_linear_learning_rate(len(linear_feature_columns)), optimizer=_get_optimizer(linear_optimizer), clip_gradients=gradient_clip_norm, variables=ops.get_collection(linear_parent_scope), name=linear_parent_scope, # Empty summaries, because head already logs "loss" summary. summaries=[])) return control_flow_ops.group(*train_ops) return head.head_ops( features, labels, mode, _make_training_op, logits=logits)
def __init__(self, word_embeddings, setting): self.vocab_size = setting.vocab_size self.len_sentence= len_sentence = setting.len_sentence self.num_epochs = setting.num_epochs self.num_classes = num_classes =setting.num_classes self.cnn_size = setting.cnn_size self.num_layers = setting.num_layers self.pos_size = setting.pos_size self.pos_num = setting.pos_num self.word_embedding = setting.word_embedding self.lr = setting.lr word_embedding = tf.get_variable(initializer=word_embeddings, name='word_embedding') pos1_embedding = tf.get_variable('pos1_embedding', [self.pos_num, self.pos_size]) pos2_embedding = tf.get_variable('pos2_embedding', [self.pos_num, self.pos_size]) #relation_embedding = tf.get_variable('relation_embedding', [self.num_classes, self.cnn_size]) self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, len_sentence], name='input_word') self.input_pos1 = tf.placeholder(dtype=tf.int32, shape=[None, len_sentence], name='input_pos1') self.input_pos2 = tf.placeholder(dtype=tf.int32, shape=[None, len_sentence], name='input_pos2') self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='input_y') self.keep_prob = tf.placeholder(tf.float32) self.input_word_ebd = tf.nn.embedding_lookup(word_embedding, self.input_word) self.input_pos1_ebd = tf.nn.embedding_lookup(pos1_embedding, self.input_pos1) self.input_pos2_ebd = tf.nn.embedding_lookup(pos2_embedding, self.input_pos2) self.inputs = tf.concat(axis=2,values=[self.input_word_ebd,self.input_pos1_ebd,self.input_pos2_ebd]) self.inputs = tf.reshape(self.inputs, [-1,self.len_sentence,self.word_embedding+self.pos_size*2,1] ) conv = layers.conv2d(inputs =self.inputs ,num_outputs = self.cnn_size ,kernel_size = [3,60],stride=[1,60],padding='SAME') max_pool = layers.max_pool2d(conv,kernel_size = [70,1],stride=[1,1]) self.sentence = tf.reshape(max_pool, [-1, self.cnn_size]) tanh = tf.nn.tanh(self.sentence) drop = layers.dropout(tanh,keep_prob=self.keep_prob) self.outputs = layers.fully_connected(inputs = drop,num_outputs = self.num_classes,activation_fn = tf.nn.softmax) ''' self.y_index = tf.argmax(self.input_y,1,output_type=tf.int32) self.indexes = tf.range(0, tf.shape(self.outputs)[0]) * tf.shape(self.outputs)[1] + self.y_index self.responsible_outputs = - tf.reduce_mean(tf.log(tf.gather(tf.reshape(self.outputs, [-1]),self.indexes))) ''' #loss self.cross_loss = -tf.reduce_mean( tf.log(tf.reduce_sum( self.input_y * self.outputs ,axis=1))) self.reward = tf.log(tf.reduce_sum( self.input_y * self.outputs ,axis=1)) self.l2_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(0.0001), weights_list=tf.trainable_variables()) self.final_loss = self.cross_loss + self.l2_loss #accuracy self.pred = tf.argmax(self.outputs,axis=1) self.pred_prob = tf.reduce_max(self.outputs,axis=1) self.y_label = tf.argmax(self.input_y,axis=1) self.accuracy = tf.reduce_mean(tf.cast( tf.equal(self.pred,self.y_label), 'float')) #minimize loss optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.minimize(self.final_loss) self.tvars = tf.trainable_variables() # manual update parameters self.tvars_holders = [] for idx, var in enumerate(self.tvars): placeholder = tf.placeholder(tf.float32, name=str(idx) + '_holder') self.tvars_holders.append(placeholder) self.update_tvar_holder = [] for idx, var in enumerate(self.tvars): update_tvar = tf.assign(var, self.tvars_holders[idx]) self.update_tvar_holder.append(update_tvar)
def __call__(self, inputs, is_training=False, reuse=None): with tf.variable_scope(self.name, reuse=reuse): with arg_scope([layers.batch_norm], scale=True, fused=True, data_format=self.data_format, is_training=is_training): with arg_scope([layers.conv2d], activation_fn=tf.nn.relu, normalizer_fn=layers.batch_norm, biases_initializer=None, weights_regularizer=layers.l2_regularizer( self.weight_decay), data_format=self.data_format): if self.data_format == 'NCHW': inputs = tf.transpose(inputs, [0, 3, 1, 2]) with tf.variable_scope('conv1'): net = layers.conv2d(inputs, num_outputs=64, kernel_size=7, stride=2) net = layers.max_pool2d(net, kernel_size=3, stride=2, padding='SAME', data_format=self.data_format) with tf.variable_scope('conv2'): net = layers.repeat(net, self.num_block[0], self.SEresBlock, self.num_outputs[0]) with tf.variable_scope('conv3'): net = self.resBlock(net, num_outputs=self.num_outputs[1], stride=2) net = layers.repeat(net, self.num_block[1] - 1, self.SEresBlock, self.num_outputs[1]) with tf.variable_scope('conv4'): net = self.resBlock(net, num_outputs=self.num_outputs[2], stride=2) net = layers.repeat(net, self.num_block[2] - 1, self.SEresBlock, self.num_outputs[2]) with tf.variable_scope('conv5'): net = self.resBlock(net, num_outputs=self.num_outputs[3], stride=2) net = layers.repeat(net, self.num_block[3] - 1, self.SEresBlock, self.num_outputs[3]) if self.data_format == 'NCHW': net = tf.reduce_mean(net, [2, 3]) net = tf.reshape(net, [-1, net.get_shape().as_list()[1]]) else: net = tf.reduce_mean(net, [1, 2]) net = tf.reshape( net, [-1, net.get_shape().as_list()[-1]]) if is_training: net = layers.dropout(net, keep_prob=0.5) pre_logits = layers.fully_connected( net, num_outputs=128, activation_fn=None, weights_regularizer=layers.l2_regularizer( self.weight_decay)) return pre_logits
def _dnn_classifier_model_fn(features, targets, mode, params): """Deep Neural Net model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * hidden_units: List of hidden units per layer. * feature_columns: An iterable containing all the feature columns used by the model. * n_classes: number of target classes. * weight_column_name: A string defining the weight feature column, or None if there are no weights. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. * activation_fn: Activation function applied to each layer. If `None`, will use `tf.nn.relu`. * dropout: When not `None`, the probability we will drop out a given coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * enable_centered_bias: A bool. If True, estimator will learn a centered bias variable for each class. Rest of the model structure learns the residual after centered bias. * num_ps_replicas: The number of parameter server replicas. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. """ hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] optimizer = params["optimizer"] activation_fn = params["activation_fn"] dropout = params["dropout"] gradient_clip_norm = params["gradient_clip_norm"] enable_centered_bias = params["enable_centered_bias"] num_ps_replicas = params["num_ps_replicas"] features = _get_feature_dict(features) parent_scope = "dnn" num_label_columns = 1 if n_classes == 2 else n_classes if n_classes == 2: loss_fn = loss_ops.sigmoid_cross_entropy else: loss_fn = loss_ops.sparse_softmax_cross_entropy input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( parent_scope + "/input_from_feature_columns", values=features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, weight_collections=[parent_scope], scope=scope) hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, values=[net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=activation_fn, variables_collections=[parent_scope], scope=scope) if dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout( net, keep_prob=(1.0 - dropout)) _add_hidden_layer_summary(net, scope.name) with variable_scope.variable_scope( parent_scope + "/logits", values=[net], partitioner=hidden_layer_partitioner) as scope: logits = layers.fully_connected( net, num_label_columns, activation_fn=None, variables_collections=[parent_scope], scope=scope) _add_hidden_layer_summary(logits, scope.name) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) if mode == estimator.ModeKeys.TRAIN: targets = _reshape_targets(targets) weight = _get_weight_tensor(features, weight_column_name) training_loss = loss_fn(logits, targets, weight=weight) loss = _rescale_eval_loss(training_loss, weight) train_ops = [optimizers.optimize_loss( loss=training_loss, global_step=contrib_variables.get_global_step(), learning_rate=_LEARNING_RATE, optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm, name=parent_scope, # Empty summaries to prevent optimizers from logging the training_loss. summaries=[])] if enable_centered_bias: train_ops.append(_centered_bias_step(targets, loss_fn, num_label_columns)) logging_ops.scalar_summary("loss", loss) return None, loss, control_flow_ops.group(*train_ops) elif mode == estimator.ModeKeys.EVAL: predictions = _predictions(logits=logits, n_classes=n_classes) targets = _reshape_targets(targets) weight = _get_weight_tensor(features, weight_column_name) training_loss = loss_fn(logits, targets, weight=weight) loss = _rescale_eval_loss(training_loss, weight) return predictions, loss, [] else: # mode == estimator.ModeKeys.INFER: predictions = _predictions(logits=logits, n_classes=n_classes) return predictions, None, []
def model(): print("building model ...") with tf.variable_scope('train'): print("building model ...") X_pl = tf.placeholder(tf.float32, [None, num_features]) X_expand = tf.expand_dims(X_pl, axis=2) print("X_pl", X_pl.get_shape()) t_pl = tf.placeholder(tf.int32, [ None, ]) print("t_pl", t_pl.get_shape()) is_training_pl = tf.placeholder(tf.bool) cell_fw = tf.nn.rnn_cell.GRUCell(205) cell_bw = tf.nn.rnn_cell.GRUCell(205) seq_len = tf.reduce_sum(tf.ones(tf.shape(X_pl), dtype=tf.int32), axis=1) _, enc_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=X_expand, sequence_length=seq_len, dtype=tf.float32) enc_states = tf.concat(1, enc_states) enc_states_drop = dropout(enc_states, is_training=is_training_pl) l1 = fully_connected(enc_states_drop, 200, activation_fn=None) l1 = batch_norm(l1, is_training=is_training_pl) l1_relu = relu(l1) l1_dropout = dropout(l1_relu, is_training=is_training_pl) l2 = fully_connected(l1_dropout, 200, activation_fn=None) l2 = batch_norm(l2, is_training=is_training_pl) l2_relu = relu(l2) l_out = fully_connected(l2_relu, num_outputs=num_classes, activation_fn=None) l_out_softmax = tf.nn.softmax(l_out) tf.contrib.layers.summarize_variables() with tf.variable_scope('metrics'): loss = sparse_softmax_cross_entropy_with_logits(l_out, t_pl) print("loss", loss.get_shape()) loss = tf.reduce_mean(loss) print("loss", loss.get_shape()) tf.summary.scalar('train/loss', loss) argmax = tf.to_int32(tf.argmax(l_out, 1)) print("argmax", argmax.get_shape()) correct = tf.to_float(tf.equal(argmax, t_pl)) print("correct,", correct.get_shape()) accuracy = tf.reduce_mean(correct) print("accuracy", accuracy.get_shape()) with tf.variable_scope('optimizer'): print("building optimizer ...") global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(loss) gradients, variables = zip(*grads_and_vars) clipped_gradients, global_norm = (tf.clip_by_global_norm( gradients, clip_norm)) clipped_grads_and_vars = zip(clipped_gradients, variables) tf.summary.scalar('train/global_gradient_norm', global_norm) train_op = optimizer.apply_gradients(clipped_grads_and_vars, global_step=global_step) return X_pl, t_pl, is_training_pl, l_out, l_out_softmax, loss, accuracy, train_op, global_step
def _encoder(self, images, embedding, scope_name="encoder", reuse_variables=False): with tf.variable_scope(scope_name) as scope: if reuse_variables: scope.reuse_variables() # Encode image # 32 * 32 * 64 images = ly.dropout(images, keep_prob=0.9, is_training=self.is_training) node1 = tf_utils.cust_conv2d(images, 64, h_f=4, w_f=4, batch_norm=False, scope_name="node1") # 16 * 16 * 128 node1 = tf_utils.cust_conv2d(node1, 128, h_f=4, w_f=4, is_training=self.is_training, scope_name="node1_1") # 8 * 8 * 256 node1 = tf_utils.cust_conv2d(node1, 256, h_f=4, w_f=4, is_training=self.is_training, scope_name="node1_2") # 4 * 4 * 512 node1 = tf_utils.cust_conv2d(node1, 512, h_f=4, w_f=4, activation_fn=None, is_training=self.is_training, scope_name="node1_3") node1 = ly.dropout(node1, keep_prob=0.7, is_training=self.is_training) # 4 * 4 * 128 node2 = tf_utils.cust_conv2d(node1, 256, h_f=1, w_f=1, h_s=1, w_s=1, is_training=self.is_training, scope_name="node2_1") # 4 * 4 * 128 node2 = tf_utils.cust_conv2d(node2, 256, h_f=3, w_f=3, h_s=1, w_s=1, is_training=self.is_training, scope_name="node2_2") # 4 * 4 * 512 node2 = tf_utils.cust_conv2d(node2, 512, h_f=3, w_f=3, h_s=1, w_s=1, activation_fn=None, is_training=self.is_training, scope_name="node2_3") node2 = ly.dropout(node2, keep_prob=0.7, is_training=self.is_training) # 4 * 4 * 512 node = tf.add(node1, node2) node = tf_utils.leaky_rectify(node) # Encode embedding # 1 x 1 x nb_emb emb = tf.expand_dims(tf.expand_dims(embedding, 1), 1) # 4 x 4 x nb_emb emb = tf.tile(emb, [1, 4, 4, 1]) # 4 x 4 x 356 comb = tf.concat([node, emb], axis=3) # Compress embedding # 4 * 4 * 256 result = tf_utils.cust_conv2d(comb, 512, h_f=3, w_f=3, w_s=1, h_s=1, scope_name="node3") result = tf_utils.cust_conv2d(result, 256, h_f=3, w_f=3, w_s=1, h_s=1, scope_name="node4") if scope_name == "discriminator": result = tf_utils.cust_conv2d(result, 128, h_f=3, w_f=3, w_s=1, h_s=1, scope_name="node5") result = tf_utils.cust_conv2d(result, 64, h_f=3, w_f=3, w_s=2, h_s=2, scope_name="node6") # 1 x 1 x 16 result = tf_utils.cust_conv2d(result, 16, h_f=3, w_f=3, w_s=2, h_s=2, scope_name="node7") return result
def define_feedforward_model(self): layer_list = [] with self.graph.as_default() as g: is_training_batch = tf.placeholder(tf.bool, shape=(), name="is_training_batch") bn_params = { "is_training": is_training_batch, "decay": 0.99, "updates_collections": None } g.add_to_collection("is_training_batch", is_training_batch) with tf.name_scope("input"): input_layer = tf.placeholder(dtype=tf.float32, shape=(None, self.n_in), name="input_layer") if self.dropout_rate != 0.0: print "Using dropout to avoid overfitting and the dropout rate is", self.dropout_rate is_training_drop = tf.placeholder(dtype=tf.bool, shape=(), name="is_training_drop") input_layer_drop = dropout(input_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop", value=is_training_drop) else: layer_list.append(input_layer) g.add_to_collection("input_layer", layer_list[0]) for i in range(len(self.hidden_layer_size)): with tf.name_scope("hidden_layer_" + str(i + 1)): if self.dropout_rate != 0.0: last_layer = layer_list[-1] if self.hidden_layer_type[i] == "tanh": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.tanh,normalizer_fn=batch_norm,\ normalizer_params=bn_params) if self.hidden_layer_type[i] == "sigmoid": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.sigmoid,normalizer_fn=batch_norm,\ normalizer_params=bn_params) new_layer_drop = dropout(new_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(new_layer_drop) else: last_layer = layer_list[-1] if self.hidden_layer_type[i] == "tanh": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.tanh,normalizer_fn=batch_norm,\ normalizer_params=bn_params) if self.hidden_layer_type[i] == "sigmoid": new_layer=fully_connected(last_layer,self.hidden_layer_size[i],activation_fn=tf.nn.sigmoid,normalizer_fn=batch_norm,\ normalizer_params=bn_params) layer_list.append(new_layer) with tf.name_scope("output_layer"): if self.output_type == "linear": output_layer = fully_connected(layer_list[-1], self.n_out, activation_fn=None) if self.output_type == "tanh": output_layer = fully_connected(layer_list[-1], self.n_out, activation_fn=tf.nn.tanh) g.add_to_collection(name="output_layer", value=output_layer) with tf.name_scope("training_op"): if self.optimizer == "adam": self.training_op = tf.train.AdamOptimizer()
def _dnn_model_fn(features, labels, mode, params, config=None): """Deep Neural Net model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * head: A `_Head` instance. * hidden_units: List of hidden units per layer. * feature_columns: An iterable containing all the feature columns used by the model. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. If `None`, will use the Adagrad optimizer with a default learning rate of 0.05. * activation_fn: Activation function applied to each layer. If `None`, will use `tf.nn.relu`. * dropout: When not `None`, the probability we will drop out a given coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to a `float` multiplier. Multiplier will be used to multiply with learning rate for the embedding variables. * input_layer_min_slice_size: Optional. The min slice size of input layer partitions. If not provided, will use the default of 64M. config: `RunConfig` object to configure the runtime settings. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. """ head = params["head"] hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] optimizer = params.get("optimizer") or "Adagrad" activation_fn = params.get("activation_fn") dropout = params.get("dropout") gradient_clip_norm = params.get("gradient_clip_norm") input_layer_min_slice_size = ( params.get("input_layer_min_slice_size") or 64 << 20) num_ps_replicas = config.num_ps_replicas if config else 0 embedding_lr_multipliers = params.get("embedding_lr_multipliers", {}) features = _get_feature_dict(features) parent_scope = "dnn" partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas) with variable_scope.variable_scope( parent_scope, values=tuple(six.itervalues(features)), partitioner=partitioner): input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=input_layer_min_slice_size)) with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner) as input_layer_scope: if all([ isinstance(fc, feature_column._FeatureColumn) # pylint: disable=protected-access for fc in feature_columns ]): net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, weight_collections=[parent_scope], scope=input_layer_scope) else: net = fc_core.input_layer( features=features, feature_columns=feature_columns, weight_collections=[parent_scope]) for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(net,)) as hidden_layer_scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=activation_fn, variables_collections=[parent_scope], scope=hidden_layer_scope) if dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) with variable_scope.variable_scope( "logits", values=(net,)) as logits_scope: logits = layers.fully_connected( net, head.logits_dimension, activation_fn=None, variables_collections=[parent_scope], scope=logits_scope) _add_hidden_layer_summary(logits, logits_scope.name) def _train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=contrib_variables.get_global_step(), learning_rate=_LEARNING_RATE, optimizer=_get_optimizer(optimizer), gradient_multipliers=( dnn_linear_combined._extract_embedding_lr_multipliers( # pylint: disable=protected-access embedding_lr_multipliers, parent_scope, input_layer_scope.name)), clip_gradients=gradient_clip_norm, name=parent_scope, # Empty summaries to prevent optimizers from logging training_loss. summaries=[]) return head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_train_op_fn, logits=logits)
def _inference(self, x, dropout, is_training=True): with tf.variable_scope('pretrain_model', reuse=None) as training_scope: if self.freeze_opt == None: weights = {} weights = self.build_emb_weights(weights) weights = self.build_lstm_weights(weights) weights = self.build_fc_weights(self.n_hidden, weights) # embedding with tf.variable_scope("embedding"): xemb = self.embedding(x, weights["emb_W"], weights["emb_mask_W"]) # recurrent neural networks with tf.variable_scope("rnn"): lstm_cell = LSTMCell(self.n_hidden, weights["lstm_W_xh"], weights["lstm_W_hh"], weights["lstm_b"]) # lstm_cell = LSTMCell(self.n_hidden) xemb = tf.unstack(xemb, self.timesteps, 1) #c, h W_state_c = tf.random_normal( [self.batch_size, self.n_hidden], stddev=0.1) W_state_h = tf.random_normal( [self.batch_size, self.n_hidden], stddev=0.1) outputs, state = tf.nn.static_rnn( lstm_cell, xemb, initial_state=(W_state_c, W_state_h), dtype=tf.float32) _, hout = state with tf.variable_scope("dropout"): h_ = layers.dropout(hout, keep_prob=dropout) for i, dim in enumerate(self.dim_hidden[:-1]): h_ = self.fc(h_, weights["fc_W" + str(i)], weights["fc_b" + str(i)]) h_ = tf.nn.dropout(h_, dropout) # Logits linear layer, i.e. softmax without normalization. N, Min = h_.get_shape() i = len(self.dim_hidden) - 1 logits = self.fc(h_, weights["fc_W" + str(i)], weights["fc_b" + str(i)], relu=False) else: with tf.variable_scope("embedding"): Wemb = self.finetune_weights["emb_W"] Wemb_mask = tf.get_variable("mask_padding", initializer=MASK_ARRAY, dtype="float32", trainable=False) xemb = self.embedding(x, Wemb, Wemb_mask) # convolutional network with tf.variable_scope("rnn"): lstm_cell = LSTMCell(self.n_hidden, self.finetune_weights["lstm_W_xh"], self.finetune_weights["lstm_W_hh"], self.finetune_weights["lstm_b"]) xemb = tf.unstack(xemb, self.timesteps, 1) W_state_c = tf.random_normal( [self.batch_size, self.n_hidden], stddev=0.1) W_state_h = tf.random_normal( [self.batch_size, self.n_hidden], stddev=0.1) outputs, state = tf.nn.static_rnn( lstm_cell, xemb, initial_state=(W_state_c, W_state_h), dtype=tf.float32) _, hout = state with tf.variable_scope("dropout"): h_ = layers.dropout(hout, keep_prob=dropout) for i, dim in enumerate(self.dim_hidden[:-1]): Wfc = self.finetune_weights["fc_W" + str(i)] bfc = self.finetune_weights["fc_b" + str(i)] h_ = self.fc(h_, Wfc, bfc) h_ = tf.nn.dropout(h_, dropout) # finetune the last layer i = len(self.dim_hidden) - 1 weights = {} dim_in = self.n_hidden_2 weights["fc_W" + str(i)] = self.weight_variable( [int(dim_in), FLAGS.n_classes], name="fc_W" + str(i)) weights["fc_b" + str(i)] = self.bias_variable( [FLAGS.n_classes], name="fc_b" + str(i)) # Logits linear layer, i.e. softmax without normalization. N, Min = h_.get_shape() i = len(self.dim_hidden) - 1 logits = self.fc(h_, weights["fc_W" + str(i)], weights["fc_b" + str(i)], relu=False) return logits
def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None): """Deep Neural Net and Linear combined model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * head: A `Head` instance. * linear_feature_columns: An iterable containing all the feature columns used by the Linear model. * linear_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the Linear model. Defaults to the Ftrl optimizer. * joint_linear_weights: If True a single (possibly partitioned) variable will be used to store the linear model weights. It's faster, but requires all columns are sparse and have the 'sum' combiner. * dnn_feature_columns: An iterable containing all the feature columns used by the DNN model. * dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN model. Defaults to the Adagrad optimizer. * dnn_hidden_units: List of hidden units per DNN layer. * dnn_activation_fn: Activation function applied to each DNN layer. If `None`, will use `tf.nn.relu`. * dnn_dropout: When not `None`, the probability we will drop out a given DNN coordinate. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to a `float` multiplier. Multiplier will be used to multiply with learning rate for the embedding variables. * input_layer_partitioner: Optional. Partitioner for input layer. config: `RunConfig` object to configure the runtime settings. Returns: `ModelFnOps` Raises: ValueError: If both `linear_feature_columns` and `dnn_features_columns` are empty at the same time, or `input_layer_partitioner` is missing. """ head = params["head"] linear_feature_columns = params.get("linear_feature_columns") linear_optimizer = params.get("linear_optimizer") or "Ftrl" joint_linear_weights = params.get("joint_linear_weights") dnn_feature_columns = params.get("dnn_feature_columns") dnn_optimizer = params.get("dnn_optimizer") or "Adagrad" dnn_hidden_units = params.get("dnn_hidden_units") dnn_activation_fn = params.get("dnn_activation_fn") or nn.relu dnn_dropout = params.get("dnn_dropout") gradient_clip_norm = params.get("gradient_clip_norm") num_ps_replicas = config.num_ps_replicas if config else 0 input_layer_partitioner = params.get("input_layer_partitioner") or ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) embedding_lr_multipliers = params.get("embedding_lr_multipliers", {}) fix_global_step_increment_bug = params.get( "fix_global_step_increment_bug", True) if not linear_feature_columns and not dnn_feature_columns: raise ValueError( "Either linear_feature_columns or dnn_feature_columns must be defined.") features = _get_feature_dict(features) linear_optimizer = _get_optimizer(linear_optimizer) _check_no_sync_replicas_optimizer(linear_optimizer) dnn_optimizer = _get_optimizer(dnn_optimizer) _check_no_sync_replicas_optimizer(dnn_optimizer) # Build DNN Logits. dnn_parent_scope = "dnn" if not dnn_feature_columns: dnn_logits = None else: if not dnn_hidden_units: raise ValueError( "dnn_hidden_units must be defined when dnn_feature_columns is " "specified.") dnn_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) with variable_scope.variable_scope( dnn_parent_scope, values=tuple(six.itervalues(features)), partitioner=dnn_partitioner): with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner) as dnn_input_scope: if all( isinstance(fc, feature_column_lib._FeatureColumn) # pylint: disable=protected-access for fc in dnn_feature_columns ): net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=dnn_input_scope) else: net = fc_core.input_layer( features=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope]) for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(net,)) as dnn_hidden_layer_scope: net = layers.fully_connected( net, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=dnn_hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout( net, keep_prob=(1.0 - dnn_dropout)) # TODO(b/31209633): Consider adding summary before dropout. _add_layer_summary(net, dnn_hidden_layer_scope.name) with variable_scope.variable_scope( "logits", values=(net,)) as dnn_logits_scope: dnn_logits = layers.fully_connected( net, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=dnn_logits_scope) _add_layer_summary(dnn_logits, dnn_logits_scope.name) # Build Linear logits. linear_parent_scope = "linear" if not linear_feature_columns: linear_logits = None else: linear_partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_scope( linear_parent_scope, values=tuple(six.itervalues(features)), partitioner=linear_partitioner) as scope: if all(isinstance(fc, feature_column_lib._FeatureColumn) # pylint: disable=protected-access for fc in linear_feature_columns): if joint_linear_weights: linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=linear_feature_columns, num_outputs=head.logits_dimension, weight_collections=[linear_parent_scope], scope=scope) else: linear_logits, _, _ = layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=linear_feature_columns, num_outputs=head.logits_dimension, weight_collections=[linear_parent_scope], scope=scope) else: linear_logits = fc_core.linear_model( features=features, feature_columns=linear_feature_columns, units=head.logits_dimension, weight_collections=[linear_parent_scope]) _add_layer_summary(linear_logits, scope.name) # Combine logits and build full model. if dnn_logits is not None and linear_logits is not None: logits = dnn_logits + linear_logits elif dnn_logits is not None: logits = dnn_logits else: logits = linear_logits def _make_training_op(training_loss): """Training op for the DNN linear combined model.""" train_ops = [] global_step = training_util.get_global_step() if dnn_logits is not None: train_ops.append( optimizers.optimize_loss( loss=training_loss, global_step=global_step, learning_rate=_DNN_LEARNING_RATE, optimizer=dnn_optimizer, gradient_multipliers=_extract_embedding_lr_multipliers( # pylint: disable=protected-access embedding_lr_multipliers, dnn_parent_scope, dnn_input_scope.name), clip_gradients=gradient_clip_norm, variables=ops.get_collection(dnn_parent_scope), name=dnn_parent_scope, # Empty summaries, because head already logs "loss" summary. summaries=[], increment_global_step=not fix_global_step_increment_bug)) if linear_logits is not None: train_ops.append( optimizers.optimize_loss( loss=training_loss, global_step=global_step, learning_rate=_linear_learning_rate(len(linear_feature_columns)), optimizer=linear_optimizer, clip_gradients=gradient_clip_norm, variables=ops.get_collection(linear_parent_scope), name=linear_parent_scope, # Empty summaries, because head already logs "loss" summary. summaries=[], increment_global_step=not fix_global_step_increment_bug)) train_op = control_flow_ops.group(*train_ops) if fix_global_step_increment_bug: with ops.control_dependencies([train_op]): with ops.colocate_with(global_step): return state_ops.assign_add(global_step, 1).op return train_op return head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_make_training_op, logits=logits)
def _inference(self, x, dropout, is_training=True): with tf.variable_scope('pretrain_model', reuse=None) as training_scope: weights = {} if self.freeze_opt == None: weights = self.build_emb_weights(weights) weights = self.build_conv_weights(weights) weights = self.build_fc_weights( self.n_filters * len(self.filter_sizes), weights) with tf.variable_scope("embedding"): self.embedding(x, weights["emb_W"], weights["emb_mask_W"]) # convolutional network with tf.variable_scope("conv"): hout = self.conv(weights, is_training) with tf.variable_scope("dropout"): h_ = layers.dropout(hout, keep_prob=dropout) for i, dim in enumerate(self.dim_hidden[:-1]): h_ = self.fc(h_, weights["fc_W" + str(i)], weights["fc_b" + str(i)]) h_ = tf.nn.dropout(h_, dropout) # Logits linear layer, i.e. softmax without normalization. N, Min = h_.get_shape() i = len(self.dim_hidden) - 1 logits = self.fc(h_, weights["fc_W" + str(i)], weights["fc_b" + str(i)], relu=False) else: with tf.variable_scope("embedding"): Wemb = self.finetune_weights["emb_W"] Wemb_mask = tf.get_variable("mask_padding", initializer=MASK_ARRAY, dtype="float32", trainable=False) self.embedding(x, Wemb, Wemb_mask) # convolutional network with tf.variable_scope("conv"): # w = {} # for i, filter_size in enumerate(self.filter_sizes): # w["conv_W"+str(filter_size)] = self.finetune_weights["conv_W"+str(filter_size)] # w["conv_b"+str(filter_size)] = self.finetune_weights["conv_b"+str(filter_size)] hout = self.conv(self.finetune_weights, is_training) with tf.variable_scope("dropout"): h_ = layers.dropout(hout, keep_prob=dropout) for i, dim in enumerate(self.dim_hidden[:-1]): Wfc = self.finetune_weights["fc_W" + str(i)] bfc = self.finetune_weights["fc_b" + str(i)] h_ = self.fc(h_, Wfc, bfc) h_ = tf.nn.dropout(h_, dropout) # finetune the last layer i = len(self.dim_hidden) - 1 weights = {} dim_in = self.n_hidden_2 weights["fc_W" + str(i)] = self.weight_variable( [int(dim_in), FLAGS.n_classes], name="fc_W" + str(i)) weights["fc_b" + str(i)] = self.bias_variable( [FLAGS.n_classes], name="fc_b" + str(i)) # Logits linear layer, i.e. softmax without normalization. N, Min = h_.get_shape() i = len(self.dim_hidden) - 1 logits = self.fc(h_, weights["fc_W" + str(i)], weights["fc_b" + str(i)], relu=False) return logits