def _build_q_net(self, state, action, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup( user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] state = tf.concat( [state, tf.expand_dims(action, axis=1, name="2d-action")], axis=1) fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1') fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2') q = tf.layers.dense(fc2, units=self.action_dim, name='q') return q[:, 0]
def loss_layer(logits, labels, num_labels, lengths, input_mask): FLAGS = tf.flags.FLAGS trans = tf.get_variable("transitions", shape=[num_labels, num_labels], initializer=initializers.xavier_initializer()) if FLAGS.use_crf: with tf.variable_scope("crf-loss"): log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=labels, transition_params=trans, sequence_lengths=lengths) per_example_loss = -log_likelihood loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, trans else: labels_one_hot = tf.one_hot(labels, num_labels) cross_entropy = labels_one_hot * tf.log(tf.nn.softmax(logits)) cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2) cross_entropy *= tf.to_float(input_mask) cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1) cross_entropy /= tf.cast(lengths, tf.float32) per_example_loss = cross_entropy loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, trans
def __init__(self, config): self.config = config self.task_name = config["task_name"] self.lstm_dim = config["lstm_dim"] self.embedding_size = config["embedding_size"] self.max_epoch = config["max_epoch"] ######原为10 epoch self.learning_rate = config["learning_rate"] self.checkpoint_dir = config["checkpoint_dir"] self.checkpoint_path = config["checkpoint_path"] self.initializer = initializers.xavier_initializer() self.is_training = True if ARGS.entry=="train" else False self.bert_config = config["bert_config"] self.init_checkpoint = config["init_checkpoint"] self.vocab_dir = config["vocab_dir"] self.tf_serving_save_dir = config["tf_serving_save_dir"] self.predict_file = config["predict_file"] self.predict_result = config["predict_result"] self.require_improvement = config["require_improvement"] self.global_steps = tf.Variable(0, trainable=False) self.best_dev_f1 = tf.Variable(0.0, trainable=False) self.best_f1 = 0.0 self.best_match_num = 0 self.steps = 0 # 迭代次数 self.last_improved = 0 # 记录上一次提升批次 self.tokenizer = tokenization.FullTokenizer( vocab_file=self.vocab_dir, )
def dense(cls, input_layer, shape, dtype=tf.float32, activation=tf.nn.relu, name="dense", detailed_summary=False): with tf.variable_scope(name): w = tf.get_variable("w", shape=shape, dtype=dtype, initializer=initializers.xavier_initializer()) b = tf.get_variable("b", shape=shape[1], dtype=dtype, initializer=tf.zeros_initializer()) out = tf.nn.bias_add(tf.matmul(input_layer, w), b) if detailed_summary: with tf.name_scope('w'): cls.variable_summaries(w) with tf.name_scope('b'): cls.variable_summaries(b) with tf.name_scope('output'): cls.variable_summaries(out) if activation is not None: return activation(out) else: return out
def _build_action_net(self, state, variable_scope): with tf.variable_scope(variable_scope): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup( user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features // 2, activation=tf.nn.relu, name='fc1') actions = tf.layers.dense(fc1, self.action_dim, activation=tf.nn.sigmoid, name='a') scaled_a = tf.multiply(actions, 1, name='scaled_a') return scaled_a[:, 0]
def __init__(self, embeddings, lstm_dim_=100, num_tags_=4, lr_=0.001): self.lstm_dim = lstm_dim_ self.num_tags = num_tags_ self.lr = lr_ self.initializer = initializers.xavier_initializer() self.dropout = tf.placeholder(dtype=tf.float32, name='dropout') self.max_steps = tf.placeholder(dtype=tf.int32, shape=[ None, ], name='seq_length') self.x_input = tf.placeholder(dtype=tf.int32, shape=[None, None], name='x_input') self.y_target = tf.placeholder(dtype=tf.int32, shape=[None, None], name='y_target') self.num_steps = tf.shape(self.x_input)[-1] with tf.variable_scope("char_embedding"): self.embeddings = tf.get_variable(name='embeddings', initializer=embeddings) self.logits = self.project_layer_single(self.bigru_layer()) with tf.variable_scope("crf_loss"): self.trans = tf.get_variable("transitions", shape=[self.num_tags, self.num_tags], initializer=self.initializer) self.loss = self.loss_layer(self.logits) self.train_step = tf.train.AdamOptimizer(self.lr).minimize( self.loss)
def linear(input_, output_size, weights_initializer=initializers.xavier_initializer(), biases_initializer=tf.zeros_initializer, activation_fn=None, trainable=True, name='linear'): """ Constructs a fully connected layer. """ # Get shape of input. shape = input_.get_shape().as_list() if len(shape) > 2: # Flatten. input_ = tf.reshape(input_, [-1, reduce(lambda x, y: x * y, shape[1:])]) shape = input_.get_shape().as_list() with tf.variable_scope(name): # Weights, bias, output. w = tf.get_variable('w', [shape[1], output_size], tf.float32, initializer=weights_initializer, trainable=trainable) b = tf.get_variable('b', [output_size], initializer=biases_initializer, trainable=trainable) out = tf.nn.bias_add(tf.matmul(input_, w), b) if activation_fn != None: # Apply activation function. out = activation_fn(out) return out, w, b
def fractal_conv2d(inputs, num_columns, num_outputs, kernel_size, joined=True, stride=1, padding='SAME', # rate=1, activation_fn=nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=None, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, is_training=True, trainable=True, scope=None): """Builds a fractal block with slim.conv2d. The fractal will have `num_columns` columns, and have Args: inputs: a 4-D tensor `[batch_size, height, width, channels]`. num_columns: integer, the columns in the fractal. """ locs = locals() fractal_args = ['inputs','num_columns','joined','is_training'] asc_fn = lambda : slim.arg_scope([slim.conv2d], **{arg:val for (arg,val) in locs.items() if arg not in fractal_args}) return fractal_template(inputs, num_columns, slim.conv2d, asc_fn, joined, is_training, reuse, scope)
def arcface_softmax(feature, targets, num_outputs, s=32, m=0.5, name="arcface_softmax"): feature_shape = feature.get_shape() kernel = tf.get_variable("arcface_softmax/W", [feature_shape[1], num_outputs], dtype=tf.float32, initializer=initializers.xavier_initializer()) feature_norm = tf.nn.l2_normalize(feature, dim=1) kernel_norm = tf.nn.l2_normalize(kernel, dim=0) cos_theta = tf.matmul(feature_norm, kernel_norm) theta = tf.acos(cos_theta) phi_theta = tf.cos(theta + m) # cos_m = math.cos(m) # sin_m = math.sin(m) # cos_theta2 = tf.square(cos_theta) # sin_theta2 = tf.subtract(1.0, cos_theta2) # sin_theta = tf.sqrt(sin_theta2) # phi_theta = tf.subtract(tf.multiply(cos_theta, cos_m), tf.multiply(sin_theta, sin_m)) logits = s * cos_theta logits_ = s * phi_theta adjust_logits = tf.where(tf.equal(targets, 1.0), logits_, logits) return adjust_logits
def xavier(uniform=True, seed=None, dtype=tf.float32): """ Xavier. Returns an initializer performing "Xavier" initialization for weights. This initializer is designed to keep the scale of the gradients roughly the same in all layers. In uniform distribution this ends up being the range: `x = sqrt(6. / (in + out)); [-x, x]` and for normal distribution a standard deviation of `sqrt(3. / (in + out))` is used. Arguments: uniform: Whether to use uniform or normal distributed random initialization. seed: A Python integer. Used to create random seeds. See `set_random_seed` for behavior. dtype: The data type. Only floating point types are supported. Returns: An initializer for a weight matrix. References: Understanding the difficulty of training deep feedforward neural networks. International conference on artificial intelligence and statistics. Xavier Glorot and Yoshua Bengio (2010). Links: [http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf] (http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf) """ return xavier_initializer(uniform=uniform, seed=seed, dtype=dtype)
def linear(inputs, output_size, weights_initializer=initializers.xavier_initializer(), biases_initializer=tf.zeros_initializer, synthetic=False, activation_fn=None, batch_norm=True, name='linear'): var = {} shape = inputs.get_shape().as_list() with tf.variable_scope(name): var['w'] = tf.get_variable('w', [shape[1], output_size], tf.float32, initializer=weights_initializer) var['b'] = tf.get_variable('b', [output_size], initializer=biases_initializer) out = tf.nn.bias_add(tf.matmul(inputs, var['w']), var['b']) if batch_norm: out = tf.contrib.layers.batch_norm(out) if activation_fn is not None: out = activation_fn(out) if synthetic: with tf.variable_scope('synthetic_grad'): out_shape = out.get_shape() h1, var['l1_w'], var['l1_b'] = linear(out, 4000, weights_initializer=tf.zeros_initializer, biases_initializer=tf.zeros_initializer, activation_fn=tf.nn.relu, batch_norm=True, name='l1') synthetic_grad, var['l2_w'], var['l2_b'] = linear(h1, out_shape[1], weights_initializer=tf.zeros_initializer, biases_initializer=tf.zeros_initializer, activation_fn=tf.nn.relu, batch_norm=True, name='l2') return out, var['w'], var['b'], synthetic_grad else: return out, var['w'], var['b']
def res_block(self, input, name): with tf.variable_scope(name): with slim.arg_scope( [slim.conv2d], activation_fn=tf.nn.relu, # normalizer_fn=self.normalizer, # normalizer_params=self.norm_params1, weights_initializer=initializers.xavier_initializer( uniform=True), weights_regularizer=slim.l1_regularizer(1e-4)): # print('......................................') split1 = input split1_1 = input conv3_1 = slim.conv2d(split1, 48, [3, 3], 1, scope='conv_3_1') conv3_2 = slim.conv2d(conv3_1, 48, [3, 3], 1, scope='conv3_2') slice1_1, slice1_2 = tf.split(conv3_2, [16, 32], axis=3) conv3_3 = slim.conv2d(slice1_2, 48, [3, 3], scope='conv3_3') conv3_4 = slim.conv2d(conv3_3, 64, [3, 3], scope='conv3_4') slice2_1, slice2_2 = tf.split(conv3_4, [16, 48], axis=3) conv3_5 = slim.conv2d(slice2_2, 48, [3, 3], 1, scope='conv3_5') conv3_6 = slim.conv2d(conv3_5, 96, [3, 3], 1, scope='conv3_6') concat1 = tf.concat([split1_1, slice1_1, slice2_1], axis=3) sum1 = concat1 + conv3_6 down1 = slim.conv2d(sum1, 64, [1, 1], 1, scope='down1') return down1
def pre_convolution(self, image1, image2, image3, name): with tf.variable_scope(name): with slim.arg_scope( [slim.conv2d], activation_fn=tf.nn.relu, # normalizer_fn=self.normalizer, # normalizer_params=self.norm_params1, weights_initializer=initializers.xavier_initializer( uniform=True), weights_regularizer=slim.l1_regularizer(1e-4)): image1 = slim.conv2d(image1, 64, [3, 3], 1, scope='conv1_1') image1 = slim.conv2d(image1, 64, [3, 3], 1, scope='conv1_2') image2 = slim.conv2d(image2, 64, [3, 3], 1, scope='conv2_1') image2 = slim.conv2d(image2, 64, [3, 3], 1, scope='conv2_2') image3 = slim.conv2d(image3, 64, [3, 3], 1, scope='conv3_1') image3 = slim.conv2d(image3, 64, [3, 3], 1, scope='conv3_2') image_1_2 = tf.concat([image1, image2], axis=3) image_1_2 = slim.conv2d(image_1_2, 64, [3, 3], 1, scope='conv_1_2') image_2_3 = tf.concat([image2, image3], axis=3) image_2_3 = slim.conv2d(image_2_3, 64, [3, 3], 1, scope='conv_2_3') image_1_2_3 = tf.concat([image_1_2, image_2_3], axis=3) image_1_2_3 = slim.conv2d(image_1_2_3, 64, [3, 3], 1, scope='conv_1_2_3') return image_1_2_3
def __init__(self, bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, max_seq_length, use_one_hot_embeddings): # load bert bert = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # 获取bert的输出 output_layer = bert.get_sequence_output() # self.all_encoder_layers = bert.get_all_encoder_layers() if is_training: output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) hidden_size = output_layer.shape[-1].value output_layer = tf.reshape(output_layer, [-1, hidden_size]) tf.logging.info(" The dimension of bert output:%s" % output_layer.shape) # 全连接层 output_weight = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(output_layer, output_weight, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.logits = tf.reshape(logits, [-1, max_seq_length, num_labels]) # 使用全连接层的输出计算MNLP分数 self.probs = tf.nn.softmax(self.logits, axis=-1) self.best_probs = tf.reduce_max(self.probs, axis=-1) self.mnlp_score = tf.reduce_mean(tf.log(self.best_probs), axis=-1) # 计算输入样本的长度 used = tf.sign(tf.abs(input_ids)) lengths = tf.reduce_sum(used, reduction_indices=1) # crf层 with tf.variable_scope("crf"): trans = tf.get_variable( "transitions", shape=[num_labels, num_labels], initializer=initializers.xavier_initializer()) if labels is None: self.loss = None else: log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( inputs=self.logits, tag_indices=labels, transition_params=trans, sequence_lengths=lengths) self.loss = tf.reduce_mean(-log_likelihood) self.predicts, self.score = crf.crf_decode(potentials=self.logits, transition_params=trans, sequence_length=lengths)
def _create_lstm_policy(self, args): # Create LSTM portion of network lstms_pol = [ rnn.LSTMCell(args.policy_size, state_is_tuple=True, initializer=initializers.xavier_initializer()) for _ in range(args.num_policy_layers) ] self.policy_lstm = rnn.MultiRNNCell(lstms_pol, state_is_tuple=True) self.policy_state = self.policy_lstm.zero_state( args.batch_size * args.sample_size, tf.float32) # Get samples from standard normal distribution, transform to match z-distribution samples = tf.random_normal( [args.sample_size, args.batch_size, args.z_dim], name="z_samples") z_samples = samples * tf.exp(self.z_logstd) + self.z_mean self.z_samples = tf.transpose(z_samples, [1, 0, 2]) # Construct policy input policy_input = tf.reshape(tf.concat([self.states, self.z_samples], 2), [-1, 1, args.z_dim + args.state_dim]) # Forward pass helper = seq2seq.TrainingHelper(policy_input, sequence_length=[1] * args.batch_size * args.sample_size) decoder = seq2seq.BasicDecoder(cell=self.policy_lstm, helper=helper, initial_state=self.policy_state) output, self.final_policy_state, _ = seq2seq.dynamic_decode( decoder, scope='policy_cell') #output = tf.squeeze(tf.gather(output[0], output[1])) output = output[0][:, -1, :] # Fully connected layer to latent variable distribution parameters W = tf.get_variable("lstm_w", [args.policy_size, args.action_dim], initializer=initializers.xavier_initializer()) b = tf.get_variable("lstm_b", [args.action_dim]) a_mean = tf.nn.xw_plus_b(output, W, b) self.a_mean = tf.reshape( a_mean, [args.batch_size, args.sample_size, args.action_dim], name="a_mean") # Initialize logstd self.a_logstd = tf.Variable(np.zeros(args.action_dim), name="a_logstd", dtype=tf.float32)
def fully_connected(in_c, out_c, name): with tf.variable_scope(name): w = tf.get_variable(name='w', shape=[in_c, out_c], initializer=initializers.xavier_initializer()) b = tf.get_variable(name='b', shape=[out_c], initializer=initializers.xavier_initializer()) def wx_b(x, actication=True): x = tf.matmul(x, w) + b if actication == True: return tf.nn.relu((x)) else: return x return wx_b
def forward(inputs, num_outputs, input_dim=None, hiddens=[200], activation_fn=tf.nn.relu, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, scope=None): """ similary as melt.slim.layers.mlp but the first step(from input to first hidden adjusted so input can be sparse) """ assert len(hiddens) >= 1, "must at least contain one hidden layer" scope = 'mlp' if scope is None else scope with tf.variable_scope(scope): outputs = melt.layers.fully_connected( inputs, num_outputs, input_dim=input_dim, activation_fn=activation_fn, weights_initializer=weights_initializer, weights_regularizer=weights_regularizer, biases_initializer=biases_initializer, biases_regularizer=biases_regularizer, reuse=reuse, scope='fc_0') #--------other hidden layers # for i in xrange(len(hiddens) -1): # outputs = slim.fully_connected(outputs, hiddens[i + 1], # activation_fn=activation_fn, # weights_initializer=weights_initializer, # weights_regularizer=weights_regularizer, # biases_initializer=biases_initializer, # biases_regularizer=biases_regularizer, # scope='fc_%d'%i+1) slim.stack(outputs, slim.fully_connected, hiddens[1:], activation_fn=activation_fn, weights_initializer=weights_initializer, weights_regularizer=weights_regularizer, biases_initializer=biases_initializer, biases_regularizer=biases_regularizer, scope='fc') return slim.linear(outputs, num_outputs, weights_initializer=weights_initializer, weights_regularizer=weights_regularizer, biases_initializer=biases_initializer, biases_regularizer=biases_regularizer, scope='linear')
def version_1(inputs, is_training): with tf.name_scope('version_1'): n_filter = 20 n_hidden = 100 n_out = 10 # Reshaping for convolutional operation x = tf.reshape(inputs, [-1, 28, 28, 1]) # Convolutional layer net = slim.conv2d(x, n_filter, [5, 5], padding='VALID', activation_fn=tf.nn.sigmoid, scope='conv1') # Pooling layer net = slim.max_pool2d(net, [2, 2], stride=2, padding='VALID', scope='pool1') # Flatten for fully-connected layer net = slim.flatten(net, scope='flatten3') # 100 sigmoid neurons net = slim.fully_connected( net, n_hidden, scope='fc1', activation_fn=tf.nn.sigmoid, weights_initializer=initializers.xavier_initializer(), biases_initializer=init_ops.zeros_initializer()) # 10 neurons (softmax) logits = slim.fully_connected( net, n_out, activation_fn=None, scope='fco', weights_initializer=initializers.xavier_initializer(), biases_initializer=init_ops.zeros_initializer()) out_layer = tf.nn.softmax(logits) return out_layer, logits
def multi_input_dense_layer( inputs, units, activation=corrnet.activations.leaky_relu, use_bias=True, kernel_initializer=initializers.xavier_initializer(), bias_initializer=init_ops.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, trainable=True, name=None, reuse=None): """Functional interface for the multi input densely-connected layer. This layer implements the operation: `outputs = activation(inputs1.kernel1 + inputs2.kernel2 + bias)` Where `activation` is the activation function passed as the `activation` argument (if not `None`), `kernel` is a weights matrix created by the layer, and `bias` is a bias vector created by the layer (only if `use_bias` is `True`). Note: if the `inputs` tensor has a rank greater than 2, then it is flattened prior to the initial matrix multiply by `kernel`. Arguments: inputs: a list of Tensor inputs. units: Integer or Long, dimensionality of the output space. activation: Activation function (callable). Set it to None to maintain a linear activation. use_bias: Boolean, whether the layer uses a bias. kernel_initializer: Initializer function for the weight matrix. If `None` (default), weights are initialized using the default initializer used by `tf.get_variable`. bias_initializer: Initializer function for the bias. kernel_regularizer: Regularizer function for the weight matrix. bias_regularizer: Regularizer function for the bias. activity_regularizer: Regularizer function for the output. trainable: Boolean, if `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). name: String, the name of the layer. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: The created layer. Use .apply method to pass inputs """ layer = MultiInputDense(units, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, trainable=trainable, name=name, dtype=inputs[0].dtype.base_dtype, _scope=name, _reuse=reuse) return layer # .apply(inputs)
def trans_conv2d(inputs, num_outputs, kernel_size, output_shape, stride=1, padding='SAME', activation_fn=nn_ops.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, scope=None): """ trans_convolution with specified output_shape """ if type(stride) in (int, float): stride = (stride, stride) if type(kernel_size) is int: kernel_size = (kernel_size, kernel_size) with tf.variable_scope(scope, 'trans_conv2d', reuse=reuse) as sc: indim = tensor_shape(inputs)[-1] filters = get_variable(name='weights', shape=kernel_size + (num_outputs, indim), init=weights_initializer, reg=weights_regularizer, collections=variables_collections) if biases_initializer is not None: biases = get_variable(name='biases', shape=(num_outputs), init=biases_initializer, reg=biases_regularizer, collections=variables_collections) outputs = tf.nn.conv2d_transpose(inputs, filters, output_shape, strides=(1) + stride + (1), padding=padding, name=scope) if biases_initializer is not None: outputs = outputs + biases if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def _create_lstm_policy(self, args): raise NotImplementedError # Create LSTM portion of network lstm = rnn_cell.LSTMCell(args.policy_size, state_is_tuple=True, initializer=initializers.xavier_initializer()) self.policy_lstm = rnn_cell.MultiRNNCell([lstm] * args.num_policy_layers, state_is_tuple=True) self.policy_state = self.policy_lstm.zero_state( args.batch_size * args.sample_size, tf.float32) # Get samples from standard normal distribution, transform to match z-distribution samples = tf.random_normal( [args.sample_size, args.batch_size, args.z_dim], name="z_samples") self.z_samples = samples * tf.exp(self.z_logstd) + self.z_mean self.z_samples = tf.transpose(self.z_samples, perm=[1, 0, 2]) # Construct policy input policy_input = tf.concat(2, [self.states, self.z_samples]) policy_input = tf.reshape( policy_input, [args.batch_size * args.sample_size, args.state_dim + args.z_dim], name="policy_input") # Forward pass with tf.variable_scope("policy"): output, self.final_policy_state = seq2seq.rnn_decoder( [policy_input], self.policy_state, self.policy_lstm) output = tf.reshape(tf.concat(1, output), [-1, args.policy_size]) # Fully connected layer to latent variable distribution parameters W = tf.get_variable("lstm_w", [args.policy_size, args.action_dim], initializer=initializers.xavier_initializer()) b = tf.get_variable("lstm_b", [args.action_dim]) a_mean = tf.nn.xw_plus_b(output, W, b) self.a_mean = tf.reshape( a_mean, [args.batch_size, args.sample_size, args.action_dim], name="a_mean") # Initialize logstd self.a_logstd = tf.Variable(np.zeros(args.action_dim), name="a_logstd", dtype=tf.float32)
def __init__(self, words_id, segs_id, labels, lengths, config, is_train=True): self.config = config self.is_train = is_train self.lr = config.lr self.char_dim = config.char_dim self.lstm_dim = config.lstm_dim self.seg_dim = config.seg_dim self.num_tags = config.num_tags self.num_chars = config.num_chars self.num_segs = 4 self.initializer = initializers.xavier_initializer() self.global_step = tf.Variable(0, trainable=False) self.char_inputs = words_id self.seg_inputs = segs_id self.targets = labels self.dropout = config.dropout self.lengths = tf.cast(lengths, tf.int32) self.batch_size = tf.shape(self.char_inputs)[0] self.num_steps = config.max_seq_length self.model_type = config.model_type self.layers = [{'dilation': 1}, {'dilation': 1}, {'dilation': 2}] self.filter_width = 3 self.num_filter = self.lstm_dim self.embedding_dim = self.char_dim + self.seg_dim self.repeat_times = 4 self.cnn_output_width = 0 embedding = self.embedding_layer(self.char_inputs, self.seg_inputs, config) if self.model_type == "bilstm": if self.is_train: model_inputs = tf.nn.dropout(embedding, self.dropout) model_outputs = self.biLSTM_layer(model_inputs, self.lstm_dim, self.lengths) self.logits = self.project_layer_bilstm(model_outputs) elif self.model_type == "idcnn": if self.is_train: model_inputs = tf.nn.dropout(embedding, self.dropout) model_outputs = self.IDCNN_layer(model_inputs) self.logits = self.project_layer_idcnn(model_outputs) else: raise KeyError
def _encode(self): """ Employs two Bi-LSTMs to encode passage and question separately """ init = None if self.para_init: init_w = tf.constant_initializer(self.init1) init_b = tf.constant_initializer(self.init1) else: init_w = initializers.xavier_initializer() init_b = tf.zeros_initializer() if self.simple_net in [0, 1, 4]: with tf.variable_scope('passage_encoding'): self.sep_p_encodes = tc.layers.fully_connected( self.p_emb, num_outputs=2 * self.hidden_size, activation_fn=tf.nn.tanh, weights_initializer=init_w, biases_initializer=init_b) with tf.variable_scope('question_encoding'): self.sep_q_encodes = tc.layers.fully_connected( self.q_emb, num_outputs=2 * self.hidden_size, activation_fn=tf.nn.tanh, weights_initializer=init_w, biases_initializer=init_b) if self.simple_net in [2, 3, 5, 7, 8]: with tf.variable_scope('passage_encoding'): self.sep_p_encodes, self.seq_p_states, self.p_r = rnn( 'bi-lstm', self.p_emb, self.p_length, self.hidden_size, self.init1, batch_size=self.batch_size, debug=self.para_init) with tf.variable_scope('question_encoding'): self.sep_q_encodes, self.seq_q_states, _ = rnn( 'bi-lstm', self.q_emb, self.q_length, self.hidden_size, self.init1, batch_size=self.batch_size, debug=self.para_init) if self.use_dropout: self.sep_p_encodes = tf.nn.dropout(self.sep_p_encodes, self.dropout_keep_prob) self.sep_q_encodes = tf.nn.dropout(self.sep_q_encodes, self.dropout_keep_prob) #self.sep_p_encodes *= tf.expand_dims(self.passage_mask, -1) #self.sep_q_encodes *= tf.expand_dims(self.question_mask, -1) variable_summaries(self.sep_p_encodes) variable_summaries(self.sep_q_encodes)
def __init__(self, config): # super parameters self.config = config self.lr = config['lr'] self.char_dim = config['char_dim'] self.seg_dim = config['seg_dim'] self.lstm_dim = config['lstm_dim'] self.num_tags = config['num_tags'] self.num_chars = config['num_chars'] self.num_segs = 4 self.global_step = tf.Variable(0, trainable=False) self.best_dev_f1 =tf.Variable(0.0, trainable=False) self.best_test_f1 = tf.Variable(0.0, trainable=False) self.initializer = initializers.xavier_initializer() # placeholder self.char_input = tf.placeholder(dtype=tf.int32, shape=[None, None], name='CharInputs') # batch_size * LenSentence self.seg_input = tf.placeholder(dtype=tf.int32, shape=[None, None], name='SgeInputs') self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name='Targets') self.dropout = tf.placeholder(dtype=tf.float32, name='Dropout') # lengths, batch_size, num_steps used = tf.sign(tf.abs(self.char_input)) length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) self.batch_size = tf.shape(self.char_input[0]) self.num_steps = tf.shape(self.char_input[1]) # Net Structure embedding_output = self.embedding_layer(char_inputs=self.char_input, seg_inputs=self.seg_input, config=config) embedding_dropout = tf.nn.dropout(embedding_output, self.dropout) lstm_output = self.biLSTM_layer(lstm_inputs=embedding_dropout, lstm_dim=self.lstm_dim, lengths=self.lengths) self.logits = self.project_layer(lstm_outputs=lstm_output) self.loss = self.loss_layer(project_logits=self.logits, lengths=self.lengths) # optimizer opt_kind = self.config['optimizer'] if opt_kind == 'sgd': self.opt = tf.train.GradientDescentOptimizer(self.lr) elif opt_kind == 'adam': self.opt = tf.train.AdamOptimizer(self.lr) elif opt_kind == 'adgrad': self.opt = tf.train.AdagradOptimizer(self.lr) else: raise KeyError # apply grad clip to avoid gradient explosion grads_vars = self.opt.compute_gradients(self.loss) capped_grads_vars = [[tf.clip_by_value(g, -self.config['clip'], self.config['clip']), v] for g, v in grads_vars] self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step) # model saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
def __init__(self, config, embeddings): self.config = config self.lstm_dim = config["lstm_dim"] self.num_chars = config["num_chars"] self.num_tags = config["num_tags"] self.char_dim = config["char_dim"] self.lr = config["lr"] zero_pad = tf.constant(0.0, dtype=tf.float32, shape=[1, config["char_dim"]]) self.char_embeding = tf.concat(axis=0, values=[zero_pad, tf.get_variable(name="char_embeding", initializer=embeddings)]) self.global_step = tf.Variable(0, trainable=False) self.initializer = initializers.xavier_initializer() self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="char_inputs") self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="targets") self.dropout = tf.placeholder(dtype=tf.float32, name="dropout") self.lengths = tf.placeholder(dtype=tf.int32, shape=[None, ], name="lengths") # self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob") # self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob") self.input_dropout_keep_prob = tf.placeholder_with_default(config["input_dropout_keep"], [], name="input_dropout_keep_prob") self.batch_size = tf.shape(self.char_inputs)[0] self.num_steps = tf.shape(self.char_inputs)[-1] # forward embedding = self.embedding_layer(self.char_inputs) lstm_inputs = tf.nn.dropout(embedding, self.input_dropout_keep_prob) ## bi-directional lstm layer lstm_outputs = self.bilstm_layer(lstm_inputs) ## logits for tags self.project_layer(lstm_outputs) ## loss of the model self.loss = self.loss_layer(self.logits, self.lengths) with tf.variable_scope("optimizer"): optimizer = self.config["optimizer"] if optimizer == "sgd": self.opt = tf.train.GradientDescentOptimizer(self.lr) elif optimizer == "adam": self.opt = tf.train.AdamOptimizer(self.lr) elif optimizer == "adgrad": self.opt = tf.train.AdagradOptimizer(self.lr) else: raise KeyError grads_vars = self.opt.compute_gradients(self.loss) capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars] self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
def embedding_to_logit(self, embedding, is_training=True, seed=0): """Create a graph, transforming embedding vectors to logit classs scores.""" from tensorflow.contrib.layers.python.layers import initializers with tf.variable_scope('net', reuse=is_training): return slim.fully_connected( embedding, self.num_labels, activation_fn=None, weights_regularizer=slim.l2_regularizer(1e-4), weights_initializer=initializers.xavier_initializer(seed=seed))
def lstm_model(self): with tf.variable_scope(name_or_scope='lstm_model', reuse=tf.AUTO_REUSE): # 输入 # 特征数量为25,batch_size,seq_length并不固定 x = tf.placeholder(shape=(None, None, 25), name='input', dtype=tf.float64) print(x.name) # 占位符,说明是否正在训练(影响batch_normalization层) is_training = tf.placeholder(name='is_training', dtype=tf.bool) # lstm构造 lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.num_units, initializer=tf.orthogonal_initializer(), num_proj=48, reuse=tf.AUTO_REUSE, name='lstm_cell', activation='sigmoid') #lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.num_units, initializer=tf.random_normal_initializer(), #num_proj = 48, reuse = tf.AUTO_REUSE, #name = 'lstm_cell', activation = 'sigmoid') c = (tf.nn.dynamic_rnn(cell=lstm_cell, inputs=x, dtype=tf.float64))[0][:, -1, :] with tf.variable_scope(name_or_scope='fully_connection', reuse=tf.AUTO_REUSE): # 全连接层 b_1 = tf.get_variable(name='bias_1', dtype=tf.float64, initializer=tf.constant(value=np.zeros((16,)), dtype=tf.float64)) w_1 = tf.get_variable(name='weight_1', shape=(48, 16), dtype=tf.float64, initializer=initializers.xavier_initializer()) b_2 = tf.get_variable(name='bias_2', dtype=tf.float64, initializer=tf.constant(value=np.zeros((2,)), dtype=tf.float64)) w_2 = tf.get_variable(name='weight_2', shape=(16, 2), dtype=tf.float64, initializer=initializers.xavier_initializer()) #w = tf.get_variable(name='weight', shape=(32, 2), dtype=tf.float64, #initializer=tf.random_normal_initializer()) c2 = tf.matmul(c, w_1) + b_1 # batch normalization层 batch_norm = tf.layers.batch_normalization(inputs=tf.matmul(c2, w_2) + b_2, training=is_training, name='batch_norm', epsilon=0) # 激活函数加softmax输出概率 [P涨,P跌] # tanh_output = tf.tanh(batch_norm, 'tanh') # output = tf.nn.softmax(tanh_output) leaky_relu_output = tf.nn.leaky_relu(features=batch_norm, alpha=0.2, name='relu') output = tf.nn.softmax(logits=leaky_relu_output, name='output') # 返回模型输出值,和占位符 return output, x, is_training
def __init__(self, config): # 从参数列表中获取模型参数 self.config = config self.lr = config["lr"] self.char_dim = config["char_dim"] # 字符的词向量维度 self.lstm_dim = config["lstm_dim"] # lstm隐层神经元数量 self.seg_dim = config["seg_dim"] # 字符的分割特征维度 self.num_tags = config["num_tags"] # 标签数量 self.num_chars = config["num_chars"] # 字符数量 self.num_segs = 4 # 分割特征的数量 # 设置全局变量 self.global_step = tf.Variable(0, trainable=False) self.best_dev_f1 = tf.Variable(0.0, trainable=False) self.best_test_f1 = tf.Variable(0.0, trainable=False) self.initializer = initializers.xavier_initializer() # 设置输入占位符 self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="ChatInputs") # 字符特征,由字符的索引id组成 self.seg_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="SegInputs") # 分割特征,由每个字符的分割特征索引组成 self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="Targets") # 真实标签 self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout") # 设置变量 self.char_lookup = None # 词向量矩阵,初始化模型的时候,通过预训练词向量进行初始化 self.seg_lookup = None # 分割特征向量矩阵 self.trans = None # 状态转移矩阵,在loss层中进行计算 used = tf.sign(tf.abs(self.char_inputs)) # 计算序列中索引非0字符的数量 length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) # 记录序列除去padding(索引为0)的真实长度 self.batch_size = tf.shape(self.char_inputs)[0] self.num_steps = tf.shape(self.char_inputs)[-1] # 序列总长度 # 构造tensor的传递 embedding = self.embedding_layer() # 通过embedding_layer得到字词向量拼接后的特征向量 lstm_inputs = tf.nn.dropout(embedding, self.dropout) # dropout层 lstm_outputs = self.bilstm_layer(lstm_inputs) # 双向BiLSTM层 self.logits = self.project_layer(lstm_outputs) # 进行预测,得到对每个字符是每个标签的概率 self.loss = self.loss_layer(self.logits) # 计算loss # 设置训练阶段的优化算法 with tf.variable_scope("optimizer"): optimizer = self.config["optimizer"] if optimizer == "sgd": self.opt = tf.train.GradientDescentOptimizer(self.lr) elif optimizer == "adam": self.opt = tf.train.AdamOptimizer(self.lr) elif optimizer == "adgrad": self.opt = tf.train.AdagradOptimizer(self.lr) else: raise KeyError # 设置梯度裁剪(grad clip)以避免梯度爆炸 grads_vars = self.opt.compute_gradients(self.loss) capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars] self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # 模型保存设置
def __init__(self, config, embeddings): self.config = config self.lstm_dim = config["lstm_dim"] self.num_chars = config["num_chars"] self.num_tags = config["num_tags"] self.char_dim = config["char_dim"] self.lr = config["lr"] self.char_embeding = tf.get_variable(name="char_embeding", initializer=embeddings) self.global_step = tf.Variable(0, trainable=False) self.initializer = initializers.xavier_initializer() self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="char_inputs") self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="targets") self.dropout = tf.placeholder(dtype=tf.float32, name="dropout") self.lengths = tf.placeholder(dtype=tf.int32, shape=[None, ], name="lengths") # self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob") # self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob") self.input_dropout_keep_prob = tf.placeholder_with_default(config["input_dropout_keep"], [], name="input_dropout_keep_prob") self.batch_size = tf.shape(self.char_inputs)[0] self.num_steps = tf.shape(self.char_inputs)[-1] # forward embedding = self.embedding_layer(self.char_inputs) lstm_inputs = tf.nn.dropout(embedding, self.input_dropout_keep_prob) ## bi-directional lstm layer lstm_outputs = self.bilstm_layer(lstm_inputs) ## logits for tags self.project_layer(lstm_outputs) ## loss of the model self.loss = self.loss_layer(self.logits, self.lengths) with tf.variable_scope("optimizer"): optimizer = self.config["optimizer"] if optimizer == "sgd": self.opt = tf.train.GradientDescentOptimizer(self.lr) elif optimizer == "adam": self.opt = tf.train.AdamOptimizer(self.lr) elif optimizer == "adgrad": self.opt = tf.train.AdagradOptimizer(self.lr) else: raise KeyError grads_vars = self.opt.compute_gradients(self.loss) capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars] self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
def __init__(self): self.learning_rate = ner_tv.initial_learning_rate self.num_hidden = ner_tv.hidden_neural_size #lstm隐层个数 self.embedding_size = ner_tv.embedding_dim self.num_tags = ner_tv.tags_num self.max_grad_norm = ner_tv.max_grad_norm self.max_sentence_len = ner_tv.sentence_length self.w2v_model_path = ner_tv.word2vec_path self.model_save_path = ner_tv.train_model_bi_lstm self.train_epoch = ner_tv.num_epochs self.dropout_train = ner_tv.dropout self.decay_step = ner_tv.decay_step self.decay_rate = ner_tv.decay_rate self.min_learning_rate = ner_tv.min_learning_rate self.initializer = initializers.xavier_initializer() self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, self.max_sentence_len], name="inputs") self.labels = tf.placeholder(dtype=tf.int32, shape=[None, self.max_sentence_len], name='labels') self.dropout = tf.placeholder(dtype=tf.float32, name='dropout') with tf.variable_scope("word2vec_embedding"): self.embedding_vec = tf.Variable(change_gensim_mode2array(), name='word2vec', dtype=tf.float32) inputs_embedding = tf.nn.embedding_lookup(self.embedding_vec, self.inputs) lengths = self.get_length(self.inputs) self.lengths = tf.cast(lengths, tf.int32) lstm_outputs = self.biLSTM_layer(inputs_embedding, self.lengths) self.logits = self.project_layer(lstm_outputs) self.loss = self.loss_layer(self.logits, self.lengths) self.global_step = tf.Variable(0, trainable=False, name="global_step") self.train_learning_rate = tf.maximum( tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_step, self.decay_rate, staircase=True), self.min_learning_rate) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.train_learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), self.max_grad_norm) self.train_op = self.optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step) self.saver = tf.train.Saver(max_to_keep=3)
def _build_q_net(self, state, n_actions, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup( user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense( state, units=n_features, activation=tf.nn.relu, name='fc1', kernel_initializer=initializers.xavier_initializer()) fc2 = tf.layers.dense( fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2', kernel_initializer=initializers.xavier_initializer()) fc3 = tf.layers.dense( fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3', kernel_initializer=initializers.xavier_initializer()) q_out = tf.maximum( tf.layers.dense( fc3, units=n_actions, name='q', kernel_initializer=initializers.xavier_initializer()), 0) return q_out
def _create_mlp_policy(self, args): # Get samples from standard normal distribution, transform to match z-distribution samples = tf.random_normal( [args.sample_size, args.batch_size, args.z_dim], name="z_samples") self.z_samples = samples * tf.exp(self.z_logstd) + self.z_mean self.z_samples = tf.transpose(self.z_samples, perm=[1, 0, 2]) # Construct encoder input enc_in = tf.concat(2, [self.states, self.z_samples]) enc_in = tf.reshape( enc_in, [args.batch_size * args.sample_size, args.state_dim + args.z_dim], name="enc_in") # Create fully connected network of desired size W = tf.get_variable("w_0", [args.state_dim + args.z_dim, args.policy_size], initializer=initializers.xavier_initializer()) b = tf.get_variable("b_0", [args.policy_size]) output = tf.nn.dropout(tf.nn.relu(tf.nn.xw_plus_b(enc_in, W, b)), args.dropout_level) for i in xrange(1, args.num_policy_layers): W = tf.get_variable("w_" + str(i), [args.policy_size, args.policy_size], initializer=initializers.xavier_initializer()) b = tf.get_variable("b_" + str(i), [args.policy_size]) output = tf.nn.dropout(tf.nn.relu(tf.nn.xw_plus_b(output, W, b)), args.dropout_level) W = tf.get_variable("w_end", [args.policy_size, args.action_dim], initializer=initializers.xavier_initializer()) b = tf.get_variable("b_end", [args.action_dim]) a_mean = tf.nn.xw_plus_b(output, W, b) self.a_mean = tf.reshape( a_mean, [args.batch_size, args.sample_size, args.action_dim], name="a_mean") # Initialize logstd self.a_logstd = tf.Variable(np.zeros(args.action_dim), name="a_logstd", dtype=tf.float32)
def project_layer(inputs, out_dim, seq_length, scope='project'): with tf.variable_scope(scope): in_dim = inputs.get_shape().as_list()[-1] weight = tf.get_variable('W', shape=[in_dim, out_dim], dtype=tf.float32, initializer=initializers.xavier_initializer()) bias = tf.get_variable('b', shape=[out_dim], dtype=tf.float32, initializer=tf.zeros_initializer()) t_output = tf.reshape(inputs, [-1, in_dim]) # (batch_size*seq_length, in_dim) output = tf.matmul(t_output, weight) + bias # (batch_size*seq_length, out_dim) output = tf.reshape(output, [-1, seq_length, out_dim]) # (batch_size, seq_length, out_dim) return output
def _create_tf_embed_nn(self, x_in, is_training, layer_sizes, name): # type: (tf.Tensor, tf.Tensor, List[int], Text) -> tf.Tensor """Create nn with hidden layers and name""" reg = tf.contrib.layers.l2_regularizer(self.C2) x = x_in for i, layer_size in enumerate(layer_sizes): x = tf.layers.dense(inputs=x, kernel_initializer=initializers.xavier_initializer(), units=layer_size, activation=tf.nn.relu, kernel_regularizer=reg, name='hidden_layer_{}_{}'.format(name, i)) x = tf.layers.dropout(x, rate=self.droprate, training=is_training) x = tf.layers.dense(inputs=x, kernel_initializer=initializers.xavier_initializer(), units=self.embed_dim, kernel_regularizer=reg, name='embed_layer_{}'.format(name)) return x
def test_conv_layer(self): g = ops.Graph() with g.as_default(): inputs = array_ops.placeholder(dtypes.float32, shape=[8, 5, 5, 3]) with contrib_ops.arg_scope( [layers.batch_norm], fused=True, is_training=True, trainable=True): return layers.convolution( inputs, num_outputs=16, kernel_size=3, stride=1, padding='VALID', activation_fn=nn_ops.relu, normalizer_fn=layers.batch_norm, normalizer_params={}, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, trainable=True, scope=None) inputs_pattern = graph_matcher.OpTypePattern('*', name='inputs') relu_pattern = graph_matcher.OpTypePattern( 'Relu', name='relu', inputs=[ graph_matcher.OpTypePattern( 'FusedBatchNorm', inputs=[ graph_matcher.OpTypePattern( 'Conv2D', inputs=[inputs_pattern, '*']), '*', '*', '*', '*' ]) ]) matcher = graph_matcher.GraphMatcher(relu_pattern) match_results = list(matcher.match_graph(g)) self.assertEqual(1, len(match_results)) match_result = match_results[0] self.assertEqual(match_result.get_tensor(inputs_pattern), inputs) self.assertEqual(match_result.get_tensor('inputs'), inputs)
def _get_weights_wrapper( name, shape, dtype=tf.float32, initializer=initializers.xavier_initializer(), weights_decay_factor=None ): """Wrapper over _get_variable_wrapper() to get weights, with weights decay factor in loss. """ weights = _get_variable_wrapper( name=name, shape=shape, dtype=dtype, initializer=initializer ) if weights_decay_factor is not None and weights_decay_factor > 0.0: weights_wd = tf.multiply( tf.nn.l2_loss(weights), weights_decay_factor, name=name + '/l2loss' ) tf.add_to_collection('losses', weights_wd) return weights
def __init__(self, sess, dataset, conf, num_train=50000, input_size=3072, test_filename='/data2/andrewliao11/cifar-10-batches-py/test_batch'): self.sess = sess self.test_filename = test_filename self.w = 32 self.h = 32 self.channel = 3 self.synthetic = conf.synthetic self.optim_type = conf.optim_type self.test_per_iter = conf.test_per_iter self.max_step = conf.max_step self.ckpt_dir = conf.checkpoint_dir self.batch_size = conf.batch_size self.num_train = num_train self.max_epoch = math.floor(conf.max_step/math.floor(self.num_train/self.batch_size)) self.input_dims = input_size self.hidden_size = conf.hidden_size self.weight_initializer = initializers.xavier_initializer() self.bias_initializer = tf.constant_initializer(0.1) self.output_size = conf.output_size self.max_to_keep = conf.max_to_keep self.dataset = dataset self.var = {} self.grad_output = {} self.synthetic_grad = {} self.layer_out = {} self.grad_loss = [] self.global_step = tf.get_variable('global_step', [],initializer=tf.constant_initializer(0), trainable=False) if self.optim_type == 'exp_decay': decay_steps = int(math.floor(self.num_train/self.batch_size)* conf.num_epoch_per_decay) self.lr = tf.train.exponential_decay(conf.init_lr, self.global_step, decay_steps, conf.decay_factor, staircase=True) self.optim = tf.train.GradientDescentOptimizer(self.lr) elif self.optim_type == 'adam': self.optim = tf.train.AdamOptimizer(conf.init_lr)
def linear(input_, output_size, weights_initializer=initializers.xavier_initializer(), biases_initializer=tf.zeros_initializer, activation_fn=None, trainable=True, name='linear'): shape = input_.get_shape().as_list() if len(shape) > 2: input_ = tf.reshape(input_, [-1, reduce(lambda x, y: x * y, shape[1:])]) shape = input_.get_shape().as_list() with tf.variable_scope(name): w = tf.get_variable('w', [shape[1], output_size], tf.float32, initializer=weights_initializer, trainable=trainable) b = tf.get_variable('b', [output_size], initializer=biases_initializer, trainable=trainable) out = tf.nn.bias_add(tf.matmul(input_, w), b) if activation_fn != None: return activation_fn(out), w, b else: return out, w, b
def class_capsules(inputs, num_classes, iterations, batch_size, name): """ :param inputs: ((24, 4, 4, 32, 4, 4), (24, 4, 4, 32)) :param num_classes: 10 :param iterations: 3 :param batch_size: 24 :param name: :return poses, activations: poses (24, 10, 4, 4), activation (24, 10). """ inputs_poses, inputs_activations = inputs # (24, 4, 4, 32, 4, 4), (24, 4, 4, 32) inputs_shape = inputs_poses.get_shape() spatial_size = int(inputs_shape[1]) # 4 pose_size = int(inputs_shape[-1]) # 4 i_size = int(inputs_shape[3]) # 32 # inputs_poses (24*4*4=384, 32, 16) inputs_poses = tf.reshape(inputs_poses, shape=[batch_size*spatial_size*spatial_size, inputs_shape[-3], inputs_shape[-2]*inputs_shape[-2] ]) with tf.variable_scope(name) as scope: with tf.variable_scope('votes') as scope: # inputs_poses (384, 32, 16) # votes: (384, 32, 10, 16) votes = mat_transform(inputs_poses, num_classes, size=batch_size*spatial_size*spatial_size) tf.logging.info(f"{name} votes shape: {votes.get_shape()}") # votes (24, 4, 4, 32, 10, 16) votes = tf.reshape(votes, shape=[batch_size, spatial_size, spatial_size, i_size, num_classes, pose_size*pose_size]) # (24, 4, 4, 32, 10, 16) votes = coord_addition(votes, spatial_size, spatial_size) tf.logging.info(f"{name} votes shape with coord addition: {votes.get_shape()}") with tf.variable_scope('routing') as scope: # beta_v and beta_a one for each output capsule: (1, 10) beta_v = tf.get_variable( name='beta_v', shape=[1, num_classes], dtype=tf.float32, initializer=initializers.xavier_initializer() ) beta_a = tf.get_variable( name='beta_a', shape=[1, num_classes], dtype=tf.float32, initializer=initializers.xavier_initializer() ) # votes (24, 4, 4, 32, 10, 16) -> (24, 512, 10, 16) votes_shape = votes.get_shape() votes = tf.reshape(votes, shape=[batch_size, votes_shape[1] * votes_shape[2] * votes_shape[3], votes_shape[4], votes_shape[5]] ) # inputs_activations (24, 4, 4, 32) -> (24, 512) inputs_activations = tf.reshape(inputs_activations, shape=[batch_size, votes_shape[1] * votes_shape[2] * votes_shape[3]]) # votes (24, 512, 10, 16), inputs_activations (24, 512) # poses (24, 10, 16), activation (24, 10) poses, activations = matrix_capsules_em_routing( votes, inputs_activations, beta_v, beta_a, iterations, name='em_routing' ) # poses (24, 10, 16) -> (24, 10, 4, 4) poses = tf.reshape(poses, shape=[batch_size, num_classes, pose_size, pose_size] ) # poses (24, 10, 4, 4), activation (24, 10) return poses, activations
def legacy_fully_connected(x, num_output_units, activation_fn=None, weight_init=initializers.xavier_initializer(), bias_init=init_ops.zeros_initializer, name=None, weight_collections=(ops.GraphKeys.WEIGHTS,), bias_collections=(ops.GraphKeys.BIASES,), output_collections=(ops.GraphKeys.ACTIVATIONS,), trainable=True, weight_regularizer=None, bias_regularizer=None): # pylint: disable=anomalous-backslash-in-string r"""Adds the parameters for a fully connected layer and returns the output. A fully connected layer is generally defined as a matrix multiply: `y = f(w * x + b)` where `f` is given by `activation_fn`. If `activation_fn` is `None`, the result of `y = w * x + b` is returned. If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)] with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix multiply along the first dimensions. The result r is a tensor of shape [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`], where \\\( r_{i_0, ..., i_{n-1}, k} = \\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\). This is accomplished by reshaping `x` to 2-D [\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)] before the matrix multiply and afterwards reshaping it to [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`]. This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting `bias_init` to `None`. The variable creation is compatible with `tf.variable_scope` and so can be reused with `tf.variable_scope` or `tf.make_template`. Most of the details of variable creation can be controlled by specifying the initializers (`weight_init` and `bias_init`) and in which collections to place the created variables (`weight_collections` and `bias_collections`; note that the variables are always added to the `VARIABLES` collection). The output of the layer can be placed in custom collections using `output_collections`. The collections arguments default to `WEIGHTS`, `BIASES` and `ACTIVATIONS`, respectively. A per layer regularization can be specified by setting `weight_regularizer` and `bias_regularizer`, which are applied to the weights and biases respectively, and whose output is added to the `REGULARIZATION_LOSSES` collection. Args: x: The input `Tensor`. num_output_units: The size of the output. activation_fn: A function that requires a single Tensor that is applied as a non-linearity. If None is used, do not apply any activation. weight_init: An optional weight initialization, defaults to `xavier_initializer`. bias_init: An initializer for the bias, defaults to 0. Set to `None` in order to disable bias. name: The name for this operation is used to name operations and to find variables. If specified it must be unique for this scope, otherwise a unique name starting with "fully_connected" will be created. See `tf.variable_op_scope` for details. weight_collections: List of graph collections to which weights are added. bias_collections: List of graph collections to which biases are added. output_collections: List of graph collections to which outputs are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). weight_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for weights. bias_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for biases. Returns: The output of the fully connected layer. Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ with variable_scope.variable_op_scope([x], name, 'fully_connected'): dims = x.get_shape().dims if dims is None: raise ValueError('dims of x must be known but is None') if len(dims) < 2: raise ValueError('rank of x must be at least 2 not: %d' % len(dims)) num_input_units = dims[-1].value if num_input_units is None: raise ValueError('last dimension of x must be known but is None') dtype = x.dtype.base_dtype weight_collections = set(list(weight_collections or []) + [ops.GraphKeys.VARIABLES]) w = variable_scope.get_variable('weights', shape=[num_input_units, num_output_units], dtype=dtype, initializer=weight_init, collections=weight_collections, regularizer=weight_regularizer, trainable=trainable) x_2_dim = x if len(dims) <= 2 else array_ops.reshape(x, [-1, num_input_units]) y = standard_ops.matmul(x_2_dim, w) if bias_init is not None: bias_collections = set(list(bias_collections or []) + [ops.GraphKeys.VARIABLES]) b = variable_scope.get_variable('bias', shape=[num_output_units], dtype=dtype, initializer=bias_init, collections=bias_collections, regularizer=bias_regularizer, trainable=trainable) y = nn.bias_add(y, b) if len(dims) > 2: out_shape = array_ops.unpack(array_ops.shape(x)) out_shape[-1] = num_output_units y = array_ops.reshape(y, array_ops.pack(out_shape)) static_shape = x.get_shape().as_list() static_shape[-1] = num_output_units y.set_shape(static_shape) return _apply_activation(y, activation_fn, output_collections)
def fully_connected(inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a fully connected layer. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of with at least rank 2 and value for the last dimension, i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer, the number of output units in the layer. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collections per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_op_scope. Returns: the tensor variable representing the result of the series of operations. Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ if not isinstance(num_outputs, int): raise ValueError('num_outputs should be integer, got %s.', num_outputs) with variable_scope.variable_op_scope([inputs], scope, 'fully_connected', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype num_input_units = utils.last_dimension(inputs.get_shape(), min_rank=2) static_shape = inputs.get_shape().as_list() static_shape[-1] = num_outputs out_shape = array_ops.unpack(array_ops.shape(inputs)) out_shape[-1] = num_outputs weights_shape = [num_input_units, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) if len(static_shape) > 2: # Reshape inputs inputs = array_ops.reshape(inputs, [-1, num_input_units]) outputs = standard_ops.matmul(inputs, weights) if normalizer_fn: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs,], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if len(static_shape) > 2: # Reshape back outputs outputs = array_ops.reshape(outputs, array_ops.pack(out_shape)) outputs.set_shape(static_shape) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def convolution2d(inputs, num_outputs, kernel_size, stride=1, padding='SAME', activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a 2D convolution followed by an optional batch_norm layer. `convolution2d` creates a variable called `weights`, representing the convolutional kernel, that is convolved with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Args: inputs: a 4-D tensor `[batch_size, height, width, channels]`. num_outputs: integer, the number of output filters. kernel_size: a list of length 2 `[kernel_height, kernel_width]` of of the filters. Can be an int if both values are the same. stride: a list of length 2 `[stride_height, stride_width]`. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: one of `VALID` or `SAME`. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionay containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. """ with variable_scope.variable_op_scope([inputs], scope, 'Conv', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) weights_shape = [kernel_h, kernel_w, num_filters_in, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) outputs = nn.conv2d(inputs, weights, [1, stride_h, stride_w, 1], padding=padding) if normalizer_fn: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs,], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def separable_convolution2d( inputs, num_outputs, kernel_size, depth_multiplier, stride=1, padding='SAME', rate=1, activation_fn=tf.nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, data_format='NHWC', scope=None): """Adds a depth-separable 2D convolution with optional batch_norm layer. This op first performs a depthwise convolution that acts separately on channels, creating a variable called `depthwise_weights`. If `num_outputs` is not None, it adds a pointwise convolution that mixes channels, creating a variable called `pointwise_weights`. Then, if `batch_norm_params` is None, it adds bias to the result, creating a variable called 'biases', otherwise it adds a batch normalization layer. It finally applies an activation function to produce the end result. Args: inputs: A tensor of size [batch_size, height, width, channels]. num_outputs: The number of pointwise convolution output filters. If is None, then we skip the pointwise convolution stage. kernel_size: A list of length 2: [kernel_height, kernel_width] of of the filters. Can be an int if both values are the same. depth_multiplier: The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to `num_filters_in * depth_multiplier`. stride: A list of length 2: [stride_height, stride_width], specifying the depthwise convolution stride. Can be an int if both strides are the same. padding: One of 'VALID' or 'SAME'. rate: A list of length 2: [rate_height, rate_width], specifying the dilation rates for a'trous convolution. Can be an int if both rates are the same. If any value is larger than one, then both stride values need to be one. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: Collection to add the outputs. trainable: Whether or not the variables should be trainable or not. scope: Optional scope for variable_scope. Returns: A `Tensor` representing the output of the operation. """ layer_variable_getter = _build_variable_getter( {'bias': 'biases', 'depthwise_kernel': 'depthwise_weights', 'pointwise_kernel': 'pointwise_weights'}) with variable_scope.variable_scope( scope, 'SeparableConv2d', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) if num_outputs is not None: channel_format = 'channels_last' if data_format == 'NHWC' else 'channels_first' # Apply separable conv using the SeparableConvolution2D layer. layer = convolutional_layers.SeparableConvolution2D( filters=num_outputs, kernel_size=kernel_size, strides=stride, padding=padding, data_format=channel_format, dilation_rate=utils.two_element_tuple(rate), activation=None, depth_multiplier=depth_multiplier, use_bias=not normalizer_fn and biases_initializer, depthwise_initializer=weights_initializer, pointwise_initializer=weights_initializer, bias_initializer=biases_initializer, depthwise_regularizer=weights_regularizer, pointwise_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. _add_variable_to_collections(layer.depthwise_kernel, variables_collections, 'weights') _add_variable_to_collections(layer.pointwise_kernel, variables_collections, 'weights') if layer.bias: _add_variable_to_collections(layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: outputs = depthwise_convolution2d( inputs, kernel_size, depth_multiplier, stride, padding, rate, activation_fn, normalizer_fn, normalizer_params, weights_initializer, weights_regularizer, biases_initializer, biases_regularizer, reuse, variables_collections, outputs_collections, trainable, data_format, scope=None) return outputs if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def __init__(self, config): self.config = config self.lr = config["lr"] self.char_dim = config["char_dim"] self.lstm_dim = config["lstm_dim"] self.seg_dim = config["seg_dim"] self.num_tags = config["num_tags"] self.num_chars = config["num_chars"] self.num_segs = 4 self.global_step = tf.Variable(0, trainable=False) self.best_dev_f1 = tf.Variable(0.0, trainable=False) self.best_test_f1 = tf.Variable(0.0, trainable=False) self.initializer = initializers.xavier_initializer() # add placeholders for the model self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="ChatInputs") self.seg_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="SegInputs") self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="Targets") # dropout keep prob self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout") used = tf.sign(tf.abs(self.char_inputs)) length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) self.batch_size = tf.shape(self.char_inputs)[0] self.num_steps = tf.shape(self.char_inputs)[-1] # embeddings for chinese character and segmentation representation embedding = self.embedding_layer(self.char_inputs, self.seg_inputs, config) # apply dropout before feed to lstm layer lstm_inputs = tf.nn.dropout(embedding, self.dropout) # bi-directional lstm layer lstm_outputs = self.biLSTM_layer(lstm_inputs, self.lstm_dim, self.lengths) # logits for tags self.logits = self.project_layer(lstm_outputs) # loss of the model self.loss = self.loss_layer(self.logits, self.lengths) with tf.variable_scope("optimizer"): optimizer = self.config["optimizer"] if optimizer == "sgd": self.opt = tf.train.GradientDescentOptimizer(self.lr) elif optimizer == "adam": self.opt = tf.train.AdamOptimizer(self.lr) elif optimizer == "adgrad": self.opt = tf.train.AdagradOptimizer(self.lr) else: raise KeyError # apply grad clip to avoid gradient explosion grads_vars = self.opt.compute_gradients(self.loss) capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars] self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step) # saver of the model self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
def depthwise_convolution2d( inputs, kernel_size, depth_multiplier=1, stride=1, padding='SAME', rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, data_format='NHWC', scope=None): """Adds a depthwise 2D convolution with optional batch_norm layer. This op performs a depthwise convolution that acts separately on channels, creating a variable called `depthwise_weights`. Then, if `normalizer_fn` is None, it adds bias to the result, creating a variable called 'biases', otherwise, the `normalizer_fn` is applied. It finally applies an activation function to produce the end result. Args: inputs: A tensor of size [batch_size, height, width, channels]. num_outputs: The number of pointwise convolution output filters. If is None, then we skip the pointwise convolution stage. kernel_size: A list of length 2: [kernel_height, kernel_width] of of the filters. Can be an int if both values are the same. depth_multiplier: The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to `num_filters_in * depth_multiplier`. stride: A list of length 2: [stride_height, stride_width], specifying the depthwise convolution stride. Can be an int if both strides are the same. padding: One of 'VALID' or 'SAME'. rate: A list of length 2: [rate_height, rate_width], specifying the dilation rates for atrous convolution. Can be an int if both rates are the same. If any value is larger than one, then both stride values need to be one. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: Collection to add the outputs. trainable: Whether or not the variables should be trainable or not. scope: Optional scope for variable_scope. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope(scope, 'DepthwiseConv2d', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) # Actually apply depthwise conv instead of separable conv. dtype = inputs.dtype.base_dtype kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) if data_format == 'NHWC': num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) strides = [1, stride_h, stride_w, 1] else: num_filters_in = inputs.get_shape().as_list()[1] strides = [1, 1, stride_h, stride_w] weights_collections = utils.get_variable_collections( variables_collections, 'weights') # Depthwise weights variable. depthwise_shape = [kernel_h, kernel_w, num_filters_in, depth_multiplier] depthwise_weights = variables.model_variable( 'depthwise_weights', shape=depthwise_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, trainable=trainable, collections=weights_collections) outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding, rate=utils.two_element_tuple(rate), data_format=data_format) num_outputs = depth_multiplier * num_filters_in if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs,], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, trainable=trainable, collections=biases_collections) outputs = nn.bias_add(outputs, biases, data_format=data_format) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def masked_fully_connected( inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a sparse fully connected layer. The weight matrix is masked. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of at least rank 2 and static value for the last dimension; i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer or long, the number of output units in the layer. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collections per variable. outputs_collections: Collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: The tensor variable representing the result of the series of operations. Raises: ValueError: If x has rank less than 2 or if its last dimension is not set. """ if not isinstance(num_outputs, six.integer_types): raise ValueError('num_outputs should be int or long, got %s.' % (num_outputs,)) layer_variable_getter = _build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with variable_scope.variable_scope( scope, 'fully_connected', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) layer = core.MaskedFullyConnected( units=num_outputs, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. _add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.bias is not None: _add_variable_to_collections(layer.bias, variables_collections, 'biases') # Apply normalizer function / layer. if normalizer_fn is not None: if not normalizer_params: normalizer_params = {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def masked_convolution(inputs, num_outputs, kernel_size, stride=1, padding='SAME', data_format=None, rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds an 2D convolution followed by an optional batch_norm layer. The layer creates a mask variable on top of the weight variable. The input to the convolution operation is the elementwise multiplication of the mask variable and the weigh It is required that 1 <= N <= 3. `convolution` creates a variable called `weights`, representing the convolutional kernel, that is convolved (actually cross-correlated) with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs atrous convolution with input stride/dilation rate equal to `rate` if a value > 1 for any dimension of `rate` is specified. In this case `stride` values != 1 are not supported. Args: inputs: A Tensor of rank N+2 of shape `[batch_size] + input_spatial_shape + [in_channels]` if data_format does not start with "NC" (default), or `[batch_size, in_channels] + input_spatial_shape` if data_format starts with "NC". num_outputs: Integer, the number of output filters. kernel_size: A sequence of N positive integers specifying the spatial dimensions of the filters. Can be a single integer to specify the same value for all spatial dimensions. stride: A sequence of N positive integers specifying the stride at which to compute output. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `stride` value != 1 is incompatible with specifying any `rate` value != 1. padding: One of `"VALID"` or `"SAME"`. data_format: A string or None. Specifies whether the channel dimension of the `input` and output is the last dimension (default, or if `data_format` does not start with "NC"), or the second dimension (if `data_format` starts with "NC"). For N=1, the valid values are "NWC" (default) and "NCW". For N=2, the valid values are "NHWC" (default) and "NCHW". For N=3, the valid values are "NDHWC" (default) and "NCDHW". rate: A sequence of N positive integers specifying the dilation rate to use for atrous convolution. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `rate` value != 1 is incompatible with specifying any `stride` value != 1. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: Collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A tensor representing the output of the operation. Raises: ValueError: If `data_format` is invalid. ValueError: Both 'rate' and `stride` are not uniformly 1. """ if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']: raise ValueError('Invalid data_format: %r' % (data_format,)) layer_variable_getter = _build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with variable_scope.variable_scope( scope, 'Conv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims if input_rank == 3: raise ValueError('Sparse Convolution not supported for input with rank', input_rank) elif input_rank == 4: layer_class = core.MaskedConv2D elif input_rank == 5: raise ValueError('Sparse Convolution not supported for input with rank', input_rank) else: raise ValueError('Sparse Convolution not supported for input with rank', input_rank) if data_format is None or data_format == 'NHWC': df = 'channels_last' elif data_format == 'NCHW': df = 'channels_first' else: raise ValueError('Unsupported data format', data_format) layer = layer_class( filters=num_outputs, kernel_size=kernel_size, strides=stride, padding=padding, data_format=df, dilation_rate=rate, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. _add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.use_bias: _add_variable_to_collections(layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def conv2d_leaders(inputs, num_outputs, kernel_size, rates=[1], stride=1, padding='SAME', activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None,): """Adds a 2D convolution followed by an optional batch_norm layer. `convolution2d` creates a variable called `weights`, representing the convolutional kernel, that is convolved with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs a'trous convolution with input stride equal to rate if rate is greater than one. Args: inputs: a 4-D tensor `[batch_size, height, width, channels]`. num_outputs: integer, the number of output filters. kernel_size: a list of length 2 `[kernel_height, kernel_width]` of of the filters. Can be an int if both values are the same. stride: a list of length 2 `[stride_height, stride_width]`. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: one of `VALID` or `SAME`. rate: integer. If less than or equal to 1, a standard convolution is used. If greater than 1, than the a'trous convolution is applied and `stride` must be set to 1. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionay containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. Raises: ValueError: if both 'rate' and `stride` are larger than one. """ with variable_scope.variable_scope(scope, 'Conv', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype # inshape = tf.shape(inputs) # Leading kernel size. kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) # Weights variable. weights_shape = [kernel_h, kernel_w, num_filters_in, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) # # Bias variable. # biases = None # if biases_initializer is not None: # biases_collections = utils.get_variable_collections( # variables_collections, 'biases') # biases = variables.model_variable('biases', # shape=[num_outputs, ], # dtype=dtype, # initializer=biases_initializer, # regularizer=biases_regularizer, # collections=biases_collections, # trainable=trainable) # Convolution at different scales. outputs_pool = [] for rate in rates: if rate > 1: conv = nn.atrous_conv2d(inputs, weights, rate, padding='SAME') else: conv = nn.conv2d(inputs, weights, [1, 1, 1, 1], padding='SAME') outputs_pool.append(conv) # 'Pooling' at different scales. A bit hacky. Use of concat + max_pool? outputs = None outputs_pool.reverse() for node in outputs_pool: if outputs is None: outputs = node else: outputs = tf.maximum(outputs, node) # # Add bias? # if biases is not None: # outputs = tf.nn.bias_add(outputs, biases) # Fix padding and stride. A bit hacky too and not so efficient! if padding == 'VALID' or stride > 1: padfilter = np.zeros(shape=(kernel_h, kernel_w, num_filters_in, 1), dtype=dtype) x = (kernel_h - 1) / 2 y = (kernel_w - 1) / 2 padfilter[x, y, :, 0] = 1. outputs = tf.nn.depthwise_conv2d(outputs, padfilter, [1, stride_h, stride_w, 1], padding=padding) # Batch norm / bias and activation... if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs, ], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def fully_connected(x, num_output_units, activation_fn=None, weight_init=initializers.xavier_initializer(), bias_init=standard_ops.constant_initializer(0.), name=None, weight_collections=(ops.GraphKeys.WEIGHTS,), bias_collections=(ops.GraphKeys.BIASES,), output_collections=(ops.GraphKeys.ACTIVATIONS,), weight_regularizer=None, bias_regularizer=None): """Adds the parameters for a fully connected layer and returns the output. A fully connected layer is generally defined as a matrix multiply: `y = f(w * x + b)` where `f` is given by `activation_fn`. If `activation_fn` is `None`, the result of `y = w * x + b` is returned. This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting `bias_init` to `None`. The variable creation is compatible with `tf.variable_scope` and so can be reused with `tf.variable_scope` or `tf.make_template`. Most of the details of variable creation can be controlled by specifying the initializers (`weight_init` and `bias_init`) and which in collections to place the created variables (`weight_collections` and `bias_collections`; note that the variables are always added to the `VARIABLES` collection). The output of the layer can be placed in custom collections using `output_collections`. The collections arguments default to `WEIGHTS`, `BIASES` and `ACTIVATIONS`, respectively. A per layer regularization can be specified by setting `weight_regularizer` and `bias_regularizer`, which are applied to the weights and biases respectively, and whose output is added to the `REGULARIZATION_LOSSES` collection. Args: x: The input `Tensor`. num_output_units: The size of the output. activation_fn: A function that requires a single Tensor that is applied as a non-linearity. If None is used, do not apply any activation. weight_init: An optional weight initialization, defaults to `xavier_initializer`. bias_init: An initializer for the bias, defaults to 0. Set to `None` in order to disable bias. name: The name for this operation is used to name operations and to find variables. If specified it must be unique for this scope, otherwise a unique name starting with "fully_connected" will be created. See `tf.variable_op_scope` for details. weight_collections: List of graph collections to which weights are added. bias_collections: List of graph collections to which biases are added. output_collections: List of graph collections to which outputs are added. weight_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for weights. bias_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for biases. Returns: The output of the fully connected layer. """ with variable_scope.variable_op_scope([x], name, 'fully_connected'): num_input_units = x.get_shape().dims[1].value dtype = x.dtype.base_dtype w = _weight_variable(shape=[num_input_units, num_output_units], dtype=dtype, initializer=weight_init, collections=weight_collections, regularizer=weight_regularizer) y = standard_ops.matmul(x, w) if bias_init is not None: b = _bias_variable(shape=[num_output_units], dtype=dtype, initializer=bias_init, collections=bias_collections, regularizer=bias_regularizer) y = nn.bias_add(y, b) return _apply_activation(y, activation_fn, output_collections)
def dnn_sampled_softmax_classifier_model_fn(features, target_indices, mode, params): """model_fn that uses candidate sampling. Args: features: Single Tensor or dict of Tensor (depends on data passed to `fit`) target_indices: A single Tensor of shape [batch_size, n_labels] containing the target indices. mode: Represents if this training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters that are listed below. hidden_units- List of hidden units per layer. All layers are fully connected. Ex. `[64, 32]` means first layer has 64 nodes and second one has 32. feature_columns- An iterable containing all the feature columns used by the model. All items in the set should be instances of classes derived from `FeatureColumn`. n_classes- number of target classes. It must be greater than 2. n_samples- number of sample target classes. Needs to be tuned - A good starting point could be 2% of n_classes. n_labels- number of labels in each example. top_k- The number of classes to predict. optimizer- An instance of `tf.Optimizer` used to train the model. If `None`, will use an Adagrad optimizer. dropout- When not `None`, the probability we will drop out a given coordinate. gradient_clip_norm- A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. See tf.clip_by_global_norm for more details. num_ps_replicas- The number of parameter server replicas. Returns: predictions: A single Tensor or a dict of Tensors. loss: A scalar containing the loss of the step. train_op: The op for training. """ hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] n_classes = params["n_classes"] n_samples = params["n_samples"] n_labels = params["n_labels"] top_k = params["top_k"] optimizer = params["optimizer"] dropout = params["dropout"] gradient_clip_norm = params["gradient_clip_norm"] num_ps_replicas = params["num_ps_replicas"] parent_scope = "dnn_ss" # Setup the input layer partitioner. input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) # Create the input layer. with variable_scope.variable_scope( parent_scope + "/input_from_feature_columns", features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( features, feature_columns, weight_collections=[parent_scope], scope=scope) # Setup the hidden layer partitioner. hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) final_hidden_layer_dim = None # Create hidden layers using fully_connected. for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, [net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected(net, num_hidden_units, variables_collections=[parent_scope], scope=scope) final_hidden_layer_dim = num_hidden_units # Add dropout if it is enabled. if dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dropout)) # Create the weights and biases for the logit layer. with variable_scope.variable_scope( parent_scope + "/logits", [net], partitioner=hidden_layer_partitioner) as scope: dtype = net.dtype.base_dtype weights_shape = [n_classes, final_hidden_layer_dim] weights = variables.model_variable( "weights", shape=weights_shape, dtype=dtype, initializer=initializers.xavier_initializer(), trainable=True, collections=[parent_scope]) biases = variables.model_variable( "biases", shape=[n_classes,], dtype=dtype, initializer=init_ops.zeros_initializer, trainable=True, collections=[parent_scope]) if mode == estimator.ModeKeys.TRAIN: # Call the candidate sampling APIs and calculate the loss. sampled_values = nn.learned_unigram_candidate_sampler( true_classes=math_ops.to_int64(target_indices), num_true=n_labels, num_sampled=n_samples, unique=True, range_max=n_classes) sampled_softmax_loss = nn.sampled_softmax_loss( weights=weights, biases=biases, inputs=net, labels=math_ops.to_int64(target_indices), num_sampled=n_samples, num_classes=n_classes, num_true=n_labels, sampled_values=sampled_values) loss = math_ops.reduce_mean(sampled_softmax_loss, name="loss") train_op = optimizers.optimize_loss( loss=loss, global_step=contrib_framework.get_global_step(), learning_rate=_DEFAULT_LEARNING_RATE, optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm, name=parent_scope) return None, loss, train_op elif mode == estimator.ModeKeys.EVAL: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) # Since the targets have multiple labels, setup the target probabilities # as 1.0/n_labels for each of the labels. target_one_hot = array_ops.one_hot( indices=target_indices, depth=n_classes, on_value=1.0 / n_labels) target_one_hot = math_ops.reduce_sum( input_tensor=target_one_hot, reduction_indices=[1]) loss = math_ops.reduce_mean( nn.softmax_cross_entropy_with_logits(logits, target_one_hot)) return predictions, loss, None elif mode == estimator.ModeKeys.INFER: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) return predictions, None, None
def conv_capsule(inputs, shape, strides, iterations, batch_size, name): """This constructs a convolution capsule layer from a primary or convolution capsule layer. i: input capsules (32) o: output capsules (32) batch size: 24 spatial dimension: 14x14 kernel: 3x3 :param inputs: a primary or convolution capsule layer with poses and activations pose: (24, 14, 14, 32, 4, 4) activation: (24, 14, 14, 32) :param shape: the shape of convolution operation kernel, [kh, kw, i, o] = (3, 3, 32, 32) :param strides: often [1, 2, 2, 1] (stride 2), or [1, 1, 1, 1] (stride 1). :param iterations: number of iterations in EM routing. 3 :param name: name. :return: (poses, activations). """ inputs_poses, inputs_activations = inputs with tf.variable_scope(name) as scope: stride = strides[1] # 2 i_size = shape[-2] # 32 o_size = shape[-1] # 32 pose_size = inputs_poses.get_shape()[-1] # 4 # Tile the input capusles' pose matrices to the spatial dimension of the output capsules # Such that we can later multiple with the transformation matrices to generate the votes. inputs_poses = kernel_tile(inputs_poses, 3, stride) # (?, 14, 14, 32, 4, 4) -> (?, 6, 6, 3x3=9, 32x16=512) # Tile the activations needed for the EM routing inputs_activations = kernel_tile(inputs_activations, 3, stride) # (?, 14, 14, 32) -> (?, 6, 6, 9, 32) spatial_size = int(inputs_activations.get_shape()[1]) # 6 # Reshape it for later operations inputs_poses = tf.reshape(inputs_poses, shape=[-1, 3 * 3 * i_size, 16]) # (?, 9x32=288, 16) inputs_activations = tf.reshape(inputs_activations, shape=[-1, spatial_size, spatial_size, 3 * 3 * i_size]) # (?, 6, 6, 9x32=288) with tf.variable_scope('votes') as scope: # Generate the votes by multiply it with the transformation matrices votes = mat_transform(inputs_poses, o_size, size=batch_size*spatial_size*spatial_size) # (864, 288, 32, 16) # Reshape the vote for EM routing votes_shape = votes.get_shape() votes = tf.reshape(votes, shape=[batch_size, spatial_size, spatial_size, votes_shape[-3], votes_shape[-2], votes_shape[-1]]) # (24, 6, 6, 288, 32, 16) tf.logging.info(f"{name} votes shape: {votes.get_shape()}") with tf.variable_scope('routing') as scope: # beta_v and beta_a one for each output capsule: (1, 1, 1, 32) beta_v = tf.get_variable( name='beta_v', shape=[1, 1, 1, o_size], dtype=tf.float32, initializer=initializers.xavier_initializer() ) beta_a = tf.get_variable( name='beta_a', shape=[1, 1, 1, o_size], dtype=tf.float32, initializer=initializers.xavier_initializer() ) # Use EM routing to compute the pose and activation # votes (24, 6, 6, 3x3x32=288, 32, 16), inputs_activations (?, 6, 6, 288) # poses (24, 6, 6, 32, 16), activation (24, 6, 6, 32) poses, activations = matrix_capsules_em_routing( votes, inputs_activations, beta_v, beta_a, iterations, name='em_routing' ) # Reshape it back to 4x4 pose matrix poses_shape = poses.get_shape() # (24, 6, 6, 32, 4, 4) poses = tf.reshape( poses, [ poses_shape[0], poses_shape[1], poses_shape[2], poses_shape[3], pose_size, pose_size ] ) tf.logging.info(f"{name} pose shape: {poses.get_shape()}") tf.logging.info(f"{name} activations shape: {activations.get_shape()}") return poses, activations
def test_xavier_wrong_dtype(self): with self.assertRaisesRegexp( TypeError, 'Cannot create initializer for non-floating point type.'): initializers.xavier_initializer(dtype=dtypes.int32) self.assertIsNone(regularizers.l1_regularizer(0.)(None))