def Nature(c_i, out_num, nonlin, is_training): # Adjust bn_args dictionary bn_args["activation_fn"] = nonlin bn_args["is_training"] = is_training c_i = conv(c_i, 32, 8, 4, **conv_args) c_i = bn(c_i, **bn_args) c_i = conv(c_i, 64, 4, 2, **conv_args) c_i = bn(c_i, **bn_args) c_i = conv(c_i, 64, 3, 1, **conv_args) c_i = bn(c_i, **bn_args) c_i = tf.reshape(c_i, [-1, np.prod([int(s) for s in c_i.get_shape()[1:]])]) c_i = fc(c_i, num_outputs=512, **dense_args) c_i = bn(c_i, **bn_args) # We can't use "center" here. We have to apply scale first and then shift # This will be done with "scale_shift" layer bn_args["center"] = False bn_args["activation_fn"] = None c_i = fc(c_i, num_outputs=out_num, **dense_args) c_i = bn(c_i, **bn_args) # We can't add scale to this batch norm because we might want to apply L2 reg and we would like # this scale to be centered in 1 instead of 0 c_i = scale_shift(c_i, name="scale") return c_i
def _output(self): # TODO: check whether to use the encodes before dropout or after dropout self.start_logits = tf.squeeze( fc(tf.concat([self.model_encodes[-3], self.model_encodes[-2]], axis=-1), 1, activation_fn=None, biases_initializer=None, scope='start_pointer'), -1) self.end_logits = tf.squeeze( fc(tf.concat([self.model_encodes[-3], self.model_encodes[-1]], axis=-1), 1, activation_fn=None, biases_initializer=None, scope='end_pointer'), -1) self.start_logits = mask_logits(self.start_logits, mask=self.c_mask) self.end_logits = mask_logits(self.end_logits, mask=self.c_mask) self.start_probs = tf.nn.softmax(self.start_logits) self.end_probs = tf.nn.softmax(self.end_logits) self.outer_product = tf.matmul( tf.expand_dims(self.start_probs, axis=2), tf.expand_dims(self.end_probs, axis=1)) self.outer_product = tf.matrix_band_part( self.outer_product, 0, tf.cast( tf.minimum( tf.shape(self.outer_product)[2] - 1, self.max_answer_len), tf.int64)) self.pred_start = tf.argmax(tf.reduce_max(self.outer_product, axis=2), axis=1) self.pred_end = tf.argmax(tf.reduce_max(self.outer_product, axis=1), axis=1)
def discriminator(self, x, is_training=True, reuse=False): # Network Architecture is exactly same as in infoGAN (https://arxiv.org/abs/1606.03657) # Architecture : (64)4c2s-(128)4c2s_BL-FC1024_BL-FC1_S with tf.variable_scope("discriminator", reuse=reuse): net = lrelu( conv2d(x, 64, 4, 4, 2, 2, name='d_conv1', data_type=self.dtype)) net = lrelu( bn(conv2d(net, 128, 4, 4, 2, 2, name='d_conv2', data_type=self.dtype), is_training=is_training, scope='d_bn2')) net = tf.reshape(net, [self.batch_size, -1]) net = lrelu( bn(fc(net, 1024, scope='d_fc3', activation_fn=None), is_training=is_training, scope='d_bn3')) #out_logit = linear(net, 1, scope='d_fc4', data_type=self.dtype) #net = tf.cast(net, tf.float32) out_logit = fc(net, 1, scope='d_fc4', activation_fn=None) out = tf.nn.sigmoid(out_logit) print("discriminator: ", out, out_logit, net) return out, out_logit, net
def _initialize_weights(self): cell_factory = GRUCell(num_units=self.params['n_units']) cell_drop = DropoutWrapper(cell_factory, self.params['k_prob']) __, states = tf.nn.dynamic_rnn(cell_drop, self.x, dtype=tf.float32, sequence_length=self.seq_length) hidden = fc(states, self.params['n_hidden_neurons']) self.output = fc(hidden, self.params['n_outputs'], activation_fn=None)
def testComplicated(): """ https://towardsdatascience.com/howto-profile-tensorflow-1a49fb18073d """ import tempfile import tensorflow as tf from tensorflow.contrib.layers import fully_connected as fc from tensorflow.examples.tutorials.mnist import input_data from tensorflow.python.client import timeline batch_size = 100 inputs = tf.placeholder(tf.float32, [batch_size, 784]) targets = tf.placeholder(tf.float32, [batch_size, 10]) with tf.variable_scope("layer_1"): fc_1_out = fc(inputs, num_outputs=500, activation_fn=tf.nn.sigmoid) with tf.variable_scope("layer_2"): fc_2_out = fc(fc_1_out, num_outputs=784, activation_fn=tf.nn.sigmoid) with tf.variable_scope("layer_3"): logits = fc(fc_2_out, num_outputs=10) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)) train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss) mnist_save_dir = os.path.join(tempfile.gettempdir(), 'MNIST_data') mnist = input_data.read_data_sets(mnist_save_dir, one_hot=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() for i in range(3): batch_input, batch_target = mnist.train.next_batch(batch_size) feed_dict = {inputs: batch_input, targets: batch_target} sess.run(train_op, feed_dict=feed_dict, options=options, run_metadata=run_metadata) fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open('timeline_02_step_%d.json' % i, 'w') as f: f.write(chrome_trace)
def fanet8ss_conv_1_1_16_16_16_exp(input_tensor, num_label, **kwargs): with tf.variable_scope('fanet8ss_conv_1_1_16_16_16_exp') as scope: normal_tensor = input_tensor / 127.5 - 1.0 h_conv1 = conv2d(normal_tensor, 16, 3, stride=2) h_conv2 = conv2d(h_conv1, 16, 1) h_conv3 = conv2d(h_conv2, 16, 3, stride=2) h_conv4 = conv2d(h_conv3, 16, 1) h_conv5 = conv2d(h_conv4, 32, 3, stride=2) h_conv6 = conv2d(h_conv5, 64, 3) h_conv7 = conv2d(h_conv6, 64, 3, stride=2) h_conv8 = conv2d(h_conv7, 128, 3) h_pool1 = max_pool2d(h_conv8, 2, 2) h_pool1_flat = flatten(h_pool1) h_fc1 = fc(h_pool1_flat, 512) point = fc(h_fc1, num_label, activation_fn=None) return point
def build_decoder(self, z, n_layers, architecture, reuse=False): with tf.variable_scope('decoder') as scope: if reuse is True: scope.reuse_variables() #Q3 layers = [] layers.append(fc(z,architecture[-1],activation_fn=tf.nn.relu)) for i in range(1,n_layers): layers.append(fc(layers[-1],architecture[n_layers-i-1],activation_fn=tf.nn.relu)) # esp_x_given_z x_hat = fc(layers[-1],d_inputs,activation_fn=None) #End of Q3 return x_hat
def fc_module(input_layer, hiddens, activation_fn=tf.nn.relu): """ fully connected module """ out = input_layer for num_outputs in hiddens: out = fc(out, num_outputs=num_outputs, activation_fn=activation_fn, weights_initializer=xavier()) return out
def classifier(self, x, is_training=True, reuse=False): with tf.variable_scope("classifier", reuse=reuse): net = fc(x, 128, scope='c_fc1', activation_fn=None) # Batch normalization should be calculated as type of float32 net = tf.cast(net, tf.float32) net = bn(net, is_training=is_training, scope='c_bn1') # Leveraging the tensors core for fully connected weight. net = tf.cast(net, tf.float16) net = tf.nn.leaky_relu(net, alpha=0.2) out_logit = fc(net, self.y_dim, scope='c_fc2', activation_fn=None) # Softmax should should be calculate as float32 out_logit = tf.cast(out_logit, tf.float32) out = tf.nn.softmax(out_logit) return out, out_logit
def fanet8ss_inference(inputs, num_label, **kwargs): with tf.variable_scope('fanet8ss_inference') as scope: inputs = inputs / 127.5 - 1.0 h_conv1 = conv2d(inputs, 16, 3, stride=2) h_conv2 = conv2d(h_conv1, 32, 3) h_conv3 = conv2d(h_conv2, 32, 3, stride=2) h_conv4 = conv2d(h_conv3, 32, 3) h_conv5 = conv2d(h_conv4, 32, 3, stride=2) h_conv6 = conv2d(h_conv5, 64, 3) h_conv7 = conv2d(h_conv6, 64, 3, stride=2) h_conv8 = conv2d(h_conv7, 128, 3) h_pool1 = max_pool2d(h_conv8, 2, 2) h_pool1_flat = flatten(h_pool1) h_fc1 = fc(h_pool1_flat, 512) h_fc2 = fc(h_fc1, num_label, activation_fn=tf.nn.sigmoid) return h_fc2
def attn_pooling(pooling_vectors, hidden_size, ref_vector=None, mask=None, scope=None): with tf.variable_scope(scope or 'attn_pooling'): u = fc(pooling_vectors, num_outputs=hidden_size, activation_fn=None, biases_initializer=None) if ref_vector is not None: u += fc(tf.expand_dims(ref_vector, 1), num_outputs=hidden_size, activation_fn=None) logits = fc(tf.tanh(u), num_outputs=1, activation_fn=None) if mask is not None: logits = mask_logits(logits, mask=tf.expand_dims(mask, -1)) scores = tf.nn.softmax(logits, 1) pooled_vector = tf.reduce_sum(pooling_vectors * scores, axis=1) return pooled_vector
def generator(self, z, y, is_training=True, reuse=False): # Network Architecture is exactly same as in infoGAN (https://arxiv.org/abs/1606.03657) # Architecture : FC1024_BR-FC7x7x128_BR-(64)4dc2s_BR-(1)4dc2s_S with tf.variable_scope("generator", reuse=reuse): # merge noise and code z = concat([z, y], 1) net = tf.nn.relu( bn(fc(z, 1024, scope='g_fc1', activation_fn=None), is_training=is_training, scope='g_bn1')) net = tf.nn.relu( bn(fc(net, 128 * 8 * 8, scope='g_fc2', activation_fn=None), is_training=is_training, scope='g_bn2')) net = tf.reshape(net, [self.batch_size, 8, 8, 128]) net = tf.nn.relu( bn(deconv2d(net, [self.batch_size, 16, 16, 64], 4, 4, 2, 2, name='g_dc3', data_type=self.dtype), is_training=is_training, scope='g_bn3')) out = tf.nn.sigmoid( deconv2d(net, [self.batch_size, 32, 32, 3], 4, 4, 2, 2, name='g_dc4', data_type=self.dtype)) print("generator: ", out) return out
def build_network(self, architecture, d_inputs): self.x = tf.placeholder(tf.float32,shape=[None, d_inputs]) n_layers = len(architecture) layers = [] #encoder / Q1 layers.append(fc(self.x,architecture[0],activation_fn=tf.nn.relu)) for i in range(1,n_layers): layers.append(fc(layers[-1],architecture[i],activation_fn=tf.nn.relu)) esp_z_given_x = fc(layers[-1],self.d_z,activation_fn=None) log_sigma_sq_z_given_x = fc(layers[-1],self.d_z,activation_fn=None) #Sampling z given x / Q2 epsi = tf.random_normal([self.d_z]) self.z = tf.math.add(esp_z_given_x, tf.math.multiply((tf.math.exp(0.5*log_sigma_sq_z_given_x)), epsi)) #decoder self.x_hat = self.build_decoder(self.z, n_layers, architecture) #Q4 cross_ent_term = 0.5*tf.math.reduce_sum(tf.square(self.x_hat-self.x))#end of Q4 #Q5 kl_term = -0.5*tf.reduce_sum(1+0.5*log_sigma_sq_z_given_x - tf.math.exp(log_sigma_sq_z_given_x) - tf.math.square(esp_z_given_x)) #end of Q5 self.J = cross_ent_term + kl_term self.train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.J) self.losses = { 'cross entropy loss term': cross_ent_term, 'KL divergence loss term': kl_term } #for generation / Q8 self.z_gen = tf.placeholder(tf.float32,shape=[None, self.d_z]) #end of Q8 self.x_gen = self.build_decoder(self.z_gen, n_layers, architecture, reuse = True)
def _embedding_encoder(self): # TODO: check the paper, it uses conv here self.emb_c = fc(self.emb_c, self.hidden_size, activation_fn=None, scope='input_projection', reuse=None) self.emb_q = fc(self.emb_q, self.hidden_size, activation_fn=None, scope='input_projection', reuse=True) # TODO: consider to mask the input and output, since they will affect the convolution. self.enc_c = encoder_block(self.emb_c, num_conv_layers=self.emb_num_convs, kernel_size=self.emb_kernel_size, hidden_size=self.hidden_size, num_heads=self.num_heads, num_blocks=self.emb_num_blocks, mask=self.c_mask, dropout=self.dropout, scope='encoder_block', reuse=None) self.enc_q = encoder_block(self.emb_q, num_conv_layers=self.emb_num_convs, kernel_size=self.emb_kernel_size, hidden_size=self.hidden_size, num_heads=self.num_heads, num_blocks=self.emb_num_blocks, mask=self.q_mask, dropout=self.dropout, scope='encoder_block', reuse=True) self.enc_c = tf.nn.dropout(self.enc_c, 1 - self.dropout) self.enc_q = tf.nn.dropout(self.enc_q, 1 - self.dropout)
def highway(x, size=None, activation=None, num_layers=2, dropout=0.0, scope='highway', reuse=None): with tf.variable_scope(scope, reuse=reuse): if size is None: size = x.shape.as_list()[-1] else: x = fc(x, size, weights_initializer=xa_init(), biases_initializer=bias_init(), activation_fn=None, scope='input_projection', reuse=reuse) for i in range(num_layers): T = fc(x, size, weights_initializer=xa_init(), biases_initializer=bias_init(), activation_fn=tf.sigmoid, scope='gate_%d' % i, reuse=reuse) H = fc(x, size, weights_initializer=he_init(), biases_initializer=bias_init(), activation_fn=activation, scope='activation_%d' % i, reuse=reuse) H = tf.nn.dropout(H, 1.0 - dropout) x = H * T + x * (1.0 - T) return x
def decoder(z): """Make reconstruction network. Parameters ---------- z Returns ------- Reconstruction distribution p(x|z;\theta) with Bernoulli distribution. Here, Bernoulli was chosen since pixel space is bounded by [0, 255]. """ net = stack(flatten(z), fc, [256, 512]) logits = fc(net, 28 * 28, activation_fn=None) return Bernoulli(logits=logits)
def encoder(x): """Make logits for variational proposal distribution. This is logits for K categorical distributions (K=num_cat_dists), where each categorical distribution is defined on N categories (N=num_classes). Parameters ---------- x Returns ------- logits: unnormalized log probability of shape (batch_size, num_cat_dists, num_classes) """ net = stack(x, fc, [512, 256]) return tf.reshape( fc(net, FLAGS.num_cat_dists * FLAGS.num_classes, activation_fn=None), [-1, FLAGS.num_cat_dists, FLAGS.num_classes])
def _model_encoder(self): self.model_encodes = [ fc(self.attn_out, self.hidden_size, activation_fn=None, scope='input_projection', reuse=None) ] for i in range(self.num_enc_layers): output = encoder_block(self.model_encodes[i], num_conv_layers=self.enc_num_convs, kernel_size=self.enc_kernel_size, hidden_size=self.hidden_size, num_heads=self.num_heads, num_blocks=self.enc_num_blocks, mask=self.c_mask, dropout=self.dropout, scope='encoder_block', reuse=True if i > 0 else None) self.model_encodes.append(tf.nn.dropout(output, 1 - self.dropout))
def build_network(self, architecture, d_inputs): self.x = tf.placeholder(tf.float32,shape=[None, d_inputs]) n_layers = len(architecture) layers = [] #encoder for i in range(n_layers): if (i==0): layers.append(fc(self.x,architecture[i],activation_fn=tf.nn.relu)) else: layers.append(fc(layers[i-1],architecture[i],activation_fn=tf.nn.relu)) self.z = fc(layers[-1],self.d_z,activation_fn=tf.nn.relu) #decoder for i in range(n_layers): if (i==0): layers.append(fc(self.z,architecture[n_layers-i-1],activation_fn=tf.nn.relu)) else: layers.append(fc(layers[i-1],architecture[n_layers-i-1],activation_fn=tf.nn.relu)) self.x_hat = fc(layers[-1],d_inputs,activation_fn=tf.nn.relu) #J self.J = tf.reduce_mean(tf.square(self.x_hat - self.x)) self.train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.J) self.losses = { 'reconstruction loss': self.J }
import os import tempfile import tensorflow as tf from tensorflow.contrib.layers import fully_connected as fc from tensorflow.examples.tutorials.mnist import input_data from tensorflow.python.client import timeline batch_size = 100 inputs = tf.placeholder(tf.float32, [batch_size, 784]) targets = tf.placeholder(tf.float32, [batch_size, 10]) with tf.variable_scope("layer_1"): fc_1_out = fc(inputs, num_outputs=500, activation_fn=tf.nn.sigmoid) with tf.variable_scope("layer_2"): fc_2_out = fc(fc_1_out, num_outputs=784, activation_fn=tf.nn.sigmoid) with tf.variable_scope("layer_3"): logits = fc(fc_2_out, num_outputs=10) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)) train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss) if __name__ == '__main__': mnist_save_dir = os.path.join(tempfile.gettempdir(), 'MNIST_data') mnist = input_data.read_data_sets(mnist_save_dir, one_hot=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True
print("version of our model with only one recurrent layer)") tf.reset_default_graph() X = tf.placeholder(tf.float32, shape=(None, n_steps, n_inputs)) y = tf.placeholder(tf.int64, shape=(None)) seq_length = tf.placeholder(tf.int64, shape=(None)) cell_factory = GRUCell(num_units=n_neurons) cell_drop = DropoutWrapper(cell_factory, k_prob) rnn_outputs, states = tf.nn.dynamic_rnn(cell_drop, X, dtype=tf.float32, sequence_length=seq_length) with arg_scope([fc], weights_regularizer=l2(reg_param)): hidden = fc(states, n_hidden) logits = fc(hidden, n_outputs, activation_fn=None) xentropy = softmax(labels=y, logits=logits) base_loss = tf.reduce_mean(xentropy) reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) cost = tf.add_n([base_loss] + reg_loss) optimizer = tf.train.AdamOptimizer(learning_rate) training_op = optimizer.minimize(cost) correct = tf.nn.in_top_k(logits, y, 2) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) init = tf.global_variables_initializer() saver = tf.train.Saver()
def generator(self, z, y, is_training=True, reuse=False): if self.mixed: with tf.variable_scope( "generator", reuse=reuse, custom_getter=float32_variable_storage_getter): # merge noise and code z = concat([z, y], 1) net = fc(z, 1024, scope='g_fc1', activation_fn=None) # Batch normalization should be calculated as type of float32 net = tf.cast(net, tf.float32) net = bn(net, is_training=is_training, scope='g_bn1') # Leveraging the tensors core for fully connected weight. net = tf.cast(net, tf.float16) net = tf.nn.relu(net) net = fc(net, 128 * 7 * 7, scope='g_fc2', activation_fn=None) # Batch normalization should be calculated as type of float32 net = tf.cast(net, tf.float32) net = bn(net, is_training=is_training, scope='g_bn2') # Leveraging the tensors core net = tf.cast(net, tf.float16) net = tf.nn.relu(net) net = tf.reshape(net, [self.batch_size, 7, 7, 128]) net = deconv2d(net, [self.batch_size, 14, 14, 64], 4, 4, 2, 2, name='g_dc3', data_type=self.dtype) # Batch normalization should be calculated as type of float32 net = tf.cast(net, tf.float32) net = bn(net, is_training=is_training, scope='g_bn3') # Leveraging the tensors core net = tf.cast(net, tf.float16) net = tf.nn.relu(net) net = deconv2d(net, [ self.batch_size, self.output_height, self.output_width, self.c_dim ], 4, 4, 2, 2, name='g_dc4', data_type=self.dtype) # Sigmoid should be calculated as type of float32 net = tf.cast(net, tf.float32) out = tf.nn.sigmoid(net) return out else: with tf.variable_scope("generator", reuse=reuse): # merge noise and code z = concat([z, y], 1) net = fc(z, 1024, scope='g_fc1', activation_fn=None) net = bn(net, is_training=is_training, scope='g_bn1') net = tf.nn.relu(net) net = fc(net, 128 * 7 * 7, scope='g_fc2', activation_fn=None) net = bn(net, is_training=is_training, scope='g_bn2') net = tf.nn.relu(net) net = tf.reshape(net, [self.batch_size, 7, 7, 128]) net = deconv2d(net, [self.batch_size, 14, 14, 64], 4, 4, 2, 2, name='g_dc3', data_type=self.dtype) net = bn(net, is_training=is_training, scope='g_bn3') net = tf.nn.relu(net) net = deconv2d(net, [ self.batch_size, self.output_height, self.output_width, self.c_dim ], 4, 4, 2, 2, name='g_dc4', data_type=self.dtype) out = tf.nn.sigmoid(net) return out
def discriminator(self, x, is_training=True, reuse=False): if self.mixed: with tf.variable_scope( "discriminator", reuse=reuse, custom_getter=float32_variable_storage_getter): # Cast the input to float16 x = tf.cast(x, tf.float16) net = conv2d(x, 64, 4, 4, 2, 2, name='d_conv1', data_type=self.dtype) net = tf.nn.leaky_relu(net, alpha=0.2) net = conv2d(net, 128, 4, 4, 2, 2, name='d_conv2', data_type=self.dtype) # Batch normalization should be calculated as type of float32 net = tf.cast(net, tf.float32) net = bn(net, is_training=is_training, scope='d_bn2') # Leveraging the tensors core for fully connected weight. net = tf.cast(net, tf.float16) net = tf.nn.leaky_relu(net, alpha=0.2) net = tf.reshape(net, [self.batch_size, -1]) net = fc(net, 1024, scope='d_fc3', activation_fn=None) # Batch normalization should be calculated as type of float32 net = tf.cast(net, tf.float32) net = bn(net, is_training=is_training, scope='d_bn3') # Leveraging the tensors core for fully connected weight. net = tf.cast(net, tf.float16) net = tf.nn.leaky_relu(net, alpha=0.2) out_logit = fc(net, 1, scope='d_fc4', activation_fn=None) # Sigmoid should be calculated as type of float32 out_logit = tf.cast(out_logit, tf.float32) out = tf.nn.sigmoid(out_logit) return out, out_logit, net else: with tf.variable_scope("discriminator", reuse=reuse): net = conv2d(x, 64, 4, 4, 2, 2, name='d_conv1', data_type=self.dtype) net = tf.nn.leaky_relu(net, alpha=0.2) net = conv2d(net, 128, 4, 4, 2, 2, name='d_conv2', data_type=self.dtype) net = bn(net, is_training=is_training, scope='d_bn2') net = tf.nn.leaky_relu(net, alpha=0.2) net = tf.reshape(net, [self.batch_size, -1]) net = fc(net, 1024, scope='d_fc3', activation_fn=None) net = bn(net, is_training=is_training, scope='d_bn3') net = tf.nn.leaky_relu(net, alpha=0.2) out_logit = fc(net, 1, scope='d_fc4', activation_fn=None) out = tf.nn.sigmoid(out_logit) return out, out_logit, net
def encoder_block(inputs, num_conv_layers, kernel_size, hidden_size, num_heads, num_blocks=1, mask=None, dropout=0.0, use_relative_pos=False, scope='encoder_block', reuse=None): with tf.variable_scope(scope, reuse=reuse): outputs = inputs for block_repeat_idx in range(num_blocks): with tf.variable_scope('block_%d' % block_repeat_idx, reuse=reuse): total_sublayers = num_conv_layers + 2 sublayer = 0 # position encoding if not use_relative_pos: outputs += get_timing_signal_1d( tf.shape(outputs)[1], tf.shape(outputs)[2]) # convolutions outputs = tf.expand_dims(outputs, 2) for i in range(num_conv_layers): residual = outputs outputs = layer_norm( outputs, begin_norm_axis=-1, begin_params_axis=-1, scope='conv_layer_norm_%d' % i, reuse=reuse) # TODO: change layer norm if i % 2 == 0: outputs = tf.nn.dropout(outputs, 1.0 - dropout) if isinstance(kernel_size, list): kernel_num = len(kernel_size) kernel_outputs = [ depthwise_conv( outputs, hidden_size, bias=True, activation=tf.nn.relu, kernel_size=k, scope='depthwise_conv_%d_kernel_%d' % (i, k), reuse=reuse) for k in kernel_size ] kernel_weights = tf.nn.softmax(tf.get_variable( 'kernel_weights_conv_%d' % i, [kernel_num], dtype=tf.float32, trainable=True, initializer=tf.constant_initializer(1.0 / kernel_num)), axis=0) outputs = 0 for j in range(kernel_num): outputs += kernel_outputs[j] * kernel_weights[j] else: outputs = depthwise_conv( outputs, hidden_size, bias=True, activation=tf.nn.relu, # activation=tf.nn.relu if i < num_conv_layers - 1 else None, kernel_size=kernel_size, scope='depthwise_conv_%d' % i, reuse=reuse) sublayer += 1 outputs = layer_dropout( residual, residual + tf.nn.dropout(outputs, 1 - dropout), dropout * float(sublayer) / total_sublayers) outputs = tf.squeeze(outputs, 2) # self attention residual = outputs outputs = layer_norm(outputs, begin_norm_axis=-1, begin_params_axis=-1, scope='self_attention_layer_norm', reuse=reuse) outputs = tf.nn.dropout(outputs, 1.0 - dropout) outputs = self_attention(outputs, hidden_size, num_heads, use_relative_pos=use_relative_pos, mask=mask, scope='self_attention_layer', reuse=reuse) sublayer += 1 outputs = layer_dropout( residual, residual + tf.nn.dropout(outputs, 1 - dropout), dropout * float(sublayer) / total_sublayers) # feed forward residual = outputs outputs = layer_norm(outputs, begin_norm_axis=-1, begin_params_axis=-1, scope='fc_layer_norm', reuse=reuse) outputs = tf.nn.dropout(outputs, 1.0 - dropout) outputs = fc(outputs, hidden_size, tf.nn.relu, weights_initializer=he_init(), biases_initializer=bias_init(), scope='fc_layer_1', reuse=reuse) # outputs = tf.nn.dropout(outputs, 1 - dropout) outputs = fc(outputs, hidden_size, None, weights_initializer=xa_init(), biases_initializer=bias_init(), scope='fc_layer_2', reuse=reuse) sublayer += 1 outputs = layer_dropout( residual, residual + tf.nn.dropout(outputs, 1 - dropout), dropout * float(sublayer) / total_sublayers) return outputs
def self_attention(inputs, hidden_size, num_heads, use_relative_pos=False, max_relative_position=16, mask=None, scope='self_attention', reuse=None): with tf.variable_scope(scope, reuse=reuse): Q = split_heads( fc(inputs, hidden_size, activation_fn=None, weights_initializer=xa_init(), biases_initializer=None, scope='q_projection', reuse=reuse), num_heads) K = split_heads( fc(inputs, hidden_size, activation_fn=None, weights_initializer=xa_init(), biases_initializer=None, scope='k_projection', reuse=reuse), num_heads) V = split_heads( fc(inputs, hidden_size, activation_fn=None, weights_initializer=xa_init(), biases_initializer=None, scope='v_projection', reuse=reuse), num_heads) Q *= (float(hidden_size) // num_heads)**-0.5 length = tf.shape(V)[2] depth = V.shape[3] # calculate similarity matrix if use_relative_pos: relations_keys = get_relative_positions_embeddings( length, depth, max_relative_position, "relative_positions_keys") sim_logits = _relative_attention_inner(Q, K, relations_keys, transpose=True) else: sim_logits = tf.matmul(Q, K, transpose_b=True) if mask is not None: logit_mask = tf.expand_dims( tf.expand_dims(tf.cast(mask, tf.float32), 1), 1) sim_logits = mask_logits(sim_logits, logit_mask) # compute the attention output attn_weights = tf.nn.softmax(sim_logits, name='attention_weights') if use_relative_pos: relations_values = get_relative_positions_embeddings( length, depth, max_relative_position, "relative_positions_values") multi_head_attns = _relative_attention_inner(attn_weights, V, relations_values, transpose=False) else: multi_head_attns = tf.matmul(attn_weights, V) outputs = merge_heads(multi_head_attns) outputs = fc(outputs, hidden_size, activation_fn=None, weights_initializer=xa_init(), biases_initializer=None, scope='merge_projection', reuse=reuse) return outputs
import tensorflow as tf from tensorflow.contrib.layers import fully_connected as fc import gym import numpy as np env = gym.make("CartPole-v0") n_inputs = 4 n_hidden = 4 n_outputs = 1 initializer = tf.contrib.layers.variance_scaling_initializer() X = tf.placeholder(tf.float32, shape=[None, n_inputs]) hidden = fc(X, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer) logits = fc(hidden, n_outputs, activation_fn=None, weights_initializer=initializer) outputs = tf.nn.sigmoid(logits) plr = tf.concat(axis=1, values=[outputs, 1 - outputs]) action = tf.multinomial(tf.log(plr), num_samples=1) y = 1. - tf.to_float(action) cross_enthopy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits) optimizer = tf.train.AdamOptimizer(0.01) grads_vars = optimizer.compute_gradients(cross_enthopy)
# print(x_test.max(), x_test.min()) input_ = tf.placeholder(tf.float32, [None, image_px], name='Input') dim_z = 10 #dimensions of embedding vector learning_rate = tf.constant(1e-5, tf.float32) batch_size = 64 beta = 1 def lrelu(x, alpha=0.1): return tf.maximum(alpha * x, x) '''Encoder''' with tf.variable_scope('encoding'): f1 = fc(input_, 512, scope='enc_fc1', activation_fn=tf.nn.relu) f2 = fc(f1, 384, scope='enc_fc2', activation_fn=tf.nn.relu) f3 = fc(f2, 256, scope='enc_fc3', activation_fn=tf.nn.relu) z_mu = fc(f3, dim_z, scope='Z_Mu', activation_fn=None) z_log_sigma_sq = fc( f3, dim_z, scope='Z_LogOf_SigmaSq', activation_fn=None) # use log(sig^2) instead of log(sigma) # Now dim_z mu's and sigma's, one for each dimension of z #latent space # encoded_dist = tf.distributions.Normal(loc=z_mu, scale=tf.sqrt(tf.exp(z_log_sigma_sq))) encoded_dist = tfd.MultivariateNormalDiag(loc=z_mu, scale_diag=tf.sqrt( tf.exp(z_log_sigma_sq))) encoded = encoded_dist.sample() #sampled embedding '''Decoder'''
def _embed(self): with tf.device('/cpu:0'): word_pad_emb = tf.get_variable('word_pad_embedding', shape=(1, self.word_vocab.embed_dim), initializer=tf.zeros_initializer, trainable=False) word_unk_emb = tf.get_variable('word_unk_embedding', shape=(1, self.word_vocab.embed_dim), initializer=tf.zeros_initializer, trainable=True) word_emb_init = tf.constant_initializer(self.word_vocab.embeddings[2:]) \ if self.word_vocab.embeddings is not None \ else tf.random_normal_initializer() normal_word_embs = tf.get_variable( 'normal_word_embeddings', shape=(self.word_vocab.size() - 2, self.word_vocab.embed_dim), initializer=word_emb_init, trainable=False) self.word_emb_mat = tf.concat( [word_pad_emb, word_unk_emb, normal_word_embs], 0) char_pad_emb = tf.get_variable('char_pad_embedding', shape=(1, self.char_vocab.embed_dim), initializer=tf.zeros_initializer, trainable=False) char_emb_init = tf.constant_initializer(self.char_vocab.embeddings[1:]) \ if self.char_vocab.embeddings is not None \ else tf.random_normal_initializer() normal_char_embs = tf.get_variable( 'normal_char_embeddings', shape=(self.char_vocab.size() - 1, self.char_vocab.embed_dim), initializer=char_emb_init, trainable=True) self.char_emb_mat = tf.concat([char_pad_emb, normal_char_embs], 0) self.emb_c = tf.nn.dropout( tf.nn.embedding_lookup(self.word_emb_mat, self.c), 1.0 - self.dropout) self.emb_q = tf.nn.dropout( tf.nn.embedding_lookup(self.word_emb_mat, self.q), 1.0 - self.dropout) self.emb_cc = tf.nn.dropout( tf.nn.embedding_lookup(self.char_emb_mat, self.cc), 1.0 - 0.5 * self.dropout) self.emb_qc = tf.nn.dropout( tf.nn.embedding_lookup(self.char_emb_mat, self.qc), 1.0 - 0.5 * self.dropout) # check the paper, it seems to use another operation # self.conv_emb_cc = conv(self.emb_cc, self.hidden_size, kernel_size=5, activation=tf.nn.relu, reuse=None) # self.conv_emb_qc = conv(self.emb_qc, self.hidden_size, kernel_size=5, activation=tf.nn.relu, reuse=True) self.conv_emb_cc = tf.reduce_max(self.emb_cc, 2) self.conv_emb_qc = tf.reduce_max(self.emb_qc, 2) self.conv_emb_cc = fc(self.conv_emb_cc, self.char_vocab.embed_dim, activation_fn=None) self.conv_emb_qc = fc(self.conv_emb_qc, self.char_vocab.embed_dim, activation_fn=None) self.emb_c = highway(tf.concat([self.emb_c, self.conv_emb_cc], axis=2), size=self.hidden_size, dropout=self.dropout, num_layers=2, scope='highway', reuse=None) self.emb_q = highway(tf.concat([self.emb_q, self.conv_emb_qc], axis=2), size=self.hidden_size, dropout=self.dropout, num_layers=2, scope='highway', reuse=True)
ns = 28 # n_steps ni = 28 # n_inputs no = 10 # n_outputs nn = 150 # n_neurons learning_rate = 0.001 X = tf.placeholder(tf.float32, [None, ns, ni]) y = tf.placeholder(tf.int32, [None]) basic_cell = tf.contrib.rnn.BasicRNNCell(num_units = nn) outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype = tf.float32) print(outputs.shape, states.shape) logits = fc(states, no, activation_fn=None) xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=y, logits=logits) loss = tf.reduce_mean(xentropy) train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) correct = tf.nn.in_top_k(logits, y,1) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) init = tf.global_variables_initializer() from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets("/tmp/data/") X_test = mnist.test.images.reshape((-1, ns, ni)) y_test = mnist.test.labels
final_visual = tf.reshape(conv6, [batch_size, 298, 1, 256], name='final_visual') final_input = tf.concat([final_audio, final_visual], 3) final_input = tf.reshape(final_input, [batch_size, 298, 2312]) unstack_input = tf.unstack(final_input, 298, 1) lstm_fw_cell = rnn.BasicLSTMCell(lstm_hidden, forget_bias=1.0) lstm_bw_cell = rnn.BasicLSTMCell(lstm_hidden, forget_bias=1.0) outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, unstack_input, dtype=tf.float32) fc1_output = fc(outputs, 600, tf.nn.relu) fc2_output = fc(fc1_output, 600, tf.nn.relu) fc3_output = fc(fc2_output, 257 * 2, tf.nn.sigmoid) complex_mask = tf.reshape(fc3_output, [2, 298, 257]) complex_mask_result = tf.complex(complex_mask[0], complex_mask[1]) label_signal_tensor = tf.convert_to_tensor(label_spectrogram) spectrogram_result = complex_mask_result * tf.convert_to_tensor( spectrogram) loss_op = tf.losses.mean_squared_error(label_signal_tensor, spectrogram_result) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) train_op = optimizer.minimize(loss_op) with tf.Session() as sess: