def build_model(self): ''' build the MemNN model ''' # the input. self.inputs = tf.placeholder(tf.int32, [None, None], name="inputs") self.last_inputs = tf.placeholder(tf.int32, [None], name="last_inputs") batch_size = tf.shape(self.inputs)[0] self.sequence_length = tf.placeholder(tf.int64, [None], name='sequence_length') self.lab_input = tf.placeholder(tf.int32, [None], name="lab_input") # the lookup dict. self.embe_dict = tf.Variable(self.pre_embedding, dtype=tf.float32, trainable=self.emb_up) self.pe_mask = tf.Variable(self.pre_embedding_mask, dtype=tf.float32, trainable=False) self.embe_dict *= self.pe_mask sent_bitmap = tf.ones_like(tf.cast(self.inputs, tf.float32)) inputs = tf.nn.embedding_lookup(self.embe_dict, self.inputs, max_norm=1) lastinputs = tf.nn.embedding_lookup(self.embe_dict, self.last_inputs, max_norm=1) org_memory = inputs pool_out = pooler(org_memory, 'mean', axis=1, sequence_length=tf.cast( tf.reshape(self.sequence_length, [batch_size, 1]), tf.float32)) pool_out = tf.reshape(pool_out, [-1, self.hidden_size]) attlayer = FwNnAttLayer(self.edim, active=self.active, stddev=self.stddev, norm_type='none') attout, alph = attlayer.forward(org_memory, lastinputs, pool_out, sent_bitmap) attout = tf.reshape(attout, [-1, self.edim]) + pool_out self.alph = tf.reshape(alph, [batch_size, 1, -1]) self.w1 = tf.Variable(tf.random_normal([self.edim, self.edim], stddev=self.stddev), trainable=True) self.w2 = tf.Variable(tf.random_normal([self.edim, self.edim], stddev=self.stddev), trainable=True) attout = tf.tanh(tf.matmul(attout, self.w1)) lastinputs = tf.tanh(tf.matmul(lastinputs, self.w2)) prod = attout * lastinputs sco_mat = tf.matmul(prod, self.embe_dict[1:], transpose_b=True) self.softmax_input = sco_mat self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=sco_mat, labels=self.lab_input) # the optimize. self.params = tf.trainable_variables() self.optimize = super(Seq2SeqAttNN, self).optimize_normal(self.loss, self.params)
def __init__(self, num_units, n_input, alpha, sigma_rec=0, activation='softplus', w_rec_init='diag', rng=None, reuse=None, name=None): super(LeakyRNNCell, self).__init__(_reuse=reuse, name=name) # Inputs must be 2-dimensional. # self.input_spec = base_layer.InputSpec(ndim=2) self._num_units = num_units self._w_rec_init = w_rec_init self._reuse = reuse if activation == 'softplus': self._activation = tf.nn.softplus self._w_in_start = 1.0 self._w_rec_start = 0.5 elif activation == 'tanh': self._activation = tf.tanh self._w_in_start = 1.0 self._w_rec_start = 1.0 elif activation == 'relu': self._activation = tf.nn.relu self._w_in_start = 1.0 self._w_rec_start = 0.5 elif activation == 'power': self._activation = lambda x: tf.square(tf.nn.relu(x)) self._w_in_start = 1.0 self._w_rec_start = 0.01 elif activation == 'retanh': self._activation = lambda x: tf.tanh(tf.nn.relu(x)) self._w_in_start = 1.0 self._w_rec_start = 0.5 else: raise ValueError('Unknown activation') self._alpha = alpha self._sigma = np.sqrt(2 / alpha) * sigma_rec if rng is None: self.rng = np.random.RandomState() else: self.rng = rng # Generate initialization matrix n_hidden = self._num_units w_in0 = (self.rng.randn(n_input, n_hidden) / np.sqrt(n_input) * self._w_in_start) if self._w_rec_init == 'diag': w_rec0 = self._w_rec_start * np.eye(n_hidden) elif self._w_rec_init == 'randortho': w_rec0 = self._w_rec_start * tools.gen_ortho_matrix(n_hidden, rng=self.rng) elif self._w_rec_init == 'randgauss': w_rec0 = (self._w_rec_start * self.rng.randn(n_hidden, n_hidden) / np.sqrt(n_hidden)) matrix0 = np.concatenate((w_in0, w_rec0), axis=0) self.w_rnn0 = matrix0 self._initializer = tf.constant_initializer(matrix0, dtype=tf.float32)
def _build_fused(self, hp): n_input = hp['n_input'] n_rnn = hp['n_rnn'] n_output = hp['n_output'] self.x = tf.placeholder("float", [None, None, n_input]) self.y = tf.placeholder("float", [None, None, n_output]) if hp['loss_type'] == 'lsq': self.c_mask = tf.placeholder("float", [None, n_output]) else: # Mask on time self.c_mask = tf.placeholder("float", [None]) # Activation functions if hp['activation'] == 'power': f_act = lambda x: tf.square(tf.nn.relu(x)) elif hp['activation'] == 'retanh': f_act = lambda x: tf.tanh(tf.nn.relu(x)) elif hp['activation'] == 'relu+': f_act = lambda x: tf.nn.relu(x + tf.constant(1.)) else: f_act = getattr(tf.nn, hp['activation']) # Recurrent activity if hp['rnn_type'] == 'LeakyRNN': n_in_rnn = self.x.get_shape().as_list()[-1] cell = LeakyRNNCell(n_rnn, n_in_rnn, hp['alpha'], sigma_rec=hp['sigma_rec'], activation=hp['activation'], w_rec_init=hp['w_rec_init'], rng=self.rng) elif hp['rnn_type'] == 'LeakyGRU': cell = LeakyGRUCell(n_rnn, hp['alpha'], sigma_rec=hp['sigma_rec'], activation=f_act) elif hp['rnn_type'] == 'LSTM': cell = tf.contrib.rnn.LSTMCell(n_rnn, activation=f_act) elif hp['rnn_type'] == 'GRU': cell = tf.contrib.rnn.GRUCell(n_rnn, activation=f_act) else: raise NotImplementedError("""rnn_type must be one of LeakyRNN, LeakyGRU, EILeakyGRU, LSTM, GRU """) # Dynamic rnn with time major self.h, states = rnn.dynamic_rnn(cell, self.x, dtype=tf.float32, time_major=True) # Output with tf.variable_scope("output"): # Using default initialization `glorot_uniform_initializer` w_out = tf.get_variable('weights', [n_rnn, n_output], dtype=tf.float32) b_out = tf.get_variable('biases', [n_output], dtype=tf.float32, initializer=tf.constant_initializer( 0.0, dtype=tf.float32)) h_shaped = tf.reshape(self.h, (-1, n_rnn)) y_shaped = tf.reshape(self.y, (-1, n_output)) # y_hat_ shape (n_time*n_batch, n_unit) y_hat_ = tf.matmul(h_shaped, w_out) + b_out if hp['loss_type'] == 'lsq': # Least-square loss y_hat = tf.sigmoid(y_hat_) self.cost_lsq = tf.reduce_mean( tf.square((y_shaped - y_hat) * self.c_mask)) else: y_hat = tf.nn.softmax(y_hat_) # Cross-entropy loss self.cost_lsq = tf.reduce_mean( self.c_mask * tf.nn.softmax_cross_entropy_with_logits( labels=y_shaped, logits=y_hat_)) self.y_hat = tf.reshape(y_hat, (-1, tf.shape(self.h)[1], n_output)) y_hat_fix, y_hat_ring = tf.split(self.y_hat, [1, n_output - 1], axis=-1) self.y_hat_loc = tf_popvec(y_hat_ring)
def cyclegan_generator_resnet(images, arg_scope_fn=cyclegan_arg_scope, num_resnet_blocks=6, num_filters=64, upsample_fn=cyclegan_upsample, kernel_size=3, tanh_linear_slope=0.0, is_training=False): """Defines the cyclegan resnet network architecture. As closely as possible following https://github.com/junyanz/CycleGAN/blob/master/models/architectures.lua#L232 FYI: This network requires input height and width to be divisible by 4 in order to generate an output with shape equal to input shape. Assertions will catch this if input dimensions are known at graph construction time, but there's no protection if unknown at graph construction time (you'll see an error). Args: images: Input image tensor of shape [batch_size, h, w, 3]. arg_scope_fn: Function to create the global arg_scope for the network. num_resnet_blocks: Number of ResNet blocks in the middle of the generator. num_filters: Number of filters of the first hidden layer. upsample_fn: Upsampling function for the decoder part of the generator. kernel_size: Size w or list/tuple [h, w] of the filter kernels for all inner layers. tanh_linear_slope: Slope of the linear function to add to the tanh over the logits. is_training: Whether the network is created in training mode or inference only mode. Not actually needed, just for compliance with other generator network functions. Returns: A `Tensor` representing the model output and a dictionary of model end points. Raises: ValueError: If the input height or width is known at graph construction time and not a multiple of 4. """ # Neither dropout nor batch norm -> dont need is_training del is_training end_points = {} input_size = images.shape.as_list() height, width = input_size[1], input_size[2] if height and height % 4 != 0: raise ValueError('The input height must be a multiple of 4.') if width and width % 4 != 0: raise ValueError('The input width must be a multiple of 4.') num_outputs = input_size[3] if not isinstance(kernel_size, (list, tuple)): kernel_size = [kernel_size, kernel_size] kernel_height = kernel_size[0] kernel_width = kernel_size[1] pad_top = (kernel_height - 1) // 2 pad_bottom = kernel_height // 2 pad_left = (kernel_width - 1) // 2 pad_right = kernel_width // 2 paddings = np.array( [[0, 0], [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]], dtype=np.int32) spatial_pad_3 = np.array([[0, 0], [3, 3], [3, 3], [0, 0]]) with slim.arg_scope(arg_scope_fn()): ########### # Encoder # ########### with tf.variable_scope('input'): # 7x7 input stage net = tf.pad(tensor=images, paddings=spatial_pad_3, mode='REFLECT') net = slim.conv2d(net, num_filters, kernel_size=[7, 7], padding='VALID') end_points['encoder_0'] = net with tf.variable_scope('encoder'): with slim.arg_scope([slim.conv2d], kernel_size=kernel_size, stride=2, activation_fn=tf.nn.relu, padding='VALID'): net = tf.pad(tensor=net, paddings=paddings, mode='REFLECT') net = slim.conv2d(net, num_filters * 2) end_points['encoder_1'] = net net = tf.pad(tensor=net, paddings=paddings, mode='REFLECT') net = slim.conv2d(net, num_filters * 4) end_points['encoder_2'] = net ################### # Residual Blocks # ################### with tf.variable_scope('residual_blocks'): with slim.arg_scope([slim.conv2d], kernel_size=kernel_size, stride=1, activation_fn=tf.nn.relu, padding='VALID'): for block_id in xrange(num_resnet_blocks): with tf.variable_scope('block_{}'.format(block_id)): res_net = tf.pad(tensor=net, paddings=paddings, mode='REFLECT') res_net = slim.conv2d(res_net, num_filters * 4) res_net = tf.pad(tensor=res_net, paddings=paddings, mode='REFLECT') res_net = slim.conv2d(res_net, num_filters * 4, activation_fn=None) net += res_net end_points['resnet_block_%d' % block_id] = net ########### # Decoder # ########### with tf.variable_scope('decoder'): with slim.arg_scope([slim.conv2d], kernel_size=kernel_size, stride=1, activation_fn=tf.nn.relu): with tf.variable_scope('decoder1'): net = upsample_fn(net, num_outputs=num_filters * 2, stride=[2, 2]) end_points['decoder1'] = net with tf.variable_scope('decoder2'): net = upsample_fn(net, num_outputs=num_filters, stride=[2, 2]) end_points['decoder2'] = net with tf.variable_scope('output'): net = tf.pad(tensor=net, paddings=spatial_pad_3, mode='REFLECT') logits = slim.conv2d(net, num_outputs, [7, 7], activation_fn=None, normalizer_fn=None, padding='valid') logits = tf.reshape(logits, _dynamic_or_static_shape(images)) end_points['logits'] = logits end_points['predictions'] = tf.tanh( logits) + logits * tanh_linear_slope return end_points['predictions'], end_points
def gelu(x): return 0.5 * x * ( 1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))
def __call__(self, x, prev_state): prev_read_vector_list = prev_state['read_vector_list'] controller_input = tf.concat([x] + prev_read_vector_list, axis=-1) #next we pass the controller, which is the RNN cell, the controller_input and prev_controller_state controller_output = self.controller(controller_input) num_parameters_per_head = self.memory_vector_dim + 1 total_parameter_num = num_parameters_per_head * self.head_num #Initiliaze weight matrix and bias and compute the parameters weights = tf.Variable( tf.random_normal([ controller_output.get_shape()[0], controller_output.get_shape()[2], total_parameter_num ], stddev=0.35)) biases = tf.Variable(tf.zeros([total_parameter_num])) parameters = tf.nn.xw_plus_b(controller_output, weights, biases) head_parameter_list = tf.split(parameters, self.head_num, axis=2) #previous read weight vector prev_w_r_list = prev_state['w_r_list'] #previous memory prev_M = prev_state['M'] #previous usage weight vector prev_w_u = prev_state['w_u'] #previous index and least used weight vector prev_indices, prev_w_lu = self.least_used(prev_w_u) #read weight vector w_r_list = [] #write weight vector w_w_list = [] #key vector k_list = [] #now, we will initialize some of the important parameters that we use for addressing. for i, head_parameter in enumerate(head_parameter_list): with tf.variable_scope('addressing_head_%d' % i): #key vector k = tf.tanh(head_parameter[:, :, 0:self.memory_vector_dim], name='k') #sig_alpha sig_alpha = tf.sigmoid(head_parameter[:, :, -1:], name='sig_alpha') #read weights w_r = self.read_head_addressing(k, prev_M) #write weights w_w = self.write_head_addressing(sig_alpha, prev_w_r_list[i], prev_w_lu) w_r_list.append(w_r) w_w_list.append(w_w) k_list.append(k) #usage weight vector w_u = self.gamma * prev_w_u + tf.add_n(w_r_list) + tf.add_n(w_w_list) #update the memory M_ = prev_M * tf.expand_dims( 1. - tf.one_hot(prev_indices[:, :, -1], self.memory_size), dim=3) #write operation M = M_ with tf.variable_scope('writing'): for i in range(self.head_num): w = tf.expand_dims(w_w_list[i], axis=3) k = tf.expand_dims(k_list[i], axis=2) M = M + tf.matmul(w, k) #read opearion read_vector_list = [] with tf.variable_scope('reading'): for i in range(self.head_num): read_vector = tf.reduce_sum( tf.expand_dims(w_r_list[i], dim=3) * M, axis=2) read_vector_list.append(read_vector) #controller output state = { 'read_vector_list': read_vector_list, 'w_r_list': w_r_list, 'w_w_list': w_w_list, 'w_u': w_u, 'M': M, } self.step += 1 return controller_output, state
def create_generator(generator_inputs, generator_outputs_channels): layers = [] print('encoder:') print(generator_inputs.shape) # encoder_1: [batch, 256, 256, in_channels] => [batch, 128, 128, ngf] with tf.variable_scope("encoder_1"): output = gen_conv(generator_inputs, a.ngf) layers.append(output) print(output.shape) layer_specs = [ a.ngf * 2, # encoder_2: [batch, 128, 128, ngf] => [batch, 64, 64, ngf * 2] a.ngf * 4, # encoder_3: [batch, 64, 64, ngf * 2] => [batch, 32, 32, ngf * 4] a.ngf * 8, # encoder_4: [batch, 32, 32, ngf * 4] => [batch, 16, 16, ngf * 8] a.ngf * 8, # encoder_5: [batch, 16, 16, ngf * 8] => [batch, 8, 8, ngf * 8] a.ngf * 8, # encoder_6: [batch, 8, 8, ngf * 8] => [batch, 4, 4, ngf * 8] a.ngf * 8, # encoder_7: [batch, 4, 4, ngf * 8] => [batch, 2, 2, ngf * 8] a.ngf * 8, # encoder_8: [batch, 2, 2, ngf * 8] => [batch, 1, 1, ngf * 8] ] for out_channels in layer_specs: with tf.variable_scope("encoder_%d" % (len(layers) + 1)): rectified = lrelu(layers[-1], 0.2) # [batch, in_height, in_width, in_channels] => [batch, in_height/2, in_width/2, out_channels] convolved = gen_conv(rectified, out_channels) output = batchnorm(convolved) layers.append(output) print(output.shape) print('decoder:') layer_specs = [ (a.ngf * 8, 0.5 ), # decoder_8: [batch, 1, 1, ngf * 8] => [batch, 2, 2, ngf * 8 * 2] ( a.ngf * 8, 0.5 ), # decoder_7: [batch, 2, 2, ngf * 8 * 2] => [batch, 4, 4, ngf * 8 * 2] ( a.ngf * 8, 0.5 ), # decoder_6: [batch, 4, 4, ngf * 8 * 2] => [batch, 8, 8, ngf * 8 * 2] ( a.ngf * 8, 0.0 ), # decoder_5: [batch, 8, 8, ngf * 8 * 2] => [batch, 16, 16, ngf * 8 * 2] ( a.ngf * 4, 0.0 ), # decoder_4: [batch, 16, 16, ngf * 8 * 2] => [batch, 32, 32, ngf * 4 * 2] ( a.ngf * 2, 0.0 ), # decoder_3: [batch, 32, 32, ngf * 4 * 2] => [batch, 64, 64, ngf * 2 * 2] ( a.ngf, 0.0 ), # decoder_2: [batch, 64, 64, ngf * 2 * 2] => [batch, 128, 128, ngf * 2] ] num_encoder_layers = len(layers) for decoder_layer, (out_channels, dropout) in enumerate(layer_specs): skip_layer = num_encoder_layers - decoder_layer - 1 with tf.variable_scope("decoder_%d" % (skip_layer + 1)): if decoder_layer == 0: # first decoder layer doesn't have skip connections # since it is directly connected to the skip_layer input = layers[-1] else: input = tf.concat([layers[-1], layers[skip_layer]], axis=3) rectified = tf.nn.relu(input) # [batch, in_height, in_width, in_channels] => [batch, in_height*2, in_width*2, out_channels] output = gen_deconv(rectified, out_channels) output = batchnorm(output) if dropout > 0.0: output = tf.nn.dropout(output, keep_prob=1 - dropout) layers.append(output) print(output.shape) # decoder_1: [batch, 128, 128, ngf * 2] => [batch, 256, 256, generator_outputs_channels] with tf.variable_scope("decoder_1"): input = tf.concat([layers[-1], layers[0]], axis=3) rectified = tf.nn.relu(input) output = gen_deconv(rectified, generator_outputs_channels) output = tf.tanh(output) layers.append(output) print(output.shape) return layers[-1]
def __call__(self, inputs, state, scope=None): """Run this RNN cell on inputs, starting from the given state. Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: `2-D Tensor` with shape `[batch_size, self.state_size]`. scope: optional cell scope. Returns: A pair containing: - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`. - New state: A single `2-D` tensor. """ batch_size, hidden_size = inputs.shape fixed_arc = self._params.fixed_arc num_layers = len(fixed_arc) // 2 prev_s = self.prev_s w_prev = self.w_prev w_skip = self.w_skip input_mask = self._input_mask layer_mask = self._layer_mask if layer_mask is not None: assert input_mask is not None ht = tf.matmul( tf.concat([inputs * input_mask, state * layer_mask], axis=1), w_prev) else: ht = tf.matmul(tf.concat([inputs, state], axis=1), w_prev) h, t = tf.split(ht, 2, axis=1) h = tf.tanh(h) t = tf.sigmoid(t) s = state + t * (h - state) layers = [s] def _select_function(h, function_id): if function_id == 0: return tf.tanh(h) elif function_id == 1: return tf.nn.relu(h) elif function_id == 2: return tf.sigmoid(h) elif function_id == 3: return h raise ValueError('Unknown func_idx {0}'.format(function_id)) start_idx = 0 used = np.zeros(num_layers + 1, dtype=np.float32) for layer_id in range(num_layers): prev_idx = fixed_arc[start_idx] func_idx = fixed_arc[start_idx + 1] prev_s = layers[prev_idx] used[prev_idx] = 1 if layer_mask is not None: ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) else: ht = tf.matmul(prev_s, w_skip[layer_id]) h, t = tf.split(ht, 2, axis=1) h = _select_function(h, func_idx) t = tf.sigmoid(t) s = prev_s + t * (h - prev_s) s.set_shape([batch_size, hidden_size]) layers.append(s) start_idx += 2 if self._params.average_loose_ends: layers = [l for l, u in zip(layers, used) if u == 0] next_s = tf.add_n(layers) / np.sum(1. - used) else: next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) return next_s, next_s
def __init__( self, learning_rate, num_layers, size, size_layer, output_size, forget_bias=0.1, attention_size=10, ): def lstm_cell(): return tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple=False) backward_rnn_cells = tf.nn.rnn_cell.MultiRNNCell( [lstm_cell() for _ in range(num_layers)], state_is_tuple=False) forward_rnn_cells = tf.nn.rnn_cell.MultiRNNCell( [lstm_cell() for _ in range(num_layers)], state_is_tuple=False) self.X = tf.placeholder(tf.float32, [None, None, size]) self.Y = tf.placeholder(tf.float32, [None, output_size]) drop_backward = tf.nn.rnn_cell.DropoutWrapper( backward_rnn_cells, output_keep_prob=forget_bias) drop_forward = tf.nn.rnn_cell.DropoutWrapper( forward_rnn_cells, output_keep_prob=forget_bias) self.backward_hidden_layer = tf.placeholder(tf.float32, shape=(None, num_layers * 2 * size_layer)) self.forward_hidden_layer = tf.placeholder(tf.float32, shape=(None, num_layers * 2 * size_layer)) outputs, last_state = tf.nn.bidirectional_dynamic_rnn( drop_forward, drop_backward, self.X, initial_state_fw=self.forward_hidden_layer, initial_state_bw=self.backward_hidden_layer, dtype=tf.float32, ) outputs = list(outputs) attention_w = tf.get_variable('attention_v1', [attention_size], tf.float32) query = tf.layers.dense( tf.expand_dims(last_state[0][:, size_layer:], 1), attention_size) keys = tf.layers.dense(outputs[0], attention_size) align = tf.reduce_sum(attention_w * tf.tanh(keys + query), [2]) align = tf.nn.tanh(align) outputs[0] = tf.squeeze( tf.matmul(tf.transpose(outputs[0], [0, 2, 1]), tf.expand_dims(align, 2)), 2, ) outputs[0] = tf.concat([outputs[0], last_state[0][:, size_layer:]], 1) attention_w = tf.get_variable('attention_v2', [attention_size], tf.float32) query = tf.layers.dense( tf.expand_dims(last_state[1][:, size_layer:], 1), attention_size) keys = tf.layers.dense(outputs[1], attention_size) align = tf.reduce_sum(attention_w * tf.tanh(keys + query), [2]) align = tf.nn.tanh(align) outputs[1] = tf.squeeze( tf.matmul(tf.transpose(outputs[1], [0, 2, 1]), tf.expand_dims(align, 2)), 2, ) outputs[1] = tf.concat([outputs[1], last_state[1][:, size_layer:]], 1) with tf.variable_scope('decoder', reuse=False): self.backward_rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell( [lstm_cell() for _ in range(num_layers)], state_is_tuple=False) self.forward_rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell( [lstm_cell() for _ in range(num_layers)], state_is_tuple=False) backward_drop_dec = tf.nn.rnn_cell.DropoutWrapper( self.backward_rnn_cells_dec, output_keep_prob=forget_bias) forward_drop_dec = tf.nn.rnn_cell.DropoutWrapper( self.forward_rnn_cells_dec, output_keep_prob=forget_bias) self.outputs, self.last_state = tf.nn.bidirectional_dynamic_rnn( forward_drop_dec, backward_drop_dec, self.X, initial_state_fw=outputs[0], initial_state_bw=outputs[1], dtype=tf.float32, ) self.outputs = list(self.outputs) attention_w = tf.get_variable('attention_v3', [attention_size], tf.float32) query = tf.layers.dense( tf.expand_dims(self.last_state[0][:, size_layer:], 1), attention_size, ) keys = tf.layers.dense(self.outputs[0], attention_size) align = tf.reduce_sum(attention_w * tf.tanh(keys + query), [2]) align = tf.nn.tanh(align) self.outputs[0] = tf.squeeze( tf.matmul( tf.transpose(self.outputs[0], [0, 2, 1]), tf.expand_dims(align, 2), ), 2, ) attention_w = tf.get_variable('attention_v4', [attention_size], tf.float32) query = tf.layers.dense( tf.expand_dims(self.last_state[1][:, size_layer:], 1), attention_size, ) keys = tf.layers.dense(self.outputs[1], attention_size) align = tf.reduce_sum(attention_w * tf.tanh(keys + query), [2]) align = tf.nn.tanh(align) self.outputs[1] = tf.squeeze( tf.matmul( tf.transpose(self.outputs[1], [0, 2, 1]), tf.expand_dims(align, 2), ), 2, ) self.outputs = tf.concat(self.outputs, 1) self.logits = tf.layers.dense(self.outputs, output_size) self.cost = tf.reduce_mean(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.cost)
def build(self, inputs): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. Returns: A dict of outputs that includes the 'predictions', 'init_ops', the 'push_ops', and the 'quantized_input'. """ num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 num_z = 16 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] batch_size = self.batch_size x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) encoding = tf.placeholder(name='encoding', shape=[batch_size, num_z], dtype=tf.float32) en = tf.expand_dims(encoding, 1) init_ops, push_ops = [], [] ### # The WaveNet Decoder. ### l = x_scaled l, inits, pushs = utils.causal_linear(x=l, n_inputs=1, n_outputs=width, name='startconv', rate=1, batch_size=batch_size, filter_length=filter_length) for init in inits: init_ops.append(init) for push in pushs: push_ops.append(push) # Set up skip connections. s = utils.linear(l, width, skip_width, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) # dilated masked cnn d, inits, pushs = utils.causal_linear(x=l, n_inputs=width, n_outputs=width * 2, name='dilatedconv_%d' % (i + 1), rate=dilation, batch_size=batch_size, filter_length=filter_length) for init in inits: init_ops.append(init) for push in pushs: push_ops.append(push) # local conditioning d += utils.linear(en, num_z, width * 2, name='cond_map_%d' % (i + 1)) # gated cnn assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:]) # residuals l += utils.linear(d, width, width, name='res_%d' % (i + 1)) # skips s += utils.linear(d, width, skip_width, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = (utils.linear(s, skip_width, skip_width, name='out1') + utils.linear(en, num_z, skip_width, name='cond_map_out1')) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = utils.linear(s, skip_width, 256, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') return { 'init_ops': init_ops, 'push_ops': push_ops, 'predictions': probs, 'encoding': encoding, 'quantized_input': x_quantized, }
def __call__(self, x, state, timestep=0, scope=None): with tf.variable_scope(scope or type(self).__name__): total_h, total_c = tf.split(state, 2, 1) h = total_h[:, 0:self.num_units] c = total_c[:, 0:self.num_units] self.hyper_state = tf.concat( [total_h[:, self.num_units:], total_c[:, self.num_units:]], 1) batch_size = x.get_shape().as_list()[0] x_size = x.get_shape().as_list()[1] self._input_size = x_size w_init = None # uniform h_init = lstm_ortho_initializer(1.0) w_xh = tf.get_variable('W_xh', [x_size, 4 * self.num_units], initializer=w_init) w_hh = tf.get_variable('W_hh', [self.num_units, 4 * self.num_units], initializer=h_init) bias = tf.get_variable('bias', [4 * self.num_units], initializer=tf.constant_initializer(0.0)) # concatenate the input and hidden states for hyperlstm input hyper_input = tf.concat([x, h], 1) hyper_output, hyper_new_state = self.hyper_cell( hyper_input, self.hyper_state) self.hyper_output = hyper_output self.hyper_state = hyper_new_state xh = tf.matmul(x, w_xh) hh = tf.matmul(h, w_hh) # split Wxh contributions ix, jx, fx, ox = tf.split(xh, 4, 1) ix = self.hyper_norm(ix, 'hyper_ix', use_bias=False) jx = self.hyper_norm(jx, 'hyper_jx', use_bias=False) fx = self.hyper_norm(fx, 'hyper_fx', use_bias=False) ox = self.hyper_norm(ox, 'hyper_ox', use_bias=False) # split Whh contributions ih, jh, fh, oh = tf.split(hh, 4, 1) ih = self.hyper_norm(ih, 'hyper_ih', use_bias=True) jh = self.hyper_norm(jh, 'hyper_jh', use_bias=True) fh = self.hyper_norm(fh, 'hyper_fh', use_bias=True) oh = self.hyper_norm(oh, 'hyper_oh', use_bias=True) # split bias ib, jb, fb, ob = tf.split(bias, 4, 0) # bias is to be broadcasted. # i = input_gate, j = new_input, f = forget_gate, o = output_gate i = ix + ih + ib j = jx + jh + jb f = fx + fh + fb o = ox + oh + ob if self.use_layer_norm: concat = tf.concat([i, j, f, o], 1) concat = layer_norm_all(concat, batch_size, 4, self.num_units, 'ln_all') i, j, f, o = tf.split(concat, 4, 1) if self.use_recurrent_dropout: g = tf.nn.dropout(tf.tanh(j), self.dropout_keep_prob) else: g = tf.tanh(j) new_c = c * tf.sigmoid(f + self.forget_bias) + tf.sigmoid(i) * g new_h = tf.tanh(layer_norm(new_c, self.num_units, 'ln_c')) * tf.sigmoid(o) hyper_h, hyper_c = tf.split(hyper_new_state, 2, 1) new_total_h = tf.concat([new_h, hyper_h], 1) new_total_c = tf.concat([new_c, hyper_c], 1) new_total_state = tf.concat([new_total_h, new_total_c], 1) return new_h, new_total_state
def build(self, inputs, is_training, rescale_inputs=True, include_decoder=True, use_reduce_mean_to_pool=False): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. rescale_inputs: Whether to convert inputs to mu-law and back to unit scaling before passing through the model (loses gradients). include_decoder: bool, whether to include the decoder in the build(). use_reduce_mean_to_pool: whether to use reduce_mean (instead of pool1d) for pooling. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) x = tf.expand_dims(x, 2) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d(x_scaled if rescale_inputs else x, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv', is_training=is_training) for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d(d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1), is_training=is_training) d = tf.nn.relu(d) en += masked.conv1d(d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1), is_training=is_training) en = masked.conv1d(en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck', is_training=is_training) if use_reduce_mean_to_pool: # Depending on the accelerator used for training, masked.pool1d may # lead to out of memory error. # reduce_mean is equivalent to masked.pool1d when the stride is the same # as the window length (which is the case here). batch_size, unused_length, depth = en.shape.as_list() en = tf.reshape(en, [batch_size, -1, self.ae_hop_length, depth]) en = tf.reduce_mean(en, axis=2) else: en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') encoding = en if not include_decoder: return {'encoding': encoding} ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled if rescale_inputs else x) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='startconv', is_training=is_training) # Set up skip connections. s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start', is_training=is_training) # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1), is_training=is_training) d = self._condition( d, masked.conv1d(en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1), is_training=is_training)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1), is_training=is_training) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1), is_training=is_training) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1', is_training=is_training) s = self._condition( s, masked.conv1d(en, num_filters=skip_width, filter_length=1, name='cond_map_out1', is_training=is_training)) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits', is_training=is_training) logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }
def self_normalizing_tanh(x): return 1.592537419722831 * tf.tanh(x)
def deep_reg(dimensions=[784, 512, 256, 64],ct=[0.1,0.1,0.1,0.1], activation = 'leaky_relu'): """ This method builds the architecture corresponding to a NN regression. Parameters: ----------- dimensions: list of ints List of number of neurons per layer. ct: list of floats List of values of the contamination level applied to the NN per layer. activation: str {'leaky_relu', 'relu_tanh', 'leaky_relu_l', 'tanh'} It defines the activation functions applied (default is 'leaky_relu') 'leaky_relu': Leaky ReLU for all activation functions 'relu_tanh': tanh on last hidden layer, Leaky ReLU for the rest 'leaky_relu_l': Linear function on last hidden layer, Leaky ReLU for the rest 'tanh': tanh for all activation functions Returns: -------- dict Dictionary with: x: np.ndarray Input training data theta: np.ndarray Labels for supervised learning z: np.ndarray NN output W: np.ndarray Weights of NN b: np.ndarray Bias cost: float Value of cost function """ L = len(dimensions) -1 # INPUT DATA x = tf.placeholder(tf.float32, [None, dimensions[0]], name='x') theta = tf.placeholder(tf.float32, [None, dimensions[-1]], name='theta') pcost = tf.placeholder(tf.float32, [1], name='pcost') # NOISY ENCODER encoder = [] all_h = [] b_enc = [] noise = tf.random_normal(shape=tf.shape(x),stddev=ct[0],dtype=tf.float32) current_input = x + noise all_h.append(current_input) for layer_i in range(1,L+1): " Defining the variables " n_input = dimensions[layer_i-1] n_output = dimensions[layer_i] low = -np.sqrt(6.0/(n_input + n_output)) high = np.sqrt(6.0/(n_input + n_output)) nameW = 'Weights' W = tf.Variable( tf.random_uniform([n_input, n_output], minval=low, maxval=high),name=nameW.format(layer_i - 1)) nameB = 'Bias' be = tf.Variable(tf.zeros([n_output]),name=nameB.format(layer_i - 1)) b_enc.append(be) encoder.append(W) if layer_i == L+1: output = tf.matmul(current_input, W) + be else: if activation == 'leaky_relu' or (activation in ['relu_tanh', 'leaky_relu_l'] and layer_i < L): output = leaky_relu(tf.matmul(current_input, W) + be) elif activation == 'leaky_relu_l' and layer_i == L: output = tf.matmul(current_input, W) + be elif activation == 'tanh' or (activation == 'relu_tanh' and layer_i == L): output = tf.tanh(tf.matmul(current_input, W) + be) noise = tf.random_normal(shape=tf.shape(output),stddev=ct[layer_i],dtype=tf.float32) current_input = output + noise z_out = output cost = tf.reduce_mean(tf.square(output - theta)) return {'x': x,'theta':theta, 'z': z_out,'W':encoder,'b':b_enc,'cost':cost}
def _build_sampler(self): """Build the sampler ops and the log_prob ops.""" hidden_size = self.params.controller_hidden_size num_layers = self.params.controller_num_layers arc_seq = [] sample_log_probs = [] sample_entropy = [] all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)] all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)] # sampler ops inputs = self.g_emb prev_c = tf.zeros([1, hidden_size], dtype=tf.float32) prev_h = tf.zeros([1, hidden_size], dtype=tf.float32) inputs = self.g_emb for layer_id in range(1, num_layers + 1): next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h all_h.append(next_h) all_h_w.append(tf.matmul(next_h, self.attn_w_1)) query = tf.matmul(next_h, self.attn_w_2) query = query + tf.concat(all_h_w[:-1], axis=0) query = tf.tanh(query) logits = tf.matmul(query, self.attn_v) logits = tf.reshape(logits, [1, layer_id]) if self.params.controller_temperature: logits /= self.params.controller_temperature if self.params.controller_tanh_constant: logits = self.params.controller_tanh_constant * tf.tanh(logits) diff = tf.to_float(layer_id - tf.range(0, layer_id))**2 logits -= tf.reshape(diff, [1, layer_id]) / 6.0 skip_index = tf.multinomial(logits, 1) skip_index = tf.to_int32(skip_index) skip_index = tf.reshape(skip_index, [1]) arc_seq.append(skip_index) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=skip_index) sample_log_probs.append(log_prob) entropy = log_prob * tf.exp(-log_prob) sample_entropy.append(tf.stop_gradient(entropy)) inputs = tf.nn.embedding_lookup(tf.concat(all_h[:-1], axis=0), skip_index) inputs /= (0.1 + tf.to_float(layer_id - skip_index)) next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logits = tf.matmul(next_h, self.w_emb, transpose_b=True) if self.params.controller_temperature: logits /= self.params.controller_temperature if self.params.controller_tanh_constant: logits = self.params.controller_tanh_constant * tf.tanh(logits) func = tf.multinomial(logits, 1) func = tf.to_int32(func) func = tf.reshape(func, [1]) arc_seq.append(func) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=func) sample_log_probs.append(log_prob) entropy = log_prob * tf.exp(-log_prob) sample_entropy.append(tf.stop_gradient(entropy)) inputs = tf.nn.embedding_lookup(self.w_emb, func) arc_seq = tf.concat(arc_seq, axis=0) self.sample_arc = arc_seq self.sample_log_probs = tf.concat(sample_log_probs, axis=0) self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs)) sample_entropy = tf.concat(sample_entropy, axis=0) self.sample_entropy = tf.reduce_sum(sample_entropy) self.all_h = all_h
def create(self): tf.reset_default_graph() self.weight_bias_init() self.x_ph = tf.placeholder("float32", [1, self.batch.shape[0], self.batch.shape[1]]) self.y_ph = tf.placeholder("float32", self.batch_targ.shape) self.seq=tf.constant(self.truncated,shape=[1]) self.seq2=tf.constant(self.truncated,shape=[1]) self.dropout_ph = tf.placeholder("float32") self.fw_cell=self.cell_create('1') self.fw_cell2=self.cell_create('2') if self.configuration=='R': self.outputs, self.states= tf.nn.dynamic_rnn(self.fw_cell, self.x_ph, sequence_length=self.seq,dtype=tf.float32) if self.attention_number >0: self.outputs_zero_padded=tf.pad(self.outputs,[[0,0],[self.attention_number,0],[0,0]]) self.RNNout1=tf.stack([tf.reshape(self.outputs_zero_padded[:,g:g+(self.attention_number+1)],[self.n_hidden[(len(self.n_hidden)-1)]*((self.attention_number)+1)]) for g in range(self.batch_size)]) self.presoft=tf.matmul(self.RNNout1, self.weights) + self.biases else: self.presoft=tf.matmul(self.outputs[0][0], self.weights) + self.biases elif self.configuration=='B': self.bw_cell=self.cell_create('1') self.bw_cell2=self.cell_create('2') with tf.variable_scope('1'): self.outputs, self.states= tf.nn.bidirectional_dynamic_rnn(self.fw_cell, self.bw_cell, self.x_ph, sequence_length=self.seq,dtype=tf.float32) self.first_out=tf.concat((self.outputs[0],self.outputs[1]),2) with tf.variable_scope('2'): self.outputs2, self.states2= tf.nn.bidirectional_dynamic_rnn(self.fw_cell2, self.bw_cell2, self.first_out, sequence_length=self.seq2,dtype=tf.float32) self.second_out=tf.concat((self.outputs2[0],self.outputs2[1]),2) for i in range((self.attention_number*2)+1): self.attention_weight_init(i) self.zero_pad_second_out=tf.pad(tf.squeeze(self.second_out),[[self.attention_number,self.attention_number],[0,0]]) # self.attention_chunks.append(self.zero_pad_second_out[j:j+attention_number*2]) self.attention_m=[tf.tanh(tf.matmul(tf.concat((self.zero_pad_second_out[j:j+self.batch_size],tf.squeeze(self.first_out)),1),self.attention_weights[j])) for j in range((self.attention_number*2)+1)] self.attention_s=tf.nn.softmax(tf.stack([tf.matmul(self.attention_m[i],self.sm_attention_weights[i]) for i in range(self.attention_number*2+1)]),0) self.attention_z=tf.reduce_sum([self.attention_s[i]*self.zero_pad_second_out[i:self.batch_size+i] for i in range(self.attention_number*2+1)],0) self.presoft=tf.matmul(self.attention_z,self.weights)+self.biases if self.output_act=='softmax': self.pred=tf.nn.softmax(self.presoft) self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.presoft, labels=self.y_ph)) elif self.output_act=='sigmoid': self.pred=tf.nn.sigmoid(self.presoft) self.cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.presoft, labels=self.y_ph)) if self.optimizer == 'GD': self.optimize = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.cost) elif self.optimizer == 'Adam': self.optimize = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost) elif self.optimizer == 'RMS': self.optimize = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.cost) self.correct_pred = tf.equal(tf.argmax(self.pred,1), tf.argmax(self.y_ph,1)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32)) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver() self.saver_var = tf.train.Saver(tf.trainable_variables()) if self.save_location==[]: self.save_location=os.getcwd()