def update_loss(self, n_task): if n_task == 0: return # Radius of influence (Constrained minimization) if self.init_change != None and self.incr_change != None: task_var = tf.Variable(self.init_change, name="epsilon_task%d" % (n_task - 1), trainable=False) self.objs['sess'].run(task_var.initializer) self.vars['epsilon_task%d' % (n_task - 1)] = task_var for prev_n_task in range(n_task - 1): self.objs['sess'].run( tf.assign_add(self.vars['epsilon_task%d' % (prev_n_task)], self.incr_change)) loss = self.vars['losses'][0] if self.use_orig_loss else self.vars[ 'losses'][n_task - 1] penalties = [] old_vars = self.objs[ 'fisher_old_ws'] if self.use_latest_theta_star else self.saved_wts[ n_task - 1] fisher_vars = self.objs[ 'fisher_diags'] if self.use_latest_theta_star else self.saved_fishers[ n_task - 1] for var, old_var, fisher in zip(self.objs['fisher_ws'], old_vars, fisher_vars): penalties += [ tf.multiply(fisher, self.norm_op(tf.subtract(var, old_var))) ] ewc_penalty = tf.add_n( [tf.reduce_sum(penalty) for penalty in penalties]) # Create new cross entropy loss if self.init_change != None and self.incr_change != None: self.vars['ce_losses'][n_task] = self.vars['ce_losses'][ n_task - 1] * self.indicator(task_var - ewc_penalty) new_loss = tf.add( loss, tf.multiply(tf.constant(self.multiplier, tf.float32), ewc_penalty)) # Remove previous CE loss if self.init_change != None and self.incr_change != None: print('lol') new_loss -= self.vars['ce_losses'][n_task - 1] new_loss += self.vars['ce_losses'][n_task] self.vars['loss'] = new_loss self.vars['losses'][n_task] = new_loss self.vars['distances'][n_task] = self.setup_distances(n_task) orig_var_list = self.vars['orig_var_list'] # print("Trainable vars: %s" % str(orig_var_list)) print("Trainable vars:") self.print_vars(orig_var_list) if self.reset_opt: print('Reset opt') self.objs['sess'].run( tf.variables_initializer(self.objs['opt'].variables())) op = self.objs['opt'].minimize(new_loss, var_list=orig_var_list) self.vars['train_op'] = op self.vars['train_ops'][n_task] = op print('Updated train_op and loss')
def __init__(self, n_inputs, n_outputs, n_hiddens, act_fun, output_order='sequential', mode='sequential', input=None, output=None): """ Constructor. :param n_inputs: number of (conditional) inputs :param n_outputs: number of outputs :param n_hiddens: list with number of hidden units for each hidden layer :param act_fun: tensorflow activation function :param output_order: order of outputs :param mode: strategy for assigning degrees to hidden nodes: can be 'random' or 'sequential' :param input: tensorflow placeholder to serve as input; if None, a new placeholder is created :param output: tensorflow placeholder to serve as output; if None, a new placeholder is created """ # save input arguments self.n_inputs = n_inputs self.n_outputs = n_outputs self.n_hiddens = n_hiddens self.act_fun = act_fun self.mode = mode # create network's parameters degrees = create_degrees(n_outputs, n_hiddens, output_order, mode) Ms, Mmp = create_masks(degrees) Wx, Ws, bs, Wm, bm, Wp, bp = create_weights_conditional( n_inputs, n_outputs, n_hiddens, None) self.parms = [Wx] + Ws + bs + [Wm, bm, Wp, bp] self.output_order = degrees[0] # activation function f = self.act_fun # input matrices self.input = tf.placeholder(dtype=dtype, shape=[None, n_inputs], name='x') if input is None else input self.y = tf.placeholder(dtype=dtype, shape=[None, n_outputs], name='y') if output is None else output # feedforward propagation h = f(tf.matmul(self.input, Wx) + tf.matmul(self.y, Ms[0] * Ws[0]) + bs[0], name='h1') for l, (M, W, b) in enumerate(zip(Ms[1:], Ws[1:], bs[1:])): h = f(tf.matmul(h, M * W) + b, name='h' + str(l + 2)) # output means self.m = tf.add(tf.matmul(h, Mmp * Wm), bm, name='m') # output log precisions self.logp = tf.add(tf.matmul(h, Mmp * Wp), bp, name='logp') # random numbers driving made self.u = tf.exp(0.5 * self.logp) * (self.y - self.m) # log likelihoods self.L = tf.multiply(-0.5,n_outputs * np.log(2 * np.pi) + \ tf.reduce_sum(self.u ** 2 - self.logp, axis=1,keepdims=True),name='L') # train objective self.trn_loss = -tf.reduce_mean(self.L, name='trn_loss')
def implicit_quantile_network(num_actions, quantile_embedding_dim, network_type, state, num_quantiles): """The Implicit Quantile ConvNet. Args: num_actions: int, number of actions. quantile_embedding_dim: int, embedding dimension for the quantile input. network_type: namedtuple, collection of expected values to return. state: `tf.Tensor`, contains the agent's current state. num_quantiles: int, number of quantile inputs. Returns: net: _network_type object containing the tensors output by the network. """ weights_initializer = contrib_slim.variance_scaling_initializer( factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True) state_net = tf.cast(state, tf.float32) state_net = tf.div(state_net, 255.) state_net = contrib_slim.conv2d(state_net, 32, [8, 8], stride=4, weights_initializer=weights_initializer) state_net = contrib_slim.conv2d(state_net, 64, [4, 4], stride=2, weights_initializer=weights_initializer) state_net = contrib_slim.conv2d(state_net, 64, [3, 3], stride=1, weights_initializer=weights_initializer) state_net = contrib_slim.flatten(state_net) state_net_size = state_net.get_shape().as_list()[-1] state_net_tiled = tf.tile(state_net, [num_quantiles, 1]) batch_size = state_net.get_shape().as_list()[0] quantiles_shape = [num_quantiles * batch_size, 1] quantiles = tf.random_uniform(quantiles_shape, minval=0, maxval=1, dtype=tf.float32) quantile_net = tf.tile(quantiles, [1, quantile_embedding_dim]) pi = tf.constant(math.pi) quantile_net = tf.cast(tf.range(1, quantile_embedding_dim + 1, 1), tf.float32) * pi * quantile_net quantile_net = tf.cos(quantile_net) quantile_net = contrib_slim.fully_connected( quantile_net, state_net_size, weights_initializer=weights_initializer) # Hadamard product. net = tf.multiply(state_net_tiled, quantile_net) net = contrib_slim.fully_connected(net, 512, weights_initializer=weights_initializer) quantile_values = contrib_slim.fully_connected( net, num_actions, activation_fn=None, weights_initializer=weights_initializer) return network_type(quantile_values=quantile_values, quantiles=quantiles)
def attention_layer(from_tensor, to_tensor, layer_idx, total_layers, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, batch_size=None, from_seq_length=None, to_seq_length=None, num_partitions=1): """Performs multi-headed attention from `from_tensor` to `to_tensor`. This is an implementation of multi-headed attention based on "Attention is all you Need". If `from_tensor` and `to_tensor` are the same, then this is self-attention. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. This function first projects `from_tensor` into a "query" tensor and `to_tensor` into "key" and "value" tensors. These are (effectively) a list of tensors of length `num_attention_heads`, where each tensor is of shape [batch_size, seq_length, size_per_head]. Then, the query and key tensors are dot-producted and scaled. These are softmaxed to obtain attention probabilities. The value tensors are then interpolated by these probabilities, then concatenated back to a single tensor and returned. In practice, the multi-headed attention are done with tf.einsum as follows: Input_tensor: [BFD] Wq, Wk, Wv: [DNH] Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq) K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk) V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv) attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H) attention_probs:[BNFT] = softmax(attention_scores) context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V) Wout:[DNH] Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout) Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. layer_idx: the index of the current layer. total_layers: total number of layers. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. num_partitions: (optional) Number of SPMD partitions. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # `query_layer` = [B, F, N, H] query_layer = dense_layer_3d(from_tensor, layer_idx, total_layers, num_attention_heads, size_per_head, create_initializer(initializer_range), query_act, name="query") # `key_layer` = [B, T, N, H] key_layer = dense_layer_3d(to_tensor, layer_idx, total_layers, num_attention_heads, size_per_head, create_initializer(initializer_range), key_act, name="key") # `value_layer` = [B, T, N, H] value_layer = dense_layer_3d(to_tensor, layer_idx, total_layers, num_attention_heads, size_per_head, create_initializer(initializer_range), value_act, name="value") if num_partitions > 1: # partition along the heads dimension query_layer = xla_sharding.split(query_layer, 2, num_partitions, use_sharding_op=True) key_layer = xla_sharding.split(key_layer, 2, num_partitions, use_sharding_op=True) value_layer = xla_sharding.split(value_layer, 2, num_partitions, use_sharding_op=True) query_layer = tf.multiply(query_layer, 1.0 / math.sqrt(float(size_per_head))) # Take the dot product between "query" and "key" to get the raw # attention scores. attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_layer, query_layer) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, attention_scores.dtype)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_scores = tf.cast(attention_scores, tf.float32) attention_scores = attention_scores - tf.stop_gradient( tf.reduce_max(attention_scores, -1, True)) attention_scores = tf.exp(attention_scores) attention_sum = tf.reduce_sum(attention_scores, -1, True) attention_probs = tf.cast(attention_scores, key_layer.dtype) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. # Split mask and scaling ops in dropout random_u = tf.random_uniform(attention_probs.shape, dtype=tf.bfloat16) keep_mask = random_u >= attention_probs_dropout_prob keep_mask = tf.cast(keep_mask, dtype=attention_probs.dtype) attention_probs = tf.multiply(keep_mask, attention_probs) # `context_layer` = [B, F, N, H] context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_layer) context_layer = context_layer / tf.cast( tf.transpose(attention_sum, [0, 2, 1, 3]), context_layer.dtype) if num_partitions > 1: # partition along the heads dimension context_layer = xla_sharding.split(context_layer, 2, num_partitions, use_sharding_op=True) # split mask and scaling ops in dropout # move the scaling from dropout to here to save same mul ops # TODO(yuemmawang) automate this optimization in xla keep_prob = 1 - attention_probs_dropout_prob scale = 1 / keep_prob context_layer = tf.multiply(context_layer, scale) return context_layer
def build_model(batch, seq_len, vocab_size, d_model, head): input_tensor = tf.placeholder(shape=(batch, seq_len, d_model), dtype=tf.int32) mask_tensor = tf.placeholder(shape=(batch, seq_len), dtype=tf.float32) # We are not using embedding here input_ids = tf.cast(input_tensor, tf.float32) # Add positional encoding. We use static positional encoding here. if USE_POSITIONAL_ENCODING: pos_enc = generate_position_embedding(input_len=seq_len, d_model=d_model) pos_enc = tf.constant(pos_enc, dtype=tf.float32) input_ids = input_ids + pos_enc # Convert input to 2D tensor input_batch = tf.reshape(input_ids, (-1, d_model)) # Transform input to Q, K and V tensor size_per_head = int(d_model / head) K = tf.layers.dense(input_batch, size_per_head * head, name='K') Q = tf.layers.dense(input_batch, size_per_head * head, name='Q') V = tf.layers.dense(input_batch, size_per_head * head, name='V') # [Batch, Head, Len, Size_per_Head] K = transpose_for_scores(K, batch, head, seq_len, size_per_head) Q = transpose_for_scores(Q, batch, head, seq_len, size_per_head) V = transpose_for_scores(V, batch, head, seq_len, size_per_head) # Scaled Dot-Product attention [Batch, Head, Len-Q, Len-K] attention_scores = tf.matmul(Q, K, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) # Generate attention mask to prevent attention to padding tokens to_mask = tf.reshape(mask_tensor, [batch, 1, seq_len]) broadcast_ones = tf.ones(shape=[batch, seq_len, 1], dtype=tf.float32) # Attention mask [Batch, Len, Len] attention_mask = broadcast_ones * to_mask # `attention_mask` = [Batch, 1, Len, Len] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Make adding -10000.0 to attention of padding tokens adder = (1.0 - attention_mask) * -10000.0 attention_scores += adder attention_probs = tf.nn.softmax(attention_scores) # `context_layer` = [Batch, Head, Len-Q, Size_per_Head] context_layer = tf.matmul(attention_probs, V) # `context_layer` = [Batch, Len-Q, Head, Size_per_Head] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) # Also calculate cost of attention head output difference here. disagreement_cost = get_attention_heads_disagreement_cost( context_layer) # `output_tensor` = [Batch x Len-Q, Head x Size_per_Head = D_Model] output_tensor = tf.reshape(context_layer, [batch * seq_len, head * size_per_head]) # Final linear projection. Note that this weight has permutation set divided by row instead of column as in K/Q/V output_tensor = tf.layers.dense(output_tensor, d_model, name='output') # `output_tensor` = [Batch, Len-Q, Head x Size_per_Head = D_Model] output_tensor = tf.reshape(output_tensor, [batch, seq_len, head * size_per_head]) # Pooled output is the hidden state of the 1st token pooled_output_tensor = output_tensor[:, 0] # Add binary classification layers prediction_tensor = tf.layers.dense(pooled_output_tensor, 1, name='prediction') logprob_tensor = tf.nn.sigmoid(prediction_tensor, name='sigmoid') return (input_tensor, mask_tensor, prediction_tensor, disagreement_cost, logprob_tensor)
def compute_knowledge_selection_and_loss(self, features, encoder_output, fact_embedding, fact_lengths, margin, num_negative_samples): """Compute knowledge selection and loss. Args: features: features. encoder_output: <tf.float32>[batch_size, input_length, hidden_dim] fact_embedding: <tf.float32>[batch_size*triple_num, max_triple_length, emb_dim] fact_lengths: # <tf.int32>[batch_size*triple_num] margin: integer value for max margin in TransE loss, num_negative_samples: shuffle and sample multiple negative examples for the TransE loss Returns: knowledge_weights: knowledge_loss: """ hparams = self._hparams encoder_output_shape = common_layers.shape_list(encoder_output) encoder_hidden_dim = encoder_output_shape[-1] inputs = features["inputs"] # <tf.float32>[batch_size, input_length, emb_dim] inputs = tf.squeeze(inputs, 2) # <tf.float32>[batch_size, input_length] context_padding = common_attention.embedding_to_padding(inputs) # <tf.float32>[batch_size] context_lens = tf.to_float( common_attention.padding_to_length(context_padding)) # <tf.float32>[batch_size, 1] context_lens = tf.expand_dims(context_lens, -1) # Compute context vector summary. # <tf.float32>[batch_size, hidden_dim] context_vector_summary = compute_summary_embedding( encoder_output, context_lens, hparams) knowledge_encoder_output = compute_average_embedding( fact_embedding, fact_lengths) # <tf.float32>[batch_size, triple_num, emb_dim] knowledge_encoder_output = tf.reshape( knowledge_encoder_output, [-1, self.triple_num, encoder_hidden_dim]) original_knowledge_encoder_output = knowledge_encoder_output if hparams.similarity_fuction == "dot_product": triple_logits = tf.squeeze( tf.matmul(knowledge_encoder_output, tf.expand_dims(context_vector_summary, 2)), -1) elif hparams.similarity_fuction == "bilinear": # Tile the context vector summary. # <tf.float32>[batch_size, triple_num*hidden_dim] tiled_context_vector = tf.tile(context_vector_summary, [1, self.triple_num]) # <tf.float32>[batch_size, triple_num, hidden_dim] context_vector = tf.reshape( tiled_context_vector, [-1, self.triple_num, encoder_hidden_dim]) # compute outer product context_vector = tf.expand_dims(context_vector, -1) knowledge_encoder_output = tf.expand_dims(knowledge_encoder_output, 2) # <tf.float32>[batch_size, triple_num, hidden_dim, hidden_dim] outer_product = tf.matmul(context_vector, knowledge_encoder_output) outer_product = tf.reshape( outer_product, [-1, self.triple_num, encoder_hidden_dim * encoder_hidden_dim]) triple_logits = tf.squeeze( tf.layers.dense(outer_product, 1, name="knolwedge_final_mlp"), -1) avg_triple_loss = 0.0 triple_labels = features["triple_labels"] subject_mask = tf.reshape( features["subject_mask"], [-1, self.triple_num, hparams.max_triple_length]) subject_mask = tf.reshape(subject_mask, [-1, hparams.max_triple_length]) predicate_mask = tf.reshape( features["predicate_mask"], [-1, self.triple_num, hparams.max_triple_length]) predicate_mask = tf.reshape(predicate_mask, [-1, hparams.max_triple_length]) object_mask = tf.reshape( features["object_mask"], [-1, self.triple_num, hparams.max_triple_length]) object_mask = tf.reshape(object_mask, [-1, hparams.max_triple_length]) # mask : [bs, max_seq_len, triple_num] # the below operation will result in [bs*triple_num,emb_dim] subject_length = tf.cast( tf.expand_dims(tf.reduce_sum(subject_mask, -1), 1), tf.float32) # [bs*tn] object_length = tf.cast( tf.expand_dims(tf.reduce_sum(object_mask, -1), 1), tf.float32) predicate_length = tf.cast( tf.expand_dims(tf.reduce_sum(predicate_mask, -1), 1), tf.float32) # expand dimension 2 to be able to broadcast subject_mask = tf.cast(tf.expand_dims(subject_mask, 2), tf.float32) predicate_mask = tf.cast(tf.expand_dims(predicate_mask, 2), tf.float32) object_mask = tf.cast(tf.expand_dims(object_mask, 2), tf.float32) subject_vect = tf.reduce_sum(tf.multiply( fact_embedding, subject_mask), 1) / ( subject_length + tf.broadcast_to(tf.constant([1e-5]), tf.shape(subject_length))) object_vect = tf.reduce_sum(tf.multiply( fact_embedding, object_mask), 1) / ( object_length + tf.broadcast_to(tf.constant([1e-5]), tf.shape(object_length))) predicate_vect = tf.reduce_sum( tf.multiply(fact_embedding, predicate_mask), 1) / (predicate_length + tf.broadcast_to( tf.constant([1e-5]), tf.shape(predicate_length))) # Shuffled rows to generate adversarial samples shuffled_subject_vect = [] shuffled_object_vect = [] for _ in range(num_negative_samples): shuffled_subject_vect += [ tf.gather( subject_vect, tf.random.shuffle(tf.range(tf.shape(subject_vect)[0]))) ] # [bs*tn,d] shuffled_object_vect += [ tf.gather( object_vect, tf.random.shuffle(tf.range(tf.shape(object_vect)[0]))) ] # [bs*tn,d] # KB pretraining loss positive_loss = tf.reduce_mean( tf.squared_difference(subject_vect + predicate_vect, object_vect)) negative_loss = 0 for n_adv in range(num_negative_samples): negative_loss += tf.reduce_mean( tf.squared_difference( shuffled_subject_vect[n_adv] + predicate_vect, object_vect)) negative_loss += tf.reduce_mean( tf.squared_difference(subject_vect + predicate_vect, shuffled_object_vect[n_adv])) # TransE Loss negative_loss = negative_loss / (2 * num_negative_samples) transe_loss = tf.clip_by_value(margin + positive_loss - negative_loss, clip_value_min=0, clip_value_max=100) if hparams.mode != tf.estimator.ModeKeys.PREDICT: triple_losses = tf.nn.weighted_cross_entropy_with_logits( labels=triple_labels, logits=triple_logits, pos_weight=hparams.pos_weight) avg_triple_loss = tf.reduce_mean(triple_losses) tf.summary.scalar("triple_loss", avg_triple_loss) return triple_logits, avg_triple_loss, original_knowledge_encoder_output, transe_loss
def build_model(self, hps): """Define model architecture.""" if hps.is_training: self.global_step = tf.Variable(0, name='global_step', trainable=False) if hps.dec_model == 'lstm': cell_fn = rnn.LSTMCell elif hps.dec_model == 'layer_norm': cell_fn = rnn.LayerNormLSTMCell elif hps.dec_model == 'hyper': cell_fn = rnn.HyperLSTMCell else: assert False, 'please choose a respectable cell' if hps.enc_model == 'lstm': enc_cell_fn = rnn.LSTMCell elif hps.enc_model == 'layer_norm': enc_cell_fn = rnn.LayerNormLSTMCell elif hps.enc_model == 'hyper': enc_cell_fn = rnn.HyperLSTMCell else: assert False, 'please choose a respectable cell' use_recurrent_dropout = self.hps.use_recurrent_dropout use_input_dropout = self.hps.use_input_dropout use_output_dropout = self.hps.use_output_dropout cell = cell_fn(hps.dec_rnn_size, use_recurrent_dropout=use_recurrent_dropout, dropout_keep_prob=self.hps.recurrent_dropout_prob) if hps.conditional: # vae mode: if hps.enc_model == 'hyper': self.enc_cell_fw = enc_cell_fn( hps.enc_rnn_size, use_recurrent_dropout=use_recurrent_dropout, dropout_keep_prob=self.hps.recurrent_dropout_prob) self.enc_cell_bw = enc_cell_fn( hps.enc_rnn_size, use_recurrent_dropout=use_recurrent_dropout, dropout_keep_prob=self.hps.recurrent_dropout_prob) else: self.enc_cell_fw = enc_cell_fn( hps.enc_rnn_size, use_recurrent_dropout=use_recurrent_dropout, dropout_keep_prob=self.hps.recurrent_dropout_prob) self.enc_cell_bw = enc_cell_fn( hps.enc_rnn_size, use_recurrent_dropout=use_recurrent_dropout, dropout_keep_prob=self.hps.recurrent_dropout_prob) # dropout: tf.logging.info('Input dropout mode = %s.', use_input_dropout) tf.logging.info('Output dropout mode = %s.', use_output_dropout) tf.logging.info('Recurrent dropout mode = %s.', use_recurrent_dropout) if use_input_dropout: tf.logging.info('Dropout to input w/ keep_prob = %4.4f.', self.hps.input_dropout_prob) cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=self.hps.input_dropout_prob) if use_output_dropout: tf.logging.info('Dropout to output w/ keep_prob = %4.4f.', self.hps.output_dropout_prob) cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=self.hps.output_dropout_prob) self.cell = cell self.sequence_lengths = tf.placeholder(dtype=tf.int32, shape=[self.hps.batch_size]) self.input_data = tf.placeholder( dtype=tf.float32, shape=[self.hps.batch_size, self.hps.max_seq_len + 1, 5]) # The target/expected vectors of strokes self.output_x = self.input_data[:, 1:self.hps.max_seq_len + 1, :] # vectors of strokes to be fed to decoder (same as above, but lagged behind # one step to include initial dummy value of (0, 0, 1, 0, 0)) self.input_x = self.input_data[:, :self.hps.max_seq_len, :] # either do vae-bit and get z, or do unconditional, decoder-only if hps.conditional: # vae mode: self.mean, self.presig = self.encoder(self.output_x, self.sequence_lengths) self.sigma = tf.exp(self.presig / 2.0) # sigma > 0. div 2.0 -> sqrt. eps = tf.random_normal((self.hps.batch_size, self.hps.z_size), 0.0, self.hps.scale, dtype=tf.float32) self.batch_z = self.mean + tf.multiply(self.sigma, eps) # KL cost self.kl_cost = -0.5 * tf.reduce_mean( (1 + self.presig - tf.square(self.mean) - tf.exp(self.presig))) self.kl_cost = tf.maximum(self.kl_cost, self.hps.kl_tolerance) pre_tile_y = tf.reshape(self.batch_z, [self.hps.batch_size, 1, self.hps.z_size]) overlay_x = tf.tile(pre_tile_y, [1, self.hps.max_seq_len, 1]) actual_input_x = tf.concat([self.input_x, overlay_x], 2) self.initial_state = tf.nn.tanh( rnn.super_linear(self.batch_z, cell.state_size, init_w='gaussian', weight_start=0.001, input_size=self.hps.z_size)) else: # unconditional, decoder-only generation self.batch_z = tf.zeros((self.hps.batch_size, self.hps.z_size), dtype=tf.float32) self.kl_cost = tf.zeros([], dtype=tf.float32) actual_input_x = self.input_x self.initial_state = cell.zero_state(batch_size=hps.batch_size, dtype=tf.float32) self.num_mixture = hps.num_mixture # TODO(deck): Better understand this comment. # Number of outputs is 3 (one logit per pen state) plus 6 per mixture # component: mean_x, stdev_x, mean_y, stdev_y, correlation_xy, and the # mixture weight/probability (Pi_k) n_out = (3 + self.num_mixture * 6) with tf.variable_scope('RNN'): output_w = tf.get_variable('output_w', [self.hps.dec_rnn_size, n_out]) output_b = tf.get_variable('output_b', [n_out]) # decoder module of sketch-rnn is below output, last_state = tf.nn.dynamic_rnn( cell, actual_input_x, initial_state=self.initial_state, time_major=False, swap_memory=True, dtype=tf.float32, scope='RNN') output = tf.reshape(output, [-1, hps.dec_rnn_size]) output = tf.nn.xw_plus_b(output, output_w, output_b) self.final_state = last_state # NB: the below are inner functions, not methods of Model def tf_2d_normal(x1, x2, mu1, mu2, s1, s2, rho): """Returns result of eq # 24 of http://arxiv.org/abs/1308.0850.""" norm1 = tf.subtract(x1, mu1) norm2 = tf.subtract(x2, mu2) s1s2 = tf.multiply(s1, s2) # eq 25 z = (tf.square(tf.div(norm1, s1)) + tf.square(tf.div(norm2, s2)) - 2 * tf.div(tf.multiply(rho, tf.multiply(norm1, norm2)), s1s2)) neg_rho = 1 - tf.square(rho) result = tf.exp(tf.div(-z, 2 * neg_rho)) denom = 2 * np.pi * tf.multiply(s1s2, tf.sqrt(neg_rho)) result = tf.div(result, denom) return result def get_lossfunc(z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen_logits, x1_data, x2_data, pen_data): """Returns a loss fn based on eq #26 of http://arxiv.org/abs/1308.0850.""" # This represents the L_R only (i.e. does not include the KL loss term). result0 = tf_2d_normal(x1_data, x2_data, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr) epsilon = 1e-6 # result1 is the loss wrt pen offset (L_s in equation 9 of # https://arxiv.org/pdf/1704.03477.pdf) result1 = tf.multiply(result0, z_pi) result1 = tf.reduce_sum(result1, 1, keep_dims=True) result1 = -tf.log(result1 + epsilon) # avoid log(0) fs = 1.0 - pen_data[:, 2] # use training data for this fs = tf.reshape(fs, [-1, 1]) # Zero out loss terms beyond N_s, the last actual stroke result1 = tf.multiply(result1, fs) # result2: loss wrt pen state, (L_p in equation 9) result2 = tf.nn.softmax_cross_entropy_with_logits( labels=pen_data, logits=z_pen_logits) result2 = tf.reshape(result2, [-1, 1]) if not self.hps.is_training: # eval mode, mask eos columns result2 = tf.multiply(result2, fs) result = result1 + result2 return result # below is where we need to do MDN (Mixture Density Network) splitting of # distribution params def get_mixture_coef(output): """Returns the tf slices containing mdn dist params.""" # This uses eqns 18 -> 23 of http://arxiv.org/abs/1308.0850. z = output z_pen_logits = z[:, 0:3] # pen states z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = tf.split( z[:, 3:], 6, 1) # process output z's into MDN parameters # softmax all the pi's and pen states: z_pi = tf.nn.softmax(z_pi) z_pen = tf.nn.softmax(z_pen_logits) # exponentiate the sigmas and also make corr between -1 and 1. z_sigma1 = tf.exp(z_sigma1) z_sigma2 = tf.exp(z_sigma2) z_corr = tf.tanh(z_corr) r = [ z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen, z_pen_logits ] return r out = get_mixture_coef(output) [o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr, o_pen, o_pen_logits] = out self.pi = o_pi self.mu1 = o_mu1 self.mu2 = o_mu2 self.sigma1 = o_sigma1 self.sigma2 = o_sigma2 self.corr = o_corr self.pen_logits = o_pen_logits # pen state probabilities (result of applying softmax to self.pen_logits) self.pen = o_pen # reshape target data so that it is compatible with prediction shape target = tf.reshape(self.output_x, [-1, 5]) [x1_data, x2_data, eos_data, eoc_data, cont_data] = tf.split(target, 5, 1) pen_data = tf.concat([eos_data, eoc_data, cont_data], 1) lossfunc = get_lossfunc(o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr, o_pen_logits, x1_data, x2_data, pen_data) self.r_cost = tf.reduce_mean(lossfunc) if self.hps.is_training: self.lr = tf.Variable(self.hps.learning_rate, trainable=False) optimizer = tf.train.AdamOptimizer(self.lr) self.kl_weight = tf.Variable(self.hps.kl_weight_start, trainable=False) self.cost = self.r_cost + self.kl_cost * self.kl_weight gvs = optimizer.compute_gradients(self.cost) g = self.hps.grad_clip capped_gvs = [(tf.clip_by_value(grad, -g, g), var) for grad, var in gvs] self.train_op = optimizer.apply_gradients( capped_gvs, global_step=self.global_step, name='train_step')
def recenter(rv_constructor, *rv_args, **rv_kwargs): rv_name = rv_kwargs.get('name') rv_value = rv_kwargs.pop('value', None) base_bijector = None if rv_constructor.__name__ == 'TransformedDistribution': if (rv_args[1].__class__.__name__ == 'Invert' and rv_args[1].bijector.__class__.__name__ == 'SoftClip'): distribution = rv_args[0] base_bijector = rv_args[1].bijector rv_constructor = distribution.__class__ rv_kwargs = distribution.parameters rv_args = rv_args[2:] # We were given a value for the transformed RV. Let's pretend it was # for the original. if rv_value is not None: rv_value = base_bijector.forward(rv_value) if (rv_constructor.__name__ == 'Normal' and not rv_name.startswith('y')): # NB: assume everything is kwargs for now. x_loc = rv_kwargs['loc'] x_scale = rv_kwargs['scale'] name = rv_kwargs['name'] a, b, _ = get_or_init(name, loc_shape=tf.shape(x_loc), scale_shape=tf.shape(x_scale), parameterisation_type='scalar') kwargs_std = {} kwargs_std['loc'] = tf.multiply(x_loc, a) kwargs_std['scale'] = tf.pow( x_scale, b) # tf.multiply(x_scale - 1., b) + 1. kwargs_std['name'] = name scale = x_scale / kwargs_std['scale'] # tf.pow(x_scale, 1. - b) shift = x_loc - tf.multiply(scale, kwargs_std['loc']) b = tfb.AffineScalar(scale=scale, shift=shift) if rv_value is not None: rv_value = b.inverse(rv_value) learnable_parameters[name + '_prior_mean'] = tf.convert_to_tensor(x_loc) learnable_parameters[name + '_prior_scale'] = tf.convert_to_tensor( x_scale) # If original RV was constrained, transform the constraint to the new # standardized RV. For now we assume a double-sided constraint. if base_bijector is not None: constraint_std = tfb.SoftClip( low=b.inverse(base_bijector.low), high=b.inverse(base_bijector.high), hinge_softness=base_bijector.hinge_softness / scale if base_bijector.hinge_softness is not None else None) rv_std = edward2.TransformedDistribution( rv_constructor(**kwargs_std), tfb.Invert(constraint_std), value=constraint_std.inverse(rv_value) if rv_value is not None else None) b = b(constraint_std) else: kwargs_std['value'] = rv_value rv_std = interceptable(rv_constructor)(*rv_args, **kwargs_std) bijectors[name] = b return b.forward(rv_std) elif ((rv_constructor.__name__.startswith('MultivariateNormal') or rv_constructor.__name__.startswith('GaussianProcess')) and not rv_kwargs['name'].startswith('y')): name = rv_kwargs['name'] if rv_constructor.__name__.startswith('GaussianProcess'): gp_dist = rv_constructor(*rv_args, **rv_kwargs).distribution X = gp_dist._get_index_points() x_loc = gp_dist.mean_fn(X) x_cov = gp_dist._compute_covariance(index_points=X) else: x_loc = rv_kwargs['loc'] x_cov = rv_kwargs['covariance_matrix'] a, b, c = get_or_init(name, loc_shape=tf.shape(x_loc), scale_shape=tf.shape(x_cov)[:-1], parameterisation_type=parameterisation_type) ndims = tf.shape(x_cov)[-1] x_loc = tf.broadcast_to(x_loc, tf.shape(x_cov)[:-1]) cov_dtype = tf.float64 if FLAGS.float64 else x_cov.dtype x_cov = tf.cast(x_cov, cov_dtype) if parameterisation_type == 'eig': """Extra cost of the eigendecomposition? we do the eig to get Lambda, Q. We rescale Lambda and create the prior dist linop - point one: the prior is an MVN (albeit an efficient one), where in NCP it's just Normal Then we construct the remaining scale matrix. (an n**3 matmul) And unlike a cholesky factor these matrices aren't triangular, so multiplication or division - can we """ Lambda, Q = eigh_with_safe_gradient(x_cov) Lambda = tf.abs(Lambda) Lambda = tf.cast(Lambda, tf.float32) Q = tf.cast(Q, tf.float32) Lambda_hat_b = tf.pow(Lambda, b) if tied_pparams: # If the scale parameterization is in the eigenbasis, # apply it to the mean in the same basis. loc_in_eigenbasis = tf.linalg.matvec(Q, x_loc, adjoint_a=True) reparam_loc = tf.linalg.matvec( Q, tf.multiply(loc_in_eigenbasis, a)) else: reparam_loc = tf.multiply(x_loc, a) kwargs_std = {} kwargs_std['loc'] = reparam_loc kwargs_std['scale'] = LinearOperatorEigenScale( Q, d=tf.sqrt(Lambda_hat_b)) kwargs_std['name'] = name Q_linop = LinearOperatorOrthogonal(Q, det_is_positive=True) scale = tf.linalg.LinearOperatorComposition([ Q_linop, tf.linalg.LinearOperatorDiag(tf.sqrt(Lambda + 1e-10)), tf.linalg.LinearOperatorDiag( 1. / tf.sqrt(Lambda_hat_b + 1e-10)), Q_linop.adjoint(), ]) shift = x_loc - scale.matvec(reparam_loc) b = tfb.AffineLinearOperator(scale=scale, shift=shift) if 'value' in rv_kwargs: kwargs_std['value'] = b.inverse(rv_kwargs['value']) elif parameterisation_type == 'chol': L = tf.linalg.cholesky(x_cov + 1e-6 * tf.eye(ndims, dtype=x_cov.dtype)) L = tf.cast(L, tf.float32) reparam_loc = x_loc * a reparam_scale = tf.linalg.LinearOperatorLowerTriangular( tf.linalg.diag(1 - b) + b[..., tf.newaxis] * L) kwargs_std = {} kwargs_std['loc'] = reparam_loc kwargs_std['scale'] = reparam_scale kwargs_std['name'] = name Dinv = tf.linalg.triangular_solve( tf.cast(reparam_scale.to_dense(), cov_dtype), tf.eye(ndims, dtype=cov_dtype)) Dinv = tf.cast(Dinv, tf.float32) scale = tf.matmul(L, Dinv) shift = x_loc - tf.linalg.matvec(scale, reparam_loc) b = tfb.AffineLinearOperator( scale=tf.linalg.LinearOperatorFullMatrix(scale), shift=shift) if 'value' in rv_kwargs: kwargs_std['value'] = b.inverse(rv_kwargs['value']) elif parameterisation_type == 'indep': # Assumes `C^-1 = diag(c)` is a learned diagonal matrix of 'evidence # precisions'. This approximates the true posterior under an iid # Gaussian observation model: prior_chol = tf.linalg.cholesky(x_cov) prior_inv = tf.linalg.cholesky_solve( prior_chol, tf.eye(ndims, dtype=prior_chol.dtype)) approx_posterior_prec = prior_inv + tf.cast( tf.linalg.diag(c), prior_inv.dtype) approx_posterior_prec_chol = tf.linalg.cholesky( approx_posterior_prec) approx_posterior_cov = tf.linalg.cholesky_solve( approx_posterior_prec_chol, tf.eye(ndims, dtype=approx_posterior_prec_chol.dtype)) cov_chol = tf.linalg.cholesky(approx_posterior_cov) cov_chol = tf.cast(cov_chol, tf.float32) prior_chol = tf.cast(prior_chol, tf.float32) scale_linop = tf.linalg.LinearOperatorLowerTriangular(cov_chol) reparam_loc = x_loc * a reparam_scale = tf.linalg.LinearOperatorComposition([ tf.linalg.LinearOperatorInversion(scale_linop), tf.linalg.LinearOperatorLowerTriangular(prior_chol) ]) kwargs_std = {} kwargs_std['loc'] = reparam_loc kwargs_std['scale'] = reparam_scale kwargs_std['name'] = name shift = x_loc - scale_linop.matvec(reparam_loc) b = tfb.AffineLinearOperator(scale=scale_linop, shift=shift) if 'value' in rv_kwargs: kwargs_std['value'] = b.inverse(rv_kwargs['value']) elif parameterisation_type == 'eigindep': # Combines 'eig' and 'indep' parameterizations, modeling the posterior # as # (V D**(-b) V' + diag(c))^-1 # where VDV' is the eigendecomposition of the prior cov, and b and c # are learned vectors. b, c = [tf.cast(x, cov_dtype) for x in (b, c)] Lambda, Q = eigh_with_safe_gradient(x_cov) Lambda = tf.abs(Lambda) Lambda_hat_b = 1e-6 + tf.pow(Lambda, b) prior = tf.matmul( Q, tf.matmul(tf.linalg.diag(Lambda_hat_b), Q, adjoint_b=True)) prior_chol = tf.linalg.cholesky( prior + 1e-6 * tf.eye(ndims, dtype=prior.dtype)) prior_prec = tf.linalg.cholesky_solve( prior_chol + 1e-6 * tf.eye(ndims, dtype=prior_chol.dtype), tf.eye(ndims, dtype=prior_chol.dtype)) approx_posterior_prec = prior_prec + tf.linalg.diag(c) approx_posterior_prec_chol = tf.linalg.cholesky( approx_posterior_prec) approx_posterior_cov = tf.linalg.cholesky_solve( approx_posterior_prec_chol + 1e-6 * tf.eye(ndims, dtype=approx_posterior_prec_chol.dtype), tf.eye(ndims, dtype=approx_posterior_prec_chol.dtype)) cov_chol = tf.linalg.cholesky( approx_posterior_cov + 1e-6 * tf.eye(ndims, dtype=approx_posterior_cov.dtype)) cov_chol = tf.cast(cov_chol, tf.float32) prior_chol = tf.cast(prior_chol, tf.float32) scale_linop = tf.linalg.LinearOperatorLowerTriangular(cov_chol) reparam_loc = tf.multiply(x_loc, a) reparam_scale = tf.linalg.LinearOperatorComposition([ tf.linalg.LinearOperatorInversion(scale_linop), tf.linalg.LinearOperatorLowerTriangular(prior_chol) ]) kwargs_std = {} kwargs_std['loc'] = reparam_loc kwargs_std['scale'] = reparam_scale kwargs_std['name'] = name shift = x_loc - scale_linop.matvec(reparam_loc) b = tfb.AffineLinearOperator(scale=scale_linop, shift=shift) if 'value' in rv_kwargs: kwargs_std['value'] = b.inverse(rv_kwargs['value']) else: raise Exception('unrecognized reparameterization strategy!') if rv_constructor.__name__.startswith('GaussianProcess'): rv_std = edward2.MultivariateNormalLinearOperator( *rv_args, **kwargs_std) else: rv_std = interceptable(rv_constructor)(*rv_args, **kwargs_std) bijectors[name] = b return b.forward(rv_std) else: return interceptable(rv_constructor)(*rv_args, **rv_kwargs)
def prepare_processing_graph(self, flags): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: flags: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = flags.desired_samples self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') if flags.wav: wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) wav_data = wav_decoder.audio else: wav_data = tf_np_load(self.wav_filename_placeholder_) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder(tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_data, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_data) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply(image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply(wav_data, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if flags.preprocess == 'raw': # background_clamp dims: [time, channels] # remove channel dim self.output_ = tf.squeeze(background_clamp, axis=1) # below options are for backward compatibility with previous # version of hotword detection on microcontrollers # in this case audio feature extraction is done separately from # neural net and user will have to manage it. elif flags.preprocess == 'mfcc': # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs # background_clamp dims: [time, channels] spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=flags.window_size_samples, stride=flags.window_stride_samples, magnitude_squared=flags.fft_magnitude_squared) # spectrogram: [channels/batch, frames, fft_feature] # extract mfcc features from spectrogram by audio_ops.mfcc: # 1 Input is spectrogram frames. # 2 Weighted spectrogram into bands using a triangular mel filterbank # 3 Logarithmic scaling # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count mfcc = audio_ops.mfcc( spectrogram=spectrogram, sample_rate=flags.sample_rate, upper_frequency_limit=flags.mel_upper_edge_hertz, lower_frequency_limit=flags.mel_lower_edge_hertz, filterbank_channel_count=flags.mel_num_bins, dct_coefficient_count=flags.dct_num_features) # mfcc: [channels/batch, frames, dct_coefficient_count] # remove channel dim self.output_ = tf.squeeze(mfcc, axis=0) elif flags.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') int16_input = tf.cast( tf.multiply(background_clamp, du.MAX_ABS_INT16), tf.int16) # audio_microfrontend does: # 1. A slicing window function of raw audio # 2. Short-time FFTs # 3. Filterbank calculations # 4. Noise reduction # 5. PCAN Auto Gain Control # 6. Logarithmic scaling # int16_input dims: [time, channels] micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=flags.sample_rate, window_size=flags.window_size_ms, window_step=flags.window_stride_ms, num_channels=flags.mel_num_bins, upper_band_limit=flags.mel_upper_edge_hertz, lower_band_limit=flags.mel_lower_edge_hertz, out_scale=1, out_type=tf.float32) # int16_input dims: [frames, num_channels] self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise ValueError('Unknown preprocess mode "%s" (should be "raw", ' ' "mfcc", or "micro")' % (flags.preprocess))
train_Y = numpy.asarray([ 1.7, 2.76, 2.09, 3.19, 1.694, 1.573, 3.366, 2.596, 2.53, 1.221, 2.827, 3.465, 1.65, 2.904, 2.42, 2.94, 1.3 ]) n_samples = train_X.shape[0] # tf Graph Input X = tf.placeholder("float") Y = tf.placeholder("float") # Set model weights W = tf.Variable(rng.randn(), name="weight") b = tf.Variable(rng.randn(), name="bias") # Construct a linear model pred = tf.add(tf.multiply(X, W), b) # Mean squared error cost = tf.reduce_sum(tf.pow(pred - Y, 2)) / (2 * n_samples) # Gradient descent # Note, minimize() knows to modify W and b because Variable objects are trainable=True by default optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() # Start training with tf.Session() as sess: # Run the initializer sess.run(init)
def true_fn() : return tf.multiply(x, 10) #x*10
test_house_price_norm = normalize(test_house_price) # Set up the TensorFlow placeholders that get updated as we descend down the gradient tf_house_size = tf.placeholder("float", name="house_size") tf_price = tf.placeholder("float", name="price") # Define the variables holding the size_factor and price we set during training. # We initialize them to some random values based on the normal distribution. tf_size_factor = tf.Variable(np.random.randn(), name="size_factor") tf_price_offset = tf.Variable(np.random.randn(), name="price_offset") # 2. Define the operations for the predicting values - predicted price = (size_factor * house_size ) + price_offset # Notice, the use of the tensorflow add and multiply functions. These add the operations to the computation graph, # AND the tensorflow methods understand how to deal with Tensors. Therefore do not try to use numpy or other library # methods. tf_price_pred = tf.add(tf.multiply(tf_size_factor, tf_house_size), tf_price_offset) # 3. Define the Loss Function (how much error) - Mean squared error tf_cost = tf.reduce_sum(tf.pow(tf_price_pred - tf_price, 2)) / (2 * num_train_samples) # Optimizer learning rate. The size of the steps down the gradient learning_rate = 0.1 # 4. define a Gradient descent optimizer that will minimize the loss defined in the operation "cost". optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_cost) # Initializing the variables init = tf.global_variables_initializer()
def forward(self): X = self.phs['X'] if not self.embedding: X = tf.cast(X, tf.float32) * (1.0 / 255) layer = self.apply_feature_extractor(X) fisher_ws = [] fisher_diags = [] fisher_diagcs = [] fisher_old_ws = [] n_layers = len(self.layer_sizes) - 1 for i in range(n_layers): layer_name = "d%d" % (i + 1) layer = utils.dense2(layer, self.layer_sizes[i], self.layer_sizes[i + 1], name=layer_name) print('Applied dense (%d, %d) of name %s' % (self.layer_sizes[i], self.layer_sizes[i + 1], layer_name)) w = utils.get_var("%s/w" % layer_name) fisher_w_name = "fisher_diag_%s_w" % layer_name fisher_wc_name = "fisher_diag_%s_wc" % layer_name fisher_old_w_name = "fisher_old_%s_w" % layer_name self.vars[fisher_w_name] = tf.Variable(tf.zeros_like(w), name=fisher_w_name) self.vars[fisher_wc_name] = tf.Variable(tf.zeros_like(w), name=fisher_wc_name) self.vars[fisher_old_w_name] = tf.Variable(tf.zeros_like(w), name=fisher_old_w_name) fisher_ws += [w] fisher_diags += [self.vars[fisher_w_name]] fisher_diagcs += [self.vars[fisher_wc_name]] fisher_old_ws += [self.vars[fisher_old_w_name]] b = utils.get_var("%s/b" % layer_name) fisher_b_name = "fisher_diag_%s_b" % layer_name fisher_bc_name = "fisher_diag_%s_bc" % layer_name fisher_old_b_name = "fisher_old_%s_b" % layer_name self.vars[fisher_b_name] = tf.Variable(tf.zeros_like(b), name=fisher_b_name) self.vars[fisher_bc_name] = tf.Variable(tf.zeros_like(b), name=fisher_bc_name) self.vars[fisher_old_b_name] = tf.Variable(tf.zeros_like(b), name=fisher_old_b_name) fisher_ws += [b] fisher_diags += [self.vars[fisher_b_name]] fisher_diagcs += [self.vars[fisher_bc_name]] fisher_old_ws += [self.vars[fisher_old_b_name]] print('Created zero fishers') if i + 1 != len(self.layer_sizes) - 1: if self.use_dropout: layer = self.activation(layer) layer = tf.keras.layers.Dropout( rate=self.dropoutv, seed=self.seed)(layer, training=self.glob_training_ph) print('Applied activation -> dropout') else: layer = self.activation(layer) print('Applied activation') self.vars['fX'] = layer self.objs['fisher_ws'] = fisher_ws self.objs['fisher_diagcs'] = fisher_diagcs self.objs['fisher_diags'] = fisher_diags self.objs['fisher_old_ws'] = fisher_old_ws # Create fisher graph print('Creating fisher batch_log_likelihood') fisher_X = tf.cast(self.phs['fisher_X'], tf.float32) * (1.0 / 255) fisher_Y = tf.one_hot(self.phs['fisher_Y'], depth=self.layer_sizes[-1], dtype=tf.float32) if self.feature_extractor_needed: fisher_X = self.apply_feature_extractor(fisher_X) fisher_Xs = [ tf.reshape(fx, shape=(1, self.layer_sizes[0])) for fx in tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0) ] else: fisher_Xs = [ tf.reshape(fx, shape=(1, *self.it.reshape_dims)) for fx in tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0) ] fisher_Ys = tf.unstack(fisher_Y, num=self.fisher_batch_size, axis=0) log_likelihoods = [] fisher_var_lists = [] for i in range(self.fisher_batch_size): raw_output = fisher_Xs[i] fisher_var_list = [] for j in range(n_layers): layer_name = "d%d" % (j + 1) w = tf.identity(utils.get_var("%s/w" % layer_name)) b = tf.identity(utils.get_var("%s/b" % layer_name)) fisher_var_list += [w, b] raw_output = tf.add(tf.matmul(raw_output, w), b) if j + 1 != len(self.layer_sizes) - 1: raw_output = self.activation(raw_output) # No dropout; TODO log_likelihood = tf.multiply(fisher_Ys[i], tf.nn.log_softmax(raw_output)) log_likelihoods += [log_likelihood] fisher_var_lists += [fisher_var_list] batch_log_likelihood = tf.reduce_sum(log_likelihoods) self.vars['batch_log_likelihood'] = batch_log_likelihood self.objs['fisher_var_lists'] = fisher_var_lists
def update_loss(self, n_task): if n_task == 0: return loss = self.vars['losses'][0] if self.use_orig_loss else self.vars[ 'losses'][n_task - 1] penalties = [] old_vars = self.objs[ 'fisher_old_ws'] if self.use_latest_theta_star else self.saved_wts[ n_task - 1] fisher_vars = self.objs[ 'fisher_diags'] if self.use_latest_theta_star else self.saved_fishers[ n_task - 1] for var, old_var, fisher in zip(self.objs['fisher_ws'], old_vars, fisher_vars): penalties += [ tf.multiply(fisher, tf.square(tf.subtract(var, old_var))) ] ewc_penalty = tf.add_n( [tf.reduce_sum(penalty) for penalty in penalties]) if self.fisher_avg: ewc_penalty = tf.multiply(1.0 / n_task, ewc_penalty) new_loss = tf.add( loss, tf.multiply(tf.constant(self.ewc_const, tf.float32), ewc_penalty)) self.vars['loss'] = new_loss self.vars['losses'][n_task] = new_loss self.vars['distances'][n_task] = self.setup_distances(n_task) orig_var_list = self.vars['orig_var_list'] # print("Trainable vars: %s" % str(orig_var_list)) print("Trainable vars:") self.print_vars(orig_var_list) if self.reset_opt: print('Reset opt') self.objs['sess'].run( tf.variables_initializer(self.objs['opt'].variables())) # op = self.objs['opt'].minimize(new_loss, var_list = orig_var_list) grads = self.objs['opt'].compute_gradients(new_loss, var_list=orig_var_list) new_grads = [] if self.correctmask: temp_masks = [] init_ops = [] for fi, mask in enumerate(self.all_masks[n_task]): temp_masks += [ tf.Variable(tf.zeros_like(mask), trainable=False) ] init_ops += [tf.assign(temp_masks[fi], mask)] self.objs['sess'].run(init_ops) print("Created temp_masks") for gv, mask, fd in zip(grads, temp_masks, self.objs['fisher_diags']): grad, var = gv s = self.objs['sess'].run(tf.reduce_sum(fd)) fd_filtered = tf.identity(mask) num = self.objs['sess'].run(tf.reduce_sum(fd_filtered)) total_num = self.objs['sess'].run( tf.reduce_sum(tf.ones_like(fd_filtered))) print("%s => can modify %d/%d params, sum = %f" % (var.name, num, total_num, s)) new_grad = tf.multiply(fd_filtered, grad) new_grads += [(new_grad, var)] else: for gv, fd in zip(grads, self.objs['fisher_diags']): grad, var = gv fd_filtered, num, total_num = self.get_mask(fd, fix=self.fix) s = self.objs['sess'].run(tf.reduce_sum(fd)) print("%s => can modify %d/%d params, sum = %f" % (var.name, num, total_num, s)) new_grad = tf.multiply(fd_filtered, grad) new_grads += [(new_grad, var)] op = self.objs['opt'].apply_gradients(new_grads) self.vars['train_op'] = op self.vars['train_ops'][n_task] = op print('Updated train_op and loss')
def build_model(self): """ 建立推断的模型 :return: 无 """ # create placeholder self.img_placeholder = tf.placeholder(dtype=tf.float32, shape=[ self.test_batch_size, self.input_w, self.input_h, self.input_c ]) self.label_placeholder = tf.placeholder(dtype=tf.int32, shape=[self.test_batch_size]) self.training_flag = tf.placeholder(dtype=tf.bool, shape=[]) self.earlyexit_lossweights_placeholder = tf.placeholder( dtype=tf.float32, shape=[len(self.earlyexit_lossweights)]) # create MODEL and build graph self.B_VGGNet_instance = B_VGGNet(num_class=self.num_class) [ self.logits_exit0, self.logits_exit1, self.logits_exit2, self.logits_exit3 ] = self.B_VGGNet_instance.model(self.img_placeholder, is_train=self.training_flag) # prediction from branches self.pred0 = tf.nn.softmax(self.logits_exit0) self.pred1 = tf.nn.softmax(self.logits_exit1) self.pred2 = tf.nn.softmax(self.logits_exit2) self.pred3 = tf.nn.softmax(self.logits_exit3) # logits of branches #print(logits_exit0.shape, logits_exit1.shape, logits_exit2.shape) self.loss_exit0 = cross_entropy(self.logits_exit0, self.label_placeholder) self.loss_exit1 = cross_entropy(self.logits_exit1, self.label_placeholder) self.loss_exit2 = cross_entropy(self.logits_exit2, self.label_placeholder) self.loss_exit3 = cross_entropy(self.logits_exit3, self.label_placeholder) self.total_loss = tf.reduce_sum( tf.multiply(self.earlyexit_lossweights_placeholder, [ self.loss_exit0, self.loss_exit1, self.loss_exit2, self.loss_exit3 ])) # accuracy from brach self.train_acc0 = top_k_error(self.pred0, self.label_placeholder, 1) self.train_acc1 = top_k_error(self.pred1, self.label_placeholder, 1) self.train_acc2 = top_k_error(self.pred2, self.label_placeholder, 1) self.train_acc3 = top_k_error(self.pred3, self.label_placeholder, 1) # Initialize MODEL and create session self.sess = tf.Session() # Construct saver and restore graph self.saver = tf.train.Saver() self.saver.restore(self.sess, os.path.join(self.checkpoint_path, 'B_VGG.ckpt'))
def build_bifpn_layer(feats, feat_sizes, fpn_name, fpn_config, is_training, fpn_num_filters, min_level, max_level, separable_conv, apply_bn_for_resampling, conv_after_downsample, use_native_resize_op, conv_bn_relu_pattern, pooling_type, use_tpu=False): """Builds a feature pyramid given previous feature pyramid and config.""" config = fpn_config or get_fpn_config(fpn_name, min_level, max_level) num_output_connections = [0 for _ in feats] for i, fnode in enumerate(config.nodes): with tf.variable_scope('fnode{}'.format(i)): logging.info('fnode %d : %s', i, fnode) new_node_width = feat_sizes[fnode['width_index']] nodes = [] for idx, input_offset in enumerate(fnode['inputs_offsets']): input_node = feats[input_offset] num_output_connections[input_offset] += 1 input_node = resample_feature_map( input_node, '{}_{}_{}'.format(idx, input_offset, len(feats)), new_node_width, fpn_num_filters, apply_bn_for_resampling, is_training, conv_after_downsample, use_native_resize_op, pooling_type) nodes.append(input_node) # Combine all nodes. dtype = nodes[0].dtype if config.weight_method == 'attn': edge_weights = [ tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype) for _ in range(len(fnode['inputs_offsets'])) ] normalized_weights = tf.nn.softmax(tf.stack(edge_weights)) nodes = tf.stack(nodes, axis=-1) new_node = tf.reduce_sum( tf.multiply(nodes, normalized_weights), -1) elif config.weight_method == 'fastattn': edge_weights = [ tf.nn.relu( tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype)) for _ in range(len(fnode['inputs_offsets'])) ] weights_sum = tf.add_n(edge_weights) nodes = [ nodes[i] * edge_weights[i] / (weights_sum + 0.0001) for i in range(len(nodes)) ] new_node = tf.add_n(nodes) elif config.weight_method == 'sum': new_node = tf.add_n(nodes) else: raise ValueError('unknown weight_method {}'.format( config.weight_method)) with tf.variable_scope('op_after_combine{}'.format(len(feats))): if not conv_bn_relu_pattern: new_node = utils.relu_fn(new_node) if separable_conv: conv_op = functools.partial(tf.layers.separable_conv2d, depth_multiplier=1) else: conv_op = tf.layers.conv2d new_node = conv_op( new_node, filters=fpn_num_filters, kernel_size=(3, 3), padding='same', use_bias=True if not conv_bn_relu_pattern else False, name='conv') new_node = utils.batch_norm_relu( new_node, is_training_bn=is_training, relu=False if not conv_bn_relu_pattern else True, data_format='channels_last', use_tpu=use_tpu, name='bn') feats.append(new_node) num_output_connections.append(0) output_feats = {} for l in range(min_level, max_level + 1): for i, fnode in enumerate(reversed(config.nodes)): if fnode['width_index'] == l: output_feats[l] = feats[-1 - i] break return output_feats
def meta_optimize(self): """Meta optimization step.""" probe_images, probe_labels = self.probe_images, self.probe_labels labels = self.labels net = self.net logits = self.logits gate_gradients = 1 batch_size = int(self.batch_size / self.strategy.num_replicas_in_sync) init_eps_val = float(1) / batch_size meta_net = networks.MetaImage(self.net, name='meta_model') if FLAGS.meta_momentum and not self.optimizer.variables(): # Initializing momentum state of optimizer for meta momentum update. # It is a hacky implementation logging.info('Pre-initialize optimizer momentum states.') idle_net_cost = tf.losses.sparse_softmax_cross_entropy( self.labels, logits) tmp_var_grads = self.optimizer.compute_gradients( tf.reduce_mean(idle_net_cost), net.trainable_variables) self.optimizer.apply_gradients(tmp_var_grads) with tf.name_scope('coefficient'): # Data weight coefficient target = tf.constant([init_eps_val] * batch_size, shape=(batch_size, ), dtype=np.float32, name='weight') # Data re-labeling coefficient eps = tf.constant([FLAGS.grad_eps_init] * batch_size, shape=(batch_size, ), dtype=tf.float32, name='eps') onehot_labels = tf.one_hot(labels, self.dataset.num_classes) onehot_labels = tf.cast(onehot_labels, tf.float32) eps_k = tf.reshape(eps, [batch_size, 1]) mixed_labels = eps_k * onehot_labels + (1 - eps_k) * self.guessed_label # raw softmax loss log_softmax = tf.nn.log_softmax(logits) net_cost = -tf.reduce_sum(mixed_labels * log_softmax, 1) lookahead_loss = tf.reduce_sum(tf.multiply(target, net_cost)) lookahead_loss = lookahead_loss + net.regularization_loss with tf.control_dependencies([lookahead_loss]): train_vars = net.trainable_variables var_grads = tf.gradients(lookahead_loss, train_vars, gate_gradients=gate_gradients) static_vars = [] for i in range(len(train_vars)): if FLAGS.meta_momentum > 0: actual_grad = self.meta_momentum_update( var_grads[i], train_vars[i].name, self.optimizer) static_vars.append( tf.math.subtract(train_vars[i], FLAGS.meta_stepsize * actual_grad)) else: static_vars.append( tf.math.subtract(train_vars[i], FLAGS.meta_stepsize * var_grads[i])) # new style meta_net.add_variable_alias(static_vars[-1], var_name=train_vars[i].name) for uv in net.updates_variables: meta_net.add_variable_alias(uv, var_name=uv.name, var_type='updates_variables') meta_net.verbose() with tf.control_dependencies(static_vars): g_logits = meta_net(probe_images, name='meta_model', reuse=True, training=True) desired_y = tf.one_hot(probe_labels, self.dataset.num_classes) meta_loss = tf.nn.softmax_cross_entropy_with_logits_v2( desired_y, g_logits) meta_loss = tf.reduce_mean(meta_loss, name='meta_loss') meta_loss = meta_loss + meta_net.get_regularization_loss(net.wd) meta_acc, meta_acc_op = tf.metrics.accuracy( probe_labels, tf.argmax(g_logits, axis=1)) with tf.control_dependencies([meta_loss] + [meta_acc_op]): meta_train_vars = meta_net.trainable_variables grad_meta_vars = tf.gradients(meta_loss, meta_train_vars, gate_gradients=gate_gradients) grad_target, grad_eps = tf.gradients(static_vars, [target, eps], grad_ys=grad_meta_vars, gate_gradients=gate_gradients) # updates weight raw_weight = target - grad_target raw_weight = raw_weight - init_eps_val unorm_weight = tf.clip_by_value(raw_weight, clip_value_min=0, clip_value_max=float('inf')) norm_c = tf.reduce_sum(unorm_weight) weight = tf.divide(unorm_weight, norm_c + 0.00001) # gets new lambda by the sign of gradient new_eps = tf.where(grad_eps < 0, x=tf.ones_like(eps), y=tf.zeros_like(eps)) return tf.stop_gradient(weight), tf.stop_gradient( new_eps), meta_loss, meta_acc
def single_score(self, u, i, feat_cate, feat_val, reuse=False): feat_val = tf.reshape(feat_val, shape=[-1, self.fieldSize, 1]) u_emb = tf.nn.embedding_lookup(self.user_emb_w, u) # [None,h] i_emb = tf.nn.embedding_lookup(self.item_emb_w, i) # [None,h] feature_embeddings = tf.nn.embedding_lookup(self.feature_emb_w, feat_cate) # [None,h2] # first-order first_emb = tf.nn.embedding_lookup(self.w_first, feat_cate) y_first_part = tf.reduce_sum(tf.multiply(first_emb, feat_val), 2) # [None,f] # second-order emb = tf.multiply(feature_embeddings, feat_val) sum_squared_part = tf.square(tf.reduce_sum(emb, 1)) squared_sum_part = tf.reduce_sum(tf.square(emb), 1) y_second_part = 0.5 * tf.subtract(sum_squared_part, squared_sum_part) # [None * k] # fcn flat_emb = tf.reshape(feature_embeddings, [-1, self.fieldSize * self.Hidden_units // 2]) all_emb = tf.concat([u_emb, i_emb, flat_emb], axis=1) if reuse: bn_layer = tf.layers.batch_normalization(inputs=all_emb, name='bn', reuse=True) layer1 = tf.layers.dense(bn_layer, 128, activation=tf.nn.sigmoid, name='f1', reuse=True) layer2 = tf.layers.dense(layer1, 64, activation=tf.nn.sigmoid, name='f2', reuse=True) layer3 = tf.layers.dense(layer2, 1, activation=tf.nn.sigmoid, name='f3', reuse=True) # deepfm deep_out = tf.concat([y_first_part, y_second_part, layer3], axis=1) res_out = tf.layers.dense(deep_out, 1, activation=None, name='f4', reuse=True) else: bn_layer = tf.layers.batch_normalization(inputs=all_emb, name='bn') layer1 = tf.layers.dense(bn_layer, 128, activation=tf.nn.sigmoid, name='f1') layer2 = tf.layers.dense(layer1, 64, activation=tf.nn.sigmoid, name='f2') layer3 = tf.layers.dense(layer2, 1, activation=tf.nn.sigmoid, name='f3') # deepfm deep_out = tf.concat([y_first_part, y_second_part, layer3], axis=1) res_out = tf.layers.dense(deep_out, 1, activation=None, name='f4') return res_out
def attention_layer(from_tensor, to_tensor, attention_mask=None, input_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, softmax_temperature=1.0, batch_size=None, from_seq_length=None, to_seq_length=None, to_proj_length=None): """Performs multi-headed attention from `from_tensor` to `to_tensor`. This is an implementation of multi-headed attention based on "Attention is all you Need". If `from_tensor` and `to_tensor` are the same, then this is self-attention. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. This function first projects `from_tensor` into a "query" tensor and `to_tensor` into "key" and "value" tensors. These are (effectively) a list of tensors of length `num_attention_heads`, where each tensor is of shape [batch_size, seq_length, size_per_head]. Then, the query and key tensors are dot-producted and scaled. These are softmaxed to obtain attention probabilities. The value tensors are then interpolated by these probabilities, then concatenated back to a single tensor and returned. In practice, the multi-headed attention are done with tf.einsum as follows: Input_tensor: [BFD] Wq, Wk, Wv: [DNH] Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq) K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk) V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv) attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H) attention_probs:[BNFT] = softmax(attention_scores) context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V) Wout:[DNH] Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout) Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. input_mask: Only required when using to_proj_length. num_attention_heads: int. Number of attention heads. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. softmax_temperature: The temperature for the softmax attention. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. to_proj_length: (Optional) Int. Down-project keys and values to this length. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # `query_layer` = [B, F, N, H] query_layer = dense_layer_3d(from_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), query_act, "query") # `key_layer` = [B, T, N, H] key_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), key_act, "key") # `value_layer` = [B, T, N, H] value_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), value_act, "value") if to_proj_length is not None: # This gives one project matrix per layer (shared by heads and value/key). # In the paper they also look into other sharing schemes. with tf.variable_scope("proj_seq_length"): proj_kernel = tf.get_variable( name="kernel", shape=[to_seq_length, to_proj_length], initializer=create_initializer(initializer_range)) input_mask = tf.cast(input_mask, tf.float32) input_mask4d = tf.reshape(input_mask, (batch_size, to_seq_length, 1, 1)) key_layer = key_layer * input_mask4d # [B, K, N, H] key_layer = tf.einsum("BTNH,TK->BKNH", key_layer, proj_kernel) value_layer = value_layer * input_mask4d # [B, K, N, H] value_layer = tf.einsum("BTNH,TK->BKNH", value_layer, proj_kernel) # Take the dot product between "query" and "key" to get the raw # attention scores. attention_scores = tf.einsum("BFNH,BTNH->BNFT", query_layer, key_layer, name="query_key_einsum") attention_scores = attention_scores / softmax_temperature attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None and to_proj_length is None: # `attention_mask` = [B, 1, F, T] or [B, H, F, T] # Caller can pass a rank 3 tensor for a constand mask or rank 4 for per-head # head attention mask. attention_mask = tf.reshape( attention_mask, shape=[batch_size, -1, from_seq_length, to_seq_length]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. attention_mask_float = tf.cast(attention_mask, tf.float32) # Please keep this tf.where as it fixes back propagation issues: It removes # NaNs when using tf.math.log. attention_mask_float = tf.where(attention_mask_float > 0.0, attention_mask_float, tf.zeros_like(attention_mask_float)) adder = tf.math.log(attention_mask_float) adder = tf.where(tf.is_finite(adder), adder, tf.zeros_like(adder, dtype=tf.float32) - 10000.0) # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs_do = dropout(attention_probs, attention_probs_dropout_prob) # `context_layer` = [B, F, N, H] context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs_do, value_layer, name="attention_value_einsum") return context_layer, attention_probs
def trainGraph(inp, out, sess): argmax = tf.placeholder("float", [None, ACTIONS]) gt = tf.placeholder("float", [None]) action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1) cost = tf.reduce_mean(tf.square(action - gt)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) game = pong.PongGame() D = deque() frame = game.getPresentFrame() frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) inp_t = np.stack((frame, frame, frame, frame), axis=2) saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) t = 0 epsilon = INITIAL_EPSILON while (1): out_t = out.eval(feed_dict={inp: [inp_t]})[0] argmax_t = np.zeros([ACTIONS]) if (random.random() <= epsilon): maxIndex = random.randrange(ACTIONS) else: maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE reward_t, frame = game.getNextFrame(argmax_t) frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (84, 84, 1)) inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) D.append((inp_t, argmax_t, reward_t, inp_t1)) if len(D) > REPLAY_MEMORY: D.popleft() if t > OBSERVE: minibatch = random.sample(D, BATCH) inp_batch = [d[0] for d in minibatch] argmax_batch = [d[1] for d in minibatch] reward_batch = [d[2] for d in minibatch] inp_t1_batch = [d[3] for d in minibatch] gt_batch = [] out_batch = out.eval(feed_dict={inp: inp_t1_batch}) for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i])) train_step.run(feed_dict={ gt: gt_batch, argmax: argmax_batch, inp: inp_batch }) inp_t = inp_t1 t = t + 1 if t % 10000 == 0: saver.save(sess, './' + 'pong' + '-dqn', global_step=t) print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
def selective_crop_and_resize(features, boxes, box_levels, boundaries, output_size=7, sample_offset=0.5, use_einsum_gather=False): """Crop and resize boxes on a set of feature maps. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, it selectively crops and resizes boxes from the corresponding feature maps to generate the box features. We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf, figure 3 for reference). Specifically, for each feature map, we select an (output_size, output_size) set of pixels corresponding to the box location, and then use bilinear interpolation to select the feature value for each pixel. For performance, we perform the gather and interpolation on all layers as a single operation. In this op the multi-level features are first stacked and gathered into [2*output_size, 2*output_size] feature points. Then bilinear interpolation is performed on the gathered feature points to generate [output_size, output_size] RoIAlign feature map. Here is the step-by-step algorithm: 1. The multi-level features are gathered into a [batch_size, num_boxes, output_size*2, output_size*2, num_filters] Tensor. The Tensor contains four neighboring feature points for each vertice in the output grid. 2. Compute the interpolation kernel of shape [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis can be seen as stacking 2x2 interpolation kernels for all vertices in the output grid. 3. Element-wise multiply the gathered features and interpolation kernel. Then apply 2x2 average pooling to reduce spatial dimension to output_size. Args: features: a 5-D tensor of shape [batch_size, num_levels, max_height, max_width, num_filters] where cropping and resizing are based. boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the information of each box w.r.t. the corresponding feature map. boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) in terms of the number of pixels of the corresponding feature map size. box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing the 0-based corresponding feature level index of each box. boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing the boundary (in (y, x)) of the corresponding feature map for each box. Any resampled grid points that go beyond the bounary will be clipped. output_size: a scalar indicating the output crop size. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. use_einsum_gather: use einsum to replace gather or not. Replacing einsum with gather can improve performance when feature size is not large, einsum is friendly with model partition as well. Gather's performance is better when feature size is very large and there are multiple box levels. Returns: features_per_box: a 5-D tensor of shape [batch_size, num_boxes, output_size, output_size, num_filters] representing the cropped features. """ (batch_size, num_levels, max_feature_height, max_feature_width, num_filters) = features.get_shape().as_list() _, num_boxes, _ = boxes.get_shape().as_list() kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions( boxes, boundaries, output_size, sample_offset) x_indices = tf.cast(tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) y_indices = tf.cast(tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) if use_einsum_gather: # Blinear interpolation is done during the last two gathers: # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # [[f00, f01], # [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot) # where [hy, ly] and [hx, lx] are the bilinear interpolation kernel. # shape is [batch_size, boxes, output_size, 2, 1] grid_y_one_hot, grid_x_one_hot = get_grid_one_hot( box_gridy0y1, box_gridx0x1, max_feature_height, max_feature_width) # shape is [batch_size, num_boxes, output_size, height] grid_y_weight = tf.reduce_sum(tf.multiply(grid_y_one_hot, kernel_y), axis=-2) # shape is [batch_size, num_boxes, output_size, width] grid_x_weight = tf.reduce_sum(tf.multiply(grid_x_one_hot, kernel_x), axis=-2) # Gather for y_axis. # shape is [batch_size, num_boxes, output_size, width, features] features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features, tf.cast(grid_y_weight, features.dtype)) # Gather for x_axis. # shape is [batch_size, num_boxes, output_size, output_size, features] features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box, tf.cast(grid_x_weight, features.dtype)) else: height_dim_offset = max_feature_width level_dim_offset = max_feature_height * height_dim_offset batch_dim_offset = num_levels * level_dim_offset batch_size_offset = tf.tile( tf.reshape( tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]), [1, num_boxes, output_size * 2, output_size * 2]) box_levels_offset = tf.tile( tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]), [1, 1, output_size * 2, output_size * 2]) y_indices_offset = tf.tile( tf.reshape(y_indices * height_dim_offset, [batch_size, num_boxes, output_size * 2, 1]), [1, 1, 1, output_size * 2]) x_indices_offset = tf.tile( tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), [1, 1, output_size * 2, 1]) indices = tf.reshape( batch_size_offset + box_levels_offset + y_indices_offset + x_indices_offset, [-1]) features = tf.reshape(features, [-1, num_filters]) # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar # performance. features_per_box = tf.reshape(tf.gather(features, indices), [ batch_size, num_boxes, output_size * 2, output_size * 2, num_filters ]) features_per_box = feature_bilinear_interpolation( features_per_box, kernel_y, kernel_x) return features_per_box
def evaluate(session, d, y): sub = tf.subtract(y, d) # 相减 power = tf.multiply(sub, sub) # 平方 E = session.run(tf.reduce_sum(power)) # 求和 E /= 2 # 除以2 return E
def scale_image_value(image): # scale values between -1 and +1 image = tf.subtract(image, 0.5) image = tf.multiply(image, 2.0) return image
def tf_DotProduct(tensorA, tensorB): return tf.reduce_sum(tf.multiply(tensorA, tensorB), axis=-1, keep_dims=True)
iris = load_iris() # 0-1에 근사한 변수 선택 X = iris.data y_data = X[:, 2] # 꽃잎 길이(3) x_data = X[:, 3] # 꽃잎 넓이(4) # Hyper parameter learning_rate = 0.1 # 학습율 0.01 > 0.1 iter_size = 50 # 학습횟수 : 50 > 500 X = tf.placeholder(dtype=tf.float32, shape=[None]) y = tf.placeholder(dtype=tf.float32, shape=[None]) a = tf.Variable(tf.random_normal(shape=[1], seed=123)) b = tf.Variable(tf.random_normal(shape=[1], seed=123)) # 단순 선형회귀모델 model_output = tf.add(tf.multiply(X, a), b) '''cost function''' cost_l1 = tf.reduce_mean(tf.abs(y - model_output)) # L1 cost_l2 = tf.reduce_mean(tf.square(y - model_output)) # L2 # L1 cost 최적화 opt_l1 = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) train_l1 = opt_l1.minimize(cost_l1) # L2 cost 최적화 opt_l2 = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) train_l2 = opt_l1.minimize(cost_l2) sess = tf.Session() sess.run(tf.global_variables_initializer())
def prelu(self, inp, name): with tf.variable_scope(name): i = int(inp.get_shape()[-1]) alpha = self.make_var('alpha', shape=(i,)) output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp)) return output
def draw_samples(alpha, scale): r"""Draw samples from the robust distribution. This function implements Algorithm 1 the paper. This code is written to allow for sampling from a set of different distributions, each parametrized by its own alpha and scale values, as opposed to the more standard approach of drawing N samples from the same distribution. This is done by repeatedly performing N instances of rejection sampling for each of the N distributions until at least one proposal for each of the N distributions has been accepted. All samples are drawn with a zero mean, to use a non-zero mean just add each mean to each sample. Args: alpha: A TF tensor/scalar or numpy array/scalar of floats where each element is the shape parameter of that element's distribution. scale: A TF tensor/scalar or numpy array/scalar of floats where each element is the scale parameter of that element's distribution. Must be the same shape as `alpha`. Returns: A TF tensor with the same shape and precision as `alpha` and `scale` where each element is a sample drawn from the distribution specified for that element by `alpha` and `scale`. """ # `scale` must have the same type as `alpha`. float_dtype = alpha.dtype tf.assert_type(scale, float_dtype) assert_ops = [ # `scale` must be > 0. tf.Assert(tf.reduce_all(scale > 0.), [scale]), # `alpha` must be >= 0. tf.Assert(tf.reduce_all(alpha >= 0.), [alpha]), # `alpha` and `scale` must have the same shape. tf.Assert(tf.reduce_all(tf.equal(tf.shape(alpha), tf.shape(scale))), [tf.shape(alpha), tf.shape(scale)]), ] with tf.control_dependencies(assert_ops): shape = tf.shape(alpha) # The distributions we will need for rejection sampling. The sqrt(2) scaling # of the Cauchy distribution corrects for our differing conventions for # standardization. cauchy = tfp.distributions.Cauchy(loc=0., scale=tf.sqrt(2.)) uniform = tfp.distributions.Uniform(low=0., high=1.) def while_cond(_, accepted): """Terminate the loop only when all samples have been accepted.""" return ~tf.reduce_all(accepted) def while_body(samples, accepted): """Generate N proposal samples, and then perform rejection sampling.""" # Draw N samples from a Cauchy, our proposal distribution. cauchy_sample = tf.cast(cauchy.sample(shape), float_dtype) # Compute the likelihood of each sample under its target distribution. nll = nllfun(cauchy_sample, alpha, tf.cast(1, float_dtype)) # Bound the NLL. We don't use the approximate loss as it may cause # unpredictable behavior in the context of sampling. nll_bound = general.lossfun( cauchy_sample, tf.cast(0, float_dtype), tf.cast(1, float_dtype), approximate=False) + log_base_partition_function(alpha) # Draw N samples from a uniform distribution, and use each uniform sample # to decide whether or not to accept each proposal sample. uniform_sample = tf.cast(uniform.sample(shape), float_dtype) accept = uniform_sample <= tf.math.exp(nll_bound - nll) # If a sample is accepted, replace its element in `samples` with the # proposal sample, and set its bit in `accepted` to True. samples = tf.where(accept, cauchy_sample, samples) accepted = accept | accepted return (samples, accepted) # Initialize the loop. The first item does not matter as it will get # overwritten, the second item must be all False. while_loop_vars = (tf.zeros(shape, float_dtype), tf.zeros(shape, dtype=bool)) # Perform rejection sampling until all N samples have been accepted. terminal_state = tf.while_loop(cond=while_cond, body=while_body, loop_vars=while_loop_vars) # Because our distribution is a location-scale family, we sample from # p(x | 0, \alpha, 1) and then scale each sample by `scale`. samples = tf.multiply(terminal_state[0], scale) return samples
#Hyperparameter Setting lamb = 0.001 batch_size = 80 learning_rate = 0.15 training_epochs = 2000 display_step = 20 # construct models x = tf.placeholder('float32',[785,None]) y = tf.placeholder('float32',[5,None]) theta = tf.Variable(tf.zeros([785,5],dtype='float32') + 0.001) x_next = tf.matmul(theta,x,transpose_a=True) #%% gradient calcuation for Theta sig = tf.exp(tf.matmul(theta,x,transpose_a=True)) grad_regression = tf.multiply(theta,2*lamb) grad_softmax = tf.divide(sig,tf.reduce_sum(sig,0)) grad_LCL = -tf.matmul(x,tf.subtract(y,grad_softmax),transpose_b=True) grad_LCL_regression = tf.add(grad_LCL,grad_regression) grad = tf.divide(grad_LCL_regression,batch_size) print(grad.shape) #update_theta theta_update = tf.assign(theta,tf.subtract(theta,learning_rate*grad)) #compare between estimated result and true result y2 = tf.argmax(sig,0) y3 = tf.argmax(y,0) score = tf.reduce_mean(tf.cast(tf.equal(y2,y3),'float32'))
def add_input_distortions(flip_left_right, random_crop, random_scale, random_brightness): """Creates the operations to apply the specified distortions. During training it can help to improve the results if we run the images through simple distortions like crops, scales, and flips. These reflect the kind of variations we expect in the real world, and so can help train the model to cope with natural data more effectively. Here we take the supplied parameters and construct a network of operations to apply them to an image. Cropping ~~~~~~~~ Cropping is done by placing a bounding box at a random position in the full image. The cropping parameter controls the size of that box relative to the input image. If it's zero, then the box is the same size as the input and no cropping is performed. If the value is 50%, then the crop box will be half the width and height of the input. In a diagram it looks like this: < width > +---------------------+ | | | width - crop% | | < > | | +------+ | | | | | | | | | | | | | | +------+ | | | | | +---------------------+ Scaling ~~~~~~~ Scaling is a lot like cropping, except that the bounding box is always centered and its size varies randomly within the given range. For example if the scale percentage is zero, then the bounding box is the same size as the input and no scaling is applied. If it's 50%, then the bounding box will be in a random range between half the width and height and full size. Args: flip_left_right: Boolean whether to randomly mirror images horizontally. random_crop: Integer percentage setting the total margin used around the crop box. random_scale: Integer percentage of how much to vary the scale by. random_brightness: Integer range to randomly multiply the pixel values by. graph. Returns: The jpeg input layer and the distorted result tensor. """ jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput') decoded_image = tf.image.decode_jpeg(jpeg_data, channels=MODEL_INPUT_DEPTH) decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32) decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0) margin_scale = 1.0 + (random_crop / 100.0) resize_scale = 1.0 + (random_scale / 100.0) margin_scale_value = tf.constant(margin_scale) resize_scale_value = tf.random_uniform(tensor_shape.scalar(), minval=1.0, maxval=resize_scale) scale_value = tf.multiply(margin_scale_value, resize_scale_value) precrop_width = tf.multiply(scale_value, MODEL_INPUT_WIDTH) precrop_height = tf.multiply(scale_value, MODEL_INPUT_HEIGHT) precrop_shape = tf.stack([precrop_height, precrop_width]) precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32) precropped_image = tf.image.resize_bilinear(decoded_image_4d, precrop_shape_as_int) precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0]) cropped_image = tf.random_crop( precropped_image_3d, [MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH, MODEL_INPUT_DEPTH]) if flip_left_right: flipped_image = tf.image.random_flip_left_right(cropped_image) else: flipped_image = cropped_image brightness_min = 1.0 - (random_brightness / 100.0) brightness_max = 1.0 + (random_brightness / 100.0) brightness_value = tf.random_uniform(tensor_shape.scalar(), minval=brightness_min, maxval=brightness_max) brightened_image = tf.multiply(flipped_image, brightness_value) distort_result = tf.expand_dims(brightened_image, 0, name='DistortResult') return jpeg_data, distort_result
def forward(self): X = self.phs['X'] if not self.embedding: X = tf.cast(X, tf.float32) * (1.0 / 255) layer = self.apply_feature_extractor(X) fisher_ws = [] fisher_diags = [] fisher_diagcs = [] fisher_old_ws = [] n_layers = len(self.layer_sizes) - 1 for i in range(n_layers): layer_name = "d%d" % (i + 1) layer = utils.dense2(layer, self.layer_sizes[i], self.layer_sizes[i + 1], name=layer_name) print('Applied dense (%d, %d) of name %s' % (self.layer_sizes[i], self.layer_sizes[i + 1], layer_name)) w = utils.get_var("%s/w" % layer_name) fisher_w_name = "fisher_diag_%s_w" % layer_name fisher_wc_name = "fisher_diag_%s_wc" % layer_name fisher_old_w_name = "fisher_old_%s_w" % layer_name self.vars[fisher_w_name] = tf.Variable(tf.zeros_like(w), name=fisher_w_name) self.vars[fisher_wc_name] = tf.Variable(tf.zeros_like(w), name=fisher_wc_name) self.vars[fisher_old_w_name] = tf.Variable(tf.zeros_like(w), name=fisher_old_w_name) fisher_ws += [w] fisher_diags += [self.vars[fisher_w_name]] fisher_diagcs += [self.vars[fisher_wc_name]] fisher_old_ws += [self.vars[fisher_old_w_name]] b = utils.get_var("%s/b" % layer_name) fisher_b_name = "fisher_diag_%s_b" % layer_name fisher_bc_name = "fisher_diag_%s_bc" % layer_name fisher_old_b_name = "fisher_old_%s_b" % layer_name self.vars[fisher_b_name] = tf.Variable(tf.zeros_like(b), name=fisher_b_name) self.vars[fisher_bc_name] = tf.Variable(tf.zeros_like(b), name=fisher_bc_name) self.vars[fisher_old_b_name] = tf.Variable(tf.zeros_like(b), name=fisher_old_b_name) fisher_ws += [b] fisher_diags += [self.vars[fisher_b_name]] fisher_diagcs += [self.vars[fisher_bc_name]] fisher_old_ws += [self.vars[fisher_old_b_name]] print('Created zero fishers') if i + 1 != len(self.layer_sizes) - 1: if self.use_dropout: layer = self.activation(layer) layer = tf.keras.layers.Dropout( rate=self.dropoutv, seed=self.seed)(layer, training=self.glob_training_ph) print('Applied activation -> dropout') else: layer = self.activation(layer) print('Applied activation') self.vars['fX'] = layer self.objs['fisher_ws'] = fisher_ws self.objs['fisher_diagcs'] = fisher_diagcs self.objs['fisher_diags'] = fisher_diags self.objs['fisher_old_ws'] = fisher_old_ws # Create fisher graph print('Creating fisher batch_log_likelihood') fisher_X = tf.cast(self.phs['fisher_X'], tf.float32) * (1.0 / 255) fisher_Y = tf.one_hot(self.phs['fisher_Y'], depth=self.layer_sizes[-1], dtype=tf.float32) if self.feature_extractor_needed: fisher_X = self.apply_feature_extractor(fisher_X) fisher_Xs = [ tf.reshape(fx, shape=(1, self.layer_sizes[0])) for fx in tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0) ] else: fisher_Xs = [ tf.reshape(fx, shape=(1, *self.it.reshape_dims)) for fx in tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0) ] fisher_Ys = tf.unstack(fisher_Y, num=self.fisher_batch_size, axis=0) fisher_var_lists = [] # Classwise direct predicted likelihoods and direct predicted likelihoods # Case I, II, III, IV nout = self.layer_sizes[-1] onehots_n = tf.unstack(tf.one_hot(list(range(nout)), nout), num=nout, axis=0) if self.version == 'case1': jlikelihoods = {ii: [] for ii in range(nout)} if self.version == 'case2': jlikelihoodsqs = {ii: [] for ii in range(nout)} if self.version == 'case3': likelihoods = [] if self.version == 'case4': likelihoodsqs = [] for i in range(self.fisher_batch_size): raw_output = fisher_Xs[i] fisher_var_list = [] for j in range(n_layers): layer_name = "d%d" % (j + 1) w = tf.identity(utils.get_var("%s/w" % layer_name)) b = tf.identity(utils.get_var("%s/b" % layer_name)) fisher_var_list += [w, b] raw_output = tf.add(tf.matmul(raw_output, w), b) if j + 1 != len(self.layer_sizes) - 1: raw_output = self.activation(raw_output) # No dropout; TODO fisher_var_lists += [fisher_var_list] # Case I, II, III, IV if self.version == 'case1': for key in jlikelihoods.keys(): jlikelihoods[key] += [ tf.multiply(onehots_n[key], tf.nn.softmax(raw_output)) ] if self.version == 'case2': for key in jlikelihoodsqs.keys(): jlikelihoodsqs[key] += [ tf.square( tf.multiply(onehots_n[key], tf.nn.softmax(raw_output))) ] if self.version == 'case3': likelihood = tf.multiply(fisher_Ys[i], tf.nn.softmax(raw_output)) likelihoods += [likelihood] if self.version == 'case4': likelihood = tf.multiply(fisher_Ys[i], tf.nn.softmax(raw_output)) likelihoodsq = tf.square(likelihood) likelihoodsqs += [likelihoodsq] self.objs['fisher_var_lists'] = fisher_var_lists # Finally, reduce_sum and add to vars if self.version == 'case1': jbatch_likelihood = { key: tf.reduce_sum(jlikelihoods[key]) for key in jlikelihoods.keys() } self.vars['jbatch_likelihood'] = jbatch_likelihood if self.version == 'case2': jbatch_likelihoodsq = { key: tf.multiply(tf.constant(0.5), tf.reduce_sum(jlikelihoodsqs[key])) for key in jlikelihoodsqs.keys() } self.vars['jbatch_likelihoodsq'] = jbatch_likelihoodsq if self.version == 'case3': batch_likelihood = tf.reduce_sum(likelihoods) self.vars['batch_likelihood'] = batch_likelihood if self.version == 'case4': batch_likelihoodsq = tf.multiply(tf.constant(0.5), tf.reduce_sum(likelihoodsqs)) self.vars['batch_likelihoodsq'] = batch_likelihoodsq