def update_loss(self, n_task):

        if n_task == 0:
            return

        # Radius of influence (Constrained minimization)
        if self.init_change != None and self.incr_change != None:
            task_var = tf.Variable(self.init_change,
                                   name="epsilon_task%d" % (n_task - 1),
                                   trainable=False)
            self.objs['sess'].run(task_var.initializer)
            self.vars['epsilon_task%d' % (n_task - 1)] = task_var
            for prev_n_task in range(n_task - 1):
                self.objs['sess'].run(
                    tf.assign_add(self.vars['epsilon_task%d' % (prev_n_task)],
                                  self.incr_change))

        loss = self.vars['losses'][0] if self.use_orig_loss else self.vars[
            'losses'][n_task - 1]

        penalties = []
        old_vars = self.objs[
            'fisher_old_ws'] if self.use_latest_theta_star else self.saved_wts[
                n_task - 1]
        fisher_vars = self.objs[
            'fisher_diags'] if self.use_latest_theta_star else self.saved_fishers[
                n_task - 1]
        for var, old_var, fisher in zip(self.objs['fisher_ws'], old_vars,
                                        fisher_vars):

            penalties += [
                tf.multiply(fisher, self.norm_op(tf.subtract(var, old_var)))
            ]

        ewc_penalty = tf.add_n(
            [tf.reduce_sum(penalty) for penalty in penalties])
        # Create new cross entropy loss
        if self.init_change != None and self.incr_change != None:
            self.vars['ce_losses'][n_task] = self.vars['ce_losses'][
                n_task - 1] * self.indicator(task_var - ewc_penalty)
        new_loss = tf.add(
            loss,
            tf.multiply(tf.constant(self.multiplier, tf.float32), ewc_penalty))

        # Remove previous CE loss
        if self.init_change != None and self.incr_change != None:
            print('lol')
            new_loss -= self.vars['ce_losses'][n_task - 1]
            new_loss += self.vars['ce_losses'][n_task]

        self.vars['loss'] = new_loss
        self.vars['losses'][n_task] = new_loss
        self.vars['distances'][n_task] = self.setup_distances(n_task)

        orig_var_list = self.vars['orig_var_list']
        # print("Trainable vars: %s" % str(orig_var_list))
        print("Trainable vars:")
        self.print_vars(orig_var_list)
        if self.reset_opt:
            print('Reset opt')
            self.objs['sess'].run(
                tf.variables_initializer(self.objs['opt'].variables()))
        op = self.objs['opt'].minimize(new_loss, var_list=orig_var_list)
        self.vars['train_op'] = op
        self.vars['train_ops'][n_task] = op

        print('Updated train_op and loss')
Beispiel #2
0
    def __init__(self,
                 n_inputs,
                 n_outputs,
                 n_hiddens,
                 act_fun,
                 output_order='sequential',
                 mode='sequential',
                 input=None,
                 output=None):
        """
        Constructor.
        :param n_inputs: number of (conditional) inputs
        :param n_outputs: number of outputs
        :param n_hiddens: list with number of hidden units for each hidden layer
        :param act_fun: tensorflow activation function
        :param output_order: order of outputs
        :param mode: strategy for assigning degrees to hidden nodes: can be 'random' or 'sequential'
        :param input: tensorflow placeholder to serve as input; if None, a new placeholder is created
        :param output: tensorflow placeholder to serve as output; if None, a new placeholder is created
        """

        # save input arguments
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.n_hiddens = n_hiddens
        self.act_fun = act_fun
        self.mode = mode

        # create network's parameters
        degrees = create_degrees(n_outputs, n_hiddens, output_order, mode)
        Ms, Mmp = create_masks(degrees)
        Wx, Ws, bs, Wm, bm, Wp, bp = create_weights_conditional(
            n_inputs, n_outputs, n_hiddens, None)
        self.parms = [Wx] + Ws + bs + [Wm, bm, Wp, bp]
        self.output_order = degrees[0]

        # activation function
        f = self.act_fun

        # input matrices
        self.input = tf.placeholder(dtype=dtype,
                                    shape=[None, n_inputs],
                                    name='x') if input is None else input
        self.y = tf.placeholder(dtype=dtype, shape=[None, n_outputs],
                                name='y') if output is None else output

        # feedforward propagation
        h = f(tf.matmul(self.input, Wx) + tf.matmul(self.y, Ms[0] * Ws[0]) +
              bs[0],
              name='h1')
        for l, (M, W, b) in enumerate(zip(Ms[1:], Ws[1:], bs[1:])):
            h = f(tf.matmul(h, M * W) + b, name='h' + str(l + 2))

        # output means
        self.m = tf.add(tf.matmul(h, Mmp * Wm), bm, name='m')

        # output log precisions
        self.logp = tf.add(tf.matmul(h, Mmp * Wp), bp, name='logp')

        # random numbers driving made
        self.u = tf.exp(0.5 * self.logp) * (self.y - self.m)

        # log likelihoods
        self.L = tf.multiply(-0.5,n_outputs * np.log(2 * np.pi) + \
                     tf.reduce_sum(self.u ** 2 - self.logp, axis=1,keepdims=True),name='L')

        # train objective
        self.trn_loss = -tf.reduce_mean(self.L, name='trn_loss')
Beispiel #3
0
def implicit_quantile_network(num_actions, quantile_embedding_dim,
                              network_type, state, num_quantiles):
    """The Implicit Quantile ConvNet.

  Args:
    num_actions: int, number of actions.
    quantile_embedding_dim: int, embedding dimension for the quantile input.
    network_type: namedtuple, collection of expected values to return.
    state: `tf.Tensor`, contains the agent's current state.
    num_quantiles: int, number of quantile inputs.

  Returns:
    net: _network_type object containing the tensors output by the network.
  """
    weights_initializer = contrib_slim.variance_scaling_initializer(
        factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True)

    state_net = tf.cast(state, tf.float32)
    state_net = tf.div(state_net, 255.)
    state_net = contrib_slim.conv2d(state_net,
                                    32, [8, 8],
                                    stride=4,
                                    weights_initializer=weights_initializer)
    state_net = contrib_slim.conv2d(state_net,
                                    64, [4, 4],
                                    stride=2,
                                    weights_initializer=weights_initializer)
    state_net = contrib_slim.conv2d(state_net,
                                    64, [3, 3],
                                    stride=1,
                                    weights_initializer=weights_initializer)
    state_net = contrib_slim.flatten(state_net)
    state_net_size = state_net.get_shape().as_list()[-1]
    state_net_tiled = tf.tile(state_net, [num_quantiles, 1])

    batch_size = state_net.get_shape().as_list()[0]
    quantiles_shape = [num_quantiles * batch_size, 1]
    quantiles = tf.random_uniform(quantiles_shape,
                                  minval=0,
                                  maxval=1,
                                  dtype=tf.float32)

    quantile_net = tf.tile(quantiles, [1, quantile_embedding_dim])
    pi = tf.constant(math.pi)
    quantile_net = tf.cast(tf.range(1, quantile_embedding_dim + 1, 1),
                           tf.float32) * pi * quantile_net
    quantile_net = tf.cos(quantile_net)
    quantile_net = contrib_slim.fully_connected(
        quantile_net, state_net_size, weights_initializer=weights_initializer)
    # Hadamard product.
    net = tf.multiply(state_net_tiled, quantile_net)

    net = contrib_slim.fully_connected(net,
                                       512,
                                       weights_initializer=weights_initializer)
    quantile_values = contrib_slim.fully_connected(
        net,
        num_actions,
        activation_fn=None,
        weights_initializer=weights_initializer)

    return network_type(quantile_values=quantile_values, quantiles=quantiles)
Beispiel #4
0
def attention_layer(from_tensor,
                    to_tensor,
                    layer_idx,
                    total_layers,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None,
                    num_partitions=1):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
  this is self-attention. Each timestep in `from_tensor` attends to the
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.

  This function first projects `from_tensor` into a "query" tensor and
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  of tensors of length `num_attention_heads`, where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with tf.einsum as follows:
    Input_tensor: [BFD]
    Wq, Wk, Wv: [DNH]
    Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq)
    K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk)
    V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv)
    attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H)
    attention_probs:[BNFT] = softmax(attention_scores)
    context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V)
    Wout:[DNH]
    Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout)

  Args:
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
    layer_idx: the index of the current layer.
    total_layers: total number of layers.
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `from_tensor`.
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `to_tensor`.
    num_partitions: (optional) Number of SPMD partitions.

  Returns:
    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
      size_per_head].

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """
    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`

    # `query_layer` = [B, F, N, H]
    query_layer = dense_layer_3d(from_tensor,
                                 layer_idx,
                                 total_layers,
                                 num_attention_heads,
                                 size_per_head,
                                 create_initializer(initializer_range),
                                 query_act,
                                 name="query")

    # `key_layer` = [B, T, N, H]
    key_layer = dense_layer_3d(to_tensor,
                               layer_idx,
                               total_layers,
                               num_attention_heads,
                               size_per_head,
                               create_initializer(initializer_range),
                               key_act,
                               name="key")

    # `value_layer` = [B, T, N, H]
    value_layer = dense_layer_3d(to_tensor,
                                 layer_idx,
                                 total_layers,
                                 num_attention_heads,
                                 size_per_head,
                                 create_initializer(initializer_range),
                                 value_act,
                                 name="value")
    if num_partitions > 1:
        # partition along the heads dimension
        query_layer = xla_sharding.split(query_layer,
                                         2,
                                         num_partitions,
                                         use_sharding_op=True)
        key_layer = xla_sharding.split(key_layer,
                                       2,
                                       num_partitions,
                                       use_sharding_op=True)
        value_layer = xla_sharding.split(value_layer,
                                         2,
                                         num_partitions,
                                         use_sharding_op=True)

    query_layer = tf.multiply(query_layer,
                              1.0 / math.sqrt(float(size_per_head)))
    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_layer, query_layer)

    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 -
                 tf.cast(attention_mask, attention_scores.dtype)) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_scores += adder

    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    attention_scores = tf.cast(attention_scores, tf.float32)
    attention_scores = attention_scores - tf.stop_gradient(
        tf.reduce_max(attention_scores, -1, True))
    attention_scores = tf.exp(attention_scores)
    attention_sum = tf.reduce_sum(attention_scores, -1, True)
    attention_probs = tf.cast(attention_scores, key_layer.dtype)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    # Split mask and scaling ops in dropout
    random_u = tf.random_uniform(attention_probs.shape, dtype=tf.bfloat16)
    keep_mask = random_u >= attention_probs_dropout_prob
    keep_mask = tf.cast(keep_mask, dtype=attention_probs.dtype)

    attention_probs = tf.multiply(keep_mask, attention_probs)

    # `context_layer` = [B, F, N, H]
    context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_layer)
    context_layer = context_layer / tf.cast(
        tf.transpose(attention_sum, [0, 2, 1, 3]), context_layer.dtype)

    if num_partitions > 1:
        # partition along the heads dimension
        context_layer = xla_sharding.split(context_layer,
                                           2,
                                           num_partitions,
                                           use_sharding_op=True)

    # split mask and scaling ops in dropout
    # move the scaling from dropout to here to save same mul ops
    # TODO(yuemmawang) automate this optimization in xla
    keep_prob = 1 - attention_probs_dropout_prob
    scale = 1 / keep_prob
    context_layer = tf.multiply(context_layer, scale)
    return context_layer
    def build_model(batch, seq_len, vocab_size, d_model, head):
        input_tensor = tf.placeholder(shape=(batch, seq_len, d_model),
                                      dtype=tf.int32)
        mask_tensor = tf.placeholder(shape=(batch, seq_len), dtype=tf.float32)

        # We are not using embedding here
        input_ids = tf.cast(input_tensor, tf.float32)

        # Add positional encoding. We use static positional encoding here.
        if USE_POSITIONAL_ENCODING:
            pos_enc = generate_position_embedding(input_len=seq_len,
                                                  d_model=d_model)
            pos_enc = tf.constant(pos_enc, dtype=tf.float32)
            input_ids = input_ids + pos_enc

        # Convert input to 2D tensor
        input_batch = tf.reshape(input_ids, (-1, d_model))

        # Transform input to Q, K and V tensor
        size_per_head = int(d_model / head)
        K = tf.layers.dense(input_batch, size_per_head * head, name='K')
        Q = tf.layers.dense(input_batch, size_per_head * head, name='Q')
        V = tf.layers.dense(input_batch, size_per_head * head, name='V')

        # [Batch, Head, Len, Size_per_Head]
        K = transpose_for_scores(K, batch, head, seq_len, size_per_head)
        Q = transpose_for_scores(Q, batch, head, seq_len, size_per_head)
        V = transpose_for_scores(V, batch, head, seq_len, size_per_head)

        # Scaled Dot-Product attention [Batch, Head, Len-Q, Len-K]
        attention_scores = tf.matmul(Q, K, transpose_b=True)
        attention_scores = tf.multiply(attention_scores,
                                       1.0 / math.sqrt(float(size_per_head)))

        # Generate attention mask to prevent attention to padding tokens
        to_mask = tf.reshape(mask_tensor, [batch, 1, seq_len])
        broadcast_ones = tf.ones(shape=[batch, seq_len, 1], dtype=tf.float32)
        # Attention mask [Batch, Len, Len]
        attention_mask = broadcast_ones * to_mask
        # `attention_mask` = [Batch, 1, Len, Len]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])
        # Make adding -10000.0 to attention of padding tokens
        adder = (1.0 - attention_mask) * -10000.0
        attention_scores += adder

        attention_probs = tf.nn.softmax(attention_scores)

        # `context_layer` = [Batch, Head, Len-Q, Size_per_Head]
        context_layer = tf.matmul(attention_probs, V)

        # `context_layer` = [Batch, Len-Q, Head, Size_per_Head]
        context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

        # Also calculate cost of attention head output difference here.
        disagreement_cost = get_attention_heads_disagreement_cost(
            context_layer)

        # `output_tensor` = [Batch x Len-Q, Head x Size_per_Head = D_Model]
        output_tensor = tf.reshape(context_layer,
                                   [batch * seq_len, head * size_per_head])

        # Final linear projection. Note that this weight has permutation set divided by row instead of column as in K/Q/V
        output_tensor = tf.layers.dense(output_tensor, d_model, name='output')

        # `output_tensor` = [Batch, Len-Q, Head x Size_per_Head = D_Model]
        output_tensor = tf.reshape(output_tensor,
                                   [batch, seq_len, head * size_per_head])

        # Pooled output is the hidden state of the 1st token
        pooled_output_tensor = output_tensor[:, 0]

        # Add binary classification layers
        prediction_tensor = tf.layers.dense(pooled_output_tensor,
                                            1,
                                            name='prediction')
        logprob_tensor = tf.nn.sigmoid(prediction_tensor, name='sigmoid')

        return (input_tensor, mask_tensor, prediction_tensor,
                disagreement_cost, logprob_tensor)
    def compute_knowledge_selection_and_loss(self, features, encoder_output,
                                             fact_embedding, fact_lengths,
                                             margin, num_negative_samples):
        """Compute knowledge selection and loss.

    Args:
      features: features.
      encoder_output: <tf.float32>[batch_size, input_length, hidden_dim]
      fact_embedding: <tf.float32>[batch_size*triple_num, max_triple_length,
        emb_dim]
      fact_lengths: # <tf.int32>[batch_size*triple_num]
      margin: integer value for max margin in TransE loss,
      num_negative_samples: shuffle and sample multiple negative examples for
      the TransE loss

    Returns:
      knowledge_weights:
      knowledge_loss:
    """
        hparams = self._hparams
        encoder_output_shape = common_layers.shape_list(encoder_output)
        encoder_hidden_dim = encoder_output_shape[-1]
        inputs = features["inputs"]
        # <tf.float32>[batch_size, input_length, emb_dim]
        inputs = tf.squeeze(inputs, 2)
        # <tf.float32>[batch_size, input_length]
        context_padding = common_attention.embedding_to_padding(inputs)
        # <tf.float32>[batch_size]
        context_lens = tf.to_float(
            common_attention.padding_to_length(context_padding))
        # <tf.float32>[batch_size, 1]
        context_lens = tf.expand_dims(context_lens, -1)
        # Compute context vector summary.
        # <tf.float32>[batch_size, hidden_dim]
        context_vector_summary = compute_summary_embedding(
            encoder_output, context_lens, hparams)
        knowledge_encoder_output = compute_average_embedding(
            fact_embedding, fact_lengths)
        # <tf.float32>[batch_size, triple_num, emb_dim]
        knowledge_encoder_output = tf.reshape(
            knowledge_encoder_output,
            [-1, self.triple_num, encoder_hidden_dim])
        original_knowledge_encoder_output = knowledge_encoder_output
        if hparams.similarity_fuction == "dot_product":
            triple_logits = tf.squeeze(
                tf.matmul(knowledge_encoder_output,
                          tf.expand_dims(context_vector_summary, 2)), -1)
        elif hparams.similarity_fuction == "bilinear":
            # Tile the context vector summary.
            # <tf.float32>[batch_size, triple_num*hidden_dim]
            tiled_context_vector = tf.tile(context_vector_summary,
                                           [1, self.triple_num])
            # <tf.float32>[batch_size, triple_num, hidden_dim]
            context_vector = tf.reshape(
                tiled_context_vector,
                [-1, self.triple_num, encoder_hidden_dim])
            # compute outer product
            context_vector = tf.expand_dims(context_vector, -1)
            knowledge_encoder_output = tf.expand_dims(knowledge_encoder_output,
                                                      2)
            # <tf.float32>[batch_size, triple_num, hidden_dim, hidden_dim]
            outer_product = tf.matmul(context_vector, knowledge_encoder_output)
            outer_product = tf.reshape(
                outer_product,
                [-1, self.triple_num, encoder_hidden_dim * encoder_hidden_dim])
            triple_logits = tf.squeeze(
                tf.layers.dense(outer_product, 1, name="knolwedge_final_mlp"),
                -1)

        avg_triple_loss = 0.0
        triple_labels = features["triple_labels"]

        subject_mask = tf.reshape(
            features["subject_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        subject_mask = tf.reshape(subject_mask,
                                  [-1, hparams.max_triple_length])

        predicate_mask = tf.reshape(
            features["predicate_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        predicate_mask = tf.reshape(predicate_mask,
                                    [-1, hparams.max_triple_length])

        object_mask = tf.reshape(
            features["object_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        object_mask = tf.reshape(object_mask, [-1, hparams.max_triple_length])

        # mask : [bs, max_seq_len, triple_num]
        # the below operation will result in [bs*triple_num,emb_dim]
        subject_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(subject_mask, -1), 1),
            tf.float32)  # [bs*tn]
        object_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(object_mask, -1), 1), tf.float32)
        predicate_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(predicate_mask, -1), 1), tf.float32)

        # expand dimension 2 to be able to broadcast
        subject_mask = tf.cast(tf.expand_dims(subject_mask, 2), tf.float32)
        predicate_mask = tf.cast(tf.expand_dims(predicate_mask, 2), tf.float32)
        object_mask = tf.cast(tf.expand_dims(object_mask, 2), tf.float32)

        subject_vect = tf.reduce_sum(tf.multiply(
            fact_embedding, subject_mask), 1) / (
                subject_length +
                tf.broadcast_to(tf.constant([1e-5]), tf.shape(subject_length)))
        object_vect = tf.reduce_sum(tf.multiply(
            fact_embedding, object_mask), 1) / (
                object_length +
                tf.broadcast_to(tf.constant([1e-5]), tf.shape(object_length)))
        predicate_vect = tf.reduce_sum(
            tf.multiply(fact_embedding, predicate_mask),
            1) / (predicate_length + tf.broadcast_to(
                tf.constant([1e-5]), tf.shape(predicate_length)))

        # Shuffled rows to generate adversarial samples
        shuffled_subject_vect = []
        shuffled_object_vect = []

        for _ in range(num_negative_samples):
            shuffled_subject_vect += [
                tf.gather(
                    subject_vect,
                    tf.random.shuffle(tf.range(tf.shape(subject_vect)[0])))
            ]  # [bs*tn,d]
            shuffled_object_vect += [
                tf.gather(
                    object_vect,
                    tf.random.shuffle(tf.range(tf.shape(object_vect)[0])))
            ]  # [bs*tn,d]

        # KB pretraining loss

        positive_loss = tf.reduce_mean(
            tf.squared_difference(subject_vect + predicate_vect, object_vect))
        negative_loss = 0
        for n_adv in range(num_negative_samples):
            negative_loss += tf.reduce_mean(
                tf.squared_difference(
                    shuffled_subject_vect[n_adv] + predicate_vect,
                    object_vect))
            negative_loss += tf.reduce_mean(
                tf.squared_difference(subject_vect + predicate_vect,
                                      shuffled_object_vect[n_adv]))

        # TransE Loss

        negative_loss = negative_loss / (2 * num_negative_samples)

        transe_loss = tf.clip_by_value(margin + positive_loss - negative_loss,
                                       clip_value_min=0,
                                       clip_value_max=100)
        if hparams.mode != tf.estimator.ModeKeys.PREDICT:
            triple_losses = tf.nn.weighted_cross_entropy_with_logits(
                labels=triple_labels,
                logits=triple_logits,
                pos_weight=hparams.pos_weight)
            avg_triple_loss = tf.reduce_mean(triple_losses)
            tf.summary.scalar("triple_loss", avg_triple_loss)

        return triple_logits, avg_triple_loss, original_knowledge_encoder_output, transe_loss
Beispiel #7
0
    def build_model(self, hps):
        """Define model architecture."""
        if hps.is_training:
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

        if hps.dec_model == 'lstm':
            cell_fn = rnn.LSTMCell
        elif hps.dec_model == 'layer_norm':
            cell_fn = rnn.LayerNormLSTMCell
        elif hps.dec_model == 'hyper':
            cell_fn = rnn.HyperLSTMCell
        else:
            assert False, 'please choose a respectable cell'

        if hps.enc_model == 'lstm':
            enc_cell_fn = rnn.LSTMCell
        elif hps.enc_model == 'layer_norm':
            enc_cell_fn = rnn.LayerNormLSTMCell
        elif hps.enc_model == 'hyper':
            enc_cell_fn = rnn.HyperLSTMCell
        else:
            assert False, 'please choose a respectable cell'

        use_recurrent_dropout = self.hps.use_recurrent_dropout
        use_input_dropout = self.hps.use_input_dropout
        use_output_dropout = self.hps.use_output_dropout

        cell = cell_fn(hps.dec_rnn_size,
                       use_recurrent_dropout=use_recurrent_dropout,
                       dropout_keep_prob=self.hps.recurrent_dropout_prob)

        if hps.conditional:  # vae mode:
            if hps.enc_model == 'hyper':
                self.enc_cell_fw = enc_cell_fn(
                    hps.enc_rnn_size,
                    use_recurrent_dropout=use_recurrent_dropout,
                    dropout_keep_prob=self.hps.recurrent_dropout_prob)
                self.enc_cell_bw = enc_cell_fn(
                    hps.enc_rnn_size,
                    use_recurrent_dropout=use_recurrent_dropout,
                    dropout_keep_prob=self.hps.recurrent_dropout_prob)
            else:
                self.enc_cell_fw = enc_cell_fn(
                    hps.enc_rnn_size,
                    use_recurrent_dropout=use_recurrent_dropout,
                    dropout_keep_prob=self.hps.recurrent_dropout_prob)
                self.enc_cell_bw = enc_cell_fn(
                    hps.enc_rnn_size,
                    use_recurrent_dropout=use_recurrent_dropout,
                    dropout_keep_prob=self.hps.recurrent_dropout_prob)

        # dropout:
        tf.logging.info('Input dropout mode = %s.', use_input_dropout)
        tf.logging.info('Output dropout mode = %s.', use_output_dropout)
        tf.logging.info('Recurrent dropout mode = %s.', use_recurrent_dropout)
        if use_input_dropout:
            tf.logging.info('Dropout to input w/ keep_prob = %4.4f.',
                            self.hps.input_dropout_prob)
            cell = tf.nn.rnn_cell.DropoutWrapper(
                cell, input_keep_prob=self.hps.input_dropout_prob)
        if use_output_dropout:
            tf.logging.info('Dropout to output w/ keep_prob = %4.4f.',
                            self.hps.output_dropout_prob)
            cell = tf.nn.rnn_cell.DropoutWrapper(
                cell, output_keep_prob=self.hps.output_dropout_prob)
        self.cell = cell

        self.sequence_lengths = tf.placeholder(dtype=tf.int32,
                                               shape=[self.hps.batch_size])
        self.input_data = tf.placeholder(
            dtype=tf.float32,
            shape=[self.hps.batch_size, self.hps.max_seq_len + 1, 5])

        # The target/expected vectors of strokes
        self.output_x = self.input_data[:, 1:self.hps.max_seq_len + 1, :]
        # vectors of strokes to be fed to decoder (same as above, but lagged behind
        # one step to include initial dummy value of (0, 0, 1, 0, 0))
        self.input_x = self.input_data[:, :self.hps.max_seq_len, :]

        # either do vae-bit and get z, or do unconditional, decoder-only
        if hps.conditional:  # vae mode:
            self.mean, self.presig = self.encoder(self.output_x,
                                                  self.sequence_lengths)
            self.sigma = tf.exp(self.presig /
                                2.0)  # sigma > 0. div 2.0 -> sqrt.
            eps = tf.random_normal((self.hps.batch_size, self.hps.z_size),
                                   0.0,
                                   self.hps.scale,
                                   dtype=tf.float32)
            self.batch_z = self.mean + tf.multiply(self.sigma, eps)
            # KL cost
            self.kl_cost = -0.5 * tf.reduce_mean(
                (1 + self.presig - tf.square(self.mean) - tf.exp(self.presig)))
            self.kl_cost = tf.maximum(self.kl_cost, self.hps.kl_tolerance)
            pre_tile_y = tf.reshape(self.batch_z,
                                    [self.hps.batch_size, 1, self.hps.z_size])
            overlay_x = tf.tile(pre_tile_y, [1, self.hps.max_seq_len, 1])
            actual_input_x = tf.concat([self.input_x, overlay_x], 2)
            self.initial_state = tf.nn.tanh(
                rnn.super_linear(self.batch_z,
                                 cell.state_size,
                                 init_w='gaussian',
                                 weight_start=0.001,
                                 input_size=self.hps.z_size))
        else:  # unconditional, decoder-only generation
            self.batch_z = tf.zeros((self.hps.batch_size, self.hps.z_size),
                                    dtype=tf.float32)
            self.kl_cost = tf.zeros([], dtype=tf.float32)
            actual_input_x = self.input_x
            self.initial_state = cell.zero_state(batch_size=hps.batch_size,
                                                 dtype=tf.float32)

        self.num_mixture = hps.num_mixture

        # TODO(deck): Better understand this comment.
        # Number of outputs is 3 (one logit per pen state) plus 6 per mixture
        # component: mean_x, stdev_x, mean_y, stdev_y, correlation_xy, and the
        # mixture weight/probability (Pi_k)
        n_out = (3 + self.num_mixture * 6)

        with tf.variable_scope('RNN'):
            output_w = tf.get_variable('output_w',
                                       [self.hps.dec_rnn_size, n_out])
            output_b = tf.get_variable('output_b', [n_out])

        # decoder module of sketch-rnn is below
        output, last_state = tf.nn.dynamic_rnn(
            cell,
            actual_input_x,
            initial_state=self.initial_state,
            time_major=False,
            swap_memory=True,
            dtype=tf.float32,
            scope='RNN')

        output = tf.reshape(output, [-1, hps.dec_rnn_size])
        output = tf.nn.xw_plus_b(output, output_w, output_b)
        self.final_state = last_state

        # NB: the below are inner functions, not methods of Model
        def tf_2d_normal(x1, x2, mu1, mu2, s1, s2, rho):
            """Returns result of eq # 24 of http://arxiv.org/abs/1308.0850."""
            norm1 = tf.subtract(x1, mu1)
            norm2 = tf.subtract(x2, mu2)
            s1s2 = tf.multiply(s1, s2)
            # eq 25
            z = (tf.square(tf.div(norm1, s1)) + tf.square(tf.div(norm2, s2)) -
                 2 * tf.div(tf.multiply(rho, tf.multiply(norm1, norm2)), s1s2))
            neg_rho = 1 - tf.square(rho)
            result = tf.exp(tf.div(-z, 2 * neg_rho))
            denom = 2 * np.pi * tf.multiply(s1s2, tf.sqrt(neg_rho))
            result = tf.div(result, denom)
            return result

        def get_lossfunc(z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr,
                         z_pen_logits, x1_data, x2_data, pen_data):
            """Returns a loss fn based on eq #26 of http://arxiv.org/abs/1308.0850."""
            # This represents the L_R only (i.e. does not include the KL loss term).

            result0 = tf_2d_normal(x1_data, x2_data, z_mu1, z_mu2, z_sigma1,
                                   z_sigma2, z_corr)
            epsilon = 1e-6
            # result1 is the loss wrt pen offset (L_s in equation 9 of
            # https://arxiv.org/pdf/1704.03477.pdf)
            result1 = tf.multiply(result0, z_pi)
            result1 = tf.reduce_sum(result1, 1, keep_dims=True)
            result1 = -tf.log(result1 + epsilon)  # avoid log(0)

            fs = 1.0 - pen_data[:, 2]  # use training data for this
            fs = tf.reshape(fs, [-1, 1])
            # Zero out loss terms beyond N_s, the last actual stroke
            result1 = tf.multiply(result1, fs)

            # result2: loss wrt pen state, (L_p in equation 9)
            result2 = tf.nn.softmax_cross_entropy_with_logits(
                labels=pen_data, logits=z_pen_logits)
            result2 = tf.reshape(result2, [-1, 1])
            if not self.hps.is_training:  # eval mode, mask eos columns
                result2 = tf.multiply(result2, fs)

            result = result1 + result2
            return result

        # below is where we need to do MDN (Mixture Density Network) splitting of
        # distribution params
        def get_mixture_coef(output):
            """Returns the tf slices containing mdn dist params."""
            # This uses eqns 18 -> 23 of http://arxiv.org/abs/1308.0850.
            z = output
            z_pen_logits = z[:, 0:3]  # pen states
            z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = tf.split(
                z[:, 3:], 6, 1)

            # process output z's into MDN parameters

            # softmax all the pi's and pen states:
            z_pi = tf.nn.softmax(z_pi)
            z_pen = tf.nn.softmax(z_pen_logits)

            # exponentiate the sigmas and also make corr between -1 and 1.
            z_sigma1 = tf.exp(z_sigma1)
            z_sigma2 = tf.exp(z_sigma2)
            z_corr = tf.tanh(z_corr)

            r = [
                z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen,
                z_pen_logits
            ]
            return r

        out = get_mixture_coef(output)
        [o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr, o_pen,
         o_pen_logits] = out

        self.pi = o_pi
        self.mu1 = o_mu1
        self.mu2 = o_mu2
        self.sigma1 = o_sigma1
        self.sigma2 = o_sigma2
        self.corr = o_corr
        self.pen_logits = o_pen_logits
        # pen state probabilities (result of applying softmax to self.pen_logits)
        self.pen = o_pen

        # reshape target data so that it is compatible with prediction shape
        target = tf.reshape(self.output_x, [-1, 5])
        [x1_data, x2_data, eos_data, eoc_data,
         cont_data] = tf.split(target, 5, 1)
        pen_data = tf.concat([eos_data, eoc_data, cont_data], 1)

        lossfunc = get_lossfunc(o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr,
                                o_pen_logits, x1_data, x2_data, pen_data)

        self.r_cost = tf.reduce_mean(lossfunc)

        if self.hps.is_training:
            self.lr = tf.Variable(self.hps.learning_rate, trainable=False)
            optimizer = tf.train.AdamOptimizer(self.lr)

            self.kl_weight = tf.Variable(self.hps.kl_weight_start,
                                         trainable=False)
            self.cost = self.r_cost + self.kl_cost * self.kl_weight

            gvs = optimizer.compute_gradients(self.cost)
            g = self.hps.grad_clip
            capped_gvs = [(tf.clip_by_value(grad, -g, g), var)
                          for grad, var in gvs]
            self.train_op = optimizer.apply_gradients(
                capped_gvs, global_step=self.global_step, name='train_step')
    def recenter(rv_constructor, *rv_args, **rv_kwargs):

        rv_name = rv_kwargs.get('name')
        rv_value = rv_kwargs.pop('value', None)

        base_bijector = None
        if rv_constructor.__name__ == 'TransformedDistribution':
            if (rv_args[1].__class__.__name__ == 'Invert'
                    and rv_args[1].bijector.__class__.__name__ == 'SoftClip'):
                distribution = rv_args[0]
                base_bijector = rv_args[1].bijector
                rv_constructor = distribution.__class__
                rv_kwargs = distribution.parameters
                rv_args = rv_args[2:]
                # We were given a value for the transformed RV. Let's pretend it was
                # for the original.
                if rv_value is not None:
                    rv_value = base_bijector.forward(rv_value)

        if (rv_constructor.__name__ == 'Normal'
                and not rv_name.startswith('y')):

            # NB: assume everything is kwargs for now.
            x_loc = rv_kwargs['loc']
            x_scale = rv_kwargs['scale']

            name = rv_kwargs['name']
            a, b, _ = get_or_init(name,
                                  loc_shape=tf.shape(x_loc),
                                  scale_shape=tf.shape(x_scale),
                                  parameterisation_type='scalar')

            kwargs_std = {}
            kwargs_std['loc'] = tf.multiply(x_loc, a)
            kwargs_std['scale'] = tf.pow(
                x_scale, b)  # tf.multiply(x_scale - 1., b) + 1.
            kwargs_std['name'] = name

            scale = x_scale / kwargs_std['scale']  # tf.pow(x_scale, 1. - b)
            shift = x_loc - tf.multiply(scale, kwargs_std['loc'])
            b = tfb.AffineScalar(scale=scale, shift=shift)
            if rv_value is not None:
                rv_value = b.inverse(rv_value)
            learnable_parameters[name +
                                 '_prior_mean'] = tf.convert_to_tensor(x_loc)
            learnable_parameters[name + '_prior_scale'] = tf.convert_to_tensor(
                x_scale)

            # If original RV was constrained, transform the constraint to the new
            # standardized RV. For now we assume a double-sided constraint.
            if base_bijector is not None:
                constraint_std = tfb.SoftClip(
                    low=b.inverse(base_bijector.low),
                    high=b.inverse(base_bijector.high),
                    hinge_softness=base_bijector.hinge_softness / scale
                    if base_bijector.hinge_softness is not None else None)
                rv_std = edward2.TransformedDistribution(
                    rv_constructor(**kwargs_std),
                    tfb.Invert(constraint_std),
                    value=constraint_std.inverse(rv_value)
                    if rv_value is not None else None)
                b = b(constraint_std)
            else:
                kwargs_std['value'] = rv_value
                rv_std = interceptable(rv_constructor)(*rv_args, **kwargs_std)
            bijectors[name] = b
            return b.forward(rv_std)

        elif ((rv_constructor.__name__.startswith('MultivariateNormal')
               or rv_constructor.__name__.startswith('GaussianProcess'))
              and not rv_kwargs['name'].startswith('y')):

            name = rv_kwargs['name']

            if rv_constructor.__name__.startswith('GaussianProcess'):
                gp_dist = rv_constructor(*rv_args, **rv_kwargs).distribution
                X = gp_dist._get_index_points()
                x_loc = gp_dist.mean_fn(X)
                x_cov = gp_dist._compute_covariance(index_points=X)
            else:
                x_loc = rv_kwargs['loc']
                x_cov = rv_kwargs['covariance_matrix']

            a, b, c = get_or_init(name,
                                  loc_shape=tf.shape(x_loc),
                                  scale_shape=tf.shape(x_cov)[:-1],
                                  parameterisation_type=parameterisation_type)
            ndims = tf.shape(x_cov)[-1]
            x_loc = tf.broadcast_to(x_loc, tf.shape(x_cov)[:-1])
            cov_dtype = tf.float64 if FLAGS.float64 else x_cov.dtype
            x_cov = tf.cast(x_cov, cov_dtype)
            if parameterisation_type == 'eig':
                """Extra cost of the eigendecomposition?

        we do the eig to get Lambda, Q.
        We rescale Lambda and create the prior dist linop
           - point one: the prior is an MVN (albeit an efficient one), where
              in NCP it's just Normal
        Then we construct the remaining scale matrix. (an n**3 matmul)
        And unlike a cholesky factor these matrices aren't triangular, so
        multiplication or division

        - can we
        """

                Lambda, Q = eigh_with_safe_gradient(x_cov)
                Lambda = tf.abs(Lambda)
                Lambda = tf.cast(Lambda, tf.float32)
                Q = tf.cast(Q, tf.float32)
                Lambda_hat_b = tf.pow(Lambda, b)
                if tied_pparams:
                    # If the scale parameterization is in the eigenbasis,
                    # apply it to the mean in the same basis.
                    loc_in_eigenbasis = tf.linalg.matvec(Q,
                                                         x_loc,
                                                         adjoint_a=True)
                    reparam_loc = tf.linalg.matvec(
                        Q, tf.multiply(loc_in_eigenbasis, a))
                else:
                    reparam_loc = tf.multiply(x_loc, a)

                kwargs_std = {}
                kwargs_std['loc'] = reparam_loc
                kwargs_std['scale'] = LinearOperatorEigenScale(
                    Q, d=tf.sqrt(Lambda_hat_b))
                kwargs_std['name'] = name

                Q_linop = LinearOperatorOrthogonal(Q, det_is_positive=True)
                scale = tf.linalg.LinearOperatorComposition([
                    Q_linop,
                    tf.linalg.LinearOperatorDiag(tf.sqrt(Lambda + 1e-10)),
                    tf.linalg.LinearOperatorDiag(
                        1. / tf.sqrt(Lambda_hat_b + 1e-10)),
                    Q_linop.adjoint(),
                ])
                shift = x_loc - scale.matvec(reparam_loc)
                b = tfb.AffineLinearOperator(scale=scale, shift=shift)

                if 'value' in rv_kwargs:
                    kwargs_std['value'] = b.inverse(rv_kwargs['value'])

            elif parameterisation_type == 'chol':
                L = tf.linalg.cholesky(x_cov +
                                       1e-6 * tf.eye(ndims, dtype=x_cov.dtype))
                L = tf.cast(L, tf.float32)

                reparam_loc = x_loc * a
                reparam_scale = tf.linalg.LinearOperatorLowerTriangular(
                    tf.linalg.diag(1 - b) + b[..., tf.newaxis] * L)
                kwargs_std = {}
                kwargs_std['loc'] = reparam_loc
                kwargs_std['scale'] = reparam_scale
                kwargs_std['name'] = name

                Dinv = tf.linalg.triangular_solve(
                    tf.cast(reparam_scale.to_dense(), cov_dtype),
                    tf.eye(ndims, dtype=cov_dtype))
                Dinv = tf.cast(Dinv, tf.float32)
                scale = tf.matmul(L, Dinv)
                shift = x_loc - tf.linalg.matvec(scale, reparam_loc)
                b = tfb.AffineLinearOperator(
                    scale=tf.linalg.LinearOperatorFullMatrix(scale),
                    shift=shift)
                if 'value' in rv_kwargs:
                    kwargs_std['value'] = b.inverse(rv_kwargs['value'])

            elif parameterisation_type == 'indep':
                # Assumes `C^-1 = diag(c)` is a learned diagonal matrix of 'evidence
                # precisions'. This approximates the true posterior under an iid
                # Gaussian observation model:
                prior_chol = tf.linalg.cholesky(x_cov)
                prior_inv = tf.linalg.cholesky_solve(
                    prior_chol, tf.eye(ndims, dtype=prior_chol.dtype))
                approx_posterior_prec = prior_inv + tf.cast(
                    tf.linalg.diag(c), prior_inv.dtype)
                approx_posterior_prec_chol = tf.linalg.cholesky(
                    approx_posterior_prec)
                approx_posterior_cov = tf.linalg.cholesky_solve(
                    approx_posterior_prec_chol,
                    tf.eye(ndims, dtype=approx_posterior_prec_chol.dtype))
                cov_chol = tf.linalg.cholesky(approx_posterior_cov)

                cov_chol = tf.cast(cov_chol, tf.float32)
                prior_chol = tf.cast(prior_chol, tf.float32)
                scale_linop = tf.linalg.LinearOperatorLowerTriangular(cov_chol)

                reparam_loc = x_loc * a
                reparam_scale = tf.linalg.LinearOperatorComposition([
                    tf.linalg.LinearOperatorInversion(scale_linop),
                    tf.linalg.LinearOperatorLowerTriangular(prior_chol)
                ])
                kwargs_std = {}
                kwargs_std['loc'] = reparam_loc
                kwargs_std['scale'] = reparam_scale
                kwargs_std['name'] = name

                shift = x_loc - scale_linop.matvec(reparam_loc)
                b = tfb.AffineLinearOperator(scale=scale_linop, shift=shift)
                if 'value' in rv_kwargs:
                    kwargs_std['value'] = b.inverse(rv_kwargs['value'])

            elif parameterisation_type == 'eigindep':
                # Combines 'eig' and 'indep' parameterizations, modeling the posterior
                # as
                # (V D**(-b) V' + diag(c))^-1
                # where VDV' is the eigendecomposition of the prior cov, and b and c
                # are learned vectors.
                b, c = [tf.cast(x, cov_dtype) for x in (b, c)]
                Lambda, Q = eigh_with_safe_gradient(x_cov)
                Lambda = tf.abs(Lambda)
                Lambda_hat_b = 1e-6 + tf.pow(Lambda, b)
                prior = tf.matmul(
                    Q,
                    tf.matmul(tf.linalg.diag(Lambda_hat_b), Q, adjoint_b=True))
                prior_chol = tf.linalg.cholesky(
                    prior + 1e-6 * tf.eye(ndims, dtype=prior.dtype))
                prior_prec = tf.linalg.cholesky_solve(
                    prior_chol + 1e-6 * tf.eye(ndims, dtype=prior_chol.dtype),
                    tf.eye(ndims, dtype=prior_chol.dtype))

                approx_posterior_prec = prior_prec + tf.linalg.diag(c)
                approx_posterior_prec_chol = tf.linalg.cholesky(
                    approx_posterior_prec)
                approx_posterior_cov = tf.linalg.cholesky_solve(
                    approx_posterior_prec_chol + 1e-6 *
                    tf.eye(ndims, dtype=approx_posterior_prec_chol.dtype),
                    tf.eye(ndims, dtype=approx_posterior_prec_chol.dtype))
                cov_chol = tf.linalg.cholesky(
                    approx_posterior_cov +
                    1e-6 * tf.eye(ndims, dtype=approx_posterior_cov.dtype))
                cov_chol = tf.cast(cov_chol, tf.float32)
                prior_chol = tf.cast(prior_chol, tf.float32)
                scale_linop = tf.linalg.LinearOperatorLowerTriangular(cov_chol)

                reparam_loc = tf.multiply(x_loc, a)

                reparam_scale = tf.linalg.LinearOperatorComposition([
                    tf.linalg.LinearOperatorInversion(scale_linop),
                    tf.linalg.LinearOperatorLowerTriangular(prior_chol)
                ])
                kwargs_std = {}
                kwargs_std['loc'] = reparam_loc
                kwargs_std['scale'] = reparam_scale
                kwargs_std['name'] = name

                shift = x_loc - scale_linop.matvec(reparam_loc)
                b = tfb.AffineLinearOperator(scale=scale_linop, shift=shift)
                if 'value' in rv_kwargs:
                    kwargs_std['value'] = b.inverse(rv_kwargs['value'])
            else:
                raise Exception('unrecognized reparameterization strategy!')

            if rv_constructor.__name__.startswith('GaussianProcess'):
                rv_std = edward2.MultivariateNormalLinearOperator(
                    *rv_args, **kwargs_std)
            else:
                rv_std = interceptable(rv_constructor)(*rv_args, **kwargs_std)

            bijectors[name] = b
            return b.forward(rv_std)
        else:
            return interceptable(rv_constructor)(*rv_args, **rv_kwargs)
  def prepare_processing_graph(self, flags):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = flags.desired_samples
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      if flags.wav:
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = tf.audio.decode_wav(
            wav_loader, desired_channels=1, desired_samples=desired_samples)
        wav_data = wav_decoder.audio
      else:
        wav_data = tf_np_load(self.wav_filename_placeholder_)

      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      # signal resampling to generate more training data
      # it will stretch or squeeze input signal proportinally to:
      self.foreground_resampling_placeholder_ = tf.placeholder(tf.float32, [])

      if self.foreground_resampling_placeholder_ != 1.0:
        image = tf.expand_dims(wav_data, 0)
        image = tf.expand_dims(image, 2)
        shape = tf.shape(wav_data)
        image_resized = tf.image.resize(
            images=image,
            size=(tf.cast((tf.cast(shape[0], tf.float32) *
                           self.foreground_resampling_placeholder_),
                          tf.int32), 1),
            preserve_aspect_ratio=False)
        image_resized_cropped = tf.image.resize_with_crop_or_pad(
            image_resized,
            target_height=desired_samples,
            target_width=1,
        )
        image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3])
        scaled_foreground = tf.multiply(image_resized_cropped,
                                        self.foreground_volume_placeholder_)
      else:
        scaled_foreground = tf.multiply(wav_data,
                                        self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          tensor=scaled_foreground,
          paddings=self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

      if flags.preprocess == 'raw':
        # background_clamp dims: [time, channels]
        # remove channel dim
        self.output_ = tf.squeeze(background_clamp, axis=1)
      # below options are for backward compatibility with previous
      # version of hotword detection on microcontrollers
      # in this case audio feature extraction is done separately from
      # neural net and user will have to manage it.
      elif flags.preprocess == 'mfcc':
        # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs
        # background_clamp dims: [time, channels]
        spectrogram = audio_ops.audio_spectrogram(
            background_clamp,
            window_size=flags.window_size_samples,
            stride=flags.window_stride_samples,
            magnitude_squared=flags.fft_magnitude_squared)
        # spectrogram: [channels/batch, frames, fft_feature]

        # extract mfcc features from spectrogram by audio_ops.mfcc:
        # 1 Input is spectrogram frames.
        # 2 Weighted spectrogram into bands using a triangular mel filterbank
        # 3 Logarithmic scaling
        # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count
        mfcc = audio_ops.mfcc(
            spectrogram=spectrogram,
            sample_rate=flags.sample_rate,
            upper_frequency_limit=flags.mel_upper_edge_hertz,
            lower_frequency_limit=flags.mel_lower_edge_hertz,
            filterbank_channel_count=flags.mel_num_bins,
            dct_coefficient_count=flags.dct_num_features)
        # mfcc: [channels/batch, frames, dct_coefficient_count]
        # remove channel dim
        self.output_ = tf.squeeze(mfcc, axis=0)
      elif flags.preprocess == 'micro':
        if not frontend_op:
          raise Exception(
              'Micro frontend op is currently not available when running'
              ' TensorFlow directly from Python, you need to build and run'
              ' through Bazel')
        int16_input = tf.cast(
            tf.multiply(background_clamp, du.MAX_ABS_INT16), tf.int16)
        # audio_microfrontend does:
        # 1. A slicing window function of raw audio
        # 2. Short-time FFTs
        # 3. Filterbank calculations
        # 4. Noise reduction
        # 5. PCAN Auto Gain Control
        # 6. Logarithmic scaling

        # int16_input dims: [time, channels]
        micro_frontend = frontend_op.audio_microfrontend(
            int16_input,
            sample_rate=flags.sample_rate,
            window_size=flags.window_size_ms,
            window_step=flags.window_stride_ms,
            num_channels=flags.mel_num_bins,
            upper_band_limit=flags.mel_upper_edge_hertz,
            lower_band_limit=flags.mel_lower_edge_hertz,
            out_scale=1,
            out_type=tf.float32)
        # int16_input dims: [frames, num_channels]
        self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
      else:
        raise ValueError('Unknown preprocess mode "%s" (should be "raw", '
                         ' "mfcc", or "micro")' % (flags.preprocess))
Beispiel #10
0
train_Y = numpy.asarray([
    1.7, 2.76, 2.09, 3.19, 1.694, 1.573, 3.366, 2.596, 2.53, 1.221, 2.827,
    3.465, 1.65, 2.904, 2.42, 2.94, 1.3
])
n_samples = train_X.shape[0]

# tf Graph Input
X = tf.placeholder("float")
Y = tf.placeholder("float")

# Set model weights
W = tf.Variable(rng.randn(), name="weight")
b = tf.Variable(rng.randn(), name="bias")

# Construct a linear model
pred = tf.add(tf.multiply(X, W), b)

# Mean squared error
cost = tf.reduce_sum(tf.pow(pred - Y, 2)) / (2 * n_samples)
# Gradient descent
#  Note, minimize() knows to modify W and b because Variable objects are trainable=True by default
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)
Beispiel #11
0
def true_fn() :
    return tf.multiply(x, 10) #x*10
Beispiel #12
0
test_house_price_norm = normalize(test_house_price)

#  Set up the TensorFlow placeholders that get updated as we descend down the gradient
tf_house_size = tf.placeholder("float", name="house_size")
tf_price = tf.placeholder("float", name="price")

# Define the variables holding the size_factor and price we set during training.
# We initialize them to some random values based on the normal distribution.
tf_size_factor = tf.Variable(np.random.randn(), name="size_factor")
tf_price_offset = tf.Variable(np.random.randn(), name="price_offset")

# 2. Define the operations for the predicting values - predicted price = (size_factor * house_size ) + price_offset
#  Notice, the use of the tensorflow add and multiply functions.  These add the operations to the computation graph,
#  AND the tensorflow methods understand how to deal with Tensors.  Therefore do not try to use numpy or other library
#  methods.
tf_price_pred = tf.add(tf.multiply(tf_size_factor, tf_house_size),
                       tf_price_offset)

# 3. Define the Loss Function (how much error) - Mean squared error
tf_cost = tf.reduce_sum(tf.pow(tf_price_pred - tf_price,
                               2)) / (2 * num_train_samples)

# Optimizer learning rate.  The size of the steps down the gradient
learning_rate = 0.1

# 4. define a Gradient descent optimizer that will minimize the loss defined in the operation "cost".
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_cost)

# Initializing the variables
init = tf.global_variables_initializer()
    def forward(self):

        X = self.phs['X']
        if not self.embedding: X = tf.cast(X, tf.float32) * (1.0 / 255)
        layer = self.apply_feature_extractor(X)
        fisher_ws = []
        fisher_diags = []
        fisher_diagcs = []
        fisher_old_ws = []

        n_layers = len(self.layer_sizes) - 1
        for i in range(n_layers):

            layer_name = "d%d" % (i + 1)

            layer = utils.dense2(layer,
                                 self.layer_sizes[i],
                                 self.layer_sizes[i + 1],
                                 name=layer_name)
            print('Applied dense (%d, %d) of name %s' %
                  (self.layer_sizes[i], self.layer_sizes[i + 1], layer_name))

            w = utils.get_var("%s/w" % layer_name)
            fisher_w_name = "fisher_diag_%s_w" % layer_name
            fisher_wc_name = "fisher_diag_%s_wc" % layer_name
            fisher_old_w_name = "fisher_old_%s_w" % layer_name
            self.vars[fisher_w_name] = tf.Variable(tf.zeros_like(w),
                                                   name=fisher_w_name)
            self.vars[fisher_wc_name] = tf.Variable(tf.zeros_like(w),
                                                    name=fisher_wc_name)
            self.vars[fisher_old_w_name] = tf.Variable(tf.zeros_like(w),
                                                       name=fisher_old_w_name)
            fisher_ws += [w]
            fisher_diags += [self.vars[fisher_w_name]]
            fisher_diagcs += [self.vars[fisher_wc_name]]
            fisher_old_ws += [self.vars[fisher_old_w_name]]

            b = utils.get_var("%s/b" % layer_name)
            fisher_b_name = "fisher_diag_%s_b" % layer_name
            fisher_bc_name = "fisher_diag_%s_bc" % layer_name
            fisher_old_b_name = "fisher_old_%s_b" % layer_name
            self.vars[fisher_b_name] = tf.Variable(tf.zeros_like(b),
                                                   name=fisher_b_name)
            self.vars[fisher_bc_name] = tf.Variable(tf.zeros_like(b),
                                                    name=fisher_bc_name)
            self.vars[fisher_old_b_name] = tf.Variable(tf.zeros_like(b),
                                                       name=fisher_old_b_name)
            fisher_ws += [b]
            fisher_diags += [self.vars[fisher_b_name]]
            fisher_diagcs += [self.vars[fisher_bc_name]]
            fisher_old_ws += [self.vars[fisher_old_b_name]]

            print('Created zero fishers')

            if i + 1 != len(self.layer_sizes) - 1:
                if self.use_dropout:
                    layer = self.activation(layer)
                    layer = tf.keras.layers.Dropout(
                        rate=self.dropoutv,
                        seed=self.seed)(layer, training=self.glob_training_ph)
                    print('Applied activation -> dropout')
                else:
                    layer = self.activation(layer)
                    print('Applied activation')

        self.vars['fX'] = layer
        self.objs['fisher_ws'] = fisher_ws
        self.objs['fisher_diagcs'] = fisher_diagcs
        self.objs['fisher_diags'] = fisher_diags
        self.objs['fisher_old_ws'] = fisher_old_ws

        # Create fisher graph
        print('Creating fisher batch_log_likelihood')

        fisher_X = tf.cast(self.phs['fisher_X'], tf.float32) * (1.0 / 255)
        fisher_Y = tf.one_hot(self.phs['fisher_Y'],
                              depth=self.layer_sizes[-1],
                              dtype=tf.float32)

        if self.feature_extractor_needed:
            fisher_X = self.apply_feature_extractor(fisher_X)
            fisher_Xs = [
                tf.reshape(fx, shape=(1, self.layer_sizes[0])) for fx in
                tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0)
            ]
        else:
            fisher_Xs = [
                tf.reshape(fx, shape=(1, *self.it.reshape_dims)) for fx in
                tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0)
            ]

        fisher_Ys = tf.unstack(fisher_Y, num=self.fisher_batch_size, axis=0)

        log_likelihoods = []
        fisher_var_lists = []

        for i in range(self.fisher_batch_size):

            raw_output = fisher_Xs[i]

            fisher_var_list = []
            for j in range(n_layers):

                layer_name = "d%d" % (j + 1)

                w = tf.identity(utils.get_var("%s/w" % layer_name))
                b = tf.identity(utils.get_var("%s/b" % layer_name))
                fisher_var_list += [w, b]
                raw_output = tf.add(tf.matmul(raw_output, w), b)

                if j + 1 != len(self.layer_sizes) - 1:

                    raw_output = self.activation(raw_output)
                    # No dropout; TODO

            log_likelihood = tf.multiply(fisher_Ys[i],
                                         tf.nn.log_softmax(raw_output))
            log_likelihoods += [log_likelihood]
            fisher_var_lists += [fisher_var_list]

        batch_log_likelihood = tf.reduce_sum(log_likelihoods)
        self.vars['batch_log_likelihood'] = batch_log_likelihood
        self.objs['fisher_var_lists'] = fisher_var_lists
    def update_loss(self, n_task):

        if n_task == 0:
            return

        loss = self.vars['losses'][0] if self.use_orig_loss else self.vars[
            'losses'][n_task - 1]

        penalties = []
        old_vars = self.objs[
            'fisher_old_ws'] if self.use_latest_theta_star else self.saved_wts[
                n_task - 1]
        fisher_vars = self.objs[
            'fisher_diags'] if self.use_latest_theta_star else self.saved_fishers[
                n_task - 1]
        for var, old_var, fisher in zip(self.objs['fisher_ws'], old_vars,
                                        fisher_vars):

            penalties += [
                tf.multiply(fisher, tf.square(tf.subtract(var, old_var)))
            ]

        ewc_penalty = tf.add_n(
            [tf.reduce_sum(penalty) for penalty in penalties])
        if self.fisher_avg:
            ewc_penalty = tf.multiply(1.0 / n_task, ewc_penalty)
        new_loss = tf.add(
            loss,
            tf.multiply(tf.constant(self.ewc_const, tf.float32), ewc_penalty))

        self.vars['loss'] = new_loss
        self.vars['losses'][n_task] = new_loss
        self.vars['distances'][n_task] = self.setup_distances(n_task)

        orig_var_list = self.vars['orig_var_list']
        # print("Trainable vars: %s" % str(orig_var_list))
        print("Trainable vars:")
        self.print_vars(orig_var_list)
        if self.reset_opt:
            print('Reset opt')
            self.objs['sess'].run(
                tf.variables_initializer(self.objs['opt'].variables()))

        # op = self.objs['opt'].minimize(new_loss, var_list = orig_var_list)
        grads = self.objs['opt'].compute_gradients(new_loss,
                                                   var_list=orig_var_list)
        new_grads = []

        if self.correctmask:
            temp_masks = []
            init_ops = []
            for fi, mask in enumerate(self.all_masks[n_task]):
                temp_masks += [
                    tf.Variable(tf.zeros_like(mask), trainable=False)
                ]
                init_ops += [tf.assign(temp_masks[fi], mask)]
            self.objs['sess'].run(init_ops)
            print("Created temp_masks")
            for gv, mask, fd in zip(grads, temp_masks,
                                    self.objs['fisher_diags']):
                grad, var = gv
                s = self.objs['sess'].run(tf.reduce_sum(fd))
                fd_filtered = tf.identity(mask)
                num = self.objs['sess'].run(tf.reduce_sum(fd_filtered))
                total_num = self.objs['sess'].run(
                    tf.reduce_sum(tf.ones_like(fd_filtered)))
                print("%s => can modify %d/%d params, sum = %f" %
                      (var.name, num, total_num, s))
                new_grad = tf.multiply(fd_filtered, grad)
                new_grads += [(new_grad, var)]
        else:
            for gv, fd in zip(grads, self.objs['fisher_diags']):
                grad, var = gv
                fd_filtered, num, total_num = self.get_mask(fd, fix=self.fix)
                s = self.objs['sess'].run(tf.reduce_sum(fd))
                print("%s => can modify %d/%d params, sum = %f" %
                      (var.name, num, total_num, s))
                new_grad = tf.multiply(fd_filtered, grad)
                new_grads += [(new_grad, var)]

        op = self.objs['opt'].apply_gradients(new_grads)

        self.vars['train_op'] = op
        self.vars['train_ops'][n_task] = op

        print('Updated train_op and loss')
Beispiel #15
0
    def build_model(self):
        """
        建立推断的模型
        :return: 无
        """
        # create placeholder
        self.img_placeholder = tf.placeholder(dtype=tf.float32,
                                              shape=[
                                                  self.test_batch_size,
                                                  self.input_w, self.input_h,
                                                  self.input_c
                                              ])
        self.label_placeholder = tf.placeholder(dtype=tf.int32,
                                                shape=[self.test_batch_size])
        self.training_flag = tf.placeholder(dtype=tf.bool, shape=[])
        self.earlyexit_lossweights_placeholder = tf.placeholder(
            dtype=tf.float32, shape=[len(self.earlyexit_lossweights)])

        # create MODEL and build graph
        self.B_VGGNet_instance = B_VGGNet(num_class=self.num_class)
        [
            self.logits_exit0, self.logits_exit1, self.logits_exit2,
            self.logits_exit3
        ] = self.B_VGGNet_instance.model(self.img_placeholder,
                                         is_train=self.training_flag)

        # prediction from branches
        self.pred0 = tf.nn.softmax(self.logits_exit0)
        self.pred1 = tf.nn.softmax(self.logits_exit1)
        self.pred2 = tf.nn.softmax(self.logits_exit2)
        self.pred3 = tf.nn.softmax(self.logits_exit3)

        # logits of branches
        #print(logits_exit0.shape, logits_exit1.shape, logits_exit2.shape)
        self.loss_exit0 = cross_entropy(self.logits_exit0,
                                        self.label_placeholder)
        self.loss_exit1 = cross_entropy(self.logits_exit1,
                                        self.label_placeholder)
        self.loss_exit2 = cross_entropy(self.logits_exit2,
                                        self.label_placeholder)
        self.loss_exit3 = cross_entropy(self.logits_exit3,
                                        self.label_placeholder)
        self.total_loss = tf.reduce_sum(
            tf.multiply(self.earlyexit_lossweights_placeholder, [
                self.loss_exit0, self.loss_exit1, self.loss_exit2,
                self.loss_exit3
            ]))

        # accuracy from brach
        self.train_acc0 = top_k_error(self.pred0, self.label_placeholder, 1)
        self.train_acc1 = top_k_error(self.pred1, self.label_placeholder, 1)
        self.train_acc2 = top_k_error(self.pred2, self.label_placeholder, 1)
        self.train_acc3 = top_k_error(self.pred3, self.label_placeholder, 1)

        # Initialize MODEL and create session
        self.sess = tf.Session()

        # Construct saver and restore graph
        self.saver = tf.train.Saver()
        self.saver.restore(self.sess,
                           os.path.join(self.checkpoint_path, 'B_VGG.ckpt'))
Beispiel #16
0
def build_bifpn_layer(feats,
                      feat_sizes,
                      fpn_name,
                      fpn_config,
                      is_training,
                      fpn_num_filters,
                      min_level,
                      max_level,
                      separable_conv,
                      apply_bn_for_resampling,
                      conv_after_downsample,
                      use_native_resize_op,
                      conv_bn_relu_pattern,
                      pooling_type,
                      use_tpu=False):
    """Builds a feature pyramid given previous feature pyramid and config."""
    config = fpn_config or get_fpn_config(fpn_name, min_level, max_level)

    num_output_connections = [0 for _ in feats]
    for i, fnode in enumerate(config.nodes):
        with tf.variable_scope('fnode{}'.format(i)):
            logging.info('fnode %d : %s', i, fnode)
            new_node_width = feat_sizes[fnode['width_index']]
            nodes = []
            for idx, input_offset in enumerate(fnode['inputs_offsets']):
                input_node = feats[input_offset]
                num_output_connections[input_offset] += 1
                input_node = resample_feature_map(
                    input_node, '{}_{}_{}'.format(idx, input_offset,
                                                  len(feats)), new_node_width,
                    fpn_num_filters, apply_bn_for_resampling, is_training,
                    conv_after_downsample, use_native_resize_op, pooling_type)
                nodes.append(input_node)

            # Combine all nodes.
            dtype = nodes[0].dtype
            if config.weight_method == 'attn':
                edge_weights = [
                    tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype)
                    for _ in range(len(fnode['inputs_offsets']))
                ]
                normalized_weights = tf.nn.softmax(tf.stack(edge_weights))
                nodes = tf.stack(nodes, axis=-1)
                new_node = tf.reduce_sum(
                    tf.multiply(nodes, normalized_weights), -1)
            elif config.weight_method == 'fastattn':
                edge_weights = [
                    tf.nn.relu(
                        tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype))
                    for _ in range(len(fnode['inputs_offsets']))
                ]
                weights_sum = tf.add_n(edge_weights)
                nodes = [
                    nodes[i] * edge_weights[i] / (weights_sum + 0.0001)
                    for i in range(len(nodes))
                ]
                new_node = tf.add_n(nodes)
            elif config.weight_method == 'sum':
                new_node = tf.add_n(nodes)
            else:
                raise ValueError('unknown weight_method {}'.format(
                    config.weight_method))

            with tf.variable_scope('op_after_combine{}'.format(len(feats))):
                if not conv_bn_relu_pattern:
                    new_node = utils.relu_fn(new_node)

                if separable_conv:
                    conv_op = functools.partial(tf.layers.separable_conv2d,
                                                depth_multiplier=1)
                else:
                    conv_op = tf.layers.conv2d

                new_node = conv_op(
                    new_node,
                    filters=fpn_num_filters,
                    kernel_size=(3, 3),
                    padding='same',
                    use_bias=True if not conv_bn_relu_pattern else False,
                    name='conv')

                new_node = utils.batch_norm_relu(
                    new_node,
                    is_training_bn=is_training,
                    relu=False if not conv_bn_relu_pattern else True,
                    data_format='channels_last',
                    use_tpu=use_tpu,
                    name='bn')

            feats.append(new_node)
            num_output_connections.append(0)

    output_feats = {}
    for l in range(min_level, max_level + 1):
        for i, fnode in enumerate(reversed(config.nodes)):
            if fnode['width_index'] == l:
                output_feats[l] = feats[-1 - i]
                break
    return output_feats
Beispiel #17
0
    def meta_optimize(self):
        """Meta optimization step."""

        probe_images, probe_labels = self.probe_images, self.probe_labels
        labels = self.labels
        net = self.net
        logits = self.logits
        gate_gradients = 1

        batch_size = int(self.batch_size / self.strategy.num_replicas_in_sync)
        init_eps_val = float(1) / batch_size

        meta_net = networks.MetaImage(self.net, name='meta_model')

        if FLAGS.meta_momentum and not self.optimizer.variables():
            # Initializing momentum state of optimizer for meta momentum update.
            # It is a hacky implementation
            logging.info('Pre-initialize optimizer momentum states.')
            idle_net_cost = tf.losses.sparse_softmax_cross_entropy(
                self.labels, logits)
            tmp_var_grads = self.optimizer.compute_gradients(
                tf.reduce_mean(idle_net_cost), net.trainable_variables)
            self.optimizer.apply_gradients(tmp_var_grads)

        with tf.name_scope('coefficient'):
            # Data weight coefficient
            target = tf.constant([init_eps_val] * batch_size,
                                 shape=(batch_size, ),
                                 dtype=np.float32,
                                 name='weight')
            # Data re-labeling coefficient
            eps = tf.constant([FLAGS.grad_eps_init] * batch_size,
                              shape=(batch_size, ),
                              dtype=tf.float32,
                              name='eps')

        onehot_labels = tf.one_hot(labels, self.dataset.num_classes)
        onehot_labels = tf.cast(onehot_labels, tf.float32)
        eps_k = tf.reshape(eps, [batch_size, 1])

        mixed_labels = eps_k * onehot_labels + (1 - eps_k) * self.guessed_label
        # raw softmax loss
        log_softmax = tf.nn.log_softmax(logits)
        net_cost = -tf.reduce_sum(mixed_labels * log_softmax, 1)

        lookahead_loss = tf.reduce_sum(tf.multiply(target, net_cost))
        lookahead_loss = lookahead_loss + net.regularization_loss

        with tf.control_dependencies([lookahead_loss]):
            train_vars = net.trainable_variables
            var_grads = tf.gradients(lookahead_loss,
                                     train_vars,
                                     gate_gradients=gate_gradients)

            static_vars = []
            for i in range(len(train_vars)):
                if FLAGS.meta_momentum > 0:
                    actual_grad = self.meta_momentum_update(
                        var_grads[i], train_vars[i].name, self.optimizer)
                    static_vars.append(
                        tf.math.subtract(train_vars[i],
                                         FLAGS.meta_stepsize * actual_grad))
                else:
                    static_vars.append(
                        tf.math.subtract(train_vars[i],
                                         FLAGS.meta_stepsize * var_grads[i]))
                # new style
                meta_net.add_variable_alias(static_vars[-1],
                                            var_name=train_vars[i].name)

            for uv in net.updates_variables:
                meta_net.add_variable_alias(uv,
                                            var_name=uv.name,
                                            var_type='updates_variables')
            meta_net.verbose()

        with tf.control_dependencies(static_vars):
            g_logits = meta_net(probe_images,
                                name='meta_model',
                                reuse=True,
                                training=True)

            desired_y = tf.one_hot(probe_labels, self.dataset.num_classes)
            meta_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                desired_y, g_logits)
            meta_loss = tf.reduce_mean(meta_loss, name='meta_loss')
            meta_loss = meta_loss + meta_net.get_regularization_loss(net.wd)
            meta_acc, meta_acc_op = tf.metrics.accuracy(
                probe_labels, tf.argmax(g_logits, axis=1))

        with tf.control_dependencies([meta_loss] + [meta_acc_op]):
            meta_train_vars = meta_net.trainable_variables
            grad_meta_vars = tf.gradients(meta_loss,
                                          meta_train_vars,
                                          gate_gradients=gate_gradients)
            grad_target, grad_eps = tf.gradients(static_vars, [target, eps],
                                                 grad_ys=grad_meta_vars,
                                                 gate_gradients=gate_gradients)
        # updates weight
        raw_weight = target - grad_target
        raw_weight = raw_weight - init_eps_val
        unorm_weight = tf.clip_by_value(raw_weight,
                                        clip_value_min=0,
                                        clip_value_max=float('inf'))
        norm_c = tf.reduce_sum(unorm_weight)
        weight = tf.divide(unorm_weight, norm_c + 0.00001)

        # gets new lambda by the sign of gradient
        new_eps = tf.where(grad_eps < 0,
                           x=tf.ones_like(eps),
                           y=tf.zeros_like(eps))

        return tf.stop_gradient(weight), tf.stop_gradient(
            new_eps), meta_loss, meta_acc
Beispiel #18
0
    def single_score(self, u, i, feat_cate, feat_val, reuse=False):

        feat_val = tf.reshape(feat_val, shape=[-1, self.fieldSize, 1])
        u_emb = tf.nn.embedding_lookup(self.user_emb_w, u)  # [None,h]
        i_emb = tf.nn.embedding_lookup(self.item_emb_w, i)  # [None,h]
        feature_embeddings = tf.nn.embedding_lookup(self.feature_emb_w,
                                                    feat_cate)  # [None,h2]

        # first-order
        first_emb = tf.nn.embedding_lookup(self.w_first, feat_cate)
        y_first_part = tf.reduce_sum(tf.multiply(first_emb, feat_val),
                                     2)  # [None,f]

        # second-order
        emb = tf.multiply(feature_embeddings, feat_val)
        sum_squared_part = tf.square(tf.reduce_sum(emb, 1))
        squared_sum_part = tf.reduce_sum(tf.square(emb), 1)
        y_second_part = 0.5 * tf.subtract(sum_squared_part,
                                          squared_sum_part)  # [None * k]

        # fcn
        flat_emb = tf.reshape(feature_embeddings,
                              [-1, self.fieldSize * self.Hidden_units // 2])
        all_emb = tf.concat([u_emb, i_emb, flat_emb], axis=1)

        if reuse:
            bn_layer = tf.layers.batch_normalization(inputs=all_emb,
                                                     name='bn',
                                                     reuse=True)
            layer1 = tf.layers.dense(bn_layer,
                                     128,
                                     activation=tf.nn.sigmoid,
                                     name='f1',
                                     reuse=True)
            layer2 = tf.layers.dense(layer1,
                                     64,
                                     activation=tf.nn.sigmoid,
                                     name='f2',
                                     reuse=True)
            layer3 = tf.layers.dense(layer2,
                                     1,
                                     activation=tf.nn.sigmoid,
                                     name='f3',
                                     reuse=True)
            # deepfm
            deep_out = tf.concat([y_first_part, y_second_part, layer3], axis=1)
            res_out = tf.layers.dense(deep_out,
                                      1,
                                      activation=None,
                                      name='f4',
                                      reuse=True)
        else:
            bn_layer = tf.layers.batch_normalization(inputs=all_emb, name='bn')
            layer1 = tf.layers.dense(bn_layer,
                                     128,
                                     activation=tf.nn.sigmoid,
                                     name='f1')
            layer2 = tf.layers.dense(layer1,
                                     64,
                                     activation=tf.nn.sigmoid,
                                     name='f2')
            layer3 = tf.layers.dense(layer2,
                                     1,
                                     activation=tf.nn.sigmoid,
                                     name='f3')
            # deepfm
            deep_out = tf.concat([y_first_part, y_second_part, layer3], axis=1)
            res_out = tf.layers.dense(deep_out, 1, activation=None, name='f4')

        return res_out
Beispiel #19
0
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    input_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    softmax_temperature=1.0,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None,
                    to_proj_length=None):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
  this is self-attention. Each timestep in `from_tensor` attends to the
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.

  This function first projects `from_tensor` into a "query" tensor and
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  of tensors of length `num_attention_heads`, where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with tf.einsum as follows:
    Input_tensor: [BFD]
    Wq, Wk, Wv: [DNH]
    Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq)
    K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk)
    V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv)
    attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H)
    attention_probs:[BNFT] = softmax(attention_scores)
    context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V)
    Wout:[DNH]
    Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout)

  Args:
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    input_mask: Only required when using to_proj_length.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    softmax_temperature: The temperature for the softmax attention.
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `from_tensor`.
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `to_tensor`.
    to_proj_length: (Optional) Int. Down-project keys and values to this length.

  Returns:
    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
      size_per_head].

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """
    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`

    # `query_layer` = [B, F, N, H]
    query_layer = dense_layer_3d(from_tensor, num_attention_heads,
                                 size_per_head,
                                 create_initializer(initializer_range),
                                 query_act, "query")

    # `key_layer` = [B, T, N, H]
    key_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                               create_initializer(initializer_range), key_act,
                               "key")

    # `value_layer` = [B, T, N, H]
    value_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                                 create_initializer(initializer_range),
                                 value_act, "value")

    if to_proj_length is not None:
        # This gives one project matrix per layer (shared by heads and value/key).
        # In the paper they also look into other sharing schemes.
        with tf.variable_scope("proj_seq_length"):
            proj_kernel = tf.get_variable(
                name="kernel",
                shape=[to_seq_length, to_proj_length],
                initializer=create_initializer(initializer_range))

        input_mask = tf.cast(input_mask, tf.float32)
        input_mask4d = tf.reshape(input_mask,
                                  (batch_size, to_seq_length, 1, 1))

        key_layer = key_layer * input_mask4d
        # [B, K, N, H]
        key_layer = tf.einsum("BTNH,TK->BKNH", key_layer, proj_kernel)

        value_layer = value_layer * input_mask4d
        # [B, K, N, H]
        value_layer = tf.einsum("BTNH,TK->BKNH", value_layer, proj_kernel)

    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    attention_scores = tf.einsum("BFNH,BTNH->BNFT",
                                 query_layer,
                                 key_layer,
                                 name="query_key_einsum")

    attention_scores = attention_scores / softmax_temperature
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None and to_proj_length is None:
        # `attention_mask` = [B, 1, F, T] or [B, H, F, T]
        # Caller can pass a rank 3 tensor for a constand mask or rank 4 for per-head
        # head attention mask.
        attention_mask = tf.reshape(
            attention_mask,
            shape=[batch_size, -1, from_seq_length, to_seq_length])

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        attention_mask_float = tf.cast(attention_mask, tf.float32)
        # Please keep this tf.where as it fixes back propagation issues: It removes
        # NaNs when using tf.math.log.
        attention_mask_float = tf.where(attention_mask_float > 0.0,
                                        attention_mask_float,
                                        tf.zeros_like(attention_mask_float))

        adder = tf.math.log(attention_mask_float)
        adder = tf.where(tf.is_finite(adder), adder,
                         tf.zeros_like(adder, dtype=tf.float32) - 10000.0)

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_scores += adder

    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    attention_probs = tf.nn.softmax(attention_scores)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs_do = dropout(attention_probs, attention_probs_dropout_prob)

    # `context_layer` = [B, F, N, H]
    context_layer = tf.einsum("BNFT,BTNH->BFNH",
                              attention_probs_do,
                              value_layer,
                              name="attention_value_einsum")

    return context_layer, attention_probs
Beispiel #20
0
def trainGraph(inp, out, sess):

    argmax = tf.placeholder("float", [None, ACTIONS])
    gt = tf.placeholder("float", [None])

    action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1)

    cost = tf.reduce_mean(tf.square(action - gt))

    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    game = pong.PongGame()

    D = deque()

    frame = game.getPresentFrame()

    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)

    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)

    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    saver = tf.train.Saver()

    sess.run(tf.initialize_all_variables())

    t = 0
    epsilon = INITIAL_EPSILON

    while (1):

        out_t = out.eval(feed_dict={inp: [inp_t]})[0]

        argmax_t = np.zeros([ACTIONS])

        if (random.random() <= epsilon):
            maxIndex = random.randrange(ACTIONS)
        else:
            maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1

        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        reward_t, frame = game.getNextFrame(argmax_t)

        frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (84, 84, 1))

        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)

        D.append((inp_t, argmax_t, reward_t, inp_t1))

        if len(D) > REPLAY_MEMORY:
            D.popleft()

        if t > OBSERVE:

            minibatch = random.sample(D, BATCH)

            inp_batch = [d[0] for d in minibatch]
            argmax_batch = [d[1] for d in minibatch]
            reward_batch = [d[2] for d in minibatch]
            inp_t1_batch = [d[3] for d in minibatch]

            gt_batch = []
            out_batch = out.eval(feed_dict={inp: inp_t1_batch})

            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i]))

            train_step.run(feed_dict={
                gt: gt_batch,
                argmax: argmax_batch,
                inp: inp_batch
            })

        inp_t = inp_t1
        t = t + 1

        if t % 10000 == 0:
            saver.save(sess, './' + 'pong' + '-dqn', global_step=t)

        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex,
              "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
Beispiel #21
0
def selective_crop_and_resize(features,
                              boxes,
                              box_levels,
                              boundaries,
                              output_size=7,
                              sample_offset=0.5,
                              use_einsum_gather=False):
    """Crop and resize boxes on a set of feature maps.

  Given multiple features maps indexed by different levels, and a set of boxes
  where each box is mapped to a certain level, it selectively crops and resizes
  boxes from the corresponding feature maps to generate the box features.

  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
  figure 3 for reference). Specifically, for each feature map, we select an
  (output_size, output_size) set of pixels corresponding to the box location,
  and then use bilinear interpolation to select the feature value for each
  pixel.

  For performance, we perform the gather and interpolation on all layers as a
  single operation. In this op the multi-level features are first stacked and
  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
  interpolation is performed on the gathered feature points to generate
  [output_size, output_size] RoIAlign feature map.

  Here is the step-by-step algorithm:
    1. The multi-level features are gathered into a
       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
       Tensor. The Tensor contains four neighboring feature points for each
       vertice in the output grid.
    2. Compute the interpolation kernel of shape
       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
       can be seen as stacking 2x2 interpolation kernels for all vertices in the
       output grid.
    3. Element-wise multiply the gathered features and interpolation kernel.
       Then apply 2x2 average pooling to reduce spatial dimension to
       output_size.

  Args:
    features: a 5-D tensor of shape [batch_size, num_levels, max_height,
      max_width, num_filters] where cropping and resizing are based.
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
      information of each box w.r.t. the corresponding feature map.
      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
        in terms of the number of pixels of the corresponding feature map size.
    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
      the 0-based corresponding feature level index of each box.
    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
      the boundary (in (y, x)) of the corresponding feature map for each box.
      Any resampled grid points that go beyond the bounary will be clipped.
    output_size: a scalar indicating the output crop size.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.
    use_einsum_gather: use einsum to replace gather or not. Replacing einsum
      with gather can improve performance when feature size is not large, einsum
      is friendly with model partition as well. Gather's performance is better
      when feature size is very large and there are multiple box levels.

  Returns:
    features_per_box: a 5-D tensor of shape
      [batch_size, num_boxes, output_size, output_size, num_filters]
      representing the cropped features.
  """
    (batch_size, num_levels, max_feature_height, max_feature_width,
     num_filters) = features.get_shape().as_list()
    _, num_boxes, _ = boxes.get_shape().as_list()

    kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
        boxes, boundaries, output_size, sample_offset)
    x_indices = tf.cast(tf.reshape(box_gridx0x1,
                                   [batch_size, num_boxes, output_size * 2]),
                        dtype=tf.int32)
    y_indices = tf.cast(tf.reshape(box_gridy0y1,
                                   [batch_size, num_boxes, output_size * 2]),
                        dtype=tf.int32)

    if use_einsum_gather:
        # Blinear interpolation is done during the last two gathers:
        #        f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
        #                              [f10, f11]]
        #        [[f00, f01],
        #         [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
        #       where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.

        # shape is [batch_size, boxes, output_size, 2, 1]
        grid_y_one_hot, grid_x_one_hot = get_grid_one_hot(
            box_gridy0y1, box_gridx0x1, max_feature_height, max_feature_width)

        # shape is [batch_size, num_boxes, output_size, height]
        grid_y_weight = tf.reduce_sum(tf.multiply(grid_y_one_hot, kernel_y),
                                      axis=-2)
        # shape is [batch_size, num_boxes, output_size, width]
        grid_x_weight = tf.reduce_sum(tf.multiply(grid_x_one_hot, kernel_x),
                                      axis=-2)

        # Gather for y_axis.
        # shape is [batch_size, num_boxes, output_size, width, features]
        features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features,
                                     tf.cast(grid_y_weight, features.dtype))
        # Gather for x_axis.
        # shape is [batch_size, num_boxes, output_size, output_size, features]
        features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box,
                                     tf.cast(grid_x_weight, features.dtype))
    else:
        height_dim_offset = max_feature_width
        level_dim_offset = max_feature_height * height_dim_offset
        batch_dim_offset = num_levels * level_dim_offset

        batch_size_offset = tf.tile(
            tf.reshape(
                tf.range(batch_size) * batch_dim_offset,
                [batch_size, 1, 1, 1]),
            [1, num_boxes, output_size * 2, output_size * 2])
        box_levels_offset = tf.tile(
            tf.reshape(box_levels * level_dim_offset,
                       [batch_size, num_boxes, 1, 1]),
            [1, 1, output_size * 2, output_size * 2])
        y_indices_offset = tf.tile(
            tf.reshape(y_indices * height_dim_offset,
                       [batch_size, num_boxes, output_size * 2, 1]),
            [1, 1, 1, output_size * 2])
        x_indices_offset = tf.tile(
            tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
            [1, 1, output_size * 2, 1])

        indices = tf.reshape(
            batch_size_offset + box_levels_offset + y_indices_offset +
            x_indices_offset, [-1])

        features = tf.reshape(features, [-1, num_filters])
        # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
        # performance.
        features_per_box = tf.reshape(tf.gather(features, indices), [
            batch_size, num_boxes, output_size * 2, output_size * 2,
            num_filters
        ])
        features_per_box = feature_bilinear_interpolation(
            features_per_box, kernel_y, kernel_x)

    return features_per_box
Beispiel #22
0
def evaluate(session, d, y):
    sub = tf.subtract(y, d)  # 相减
    power = tf.multiply(sub, sub)  # 平方
    E = session.run(tf.reduce_sum(power))  # 求和
    E /= 2  # 除以2
    return E
Beispiel #23
0
def scale_image_value(image):
    # scale values between -1 and +1
    image = tf.subtract(image, 0.5)
    image = tf.multiply(image, 2.0)
    return image
Beispiel #24
0
def tf_DotProduct(tensorA, tensorB):
    return tf.reduce_sum(tf.multiply(tensorA, tensorB),
                         axis=-1,
                         keep_dims=True)
Beispiel #25
0
iris = load_iris()  # 0-1에 근사한 변수 선택
X = iris.data
y_data = X[:, 2]  # 꽃잎 길이(3)
x_data = X[:, 3]  # 꽃잎 넓이(4)

# Hyper parameter
learning_rate = 0.1  # 학습율 0.01 > 0.1
iter_size = 50  # 학습횟수 : 50 > 500

X = tf.placeholder(dtype=tf.float32, shape=[None])
y = tf.placeholder(dtype=tf.float32, shape=[None])
a = tf.Variable(tf.random_normal(shape=[1], seed=123))
b = tf.Variable(tf.random_normal(shape=[1], seed=123))

# 단순 선형회귀모델
model_output = tf.add(tf.multiply(X, a), b)
'''cost function'''
cost_l1 = tf.reduce_mean(tf.abs(y - model_output))  #  L1
cost_l2 = tf.reduce_mean(tf.square(y - model_output))  # L2

# L1 cost 최적화
opt_l1 = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_l1 = opt_l1.minimize(cost_l1)

# L2 cost 최적화
opt_l2 = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_l2 = opt_l1.minimize(cost_l2)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
 def prelu(self, inp, name):
     with tf.variable_scope(name):
         i = int(inp.get_shape()[-1])
         alpha = self.make_var('alpha', shape=(i,))
         output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp))
     return output
Beispiel #27
0
def draw_samples(alpha, scale):
    r"""Draw samples from the robust distribution.

  This function implements Algorithm 1 the paper. This code is written to allow
  for sampling from a set of different distributions, each parametrized by its
  own alpha and scale values, as opposed to the more standard approach of
  drawing N samples from the same distribution. This is done by repeatedly
  performing N instances of rejection sampling for each of the N distributions
  until at least one proposal for each of the N distributions has been accepted.
  All samples are drawn with a zero mean, to use a non-zero mean just add each
  mean to each sample.

  Args:
    alpha: A TF tensor/scalar or numpy array/scalar of floats where each element
      is the shape parameter of that element's distribution.
    scale: A TF tensor/scalar or numpy array/scalar of floats where each element
      is the scale parameter of that element's distribution. Must be the same
      shape as `alpha`.

  Returns:
    A TF tensor with the same shape and precision as `alpha` and `scale` where
    each element is a sample drawn from the distribution specified for that
    element by `alpha` and `scale`.
  """
    # `scale` must have the same type as `alpha`.
    float_dtype = alpha.dtype
    tf.assert_type(scale, float_dtype)
    assert_ops = [
        # `scale` must be > 0.
        tf.Assert(tf.reduce_all(scale > 0.), [scale]),
        # `alpha` must be >= 0.
        tf.Assert(tf.reduce_all(alpha >= 0.), [alpha]),
        # `alpha` and `scale` must have the same shape.
        tf.Assert(tf.reduce_all(tf.equal(tf.shape(alpha), tf.shape(scale))),
                  [tf.shape(alpha), tf.shape(scale)]),
    ]

    with tf.control_dependencies(assert_ops):
        shape = tf.shape(alpha)

        # The distributions we will need for rejection sampling. The sqrt(2) scaling
        # of the Cauchy distribution corrects for our differing conventions for
        # standardization.
        cauchy = tfp.distributions.Cauchy(loc=0., scale=tf.sqrt(2.))
        uniform = tfp.distributions.Uniform(low=0., high=1.)

        def while_cond(_, accepted):
            """Terminate the loop only when all samples have been accepted."""
            return ~tf.reduce_all(accepted)

        def while_body(samples, accepted):
            """Generate N proposal samples, and then perform rejection sampling."""
            # Draw N samples from a Cauchy, our proposal distribution.
            cauchy_sample = tf.cast(cauchy.sample(shape), float_dtype)

            # Compute the likelihood of each sample under its target distribution.
            nll = nllfun(cauchy_sample, alpha, tf.cast(1, float_dtype))
            # Bound the NLL. We don't use the approximate loss as it may cause
            # unpredictable behavior in the context of sampling.
            nll_bound = general.lossfun(
                cauchy_sample,
                tf.cast(0, float_dtype),
                tf.cast(1, float_dtype),
                approximate=False) + log_base_partition_function(alpha)

            # Draw N samples from a uniform distribution, and use each uniform sample
            # to decide whether or not to accept each proposal sample.
            uniform_sample = tf.cast(uniform.sample(shape), float_dtype)
            accept = uniform_sample <= tf.math.exp(nll_bound - nll)

            # If a sample is accepted, replace its element in `samples` with the
            # proposal sample, and set its bit in `accepted` to True.
            samples = tf.where(accept, cauchy_sample, samples)
            accepted = accept | accepted
            return (samples, accepted)

        # Initialize the loop. The first item does not matter as it will get
        # overwritten, the second item must be all False.
        while_loop_vars = (tf.zeros(shape,
                                    float_dtype), tf.zeros(shape, dtype=bool))

        # Perform rejection sampling until all N samples have been accepted.
        terminal_state = tf.while_loop(cond=while_cond,
                                       body=while_body,
                                       loop_vars=while_loop_vars)

        # Because our distribution is a location-scale family, we sample from
        # p(x | 0, \alpha, 1) and then scale each sample by `scale`.
        samples = tf.multiply(terminal_state[0], scale)

        return samples
#Hyperparameter Setting 
lamb = 0.001
batch_size = 80
learning_rate = 0.15
training_epochs = 2000
display_step = 20

# construct models
x = tf.placeholder('float32',[785,None])
y = tf.placeholder('float32',[5,None])
theta = tf.Variable(tf.zeros([785,5],dtype='float32') + 0.001)
x_next = tf.matmul(theta,x,transpose_a=True)

#%% gradient calcuation for Theta
sig = tf.exp(tf.matmul(theta,x,transpose_a=True))
grad_regression = tf.multiply(theta,2*lamb)
grad_softmax = tf.divide(sig,tf.reduce_sum(sig,0))
grad_LCL = -tf.matmul(x,tf.subtract(y,grad_softmax),transpose_b=True)
grad_LCL_regression  = tf.add(grad_LCL,grad_regression)
grad = tf.divide(grad_LCL_regression,batch_size)


print(grad.shape)

#update_theta
theta_update = tf.assign(theta,tf.subtract(theta,learning_rate*grad))

#compare between estimated result and true result
y2 = tf.argmax(sig,0)
y3 = tf.argmax(y,0)
score = tf.reduce_mean(tf.cast(tf.equal(y2,y3),'float32'))
def add_input_distortions(flip_left_right, random_crop, random_scale,
                          random_brightness):
    """Creates the operations to apply the specified distortions.

  During training it can help to improve the results if we run the images
  through simple distortions like crops, scales, and flips. These reflect the
  kind of variations we expect in the real world, and so can help train the
  model to cope with natural data more effectively. Here we take the supplied
  parameters and construct a network of operations to apply them to an image.

  Cropping
  ~~~~~~~~

  Cropping is done by placing a bounding box at a random position in the full
  image. The cropping parameter controls the size of that box relative to the
  input image. If it's zero, then the box is the same size as the input and no
  cropping is performed. If the value is 50%, then the crop box will be half the
  width and height of the input. In a diagram it looks like this:

  <       width         >
  +---------------------+
  |                     |
  |   width - crop%     |
  |    <      >         |
  |    +------+         |
  |    |      |         |
  |    |      |         |
  |    |      |         |
  |    +------+         |
  |                     |
  |                     |
  +---------------------+

  Scaling
  ~~~~~~~

  Scaling is a lot like cropping, except that the bounding box is always
  centered and its size varies randomly within the given range. For example if
  the scale percentage is zero, then the bounding box is the same size as the
  input and no scaling is applied. If it's 50%, then the bounding box will be in
  a random range between half the width and height and full size.

  Args:
    flip_left_right: Boolean whether to randomly mirror images horizontally.
    random_crop: Integer percentage setting the total margin used around the
    crop box.
    random_scale: Integer percentage of how much to vary the scale by.
    random_brightness: Integer range to randomly multiply the pixel values by.
    graph.

  Returns:
    The jpeg input layer and the distorted result tensor.
  """

    jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput')
    decoded_image = tf.image.decode_jpeg(jpeg_data, channels=MODEL_INPUT_DEPTH)
    decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
    decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
    margin_scale = 1.0 + (random_crop / 100.0)
    resize_scale = 1.0 + (random_scale / 100.0)
    margin_scale_value = tf.constant(margin_scale)
    resize_scale_value = tf.random_uniform(tensor_shape.scalar(),
                                           minval=1.0,
                                           maxval=resize_scale)
    scale_value = tf.multiply(margin_scale_value, resize_scale_value)
    precrop_width = tf.multiply(scale_value, MODEL_INPUT_WIDTH)
    precrop_height = tf.multiply(scale_value, MODEL_INPUT_HEIGHT)
    precrop_shape = tf.stack([precrop_height, precrop_width])
    precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32)
    precropped_image = tf.image.resize_bilinear(decoded_image_4d,
                                                precrop_shape_as_int)
    precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0])
    cropped_image = tf.random_crop(
        precropped_image_3d,
        [MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH, MODEL_INPUT_DEPTH])
    if flip_left_right:
        flipped_image = tf.image.random_flip_left_right(cropped_image)
    else:
        flipped_image = cropped_image
    brightness_min = 1.0 - (random_brightness / 100.0)
    brightness_max = 1.0 + (random_brightness / 100.0)
    brightness_value = tf.random_uniform(tensor_shape.scalar(),
                                         minval=brightness_min,
                                         maxval=brightness_max)
    brightened_image = tf.multiply(flipped_image, brightness_value)
    distort_result = tf.expand_dims(brightened_image, 0, name='DistortResult')
    return jpeg_data, distort_result
    def forward(self):

        X = self.phs['X']
        if not self.embedding: X = tf.cast(X, tf.float32) * (1.0 / 255)
        layer = self.apply_feature_extractor(X)
        fisher_ws = []
        fisher_diags = []
        fisher_diagcs = []
        fisher_old_ws = []

        n_layers = len(self.layer_sizes) - 1
        for i in range(n_layers):

            layer_name = "d%d" % (i + 1)

            layer = utils.dense2(layer,
                                 self.layer_sizes[i],
                                 self.layer_sizes[i + 1],
                                 name=layer_name)
            print('Applied dense (%d, %d) of name %s' %
                  (self.layer_sizes[i], self.layer_sizes[i + 1], layer_name))

            w = utils.get_var("%s/w" % layer_name)
            fisher_w_name = "fisher_diag_%s_w" % layer_name
            fisher_wc_name = "fisher_diag_%s_wc" % layer_name
            fisher_old_w_name = "fisher_old_%s_w" % layer_name
            self.vars[fisher_w_name] = tf.Variable(tf.zeros_like(w),
                                                   name=fisher_w_name)
            self.vars[fisher_wc_name] = tf.Variable(tf.zeros_like(w),
                                                    name=fisher_wc_name)
            self.vars[fisher_old_w_name] = tf.Variable(tf.zeros_like(w),
                                                       name=fisher_old_w_name)
            fisher_ws += [w]
            fisher_diags += [self.vars[fisher_w_name]]
            fisher_diagcs += [self.vars[fisher_wc_name]]
            fisher_old_ws += [self.vars[fisher_old_w_name]]

            b = utils.get_var("%s/b" % layer_name)
            fisher_b_name = "fisher_diag_%s_b" % layer_name
            fisher_bc_name = "fisher_diag_%s_bc" % layer_name
            fisher_old_b_name = "fisher_old_%s_b" % layer_name
            self.vars[fisher_b_name] = tf.Variable(tf.zeros_like(b),
                                                   name=fisher_b_name)
            self.vars[fisher_bc_name] = tf.Variable(tf.zeros_like(b),
                                                    name=fisher_bc_name)
            self.vars[fisher_old_b_name] = tf.Variable(tf.zeros_like(b),
                                                       name=fisher_old_b_name)
            fisher_ws += [b]
            fisher_diags += [self.vars[fisher_b_name]]
            fisher_diagcs += [self.vars[fisher_bc_name]]
            fisher_old_ws += [self.vars[fisher_old_b_name]]

            print('Created zero fishers')

            if i + 1 != len(self.layer_sizes) - 1:
                if self.use_dropout:
                    layer = self.activation(layer)
                    layer = tf.keras.layers.Dropout(
                        rate=self.dropoutv,
                        seed=self.seed)(layer, training=self.glob_training_ph)
                    print('Applied activation -> dropout')
                else:
                    layer = self.activation(layer)
                    print('Applied activation')

        self.vars['fX'] = layer
        self.objs['fisher_ws'] = fisher_ws
        self.objs['fisher_diagcs'] = fisher_diagcs
        self.objs['fisher_diags'] = fisher_diags
        self.objs['fisher_old_ws'] = fisher_old_ws

        # Create fisher graph
        print('Creating fisher batch_log_likelihood')

        fisher_X = tf.cast(self.phs['fisher_X'], tf.float32) * (1.0 / 255)
        fisher_Y = tf.one_hot(self.phs['fisher_Y'],
                              depth=self.layer_sizes[-1],
                              dtype=tf.float32)

        if self.feature_extractor_needed:
            fisher_X = self.apply_feature_extractor(fisher_X)
            fisher_Xs = [
                tf.reshape(fx, shape=(1, self.layer_sizes[0])) for fx in
                tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0)
            ]
        else:
            fisher_Xs = [
                tf.reshape(fx, shape=(1, *self.it.reshape_dims)) for fx in
                tf.unstack(fisher_X, num=self.fisher_batch_size, axis=0)
            ]

        fisher_Ys = tf.unstack(fisher_Y, num=self.fisher_batch_size, axis=0)

        fisher_var_lists = []

        # Classwise direct predicted likelihoods and direct predicted likelihoods
        # Case I, II, III, IV
        nout = self.layer_sizes[-1]
        onehots_n = tf.unstack(tf.one_hot(list(range(nout)), nout),
                               num=nout,
                               axis=0)
        if self.version == 'case1':
            jlikelihoods = {ii: [] for ii in range(nout)}
        if self.version == 'case2':
            jlikelihoodsqs = {ii: [] for ii in range(nout)}
        if self.version == 'case3':
            likelihoods = []
        if self.version == 'case4':
            likelihoodsqs = []

        for i in range(self.fisher_batch_size):

            raw_output = fisher_Xs[i]

            fisher_var_list = []
            for j in range(n_layers):

                layer_name = "d%d" % (j + 1)

                w = tf.identity(utils.get_var("%s/w" % layer_name))
                b = tf.identity(utils.get_var("%s/b" % layer_name))
                fisher_var_list += [w, b]
                raw_output = tf.add(tf.matmul(raw_output, w), b)

                if j + 1 != len(self.layer_sizes) - 1:

                    raw_output = self.activation(raw_output)
                    # No dropout; TODO

            fisher_var_lists += [fisher_var_list]

            # Case I, II, III, IV
            if self.version == 'case1':
                for key in jlikelihoods.keys():
                    jlikelihoods[key] += [
                        tf.multiply(onehots_n[key], tf.nn.softmax(raw_output))
                    ]
            if self.version == 'case2':
                for key in jlikelihoodsqs.keys():
                    jlikelihoodsqs[key] += [
                        tf.square(
                            tf.multiply(onehots_n[key],
                                        tf.nn.softmax(raw_output)))
                    ]
            if self.version == 'case3':
                likelihood = tf.multiply(fisher_Ys[i],
                                         tf.nn.softmax(raw_output))
                likelihoods += [likelihood]
            if self.version == 'case4':
                likelihood = tf.multiply(fisher_Ys[i],
                                         tf.nn.softmax(raw_output))
                likelihoodsq = tf.square(likelihood)
                likelihoodsqs += [likelihoodsq]

        self.objs['fisher_var_lists'] = fisher_var_lists

        # Finally, reduce_sum and add to vars
        if self.version == 'case1':
            jbatch_likelihood = {
                key: tf.reduce_sum(jlikelihoods[key])
                for key in jlikelihoods.keys()
            }
            self.vars['jbatch_likelihood'] = jbatch_likelihood
        if self.version == 'case2':
            jbatch_likelihoodsq = {
                key: tf.multiply(tf.constant(0.5),
                                 tf.reduce_sum(jlikelihoodsqs[key]))
                for key in jlikelihoodsqs.keys()
            }
            self.vars['jbatch_likelihoodsq'] = jbatch_likelihoodsq
        if self.version == 'case3':
            batch_likelihood = tf.reduce_sum(likelihoods)
            self.vars['batch_likelihood'] = batch_likelihood
        if self.version == 'case4':
            batch_likelihoodsq = tf.multiply(tf.constant(0.5),
                                             tf.reduce_sum(likelihoodsqs))
            self.vars['batch_likelihoodsq'] = batch_likelihoodsq