Exemple #1
0
def resnet_model_fn_w_pruning(features, labels, mode, params):
    """The model_fn for ResNet-50 with pruning.

  Args:
    features: A float32 batch of images.
    labels: A int32 batch of labels.
    mode: Specifies whether training or evaluation.
    params: Dictionary of parameters passed to the model.

  Returns:
    A TPUEstimatorSpec for the model
  """

    width = 1. if FLAGS.width <= 0 else FLAGS.width
    if isinstance(features, dict):
        features = features['feature']

    if FLAGS.data_format == 'channels_first':
        assert not FLAGS.transpose_input  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])

    if FLAGS.transpose_input and mode != tf_estimator.ModeKeys.PREDICT:
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    # Normalize the image to zero mean and unit variance.
    features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype)
    features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype)

    pruning_method = params['pruning_method']
    use_tpu = params['use_tpu']
    log_alpha_threshold = params['log_alpha_threshold']

    def build_network():
        """Construct the network in the graph."""
        model_pruning_method = pruning_method
        if pruning_method == 'scratch':
            model_pruning_method = 'threshold'

        network = resnet_model.resnet_v1_(
            resnet_depth=FLAGS.resnet_depth,
            num_classes=FLAGS.num_label_classes,
            # we need to construct the model with the pruning masks, but they won't
            # be updated if we're doing scratch training
            pruning_method=model_pruning_method,
            init_method=FLAGS.init_method,
            width=width,
            prune_first_layer=FLAGS.prune_first_layer,
            prune_last_layer=FLAGS.prune_last_layer,
            data_format=FLAGS.data_format,
            end_sparsity=FLAGS.end_sparsity,
            clip_log_alpha=FLAGS.clip_log_alpha,
            log_alpha_threshold=log_alpha_threshold,
            weight_decay=FLAGS.weight_decay)
        return network(inputs=features,
                       is_training=(mode == tf_estimator.ModeKeys.TRAIN))

    if FLAGS.precision == 'bfloat16':
        with contrib_tpu.bfloat16_scope():
            logits = build_network()
        logits = tf.cast(logits, tf.float32)
    elif FLAGS.precision == 'float32':
        logits = build_network()

    if mode == tf_estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf_estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf_estimator.export.PredictOutput(predictions)
            })

    output_dir = params['output_dir']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes)

    # make sure we reuse the same label smoothing parameter is we're doing
    # scratch / lottery ticket experiments.
    label_smoothing = FLAGS.label_smoothing
    if FLAGS.pruning_method == 'scratch':
        label_smoothing = float(FLAGS.load_mask_dir.split('/')[15])
    loss = tf.losses.softmax_cross_entropy(logits=logits,
                                           onehot_labels=one_hot_labels,
                                           label_smoothing=label_smoothing)
    # Add regularization loss term
    loss += tf.losses.get_regularization_loss()

    if pruning_method == 'variational_dropout':
        reg_loss = utils.variational_dropout_dkl_loss(
            reg_scalar=FLAGS.reg_scalar,
            start_reg_ramp_up=FLAGS.sparsity_begin_step,
            end_reg_ramp_up=FLAGS.sparsity_end_step,
            warm_up=FLAGS.is_warm_up,
            use_tpu=use_tpu)
        loss += reg_loss
        tf.losses.add_loss(reg_loss, loss_collection=tf.GraphKeys.LOSSES)
    elif pruning_method == 'l0_regularization':
        reg_loss = utils.l0_regularization_loss(
            reg_scalar=FLAGS.reg_scalar,
            start_reg_ramp_up=FLAGS.sparsity_begin_step,
            end_reg_ramp_up=FLAGS.sparsity_end_step,
            warm_up=FLAGS.is_warm_up,
            use_tpu=use_tpu)
        loss += reg_loss
        tf.losses.add_loss(reg_loss, loss_collection=tf.GraphKeys.LOSSES)

    host_call = None
    if mode == tf_estimator.ModeKeys.TRAIN:
        host_call, train_op = train_function(pruning_method, loss, output_dir,
                                             use_tpu)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf_estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Calculate eval metrics."""
            logging.info('In metric function')
            eval_metrics = {}
            predictions = tf.cast(tf.argmax(logits, axis=1), tf.int32)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            eval_metrics['top_5_eval_accuracy'] = tf.metrics.mean(in_top_5)
            eval_metrics['eval_accuracy'] = tf.metrics.accuracy(
                labels=labels, predictions=predictions)

            return eval_metrics

        def vd_metric_fn(labels, logits, global_sparsity):
            eval_metrics = metric_fn(labels, logits)
            eval_metrics['global_sparsity'] = tf.metrics.mean(global_sparsity)
            return eval_metrics

        tensors = [labels, logits]
        metric_function = metric_fn

        if FLAGS.pruning_method == 'variational_dropout':
            batch_size = labels.shape[0]
            ones = tf.ones([batch_size, 1])
            mask_metrics = utils.add_vd_pruning_summaries(
                threshold=FLAGS.log_alpha_threshold)
            tensors.append(mask_metrics['global_sparsity'] * ones)
            metric_function = vd_metric_fn

        eval_metrics = (metric_function, tensors)

    # define a custom scaffold function to enable initializing the mask from an
    # already trained checkpoint.
    def initialize_mask_from_ckpt(ckpt_path):
        """Load mask from an existing checkpoint."""
        model_dir = FLAGS.output_dir
        already_has_ckpt = model_dir and tf.train.latest_checkpoint(
            model_dir) is not None
        if already_has_ckpt:
            tf.logging.info(
                'Training already started on this model, not loading masks from'
                'previously trained model')
            return

        reader = tf.train.NewCheckpointReader(ckpt_path)
        mask_names = reader.get_variable_to_shape_map().keys()
        mask_names = [x for x in mask_names if x.endswith('mask')]

        variable_map = {}
        for var in tf.global_variables():
            var_name = var.name.split(':')[0]
            if var_name in mask_names:
                tf.logging.info('Loading mask variable from checkpoint: %s',
                                var_name)
                variable_map[var_name] = var
            elif 'mask' in var_name:
                tf.logging.info(
                    'Cannot find mask variable in checkpoint, skipping: %s',
                    var_name)
        tf.train.init_from_checkpoint(ckpt_path, variable_map)

    def initialize_parameters_from_ckpt(ckpt_path):
        """Load parameters from an existing checkpoint."""
        model_dir = FLAGS.output_dir
        already_has_ckpt = model_dir and tf.train.latest_checkpoint(
            model_dir) is not None
        if already_has_ckpt:
            tf.logging.info(
                'Training already started on this model, not loading masks from'
                'previously trained model')
            return

        reader = tf.train.NewCheckpointReader(ckpt_path)
        param_names = reader.get_variable_to_shape_map().keys()
        param_names = [x for x in param_names if not x.endswith('mask')]

        variable_map = {}
        for var in tf.global_variables():
            var_name = var.name.split(':')[0]
            if var_name in param_names:
                tf.logging.info(
                    'Loading parameter variable from checkpoint: %s', var_name)
                variable_map[var_name] = var
            elif 'mask' not in var_name:
                tf.logging.info(
                    'Cannot find parameter variable in checkpoint, skipping: %s',
                    var_name)
        tf.train.init_from_checkpoint(ckpt_path, variable_map)

    if FLAGS.pruning_method == 'scratch':
        if FLAGS.load_mask_dir:

            def scaffold_fn():
                initialize_mask_from_ckpt(FLAGS.load_mask_dir)
                if FLAGS.initial_value_checkpoint:
                    initialize_parameters_from_ckpt(
                        FLAGS.initial_value_checkpoint)
                return tf.train.Scaffold()
        else:
            raise ValueError(
                'Must supply a mask directory to use scratch method')
    else:
        scaffold_fn = None

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op,
                                        host_call=host_call,
                                        eval_metrics=eval_metrics,
                                        scaffold_fn=scaffold_fn)
Exemple #2
0
def transformer_xl(inp_k,
                   n_token,
                   n_layer,
                   d_model,
                   n_head,
                   d_head,
                   d_inner,
                   dropout,
                   dropatt,
                   attn_type,
                   bi_data,
                   initializer,
                   is_training,
                   mem_len=None,
                   inp_q=None,
                   mems=None,
                   same_length=False,
                   clamp_len=-1,
                   untie_r=False,
                   use_tpu=True,
                   input_mask=None,
                   perm_mask=None,
                   seg_id=None,
                   reuse_len=None,
                   ff_activation='relu',
                   target_mapping=None,
                   use_bfloat16=False,
                   scope='transformer',
                   **kwargs):
    """
    Defines a Transformer-XL computation graph with additional
    support for XLNet.

    Args:

    inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
    seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
    input_mask: float32 Tensor in shape [len, bsz], the input mask.
      0 for real tokens and 1 for padding.
    mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
      from previous batches. The length of the list equals n_layer.
      If None, no memory is used.
    perm_mask: float32 Tensor in shape [len, len, bsz].
      If perm_mask[i, j, k] = 0, i attend to j in batch k;
      if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
      If None, each position attends to all the others.
    target_mapping: float32 Tensor in shape [num_predict, len, bsz].
      If target_mapping[i, j, k] = 1, the i-th predict in batch k is
      on the j-th token.
      Only used during pretraining for partial prediction.
      Set to None during finetuning.
    inp_q: float32 Tensor in shape [len, bsz].
      1 for tokens with losses and 0 for tokens without losses.
      Only used during pretraining for two-stream attention.
      Set to None during finetuning.

    n_layer: int, the number of layers.
    d_model: int, the hidden size.
    n_head: int, the number of attention heads.
    d_head: int, the dimension size of each attention head.
    d_inner: int, the hidden size in feed-forward layers.
    ff_activation: str, "relu" or "gelu".
    untie_r: bool, whether to untie the biases in attention.
    n_token: int, the vocab size.

    is_training: bool, whether in training mode.
    use_tpu: bool, whether TPUs are used.
    use_bfloat16: bool, use bfloat16 instead of float32.
    dropout: float, dropout rate.
    dropatt: float, dropout rate on attention probabilities.
    init: str, the initialization scheme, either "normal" or "uniform".
    init_range: float, initialize the parameters with a uniform distribution
      in [-init_range, init_range]. Only effective when init="uniform".
    init_std: float, initialize the parameters with a normal distribution
      with mean 0 and stddev init_std. Only effective when init="normal".
    mem_len: int, the number of tokens to cache.
    reuse_len: int, the number of tokens in the currect batch to be cached
      and reused in the future.
    bi_data: bool, whether to use bidirectional input pipeline.
      Usually set to True during pretraining and False during finetuning.
    clamp_len: int, clamp all relative distances larger than clamp_len.
      -1 means no clamping.
    same_length: bool, whether to use the same attention length for each token.
    summary_type: str, "last", "first", "mean", or "attn". The method
      to pool the input to get a vector representation.
    initializer: A tf initializer.
    scope: scope name for the computation graph.
  """
    tf.logging.info('memory input {}'.format(mems))
    tf_float = tf.bfloat16 if use_bfloat16 else tf.float32
    tf.logging.info('Use float type {}'.format(tf_float))

    new_mems = []
    with tf.variable_scope(scope):
        if untie_r:
            r_w_bias = tf.get_variable(
                'r_w_bias',
                [n_layer, n_head, d_head],
                dtype=tf_float,
                initializer=initializer,
            )
            r_r_bias = tf.get_variable(
                'r_r_bias',
                [n_layer, n_head, d_head],
                dtype=tf_float,
                initializer=initializer,
            )
        else:
            r_w_bias = tf.get_variable(
                'r_w_bias',
                [n_head, d_head],
                dtype=tf_float,
                initializer=initializer,
            )
            r_r_bias = tf.get_variable(
                'r_r_bias',
                [n_head, d_head],
                dtype=tf_float,
                initializer=initializer,
            )

        bsz = tf.shape(inp_k)[1]
        qlen = tf.shape(inp_k)[0]
        mlen = tf.shape(mems[0])[0] if mems is not None else 0
        klen = mlen + qlen

        # Attention mask
        # causal attention mask
        if attn_type == 'uni':
            attn_mask = _create_mask(qlen, mlen, tf_float, same_length)
            attn_mask = attn_mask[:, :, None, None]
        elif attn_type == 'bi':
            attn_mask = None
        else:
            raise ValueError(
                'Unsupported attention type: {}'.format(attn_type))

        # data mask: input mask & perm mask
        if input_mask is not None and perm_mask is not None:
            data_mask = input_mask[None] + perm_mask
        elif input_mask is not None and perm_mask is None:
            data_mask = input_mask[None]
        elif input_mask is None and perm_mask is not None:
            data_mask = perm_mask
        else:
            data_mask = None

        if data_mask is not None:
            # all mems can be attended to
            mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
                                 dtype=tf_float)
            data_mask = tf.concat([mems_mask, data_mask], 1)
            if attn_mask is None:
                attn_mask = data_mask[:, :, :, None]
            else:
                attn_mask += data_mask[:, :, :, None]

        if attn_mask is not None:
            attn_mask = tf.cast(attn_mask > 0, dtype=tf_float)

        if attn_mask is not None:
            non_tgt_mask = -tf.eye(qlen, dtype=tf_float)
            non_tgt_mask = tf.concat(
                [tf.zeros([qlen, mlen], dtype=tf_float), non_tgt_mask],
                axis=-1,
            )
            non_tgt_mask = tf.cast(
                (attn_mask + non_tgt_mask[:, :, None, None]) > 0,
                dtype=tf_float,
            )
        else:
            non_tgt_mask = None

        # Word embedding
        word_emb_k, lookup_table, lookup_table_2 = embedding_lookup(
            x=inp_k,
            n_token=n_token,
            d_embed=128,
            hidden_size=d_model,
            initializer=initializer,
            use_tpu=use_tpu,
            dtype=tf_float,
            scope='word_embedding',
        )

        if inp_q is not None:
            with tf.variable_scope('mask_emb'):
                mask_emb = tf.get_variable('mask_emb', [1, 1, d_model],
                                           dtype=tf_float)
                if target_mapping is not None:
                    word_emb_q = tf.tile(mask_emb,
                                         [tf.shape(target_mapping)[0], bsz, 1])
                else:
                    inp_q_ext = inp_q[:, :, None]
                    word_emb_q = (inp_q_ext * mask_emb +
                                  (1 - inp_q_ext) * word_emb_k)
        output_h = tf.layers.dropout(word_emb_k, dropout, training=is_training)
        if inp_q is not None:
            output_g = tf.layers.dropout(word_emb_q,
                                         dropout,
                                         training=is_training)

        # Segment embedding
        if seg_id is not None:
            if untie_r:
                r_s_bias = tf.get_variable(
                    'r_s_bias',
                    [n_layer, n_head, d_head],
                    dtype=tf_float,
                    initializer=initializer,
                )
            else:
                # default case (tie)
                r_s_bias = tf.get_variable(
                    'r_s_bias',
                    [n_head, d_head],
                    dtype=tf_float,
                    initializer=initializer,
                )

            seg_embed = tf.get_variable(
                'seg_embed',
                [n_layer, 2, n_head, d_head],
                dtype=tf_float,
                initializer=initializer,
            )

            # Convert `seg_id` to one-hot `seg_mat`
            mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32)
            cat_ids = tf.concat([mem_pad, seg_id], 0)

            # `1` indicates not in the same segment [qlen x klen x bsz]
            seg_mat = tf.cast(
                tf.logical_not(tf.equal(seg_id[:, None], cat_ids[None, :])),
                tf.int32,
            )
            seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float)
        else:
            seg_mat = None

        # Positional encoding
        pos_emb = relative_positional_encoding(
            qlen,
            klen,
            d_model,
            clamp_len,
            attn_type,
            bi_data,
            bsz=bsz,
            dtype=tf_float,
        )
        pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training)

        # Attention layers
        if mems is None:
            mems = [None] * n_layer

        name_variable_scope = 'layer_shared'

        for i in range(n_layer):
            # cache new mems
            new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len))

            # segment bias
            if seg_id is None:
                r_s_bias_i = None
                seg_embed_i = None
            else:
                r_s_bias_i = r_s_bias if not untie_r else r_s_bias[i]
                seg_embed_i = seg_embed[i]

            with tf.variable_scope(name_variable_scope,
                                   reuse=True if i > 0 else False):
                if inp_q is not None:
                    output_h, output_g = two_stream_rel_attn(
                        h=output_h,
                        g=output_g,
                        r=pos_emb,
                        r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
                        r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
                        seg_mat=seg_mat,
                        r_s_bias=r_s_bias_i,
                        seg_embed=seg_embed_i,
                        attn_mask_h=non_tgt_mask,
                        attn_mask_g=attn_mask,
                        mems=mems[i],
                        target_mapping=target_mapping,
                        d_model=d_model,
                        n_head=n_head,
                        d_head=d_head,
                        dropout=dropout,
                        dropatt=dropatt,
                        is_training=is_training,
                        kernel_initializer=initializer,
                    )
                    reuse = True
                else:
                    reuse = False

                    output_h = rel_multihead_attn(
                        h=output_h,
                        r=pos_emb,
                        r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
                        r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
                        seg_mat=seg_mat,
                        r_s_bias=r_s_bias_i,
                        seg_embed=seg_embed_i,
                        attn_mask=non_tgt_mask,
                        mems=mems[i],
                        d_model=d_model,
                        n_head=n_head,
                        d_head=d_head,
                        dropout=dropout,
                        dropatt=dropatt,
                        is_training=is_training,
                        kernel_initializer=initializer,
                        reuse=reuse,
                    )

                if inp_q is not None:
                    output_g = positionwise_ffn(
                        inp=output_g,
                        d_model=d_model,
                        d_inner=d_inner,
                        dropout=dropout,
                        kernel_initializer=initializer,
                        activation_type=ff_activation,
                        is_training=is_training,
                    )

                output_h = positionwise_ffn(
                    inp=output_h,
                    d_model=d_model,
                    d_inner=d_inner,
                    dropout=dropout,
                    kernel_initializer=initializer,
                    activation_type=ff_activation,
                    is_training=is_training,
                    reuse=reuse,
                )

        if inp_q is not None:
            output = tf.layers.dropout(output_g, dropout, training=is_training)
        else:
            output = tf.layers.dropout(output_h, dropout, training=is_training)

        return output, new_mems, lookup_table, lookup_table_2
Exemple #3
0
    def __init__(self,
                 simulation,
                 rnn_dim,
                 rnn_cell,
                 my_scope,
                 num_actions,
                 internal_states=2,
                 learning_rate=0.0001,
                 extra_layer=False):
        """The network receives the observation from both eyes, processes it
        #through convolutional layers, concatenates it with the internal state
        #and feeds it to the RNN."""

        self.num_arms = len(
            simulation.fish.left_eye.vis_angles)  # Rays for each eye
        self.rnn_dim = rnn_dim
        self.rnn_output_size = self.rnn_dim
        self.actions = tf.placeholder(shape=[None],
                                      dtype=tf.int32,
                                      name='actions')
        self.actions_one_hot = tf.one_hot(self.actions,
                                          num_actions,
                                          dtype=tf.float32)

        self.prev_actions = tf.placeholder(shape=[None],
                                           dtype=tf.int32,
                                           name='prev_actions')
        self.prev_actions_one_hot = tf.one_hot(self.prev_actions,
                                               num_actions,
                                               dtype=tf.float32)

        self.internal_state = tf.placeholder(shape=[None, internal_states],
                                             dtype=tf.float32,
                                             name='internal_state')

        self.observation = tf.placeholder(shape=[None, 3, 2],
                                          dtype=tf.float32,
                                          name='obs')
        self.reshaped_observation = tf.reshape(self.observation,
                                               shape=[-1, self.num_arms, 3, 2])
        self.left_eye = self.reshaped_observation[:, :, :, 0]
        self.right_eye = self.reshaped_observation[:, :, :, 1]

        #                ------------ Common to Both ------------                   #

        self.exp_keep = tf.placeholder(shape=None, dtype=tf.float32)
        self.Temp = tf.placeholder(shape=None, dtype=tf.float32)
        self.trainLength = tf.placeholder(dtype=tf.int32)
        self.batch_size = tf.placeholder(dtype=tf.int32, shape=[])
        self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32)

        #                ------------ Normal network ------------                   #

        self.conv1l = tf.layers.conv1d(inputs=self.left_eye,
                                       filters=16,
                                       kernel_size=16,
                                       strides=4,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv1l')
        self.conv2l = tf.layers.conv1d(inputs=self.conv1l,
                                       filters=8,
                                       kernel_size=8,
                                       strides=2,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv2l')
        self.conv3l = tf.layers.conv1d(inputs=self.conv2l,
                                       filters=8,
                                       kernel_size=4,
                                       strides=1,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv3l')
        self.conv4l = tf.layers.conv1d(inputs=self.conv3l,
                                       filters=64,
                                       kernel_size=4,
                                       strides=1,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv4l')

        self.conv1r = tf.layers.conv1d(inputs=self.right_eye,
                                       filters=16,
                                       kernel_size=16,
                                       strides=4,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv1r')
        self.conv2r = tf.layers.conv1d(inputs=self.conv1r,
                                       filters=8,
                                       kernel_size=8,
                                       strides=2,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv2r')
        self.conv3r = tf.layers.conv1d(inputs=self.conv2r,
                                       filters=8,
                                       kernel_size=4,
                                       strides=1,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv3r')
        self.conv4r = tf.layers.conv1d(inputs=self.conv3r,
                                       filters=64,
                                       kernel_size=4,
                                       strides=1,
                                       padding='valid',
                                       activation=tf.nn.relu,
                                       name=my_scope + '_conv4r')

        # We take the output from the final convolutional layer and send it to a recurrent layer.
        # The input must be reshaped into [batch x trace x units] for rnn processing,
        # and then returned to [batch x units] when sent through the upper levels.

        self.conv4l_flat = tf.layers.flatten(self.conv4l)
        self.conv4r_flat = tf.layers.flatten(self.conv4r)

        self.conv_with_states = tf.concat([
            self.conv4l_flat, self.conv4r_flat, self.prev_actions_one_hot,
            self.internal_state
        ], 1)
        self.rnn_in = tf.layers.dense(
            self.conv_with_states,
            self.rnn_dim,
            activation=tf.nn.relu,
            kernel_initializer=tf.orthogonal_initializer,
            trainable=True,
            name=my_scope + '_rnn_in')
        self.convFlat = tf.reshape(
            self.rnn_in, [self.batch_size, self.trainLength, self.rnn_dim])

        self.rnn, self.rnn_state = tf.nn.dynamic_rnn(
            inputs=self.convFlat,
            cell=rnn_cell,
            dtype=tf.float32,
            initial_state=self.state_in,
            scope=my_scope + '_rnn',
        )
        self.rnn = tf.reshape(self.rnn, shape=[-1, self.rnn_dim])
        self.rnn_output = self.rnn

        if extra_layer:
            self.rnn_in2 = tf.layers.dense(
                self.rnn_output,
                self.rnn_dim,
                activation=tf.nn.relu,
                kernel_initializer=tf.orthogonal_initializer,
                trainable=True,
                name=my_scope + "_rnn_in_2")
            self.rnnFlat = tf.reshape(
                self.rnn_in2,
                [self.batch_size, self.trainLength, self.rnn_dim])

            self.rnn2, self.rnn_state2 = tf.nn.dynamic_rnn(
                inputs=self.rnnFlat,
                cell=rnn_cell,
                dtype=tf.float32,
                initial_state=self.state_in,
                scope=my_scope + '_rnn2',
                name=my_scope + "_rnn2")
            self.rnn2 = tf.reshape(self.rnn2, shape=[-1, self.rnn_dim])
            self.rnn2_output = self.rnn2
            # The output from the recurrent player is then split into separate Value and Advantage streams
            self.streamA, self.streamV = tf.split(self.rnn2_output, 2, 1)

        else:
            self.rnn_state2 = self.rnn_state
            self.streamA, self.streamV = tf.split(self.rnn_output, 2, 1)
        self.AW = tf.Variable(tf.random_normal(
            [self.rnn_output_size // 2, num_actions]),
                              name=my_scope + "aw")
        self.VW = tf.Variable(tf.random_normal([self.rnn_output_size // 2, 1]),
                              name=my_scope + "vw")
        self.Advantage = tf.matmul(self.streamA, self.AW)
        self.Value = tf.matmul(self.streamV, self.VW)

        #                ------------ Reflected network ------------                   #

        self.ref_left_eye = tf.reverse(self.right_eye,
                                       [1])  # TODO: Note swapping here.
        self.ref_right_eye = tf.reverse(self.left_eye, [1])

        self.conv1l_ref = tf.layers.conv1d(inputs=self.ref_left_eye,
                                           filters=16,
                                           kernel_size=16,
                                           strides=4,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv1l',
                                           reuse=True)
        self.conv2l_ref = tf.layers.conv1d(inputs=self.conv1l_ref,
                                           filters=8,
                                           kernel_size=8,
                                           strides=2,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv2l',
                                           reuse=True)
        self.conv3l_ref = tf.layers.conv1d(inputs=self.conv2l_ref,
                                           filters=8,
                                           kernel_size=4,
                                           strides=1,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv3l',
                                           reuse=True)
        self.conv4l_ref = tf.layers.conv1d(inputs=self.conv3l_ref,
                                           filters=64,
                                           kernel_size=4,
                                           strides=1,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv4l',
                                           reuse=True)

        self.conv1r_ref = tf.layers.conv1d(inputs=self.ref_right_eye,
                                           filters=16,
                                           kernel_size=16,
                                           strides=4,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv1r',
                                           reuse=True)
        self.conv2r_ref = tf.layers.conv1d(inputs=self.conv1r_ref,
                                           filters=8,
                                           kernel_size=8,
                                           strides=2,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv2r',
                                           reuse=True)
        self.conv3r_ref = tf.layers.conv1d(inputs=self.conv2r_ref,
                                           filters=8,
                                           kernel_size=4,
                                           strides=1,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv3r',
                                           reuse=True)
        self.conv4r_ref = tf.layers.conv1d(inputs=self.conv3r_ref,
                                           filters=64,
                                           kernel_size=4,
                                           strides=1,
                                           padding='valid',
                                           activation=tf.nn.relu,
                                           name=my_scope + '_conv4r',
                                           reuse=True)

        self.conv4l_flat_ref = tf.layers.flatten(self.conv4l_ref)
        self.conv4r_flat_ref = tf.layers.flatten(self.conv4r_ref)
        self.prev_actions_one_hot_rev = tf.reverse(self.prev_actions_one_hot,
                                                   [1])
        self.internal_state_rev = tf.reverse(self.internal_state, [1])

        self.conv_with_states_ref = tf.concat([
            self.conv4l_flat_ref, self.conv4r_flat_ref,
            self.prev_actions_one_hot_rev, self.internal_state_rev
        ], 1)
        self.rnn_in_ref = tf.layers.dense(
            self.conv_with_states_ref,
            self.rnn_dim,
            activation=tf.nn.relu,
            kernel_initializer=tf.orthogonal_initializer,
            trainable=True,
            name=my_scope + '_rnn_in',
            reuse=True)
        self.convFlat_ref = tf.reshape(
            self.rnn_in_ref, [self.batch_size, self.trainLength, self.rnn_dim])

        self.rnn_ref, self.rnn_state_ref = tf.nn.dynamic_rnn(
            inputs=self.convFlat_ref,
            cell=rnn_cell,
            dtype=tf.float32,
            initial_state=self.state_in,
            scope=my_scope + '_rnn')
        self.rnn_ref = tf.reshape(self.rnn_ref, shape=[-1, self.rnn_dim])
        self.rnn_output_ref = self.rnn_ref

        if extra_layer:
            self.rnn_in2_ref = tf.layers.dense(
                self.rnn_output_ref,
                self.rnn_dim,
                activation=tf.nn.relu,
                kernel_initializer=tf.orthogonal_initializer,
                trainable=True,
                name=my_scope + "_rnn_in_2",
                reuse=True)
            self.rnnFlat_ref = tf.reshape(
                self.rnn_in2_ref,
                [self.batch_size, self.trainLength, self.rnn_dim])

            self.rnn2_ref, self.rnn_state2_ref = tf.nn.dynamic_rnn(
                inputs=self.rnnFlat_ref,
                cell=rnn_cell,
                dtype=tf.float32,
                initial_state=self.state_in,
                scope=my_scope + '_rnn2',
                name=my_scope + "_rnn2",
                reuse=True)
            self.rnn2_ref = tf.reshape(self.rnn2_ref, shape=[-1, self.rnn_dim])
            self.rnn2_output_ref = self.rnn2_ref
            # The output from the recurrent player is then split into separate Value and Advantage streams
            self.streamA_ref, self.streamV_ref = tf.split(
                self.rnn2_output_ref, 2, 1)

        else:
            self.rnn_state2_ref = self.rnn_state_ref
            self.streamA_ref, self.streamV_ref = tf.split(
                self.rnn_output_ref, 2, 1)

        self.Value_ref = tf.matmul(self.streamV_ref, self.VW)
        self.Advantage_ref = tf.matmul(self.streamA_ref, self.AW)

        # Swapping rows in advantage - Note that this is specific to the current action space and order
        self.Advantage_ref = tf.concat([
            self.Advantage_ref[0:, :][:, :1], self.Advantage_ref[0:, :][:,
                                                                        2:3],
            self.Advantage_ref[0:, :][:, 1:2], self.Advantage_ref[0:, :][:,
                                                                         3:4],
            self.Advantage_ref[0:, :][:, 5:6], self.Advantage_ref[0:, :][:,
                                                                         4:5],
            self.Advantage_ref[0:, :][:, 6:7], self.Advantage_ref[0:, :][:,
                                                                         8:9],
            self.Advantage_ref[0:, :][:, 7:8], self.Advantage_ref[0:, :][:, 9:]
        ],
                                       axis=1)

        #                ------------ Integrating Normal and Reflected ------------                   #

        self.Value_final = tf.divide(tf.add(self.Value, self.Value_ref), 2)
        self.Advantage_final = tf.divide(
            tf.add(self.Advantage, self.Advantage_ref), 2)

        self.salience = tf.gradients(self.Advantage_final, self.observation)
        # Then combine them together to get our final Q-values.
        self.Q_out = self.Value_final + tf.subtract(
            self.Advantage_final,
            tf.reduce_mean(self.Advantage_final, axis=1, keep_dims=True))
        self.predict = tf.argmax(self.Q_out, 1)
        self.Q_dist = tf.nn.softmax(self.Q_out / self.Temp)
        # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
        self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)

        self.Q = tf.reduce_sum(tf.multiply(self.Q_out, self.actions_one_hot),
                               axis=1)

        self.td_error = tf.square(self.targetQ - self.Q)

        # In order to only propagate accurate gradients through the network, we will mask the first
        # half of the losses for each trace as per Lample & Chatlot 2016
        self.maskA = tf.zeros([self.batch_size, self.trainLength // 2])
        self.maskB = tf.ones([self.batch_size, self.trainLength // 2])
        self.mask = tf.concat([self.maskA, self.maskB], 1)
        self.mask = tf.reshape(self.mask, [-1])
        self.loss = tf.reduce_mean(self.td_error * self.mask)

        self.trainer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.updateModel = self.trainer.minimize(self.loss)
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        # Create the variable outside the assertion to avoid TF2 compatibility
        # issues.
        full_position_embeddings = tf.get_variable(
            name=position_embedding_name,
            shape=[max_position_embeddings, width],
            initializer=create_initializer(initializer_range))

        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Exemple #5
0
    def build():
        """Builds the Tensorflow graph."""
        inputs, labels, lengths = None, None, None

        if mode in ('train', 'eval'):
            if isinstance(no_event_label, numbers.Number):
                label_shape = []
            else:
                label_shape = [len(no_event_label)]
            inputs, labels, lengths = magenta.common.get_padded_batch(
                sequence_example_file_paths,
                hparams.batch_size,
                input_size,
                label_shape=label_shape,
                shuffle=mode == 'train')

        elif mode == 'generate':
            inputs = tf.placeholder(tf.float32,
                                    [hparams.batch_size, None, input_size])

        if isinstance(encoder_decoder,
                      note_seq.OneHotIndexEventSequenceEncoderDecoder):
            expanded_inputs = tf.one_hot(
                tf.cast(tf.squeeze(inputs, axis=-1), tf.int64),
                encoder_decoder.input_depth)
        else:
            expanded_inputs = inputs

        dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob

        cell = make_rnn_cell(hparams.rnn_layer_sizes,
                             dropout_keep_prob=dropout_keep_prob,
                             attn_length=hparams.attn_length,
                             residual_connections=hparams.residual_connections)

        initial_state = cell.zero_state(hparams.batch_size, tf.float32)

        outputs, final_state = tf.nn.dynamic_rnn(cell,
                                                 expanded_inputs,
                                                 sequence_length=lengths,
                                                 initial_state=initial_state,
                                                 swap_memory=True)

        outputs_flat = magenta.common.flatten_maybe_padded_sequences(
            outputs, lengths)
        if isinstance(num_classes, numbers.Number):
            num_logits = num_classes
        else:
            num_logits = sum(num_classes)
        logits_flat = tf_slim.layers.linear(outputs_flat, num_logits)

        if mode in ('train', 'eval'):
            labels_flat = magenta.common.flatten_maybe_padded_sequences(
                labels, lengths)

            if isinstance(num_classes, numbers.Number):
                softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels_flat, logits=logits_flat)
                predictions_flat = tf.argmax(logits_flat, axis=1)
            else:
                logits_offsets = np.cumsum([0] + num_classes)
                softmax_cross_entropy = []
                predictions = []
                for i in range(len(num_classes)):
                    softmax_cross_entropy.append(
                        tf.nn.sparse_softmax_cross_entropy_with_logits(
                            labels=labels_flat[:, i],
                            logits=logits_flat[:, logits_offsets[i]:
                                               logits_offsets[i + 1]]))
                    predictions.append(
                        tf.argmax(
                            logits_flat[:,
                                        logits_offsets[i]:logits_offsets[i +
                                                                         1]],
                            axis=1))
                predictions_flat = tf.stack(predictions, 1)

            correct_predictions = tf.to_float(
                tf.equal(labels_flat, predictions_flat))
            event_positions = tf.to_float(
                tf.not_equal(labels_flat, no_event_label))
            no_event_positions = tf.to_float(
                tf.equal(labels_flat, no_event_label))

            # Compute the total number of time steps across all sequences in the
            # batch. For some checkpoint this will be different from the number of RNN
            # steps.
            def batch_labels_to_num_steps(batch_labels, lengths):
                num_steps = 0
                for labels, length in zip(batch_labels, lengths):
                    num_steps += encoder_decoder.labels_to_num_steps(
                        labels[:length])
                return np.float32(num_steps)

            num_steps = tf.py_func(batch_labels_to_num_steps,
                                   [labels, lengths], tf.float32)

            if mode == 'train':
                loss = tf.reduce_mean(softmax_cross_entropy)
                perplexity = tf.exp(loss)
                accuracy = tf.reduce_mean(correct_predictions)
                event_accuracy = (
                    tf.reduce_sum(correct_predictions * event_positions) /
                    tf.reduce_sum(event_positions))
                no_event_accuracy = (
                    tf.reduce_sum(correct_predictions * no_event_positions) /
                    tf.reduce_sum(no_event_positions))

                loss_per_step = tf.reduce_sum(
                    softmax_cross_entropy) / num_steps
                perplexity_per_step = tf.exp(loss_per_step)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=hparams.learning_rate)

                train_op = tf_slim.learning.create_train_op(
                    loss, optimizer, clip_gradient_norm=hparams.clip_norm)
                tf.add_to_collection('train_op', train_op)

                vars_to_summarize = {
                    'loss': loss,
                    'metrics/perplexity': perplexity,
                    'metrics/accuracy': accuracy,
                    'metrics/event_accuracy': event_accuracy,
                    'metrics/no_event_accuracy': no_event_accuracy,
                    'metrics/loss_per_step': loss_per_step,
                    'metrics/perplexity_per_step': perplexity_per_step,
                }
            elif mode == 'eval':
                vars_to_summarize, update_ops = tf_slim.metrics.aggregate_metric_map(
                    {
                        'loss':
                        tf.metrics.mean(softmax_cross_entropy),
                        'metrics/accuracy':
                        tf.metrics.accuracy(labels_flat, predictions_flat),
                        'metrics/per_class_accuracy':
                        tf.metrics.mean_per_class_accuracy(
                            labels_flat, predictions_flat, num_classes),
                        'metrics/event_accuracy':
                        tf.metrics.recall(event_positions,
                                          correct_predictions),
                        'metrics/no_event_accuracy':
                        tf.metrics.recall(no_event_positions,
                                          correct_predictions),
                        'metrics/loss_per_step':
                        tf.metrics.mean(tf.reduce_sum(softmax_cross_entropy) /
                                        num_steps,
                                        weights=num_steps),
                    })
                for updates_op in update_ops.values():
                    tf.add_to_collection('eval_ops', updates_op)

                # Perplexity is just exp(loss) and doesn't need its own update op.
                vars_to_summarize['metrics/perplexity'] = tf.exp(
                    vars_to_summarize['loss'])
                vars_to_summarize['metrics/perplexity_per_step'] = tf.exp(
                    vars_to_summarize['metrics/loss_per_step'])

            for var_name, var_value in vars_to_summarize.items():
                tf.summary.scalar(var_name, var_value)
                tf.add_to_collection(var_name, var_value)

        elif mode == 'generate':
            temperature = tf.placeholder(tf.float32, [])
            if isinstance(num_classes, numbers.Number):
                softmax_flat = tf.nn.softmax(
                    tf.div(logits_flat, tf.fill([num_classes], temperature)))
                softmax = tf.reshape(softmax_flat,
                                     [hparams.batch_size, -1, num_classes])
            else:
                logits_offsets = np.cumsum([0] + num_classes)
                softmax = []
                for i in range(len(num_classes)):
                    sm = tf.nn.softmax(
                        tf.div(
                            logits_flat[:,
                                        logits_offsets[i]:logits_offsets[i +
                                                                         1]],
                            tf.fill([num_classes[i]], temperature)))
                    sm = tf.reshape(sm,
                                    [hparams.batch_size, -1, num_classes[i]])
                    softmax.append(sm)

            tf.add_to_collection('inputs', inputs)
            tf.add_to_collection('temperature', temperature)
            tf.add_to_collection('softmax', softmax)
            # Flatten state tuples for metagraph compatibility.
            for state in tf.nest.flatten(initial_state):
                tf.add_to_collection('initial_state', state)
            for state in tf.nest.flatten(final_state):
                tf.add_to_collection('final_state', state)
Exemple #6
0
def batch_gather_by_one_hot(params: tf.Tensor,
                            indices: tf.Tensor,
                            batch_dims: Optional[int] = None,
                            name: Optional[Text] = None) -> tf.Tensor:
  """Performs a batched version of gather using tf.one_hot multiplication.

  The first `batch_dims` dimensions of `params` and `indices` must match in
  shape.

  This is intended for TPU friendliness but comes with additional complexity
  costs. In particular, the materialized one-hot tensor has
  `lookup_size * indices.shape.num_elements()` elements.
  The time complexity is higher by a factor of `lookup_size` also.

  Args:
    params: <float32>[...some_batch_dims, lookup_size, ...] Tensor of values to
      gather from.
    indices: <int>[...some_batch_dims, ...index_dims...] Tensor of ids to index
      into `params`. Any values outside the range [0, lookup_size) will
      translate to 0 values in the output.
    batch_dims: Number of batched dimensions. Must be positive. Defaults to
      len(indices.shape) - 1.
    name: A name for the operation (optional).

  Returns:
    [indices.shape, params.shape[(batch_dims+1):]] Tensor.
  """
  # We rename `batch_dims` to `num_batch_dims` since it refers to a single
  # integer rather than a list of the dimensions themselves. The argument
  # name is kept to match `tf.gather`.
  num_batch_dims = batch_dims
  del batch_dims

  with tf.name_scope(name or 'batch_gather_by_one_hot'):
    params = tf.convert_to_tensor(params)
    indices = tf.convert_to_tensor(indices)

    if num_batch_dims is None:
      num_batch_dims = len(indices.shape) - 1
    if num_batch_dims <= 0:
      raise ValueError('`num_batch_dims` must be positive.')
    if len(params.shape) <= num_batch_dims:
      raise ValueError('`params` has too few dimensions.')
    if len(indices.shape) < num_batch_dims:
      raise ValueError('`indices` has too few dimensions.')
    if not params.shape[:num_batch_dims].is_compatible_with(
        indices.shape[:num_batch_dims]):
      raise ValueError('`params` and `indices` must have compatible batch '
                       'dimensions.')

    lookup_size = tf.shape(params)[num_batch_dims]

    # Flatten all "index_dims" in `indices` into a single dimension.
    flat_indices_shape = tf.concat([tf.shape(indices)[:num_batch_dims], [-1]],
                                   0)
    flat_indices = tf.reshape(indices, flat_indices_shape)
    one_hot_matrices = tf.one_hot(flat_indices, lookup_size, dtype=params.dtype)

    # Flatten all `params` dims after the "lookup_size" dimension. (If there
    # aren't any, then expand a final dimension.)
    flat_params_shape = tf.concat(
        [tf.shape(params)[:(num_batch_dims + 1)], [-1]], 0)
    flat_params = tf.reshape(params, flat_params_shape)

    flat_result = tf.matmul(one_hot_matrices, flat_params)
    output_shape = tf.concat(
        [tf.shape(indices),
         tf.shape(params)[(num_batch_dims + 1):]], 0)
    return tf.reshape(flat_result, output_shape)
Exemple #7
0
    def call(self,
             yesno_logits,
             yesno_labels,
             supporting_fact_logits,
             supporting_fact_labels,
             block_ids,
             num_replicas=None,
             eps=0):
        """Calls the layer.

    Args:
      yesno_logits: <float32>[batch_size, 3] Logits per position.
      supporting_fact_logits: <float32>[batch_size] Logits per position fro
        supporting facts classification.
      block_ids: <int32>[batch_size] Block IDs of every sample in the batch.
      num_replicas: Number of replicas to gather summaries from. If None
        (default) then cross-replicas summaries are not used.
      eps: <float> Small constant for numerical stability.

    Returns:
        total_loss: <float>
    """
        batch_size = tf.shape(supporting_fact_logits)[0]
        supporting_fact_logits = tf.expand_dims(supporting_fact_logits, 1)
        supporting_fact_labels = tf.expand_dims(supporting_fact_labels, 1)
        example_mask = tf.cast(tf.expand_dims(tf.not_equal(block_ids, 0), 1),
                               tf.float32)

        # (1) Aggregate block_ids across global batch. Compute cross block mask.
        all_block_ids = block_ids
        if num_replicas:
            all_block_ids = tpu_utils.cross_replica_concat(
                tensor=all_block_ids,
                num_replicas=num_replicas,
                name='block_ids_concat')

        # [batch_size, global_batch_size]
        cross_blocks_eq_mask = tf.cast(
            tf.equal(tf.expand_dims(block_ids, 1),
                     tf.expand_dims(all_block_ids, 0)), tf.float32)

        # (2) Apply softmax over all positions in the (global) batch
        # across the blocks with the same `block_id`.

        # [batch_size, 3, 1]
        yes_no_span_probs = losses.cross_batch_softmax(
            tf.expand_dims(yesno_logits, 2), cross_blocks_eq_mask,
            num_replicas)
        yes_no_span_probs = tf.squeeze(yes_no_span_probs, 2)

        # [batch_size, 1]
        supporting_facts_probs = losses.cross_batch_softmax(
            tf.expand_dims(supporting_fact_logits, 2), cross_blocks_eq_mask,
            num_replicas)
        supporting_facts_probs = tf.squeeze(supporting_facts_probs, 2)

        # (3) Prepare one-hot labels based on annotation begins and ends

        supporting_fact_labels = tf.cast(supporting_fact_labels, tf.float32)

        # [batch_size, 3]
        yes_no_span_one_hot = tf.one_hot(yesno_labels,
                                         depth=3,
                                         dtype=tf.float32)
        yes_no_span_one_hot = yes_no_span_one_hot * supporting_fact_labels

        # (4) Compute the probability of the current begin / end positions across
        # the blocks with the same `block_id`.

        def mean_loss(all_losses):
            return tf.reduce_sum(all_losses * example_mask) / (
                tf.reduce_sum(example_mask) + eps)

        supporting_facts_loss = -mean_loss(
            tf.log(supporting_facts_probs * supporting_fact_labels + eps))

        yes_no_span_loss = -mean_loss(
            tf.log(yes_no_span_probs * yes_no_span_one_hot + eps))

        return yes_no_span_loss, supporting_facts_loss
Exemple #8
0
    def testLoss(self):
        """
        Tests the loss of the FasterRCNN
        """

        # Create prediction_dict's structure
        prediction_dict_random = {
            'rpn_prediction': {},
            'classification_prediction': {
                'rcnn': {
                    'cls_score': None,
                    'bbox_offsets': None
                },
                'target': {},
                '_debug': {
                    'losses': {}
                }
            }
        }
        prediction_dict_perf = {
            'rpn_prediction': {},
            'classification_prediction': {
                'rcnn': {
                    'cls_score': None,
                    'bbox_offsets': None
                },
                'target': {},
                '_debug': {
                    'losses': {}
                }
            }
        }

        # Set seeds for stable results
        rand_seed = 13
        target_seed = 43
        image_size = (60, 80)
        num_anchors = 1000

        config = EasyDict(self.config)
        config.model.rpn.l2_regularization_scale = 0.0
        config.model.rcnn.l2_regularization_scale = 0.0
        config.model.base_network.arg_scope.weight_decay = 0.0

        #   RPN

        # Random generation of cls_targets for rpn
        # where:
        #       {-1}:   Ignore
        #       { 0}:   Background
        #       { 1}:   Object
        rpn_cls_target = tf.floor(
            tf.random_uniform([num_anchors],
                              minval=-1,
                              maxval=2,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None))

        # Creation of cls_scores with:
        #   score 100 in correct class
        #   score 0 in wrong class

        # Generation of opposite cls_score for rpn
        rpn_cls_score = tf.cast(
            tf.one_hot(tf.cast(tf.mod(tf.identity(rpn_cls_target) + 1, 2),
                               tf.int32),
                       depth=2,
                       on_value=10), tf.float32)
        # Generation of correct cls_score for rpn
        rpn_cls_perf_score = tf.cast(
            tf.one_hot(tf.cast(tf.identity(rpn_cls_target), tf.int32),
                       depth=2,
                       on_value=100), tf.float32)

        # Random generation of target bbox deltas
        rpn_bbox_target = tf.floor(
            tf.random_uniform([num_anchors, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None))

        # Random generation of predicted bbox deltas
        rpn_bbox_predictions = tf.floor(
            tf.random_uniform([num_anchors, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=rand_seed,
                              name=None))

        prediction_dict_random['rpn_prediction'][
            'rpn_cls_score'] = rpn_cls_score
        prediction_dict_random['rpn_prediction'][
            'rpn_cls_target'] = rpn_cls_target
        prediction_dict_random['rpn_prediction'][
            'rpn_bbox_target'] = rpn_bbox_target
        prediction_dict_random['rpn_prediction'][
            'rpn_bbox_pred'] = rpn_bbox_predictions

        prediction_dict_perf['rpn_prediction'][
            'rpn_cls_score'] = rpn_cls_perf_score
        prediction_dict_perf['rpn_prediction'][
            'rpn_cls_target'] = rpn_cls_target
        prediction_dict_perf['rpn_prediction'][
            'rpn_bbox_target'] = rpn_bbox_target
        prediction_dict_perf['rpn_prediction'][
            'rpn_bbox_pred'] = rpn_bbox_target

        #   RCNN

        # Set the number of classes
        num_classes = config.model.network.num_classes

        # Randomly generate the bbox_offsets for the correct class = 1
        prediction_dict_random['classification_prediction']['target'] = {
            'bbox_offsets':
            tf.random_uniform([1, 4],
                              minval=-1,
                              maxval=1,
                              dtype=tf.float32,
                              seed=target_seed,
                              name=None),
            'cls': [1]
        }

        # Set the same bbox_offsets and cls for the perfect prediction
        prediction_dict_perf['classification_prediction'][
            'target'] = prediction_dict_random['classification_prediction'][
                'target'].copy()

        # Generate random scores for the num_classes + the background class
        rcnn_cls_score = tf.random_uniform([1, num_classes + 1],
                                           minval=-100,
                                           maxval=100,
                                           dtype=tf.float32,
                                           seed=rand_seed,
                                           name=None)

        # Generate a perfect prediction with the correct class score = 100
        # and the rest set to 0
        rcnn_cls_perf_score = tf.cast(
            tf.one_hot([1], depth=num_classes + 1, on_value=100), tf.float32)

        # Generate the random delta prediction for each class
        rcnn_bbox_offsets = tf.random_uniform([1, num_classes * 4],
                                              minval=-1,
                                              maxval=1,
                                              dtype=tf.float32,
                                              seed=rand_seed,
                                              name=None)

        # Copy the random prediction and set the correct class prediction
        # as the target one
        target_bbox_offsets = prediction_dict_random[
            'classification_prediction']['target']['bbox_offsets']
        initial_val = 1 * 4  # cls value * 4
        rcnn_bbox_perf_offsets = tf.Variable(
            tf.reshape(
                tf.random_uniform([1, num_classes * 4],
                                  minval=-1,
                                  maxval=1,
                                  dtype=tf.float32,
                                  seed=target_seed,
                                  name=None), [-1]))
        rcnn_bbox_perf_offsets = tf.reshape(
            tf.scatter_update(rcnn_bbox_perf_offsets,
                              tf.range(initial_val, initial_val + 4),
                              tf.reshape(target_bbox_offsets, [-1])), [1, -1])

        prediction_dict_random['classification_prediction']['rcnn'][
            'cls_score'] = rcnn_cls_score
        prediction_dict_random['classification_prediction']['rcnn'][
            'bbox_offsets'] = rcnn_bbox_offsets

        prediction_dict_perf['classification_prediction']['rcnn'][
            'cls_score'] = rcnn_cls_perf_score
        prediction_dict_perf['classification_prediction']['rcnn'][
            'bbox_offsets'] = rcnn_bbox_perf_offsets

        loss_perfect = self._get_losses(config, prediction_dict_perf,
                                        image_size)
        loss_random = self._get_losses(config, prediction_dict_random,
                                       image_size)

        loss_random_compare = {
            'rcnn_cls_loss': 5,
            'rcnn_reg_loss': 3,
            'rpn_cls_loss': 5,
            'rpn_reg_loss': 3,
            'no_reg_loss': 16,
            'regularization_loss': 0,
            'total_loss': 22,
        }
        for loss in loss_random:
            self.assertGreaterEqual(loss_random[loss],
                                    loss_random_compare[loss], loss)
            self.assertEqual(loss_perfect[loss], 0, loss)
def resnet_model_fn(features, labels, mode, params):
    """The model_fn for ResNet to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images. If transpose_input is enabled, it
        is transposed to device layout and reshaped to 1D tensor.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features["feature"]

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU/TPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC.
    if params["data_format"] == "channels_first":
        assert not params["transpose_input"]  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])

    if params["transpose_input"] and mode != tf.estimator.ModeKeys.PREDICT:
        image_size = tf.sqrt(tf.shape(features)[0] / (3 * tf.shape(labels)[0]))
        features = tf.reshape(features, [image_size, image_size, 3, -1])
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    # Normalize the image to zero mean and unit variance.
    features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype)
    features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype)

    # DropBlock keep_prob for the 4 block groups of ResNet architecture.
    # None means applying no DropBlock at the corresponding block group.
    dropblock_keep_probs = [None] * 4
    if params["dropblock_groups"]:
        # Scheduled keep_prob for DropBlock.
        train_steps = tf.cast(params["train_steps"], tf.float32)
        current_step = tf.cast(tf.train.get_global_step(), tf.float32)
        current_ratio = current_step / train_steps
        dropblock_keep_prob = 1 - current_ratio * (
            1 - params["dropblock_keep_prob"])

        # Computes DropBlock keep_prob for different block groups of ResNet.
        dropblock_groups = [
            int(x) for x in params["dropblock_groups"].split(",")
        ]
        for block_group in dropblock_groups:
            if block_group < 1 or block_group > 4:
                raise ValueError(
                    "dropblock_groups should be a comma separated list of integers "
                    "between 1 and 4 (dropblcok_groups: {}).".format(
                        params["dropblock_groups"]))
            dropblock_keep_probs[block_group - 1] = 1 - (
                (1 - dropblock_keep_prob) / 4.0**(4 - block_group))

    # This nested function allows us to avoid duplicating the logic which
    # builds the network, for different values of --precision.
    def build_network():
        network = resnet_model.resnet_v1(
            resnet_depth=params["resnet_depth"],
            num_classes=params["num_label_classes"],
            dropblock_size=params["dropblock_size"],
            dropblock_keep_probs=dropblock_keep_probs,
            data_format=params["data_format"],
        )
        return network(inputs=features,
                       is_training=(mode == tf.estimator.ModeKeys.TRAIN))

    if params["precision"] == "bfloat16":
        with tf.tpu.bfloat16_scope():
            logits = build_network()
        logits = tf.cast(logits, tf.float32)
    elif params["precision"] == "float32":
        logits = build_network()

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            "classes": tf.argmax(logits, axis=1),
            "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                "classify": tf.estimator.export.PredictOutput(predictions)
            },
        )

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params["batch_size"]  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, params["num_label_classes"])
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=one_hot_labels,
        label_smoothing=params["label_smoothing"],
    )

    # Add weight decay to the loss for non-batch-normalization variables.
    if params["enable_lars"]:
        loss = cross_entropy
    else:
        loss = cross_entropy + params["weight_decay"] * tf.add_n([
            tf.nn.l2_loss(v) for v in tf.trainable_variables()
            if "batch_normalization" not in v.name
        ])

    host_call = None
    if mode == tf.estimator.ModeKeys.TRAIN:
        # Compute the current epoch and associated learning rate from global_step.
        global_step = tf.train.get_global_step()
        steps_per_epoch = params["num_train_images"] / params[
            "train_batch_size"]
        current_epoch = tf.cast(global_step, tf.float32) / steps_per_epoch
        # LARS is a large batch optimizer. LARS enables higher accuracy at batch 16K
        # and larger batch sizes.
        if params["enable_lars"]:
            learning_rate = 0.0
            optimizer = lars_util.init_lars_optimizer(current_epoch, params)
        else:
            learning_rate = learning_rate_schedule(params, current_epoch)
            optimizer = tf.train.MomentumOptimizer(
                learning_rate=learning_rate,
                momentum=params["momentum"],
                use_nesterov=True,
            )
        if params["use_tpu"]:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if not params["skip_host_call"]:

            def host_call_fn(gs, loss, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.

                This function is executed on the CPU and should not directly reference
                any Tensors in the rest of the `model_fn`. To pass Tensors from the
                model to the `metric_fn`, provide as part of the `host_call`. See
                https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
                for more information.

                Arguments should match the list of `Tensor` objects passed as the second
                element in the tuple passed to `host_call`.

                Args:
                  gs: `Tensor with shape `[batch]` for the global_step
                  loss: `Tensor` with shape `[batch]` for the training loss.
                  lr: `Tensor` with shape `[batch]` for the learning_rate.
                  ce: `Tensor` with shape `[batch]` for the current_epoch.

                Returns:
                  List of summary ops to run on the CPU host.
                """
                gs = gs[0]
                # Host call fns are executed params['iterations_per_loop'] times after
                # one TPU loop is finished, setting max_queue value to the same as
                # number of iterations will make the summary writer only flush the data
                # to storage once per loop.
                with tf2.summary.create_file_writer(
                        FLAGS.model_dir,
                        max_queue=params["iterations_per_loop"]).as_default():
                    with tf2.summary.record_if(True):
                        tf2.summary.scalar("loss", loss[0], step=gs)
                        tf2.summary.scalar("learning_rate", lr[0], step=gs)
                        tf2.summary.scalar("current_epoch", ce[0], step=gs)

                    return tf.summary.all_v2_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            loss_t = tf.reshape(loss, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

            This function is executed on the CPU and should not directly reference
            any Tensors in the rest of the `model_fn`. To pass Tensors from the model
            to the `metric_fn`, provide as part of the `eval_metrics`. See
            https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
            for more information.

            Arguments should match the list of `Tensor` objects passed as the second
            element in the tuple passed to `eval_metrics`.

            Args:
              labels: `Tensor` with shape `[batch]`.
              logits: `Tensor` with shape `[batch, num_classes]`.

            Returns:
              A dict of the metrics to return from evaluation.
            """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                "top_1_accuracy": top_1_accuracy,
                "top_5_accuracy": top_5_accuracy
            }

        eval_metrics = (metric_fn, [labels, logits])

    return tf.estimator.tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        host_call=host_call,
        eval_metrics=eval_metrics,
    )
Exemple #10
0
def train(model_path, learning_rate, epoch, noisy=False):
    total_epoch = epoch
    teacher = nin()
    student = lenet()
    if noisy == True:
        drop_scale = 1 / Nratio
        noisy_mask = tf.nn.dropout(tf.constant(
            np.float32(np.ones((batch_size, 1))) / drop_scale),
                                   keep_prob=Nratio)  #(batchsize,1)
        gaussian = tf.random_normal(shape=[batch_size, 1],
                                    mean=0.0,
                                    stddev=Nsigma)
        noisy = tf.mul(noisy_mask, gaussian)
        #noisy_add = tf.add(tf.constant(np.float32(np.ones((batch_size,1)))), noisy)
        teacher = tf.mul(teacher,
                         tf.tile(noisy, tf.constant([1, 10])))  #(batchsize,10)
        #teacher = tf.add(teacher, tf.tile(noisy,tf.constant([1,10])))
        print(bcolors.G + "prepare for training, noisy mode" + bcolors.END)
        tf_loss = tf.nn.l2_loss(teacher - student) / batch_size
    elif KD == True:  # correct Hinton method at 2017.1.3
        print(bcolors.G + "prepare for training, knowledge distilling mode" +
              bcolors.END)
        one_hot = tf.one_hot(y, n_classes, 1.0, 0.0)
        #one_hot = tf.cast(one_hot_int, tf.float32)
        teacher_tau = tf.scalar_mul(1.0 / tau, teacher)
        student_tau = tf.scalar_mul(1.0 / tau, student)
        objective1 = tf.nn.sigmoid_cross_entropy_with_logits(
            student_tau, one_hot)
        objective2 = tf.scalar_mul(0.5, tf.square(student_tau - teacher_tau))
        tf_loss = (lamda * tf.reduce_sum(objective1) +
                   (1 - lamda) * tf.reduce_sum(objective2)) / batch_size
    else:
        print(bcolors.G + "prepare for training, NIPS2014 mode" + bcolors.END)
        tf_loss = tf.nn.l2_loss(teacher - student) / batch_size

    optimizer1 = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(tf_loss)
    optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate /
                                        10).minimize(tf_loss)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        gpu_options=gpu_options, allow_soft_placement=True))
    tf.initialize_all_variables().run()
    with tf.device('/cpu:0'):
        saver = tf.train.Saver(max_to_keep=100)
        #saver.restore(sess, os.path.join(model_path,'model-99')
    data, label = read_cifar10('train')
    index = np.array(range(len(data)))  # index randomly ordered
    mean = cal_mean()
    begin = time.time()
    iterations = len(data) // batch_size
    decay_step = int(total_epoch * 0.8)
    cnt = 0
    dropout_rate = dropout
    print(bcolors.G + "number of iterations (per epoch) =" +
          str(len(data) / batch_size) + bcolors.END)
    for i in range(total_epoch):
        np.random.shuffle(index)
        cost_sum = 0
        for j in range(iterations):
            batch_x = np.float32(
                data[index[j * batch_size:(j + 1) * batch_size]]) - mean
            batch_y = np.squeeze(
                np.float32(label[index[j * batch_size:(j + 1) * batch_size]]))
            if cnt / decay_step == 0:
                lr = learning_rate
                _, cost = sess.run([optimizer1, tf_loss],
                                   feed_dict={
                                       x: batch_x,
                                       y: batch_y,
                                       keep_prob: 1 - dropout_rate
                                   })
            elif cnt / decay_step == 1:
                lr = learning_rate / 10
                _, cost = sess.run([optimizer2, tf_loss],
                                   feed_dict={
                                       x: batch_x,
                                       y: batch_y,
                                       keep_prob: 1 - dropout_rate
                                   })
            cost_sum += cost
            #pdb.set_trace()
            #if (j % int(iterations*0.25) == 0):
            #    print(("epoch %d-iter %d, cost = %f , avg-cost = %f"%(i, j, cost, cost/n_classes))
            #    sys.stdout.flush()
        cnt += 1
        avg_time = time.time() - begin
        print(
            "epoch %d - avg. %f seconds in each epoch, lr = %.0e, cost = %f , avg-cost-per-logits = %f"
            % (i, avg_time / cnt, lr, cost_sum,
               cost_sum / iterations / n_classes))
        if np.mod(i + 1, 10) == 0:
            print("Epoch ", i + 1, " is done. Saving the model ...")
            with tf.device('/cpu:0'):
                if not os.path.exists(model_path):
                    os.makedirs(model_path)
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=i)
        sys.stdout.flush()
Exemple #11
0
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    masked_lm_positions = features["masked_lm_positions"]
    masked_lm_ids = features["masked_lm_ids"]
    masked_lm_weights = features["masked_lm_weights"]
    next_sentence_labels = features["next_sentence_labels"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    if bert_teacher_config is None:
      model = modeling.BertModel(
          config=bert_config,
          is_training=is_training,
          input_ids=input_ids,
          input_mask=input_mask,
          token_type_ids=segment_ids,
          use_one_hot_embeddings=use_one_hot_embeddings,
          use_einsum=use_einsum)

      label_ids = tf.reshape(masked_lm_ids, [-1])
      true_labels = tf.one_hot(
          label_ids, depth=bert_config.vocab_size,
          dtype=model.get_sequence_output().dtype)
      one_hot_labels = true_labels
    else:
      model = modeling.BertModel(
          config=bert_config,
          is_training=False,
          input_ids=input_ids,
          input_mask=input_mask,
          token_type_ids=segment_ids,
          use_one_hot_embeddings=use_one_hot_embeddings,
          use_einsum=use_einsum)

      with tf.variable_scope("teacher"):
        teacher_model = modeling.BertModel(
            config=bert_teacher_config,
            is_training=False,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
            use_einsum=use_einsum)

        label_ids = tf.reshape(masked_lm_ids, [-1])

        true_labels = tf.one_hot(
            label_ids, depth=bert_config.vocab_size,
            dtype=model.get_sequence_output().dtype)

        teacher_logits = get_logits(
            bert_teacher_config,
            distill_temperature * teacher_model.get_sequence_output(),
            teacher_model.get_embedding_table(),
            masked_lm_positions)

        teacher_labels = tf.nn.softmax(teacher_logits, axis=-1)

        if distill_ground_truth_ratio == 1.0:
          one_hot_labels = true_labels
        else:
          one_hot_labels = (
              teacher_labels * (1 - distill_ground_truth_ratio)
              + true_labels * distill_ground_truth_ratio)

        teacher_attentions = teacher_model.get_all_attention_maps()
        student_attentions = model.get_all_attention_maps()

        teacher_hiddens = teacher_model.get_all_encoder_layers()
        student_hiddens = model.get_all_encoder_layers()

    (masked_lm_loss, _, masked_lm_example_loss,
     masked_lm_log_probs, _) = get_masked_lm_output(
         bert_config, model.get_sequence_output(), model.get_embedding_table(),
         masked_lm_positions, tf.stop_gradient(one_hot_labels),
         true_labels, masked_lm_weights)

    (next_sentence_loss, next_sentence_example_loss,
     next_sentence_log_probs) = get_next_sentence_output(
         bert_config, model.get_pooled_output(), next_sentence_labels)

    extra_loss1 = 0.0
    extra_loss2 = 0.0
    extra_loss3 = 0.0
    extra_loss4 = 0.0

    scalars_to_summarize = {}

    def get_layerwise_gate(layer_id):
      steps_per_phase = num_train_steps // bert_config.num_hidden_layers
      layer_wise_gate = distill_util.layer_wise_learning_rate(
          layer_id=layer_id, steps_per_phase=steps_per_phase, binary=True)
      return layer_wise_gate

    if layer_wise_warmup and hidden_distill_factor != 0.0:
      layer_id = 0
      for teacher_hidden, student_hidden in (
          zip(teacher_hiddens[1:], student_hiddens[1:])):
        with tf.variable_scope("hidden_distill_%d" % layer_id):
          mse_loss = tf.losses.mean_squared_error(
              tf.stop_gradient(
                  contrib_layers.layer_norm(
                      inputs=teacher_hidden,
                      begin_norm_axis=-1,
                      begin_params_axis=-1,
                      trainable=False)),
              contrib_layers.layer_norm(
                  inputs=student_hidden,
                  begin_norm_axis=-1,
                  begin_params_axis=-1,
                  trainable=False))
          layer_wise_gate = get_layerwise_gate(layer_id)
          extra_loss1 += layer_wise_gate * mse_loss
        layer_id += 1
      extra_loss1 = extra_loss1 * hidden_distill_factor / layer_id

    if layer_wise_warmup and (
        beta_distill_factor != 0 and gamma_distill_factor != 0.0):
      layer_id = 0
      for teacher_hidden, student_hidden in (
          zip(teacher_hiddens[1:], student_hiddens[1:])):
        with tf.variable_scope("hidden_distill_%d" % layer_id):
          teacher_mean = tf.reduce_mean(
              teacher_hiddens, axis=[-1], keepdims=True)
          student_mean = tf.reduce_mean(
              student_hidden, axis=[-1], keepdims=True)
          teacher_variance = tf.reduce_mean(
              tf.squared_difference(teacher_hiddens, teacher_mean),
              axis=[-1], keepdims=True)
          student_variance = tf.reduce_mean(
              tf.squared_difference(student_hidden, student_mean),
              axis=[-1], keepdims=True)
          beta_distill_loss = tf.reduce_mean(
              tf.squared_difference(
                  tf.stop_gradient(teacher_mean), student_mean))
          gamma_distill_loss = tf.reduce_mean(
              tf.abs(tf.stop_gradient(teacher_variance) - student_variance))
          layer_wise_gate = get_layerwise_gate(layer_id)
          extra_loss3 += layer_wise_gate * beta_distill_loss
          extra_loss4 += layer_wise_gate * gamma_distill_loss
        layer_id += 1
      extra_loss3 = extra_loss3 * beta_distill_factor / layer_id
      extra_loss4 = extra_loss4 * gamma_distill_factor / layer_id

    if layer_wise_warmup and attention_distill_factor != 0.0:
      layer_id = 0
      for teacher_attention, student_attention in (
          zip(teacher_attentions, student_attentions)):
        with tf.variable_scope("attention_distill_%d" % layer_id):
          teacher_attention_prob = tf.nn.softmax(
              teacher_attention, axis=-1)
          student_attention_log_prob = tf.nn.log_softmax(
              student_attention, axis=-1)
          kl_divergence = - (
              tf.stop_gradient(teacher_attention_prob)
              * student_attention_log_prob)
          kl_divergence = tf.reduce_mean(tf.reduce_sum(kl_divergence, axis=-1))
          layer_wise_gate = get_layerwise_gate(layer_id)
          extra_loss2 += layer_wise_gate * kl_divergence
        layer_id += 1
      extra_loss2 = extra_loss2 * attention_distill_factor / layer_id

    if layer_wise_warmup:
      total_loss = extra_loss1 + extra_loss2 + extra_loss3 + extra_loss4
    else:
      total_loss = masked_lm_loss + next_sentence_loss

    if summary_dir is not None:
      if layer_wise_warmup:
        scalars_to_summarize["feature_map_transfer_loss"] = extra_loss1
        scalars_to_summarize["attention_transfer_loss"] = extra_loss2
        scalars_to_summarize["mean_transfer_loss"] = extra_loss3
        scalars_to_summarize["variance_transfer_loss"] = extra_loss4
      else:
        scalars_to_summarize["masked_lm_loss"] = masked_lm_loss
        scalars_to_summarize["next_sentence_loss"] = next_sentence_loss

        masked_lm_predictions = tf.argmax(
            masked_lm_log_probs, axis=-1, output_type=tf.int32)
        masked_lm_accuracy = tf.cast(tf.math.equal(
            tf.reshape(masked_lm_ids, [-1]),
            tf.reshape(masked_lm_predictions, [-1])), tf.float32)
        numerator = tf.reduce_sum(
            tf.reshape(masked_lm_weights, [-1]) * masked_lm_accuracy)
        denominator = tf.reduce_sum(masked_lm_weights) + 1e-5
        masked_lm_accuracy = numerator / denominator
        scalars_to_summarize["masked_lm_accuracy"] = masked_lm_accuracy

        next_sentence_predictions = tf.argmax(
            next_sentence_log_probs, axis=-1, output_type=tf.int32)
        next_sentence_accuracy = tf.reduce_mean(
            tf.cast(tf.math.equal(
                tf.reshape(next_sentence_labels, [-1]),
                tf.reshape(next_sentence_predictions, [-1])), tf.float32))
        scalars_to_summarize["next_sentence_accuracy"] = next_sentence_accuracy

      scalars_to_summarize["global_step"] = tf.train.get_or_create_global_step()
      scalars_to_summarize["loss"] = total_loss

    host_call = None
    if summary_dir is not None:
      if use_tpu:
        for name in scalars_to_summarize:
          scalars_to_summarize[name] = tf.reshape(
              scalars_to_summarize[name], [1])

        def host_call_fn(*args):
          """Host call function to compute training summaries."""
          scalars = _list_to_dicts(args, scalars_to_summarize.keys())[0]
          for name in scalars:
            scalars[name] = scalars[name][0]

          with contrib_summary.create_file_writer(
              summary_dir, max_queue=1000).as_default():
            with contrib_summary.always_record_summaries():
              for name, value in scalars.items():
                if name not in ["global_step"]:
                  contrib_summary.scalar(
                      name, value, step=scalars["global_step"])

          return contrib_summary.all_summary_ops()

        host_call = (host_call_fn, _dicts_to_list([scalars_to_summarize],
                                                  scalars_to_summarize.keys()))
      else:
        for name in scalars_to_summarize:
          tf.summary.scalar(name, scalars_to_summarize[name])

    tvars = tf.trainable_variables()

    initialized_variable_names = {}
    teacher_initialized_variable_names = {}
    scaffold_fn = None

    if init_checkpoint:
      if not init_from_teacher:
        # Initializes from the checkpoint for all variables.
        (assignment_map, initialized_variable_names
        ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
        if use_tpu:

          def tpu_scaffold():
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            return tf.train.Scaffold()

          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
      elif bert_teacher_config is not None:
        # Initializes from the pre-trained checkpoint only for teacher model
        # and embeddings for distillation.
        (assignment_map, initialized_variable_names
        ) = modeling.get_assignment_map_from_checkpoint(
            tvars, init_checkpoint, init_embedding=True)
        (teacher_assignment_map, teacher_initialized_variable_names
        ) = modeling.get_assignment_map_from_checkpoint(
            tvars, init_checkpoint, init_from_teacher=True)
        if use_tpu:

          def teacher_tpu_scaffold():
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            tf.train.init_from_checkpoint(init_checkpoint,
                                          teacher_assignment_map)
            return tf.train.Scaffold()

          scaffold_fn = teacher_tpu_scaffold
        else:
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          tf.train.init_from_checkpoint(init_checkpoint, teacher_assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    total_size = 0
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      if var.name in teacher_initialized_variable_names:
        init_string = ", *INIT_FROM_TEACHER_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)
      if not var.name.startswith("teacher"):
        total_size += functools.reduce(lambda x, y: x * y,
                                       var.get_shape().as_list())
    tf.logging.info("  total variable parameters: %d", total_size)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
      if layer_wise_warmup:
        train_op = optimization.create_optimizer(
            total_loss, learning_rate, num_train_steps,
            num_warmup_steps, use_tpu, optimizer,
            end_lr_rate=1.0, use_layer_wise_warmup=True,
            total_warmup_phases=bert_config.num_hidden_layers)
      else:
        train_op = optimization.create_optimizer(
            total_loss, learning_rate, num_train_steps,
            num_warmup_steps, use_tpu, optimizer)

      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn,
          host_call=host_call)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                    masked_lm_weights, next_sentence_example_loss,
                    next_sentence_log_probs, next_sentence_labels):
        """Computes the loss and accuracy of the model."""
        masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
                                         [-1, masked_lm_log_probs.shape[-1]])
        masked_lm_predictions = tf.argmax(
            masked_lm_log_probs, axis=-1, output_type=tf.int32)
        masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
        masked_lm_accuracy = tf.metrics.accuracy(
            labels=masked_lm_ids,
            predictions=masked_lm_predictions,
            weights=masked_lm_weights)
        masked_lm_mean_loss = tf.metrics.mean(
            values=masked_lm_example_loss, weights=masked_lm_weights)

        next_sentence_log_probs = tf.reshape(
            next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
        next_sentence_predictions = tf.argmax(
            next_sentence_log_probs, axis=-1, output_type=tf.int32)
        next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
        next_sentence_accuracy = tf.metrics.accuracy(
            labels=next_sentence_labels, predictions=next_sentence_predictions)
        next_sentence_mean_loss = tf.metrics.mean(
            values=next_sentence_example_loss)

        return {
            "masked_lm_accuracy": masked_lm_accuracy,
            "masked_lm_loss": masked_lm_mean_loss,
            "next_sentence_accuracy": next_sentence_accuracy,
            "next_sentence_loss": next_sentence_mean_loss,
        }

      eval_metrics = (metric_fn, [
          masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
          masked_lm_weights, next_sentence_example_loss,
          next_sentence_log_probs, next_sentence_labels
      ])
      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))

    return output_spec
def cnn_model_fn(features, labels, mode):
	input_layer = tf.reshape(features["x"], [-1, image_x, image_y, 1], name="input")

	conv1 = tf.layers.conv2d(
	  inputs=input_layer,
	  filters=16,
	  kernel_size=[2, 2],
	  padding="same",
	  activation=tf.nn.relu,
	  name="conv1")
	print("conv1",conv1.shape)
	pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2, name="pool1")
	print("pool1",pool1.shape)

	conv2 = tf.layers.conv2d(
	  inputs=pool1,
	  filters=32,
	  kernel_size=[5, 5],
	  padding="same",
	  activation=tf.nn.relu,
	  name="conv2")
	print("conv2",conv2.shape)
	pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[5, 5], strides=5, name="pool2")
	print("pool2",pool2.shape)

	conv3 = tf.layers.conv2d(
	  inputs=pool2,
	  filters=64,
	  kernel_size=[5, 5],
	  padding="same",
	  activation=tf.nn.relu,
	  name="conv3")
	print("conv3",conv3.shape)

	# Dense Layer
	flat = tf.reshape(conv3, [-1, 5*5*64], name="flat")
	print(flat.shape)
	dense = tf.layers.dense(inputs=flat, units=128, activation=tf.nn.relu, name="dense")
	print(dense.shape)
	dropout = tf.layers.dropout(inputs=dense, rate=0.2, training=mode == tf.estimator.ModeKeys.TRAIN, name="dropout")

	# Logits Layer
	num_of_classes = get_num_of_classes()
	logits = tf.layers.dense(inputs=dropout, units=num_of_classes, name="logits")

	output_class = tf.argmax(input=logits, axis=1, name="output_class")
	output_probab = tf.nn.softmax(logits, name="softmax_tensor")
	predictions = {"classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor")}
	#tf.Print(tf.nn.softmax(logits, name="softmax_tensor"), [tf.nn.softmax(logits, name="softmax_tensor")])
	if mode == tf.estimator.ModeKeys.PREDICT:
		return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

	# Calculate Loss (for both TRAIN and EVAL modes)
	onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_of_classes)
	loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

	# Configure the Training Op (for TRAIN mode)
	if mode == tf.estimator.ModeKeys.TRAIN:
		optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-2)
		train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
		return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

	# Add evaluation metrics (for EVAL mode)
	eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])}
	return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def _greedy_decode(input_embeddings,
                   output_vocab_size,
                   target_end_id,
                   target_start_id,
                   output_vocab_embeddings_table,
                   source_len,
                   model_config,
                   mode,
                   input_copy_mask=None,
                   clean_output_mask=None):
    """Fast decoding."""
    encoder_output = common_layers.linear_transform(
        input_embeddings,
        output_size=model_config.model_parameters.encoder_dims,
        scope="bert_to_transformer")

    decode_length = model_config.data_options.max_decode_length

    # Expand the inputs in to the beam width.
    def symbols_to_logits_fn(logit_indices, current_index):
        """Go from targets to logits."""
        logit_indices = tf.expand_dims(logit_indices, 0)
        decode_steps = decode_utils.get_decode_steps(logit_indices,
                                                     output_vocab_size,
                                                     model_config)
        target_embeddings = _get_target_embeddings(
            input_embeddings, output_vocab_embeddings_table, decode_steps,
            model_config)
        decoder_output = _build_transformer_decoder(
            encoder_output,
            source_len,
            target_embeddings,
            mode,
            model_config,
            single_step_index=current_index)

        logits = _get_action_logits(encoder_output,
                                    decoder_output,
                                    output_vocab_embeddings_table,
                                    output_vocab_size,
                                    model_config,
                                    input_copy_mask=input_copy_mask,
                                    clean_output_mask=clean_output_mask)

        # Squeeze batch dimension and length dimension, as both should be 1.
        logits = tf.squeeze(logits, axis=[0, 1])
        # Shape of logits should now be (output_vocab_size).
        return logits

    def loop_cond(i, decoded_ids, unused_logprobs):
        """Loop conditional that returns false to stop loop."""
        return tf.logical_and(
            tf.reduce_all(tf.not_equal(decoded_ids, target_end_id)),
            tf.less(i, decode_length))

    def inner_loop(i, decoded_ids, logprobs):
        """Decoder function invoked on each while loop iteration."""
        logits = symbols_to_logits_fn(decoded_ids, i)
        next_id = tf.argmax(logits, axis=0)
        softmax = tf.nn.softmax(logits)
        extended_vocab_size = tf.shape(softmax)[-1]
        mask = tf.one_hot(next_id, extended_vocab_size)
        prob = tf.reduce_sum(softmax * mask)
        logprob = tf.log(prob)

        # Add one-hot values to output Tensors, since values at index > i+1 should
        # still be zero.
        logprobs += tf.one_hot(i + 1,
                               decode_length + 1,
                               on_value=logprob,
                               dtype=tf.float32)
        decoded_ids += tf.one_hot(i + 1,
                                  decode_length + 1,
                                  on_value=next_id,
                                  dtype=tf.int64)

        return i + 1, decoded_ids, logprobs

    initial_ids = tf.zeros(dtype=tf.int64, shape=[decode_length + 1])
    initial_ids += tf.one_hot(0,
                              decode_length + 1,
                              on_value=tf.cast(target_start_id, tf.int64))
    initial_logprob = tf.zeros(dtype=tf.float32, shape=[decode_length + 1])
    initial_i = tf.constant(0)

    initial_values = [initial_i, initial_ids, initial_logprob]

    _, decoded_ids, logprobs = tf.while_loop(loop_cond, inner_loop,
                                             initial_values)

    # Remove <START> symbol.
    decoded_ids = decoded_ids[1:]
    logprobs = logprobs[1:]
    # Sum logprobs to get scores for overall sequence.
    logprobs = tf.reduce_sum(logprobs, axis=0)

    # Expand decoded_ids and logprobs to reflect beam width dimension of 1.
    decoded_ids = tf.expand_dims(decoded_ids, 0)
    logprobs = tf.expand_dims(logprobs, 0)

    # This is the output dict that the function returns.
    output_decode_steps = decode_utils.get_decode_steps(
        decoded_ids, output_vocab_size, model_config)
    predictions = decode_utils.get_predictions(output_decode_steps)
    predictions[constants.SCORES_KEY] = logprobs

    return predictions
  def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model):
    """Masked language modeling softmax layer."""
    masked_lm_weights = inputs.masked_lm_weights
    with tf.variable_scope("generator_predictions"):
      if self._config.uniform_generator or self._config.identity_generator or self._config.heuristic_generator:
        logits = tf.zeros(self._bert_config.vocab_size)
        logits_tiled = tf.zeros(
            modeling.get_shape_list(inputs.masked_lm_ids) +
            [self._bert_config.vocab_size])
        logits_tiled += tf.reshape(logits, [1, 1, self._bert_config.vocab_size])
        logits = logits_tiled
      else:
        relevant_hidden = pretrain_helpers.gather_positions(
            model.get_sequence_output(), inputs.masked_lm_positions)
        hidden = tf.layers.dense(
            relevant_hidden,
            units=modeling.get_shape_list(model.get_embedding_table())[-1],
            activation=modeling.get_activation(self._bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                self._bert_config.initializer_range))
        hidden = modeling.layer_norm(hidden)
        output_bias = tf.get_variable(
            "output_bias",
            shape=[self._bert_config.vocab_size],
            initializer=tf.zeros_initializer())
        logits = tf.matmul(hidden, model.get_embedding_table(),
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

      oh_labels = tf.one_hot(
          inputs.masked_lm_ids, depth=self._bert_config.vocab_size,
          dtype=tf.float32)

      probs = tf.nn.softmax(logits)

      if self._config.identity_generator:
          identity_logits = tf.zeros(self._bert_config.vocab_size)
          identity_logits_tiled = tf.zeros(
              modeling.get_shape_list(inputs.masked_lm_ids) +
              [self._bert_config.vocab_size])
          masked_identity_weights = tf.one_hot(inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32)
          identity_logits_tiled += 25.0 * masked_identity_weights
          identity_logits_tiled += tf.reshape(identity_logits, [1, 1, self._bert_config.vocab_size])
          identity_logits = identity_logits_tiled
          identity_probs = tf.nn.softmax(identity_logits)

          identity_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_identity_weight
          probs = probs * (1 - identity_weight) + identity_probs * identity_weight
          logits = tf.math.log(probs)  # softmax(log(probs)) = probs
      elif self._config.heuristic_generator:
          synonym_logits = tf.zeros(self._bert_config.vocab_size)
          synonym_logits_tiled = tf.zeros(
              modeling.get_shape_list(inputs.masked_lm_ids) +
              [self._bert_config.vocab_size])
          masked_synonym_weights = tf.reduce_sum(
              tf.one_hot(inputs.masked_synonym_ids, depth=self._bert_config.vocab_size, dtype=tf.float32), -2)
          padded_synonym_mask = tf.concat([tf.zeros([1]), tf.ones([self._bert_config.vocab_size - 1])], 0)
          masked_synonym_weights *= tf.expand_dims(tf.expand_dims(padded_synonym_mask, 0), 0)
          synonym_logits_tiled += 25.0 * masked_synonym_weights
          synonym_logits_tiled += tf.reshape(synonym_logits, [1, 1, self._bert_config.vocab_size])
          synonym_logits = synonym_logits_tiled
          synonym_probs = tf.nn.softmax(synonym_logits)

          if self._config.synonym_scheduler_type == 'linear':
              synonym_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_synonym_weight
              probs = probs * (1 - synonym_weight) + synonym_probs * synonym_weight
              logits = tf.math.log(probs)  # softmax(log(probs)) = probs

      log_probs = tf.nn.log_softmax(logits)
      label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)

      numerator = tf.reduce_sum(inputs.masked_lm_weights * label_log_probs)
      denominator = tf.reduce_sum(masked_lm_weights) + 1e-6
      loss = numerator / denominator
      preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32)

      MLMOutput = collections.namedtuple(
          "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"])
      return MLMOutput(
          logits=logits, probs=probs, per_example_loss=label_log_probs,
          loss=loss, preds=preds)
Exemple #15
0
    def build_model(self, cate_list):
        """モデルの構築"""

        # 変数の定義
        # 商品の埋め込み表現が保存される行列 [|I|, di]
        # 2次元のルックアップテーブル
        item_emb_w = tf.get_variable(
            "item_emb_w",
            [self.config['item_count'], self.config['itemid_embedding_size']])
        # 類似度のバイアスのベクトル [|I|]
        item_b = tf.get_variable("item_b", [
            self.config['item_count'],
        ],
                                 initializer=tf.constant_initializer(0.0))
        # カテゴリの埋め込み表現が保存される行列 [|A|, da]
        # 2次元のルックアップテーブル
        cate_emb_w = tf.get_variable(
            "cate_emb_w",
            [self.config['cate_count'], self.config['cateid_embedding_size']])
        # 各商品のIDとカテゴリIDのマップ(リスト) [|I|]
        cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int32)

        # アイテム埋め込みとカテゴリ埋め込みと時間の埋め込みを結合、それをDenseで写像する
        # 論文:p3左のu_ij=h_emb
        # 予測すべきアイテムの埋め込み表現 [B, di+da]
        i_emb = tf.concat([
            tf.nn.embedding_lookup(item_emb_w, self.i),
            tf.nn.embedding_lookup(cate_emb_w, tf.gather(cate_list, self.i)),
        ], 1)
        # 予測すべきアイテムの重み [B]
        i_b = tf.gather(item_b, self.i)

        # 入力する各履歴の埋め込み表現 [B, T, di+da]
        # embedding_lookupでルックアップテーブルから該当する埋め込み表現を持ってくる
        h_emb = tf.concat([
            tf.nn.embedding_lookup(item_emb_w, self.hist_i),
            tf.nn.embedding_lookup(cate_emb_w, tf.gather(
                cate_list, self.hist_i)),
            self.im,
            self.r,
        ], 2)

        if self.config['concat_time_emb'] == True:
            # 時間の埋め込み表現を結合 [B, T, di+da+dt]
            t_emb = tf.one_hot(self.hist_t, 12, dtype=tf.float32)
            h_emb = tf.concat([h_emb, t_emb], -1)
            h_emb = tf.layers.dense(h_emb, self.config['hidden_units'])
        else:
            # 時間の埋め込み表現をPE [B, T, di+da]
            t_emb = tf.layers.dense(tf.expand_dims(self.hist_t, -1),
                                    self.config['hidden_units'],
                                    activation=tf.nn.tanh)
            h_emb += t_emb

        # アテンション機構を重ねる数
        num_blocks = self.config['num_blocks']
        num_heads = self.config['num_heads']
        dropout_rate = self.config['dropout']
        # QKVをDenseで写像した後のサイズ C = di+da+dt or di+da
        num_units = h_emb.get_shape().as_list()[-1]

        # トランスフォーマー
        # 論文:p4左数式(3)
        # u_emb [B, C]
        u_emb, self.att, self.stt = attention_net(
            # uij
            h_emb,
            # ユーザーの履歴の長さ
            self.sl,
            # デコーダーへの入力
            i_emb,
            num_units,
            num_heads,
            num_blocks,
            dropout_rate,
            self.is_training,
            False)

        # 予測
        # 論文:p4右数式(7)&(8) f(h_t, et_u) reduce_sum([B, C]) [B]
        self.logits = i_b + tf.reduce_sum(tf.multiply(u_emb, i_emb), 1)

        # ============== Eval ===============
        self.eval_logits = self.logits

        # Step variable
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.global_epoch_step = \
            tf.Variable(0, trainable=False, name='global_epoch_step')
        self.global_epoch_step_op = \
            tf.assign(self.global_epoch_step, self.global_epoch_step+1)

        # Loss
        # L2正規化
        l2_norm = tf.add_n([
            tf.nn.l2_loss(u_emb),
            tf.nn.l2_loss(i_emb),
        ])

        # ロス定義、ペアワイズ、シグモイド相互情報量
        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=self.logits,
                labels=self.y)) + self.config['regulation_rate'] * l2_norm

        self.train_summary = tf.summary.merge([
            tf.summary.histogram('embedding/1_item_emb', item_emb_w),
            tf.summary.histogram('embedding/2_cate_emb', cate_emb_w),
            tf.summary.histogram('embedding/3_time_raw', self.hist_t),
            tf.summary.histogram('embedding/3_time_dense', t_emb),
            tf.summary.histogram('embedding/4_final', h_emb),
            tf.summary.histogram('attention_output', u_emb),
            tf.summary.scalar('L2_norm Loss', l2_norm),
            tf.summary.scalar('Training Loss', self.loss),
        ])
Exemple #16
0
def dot_product_mpnn_attention(q,
                               k,
                               v,
                               adjacency_matrix,
                               num_edge_types,
                               num_transforms=None,
                               use_weighted_sum=False,
                               name=None):
    """Dot product attention with edge vectors.

  Let B be the number of batches.
  Let N be the number of nodes in the graph.
  Let K be the size of the attention keys/queries.
  Let V be the size of the attention values.
  Let T be the total number of transforms (num_transforms).

  Args:
    q: The query Tensor of shape [B, N, K].
    k: The key Tensor of shape [B, T, N, K].
    v: The value Tensor of shape [B, T, N, V].
    adjacency_matrix: A Tensor of shape [B, N, N, T]. An entry at
      indices b, i, j, k is the indicator of the edge
      from node j to node i in batch b. A standard adjacency matrix will only
      have one edge type while a mutigraph will have multiple edge types.
    num_edge_types: An integer specifying number of edge types.
    num_transforms: An integer indicating number of transforms (T). If None,
      then num_transforms will be equal to num_edge_types.
    use_weighted_sum: If False, will only use a single transform per edge type.
      Otherwise, use a learned weighted sum of transforms per edge type.
    name: A string.

  Returns:
    A Tensor of shape [B, N, V] storing the result of computing attention
    weights using the queries and keys and combining the values according to
    those weights.

  Raises:
    ValueError: if num_transforms doesn't equal num_edge_types and not using
      weighted sum.
  """
    with tf.variable_scope(name,
                           default_name="dot_product_mpnn_attention",
                           values=[q, k, v, adjacency_matrix, num_edge_types]):
        # If not explicitly set, use num_transforms set to num_edge_types.
        num_transforms = (num_edge_types
                          if num_transforms is None else num_transforms)

        if not use_weighted_sum and num_transforms != num_edge_types:
            raise ValueError("num_transforms must equal num_edge_types unless "
                             "use_weighted_sum is True")

        # Computes the raw dot-product attention values between each query and
        # the corresponding keys it needs to consider.
        #
        # This operation takes the dot product of (the query for
        # each node) and (the key for each node for each possible edge type),
        # creating an N x N matrix for each edge type. The entry at index (i, j)
        # is the dot-product for the edge from node i to node j of the appropriate
        # type. These dot products will eventually become attention weights
        # specifying how much node i weights an edge of that type coming from node
        # j.
        all_edge_logits = tf.matmul(tf.tile(tf.expand_dims(q, axis=1),
                                            [1, num_edge_types, 1, 1]),
                                    k,
                                    transpose_b=True)

        # The adjacency matrix assumes there is only one directed edge (i <- j) for
        # each pair of nodes. If such an edge exists, it contains the integer
        # type of that edge at position (i, j) of the adjacency matrix.
        #
        # Construct edge_vectors of shape [B, N, N, T].
        if use_weighted_sum:
            # Use dense representation for edge vectors.
            edge_vectors = make_edge_vectors(adjacency_matrix, num_edge_types,
                                             num_transforms)
        else:
            # Generate one-hot vectors based on edge types.
            # If there is an edge from node j to node i of type t, then index t of the
            # last dimension is 1 for entry (i, j) of the second and third dimensions.
            edge_vectors = tf.one_hot(adjacency_matrix, num_transforms)

        # Rearranging the dimensions to match the shape of all_edge_logits.
        edge_vectors = tf.transpose(edge_vectors, [0, 3, 1, 2])

        # Element-wise multiplies all_edge_logits and edge_vectors.
        #
        # In other words: all_edge_logits contains N x N matrices of query-key
        # products. This element-wise multiplication zeroes out entries that do not
        # correspond to actual edges in the graph of the appropriate edge type.
        # all_edge_logits retains shape [B, T, N, N].
        all_edge_logits *= edge_vectors

        # Since there can only be one edge from node A to node B, we can collapse
        # the T different adjacency matrices containing key-query pairs into one
        # adjacency matrix. logits is [B, N, N].
        # TODO(dbieber): Use a reshape instead of reduce sum to attend over all
        # edges instead of over all neighboring nodes to handle the multigraph case.
        logits = tf.reduce_sum(all_edge_logits, axis=1)

        # For pairs of nodes with no edges between them, add a large negative bias
        # to each location without an edge so that the softmax of entries with the
        # value 0 become a small negative number instead.
        bias = 0
        bias = tf.to_float(
            tf.equal(tf.reduce_sum(adjacency_matrix, axis=-1), 0)) * -1e9
        logits += bias

        # Turn the raw key-query products into a probability distribution (or,
        # in terms of attention, weights). The softmax is computed across the
        # last dimension of logits.
        compatibility = tf.nn.softmax(logits)  # Shape [B, N, N].

        # Computes a summary showing the attention matrix as an image. Does not do
        # any work toward actually performing attention.
        common_attention.attention_image_summary(
            tf.expand_dims(compatibility, axis=1), None)

        # Repeats the attention matrix T times for each batch, producing
        # a tensor with shape [B, T, N, N] where the [N, N] component is T
        # repeats of the values found in compatibility.
        edge_compatibility = tf.tile(tf.expand_dims(compatibility, axis=1),
                                     [1, num_edge_types, 1, 1])

        # Zeroes out the entries in edge_compatibility that do not correspond to
        # actual edges.
        edge_compatibility *= edge_vectors  # Shape [B, T, N, N].

        output = compute_values(edge_compatibility, v)
        return output
Exemple #17
0
def batch_loss(model, batch):
    predicted_y = tf.nn.softmax(tf.matmul(batch.x, model.weights) + model.bias)
    return -tf.reduce_mean(
        tf.reduce_sum(tf.one_hot(batch.y, 10) * tf.log(predicted_y), axis=[1]))
Exemple #18
0
    def gating_internel(self, inputs, total_token_num):
        logits = tf.einsum('GSM,ME->GSE', inputs, self.gating_weight)  # G'SE
        raw_gates = tf.nn.softmax(logits)  # along E dim, G'SE
        tf.logging.info("raw_gates:{}".format(raw_gates))

        while self.expert_capacity_dim % 4:
            self.expert_capacity_dim += 1
            tf.logging.info(
                'Setting expert_capacity_dim=%r ('
                'num_experts=%r name_scope=%r)', self.expert_capacity_dim,
                self.num_experts,
                tf.get_default_graph().get_name_scope())

        # First top gate idx and gate val
        top_gate_index_1 = tf.math.argmax(raw_gates,
                                          axis=-1,
                                          output_type=tf.int32)  # G'S
        #tf.summary.tensor_summary('top_gate_index_1', top_gate_index_1)
        mask_1 = tf.one_hot(top_gate_index_1, self.num_experts,
                            dtype=tffloat)  # G'SE
        density_1_proxy = raw_gates
        importance = tf.ones_like(mask_1[:, :, 0])
        gate_1 = tf.einsum('GSE,GSE->GS', raw_gates, mask_1)  # G'S

        # Second top gate idx and gate val
        gates_without_top_1 = raw_gates * (1.0 - mask_1)
        top_gate_index_2 = tf.math.argmax(gates_without_top_1,
                                          axis=-1,
                                          output_type=tf.int32)  # G'S
        #tf.summary.tensor_summary('top_gate_index_2', top_gate_index_2)
        mask_2 = tf.one_hot(top_gate_index_2, self.num_experts,
                            dtype=tffloat)  # G'SE
        gate_2 = tf.einsum('GSE,GSE->GS', gates_without_top_1, mask_2)  # G'S

        # We reshape the mask as [X*S, E], and compute cumulative sums of
        # assignment indicators for each expert index e \in 0..E-1 independently.
        # First occurrence of assignment indicator is excluded, see exclusive=True
        # flag below.
        position_in_expert_1 = tf.cumsum(mask_1, exclusive=True, axis=1)

        # GS Tensor
        capacity = tf.cast(self.expert_capacity_dim,
                           dtype=position_in_expert_1.dtype)

        # GE Tensor (reducing S out of GSE tensor mask_1)
        # density_1[:, e] represents assignment ratio (num assigned / total) to
        # expert e as top_1 expert without taking capacity into account.
        density_denom = tf.reduce_mean(importance, axis=(1))[:,
                                                             tf.newaxis] + 1e-6
        density_1 = tf.reduce_mean(mask_1, axis=(1)) / density_denom
        # density_1_proxy[:, e] represents mean of raw_gates for expert e, including
        # those of examples not assigned to e with top_k.
        density_1_proxy = tf.reduce_mean(density_1_proxy,
                                         axis=1) / density_denom

        with tf.name_scope('aux_loss'):
            # The MoE paper (https://arxiv.org/pdf/1701.06538.pdf) uses an aux loss of
            # reduce_mean(density_1_proxy * density_1_proxy). Here we replace one of
            # the density_1_proxy with the discrete density_1 following mesh_tensorflow.
            aux_loss = tf.reduce_mean(density_1_proxy *
                                      density_1)  # element-wise
            aux_loss *= self.num_experts * self.num_experts  # const coefficient

        mask_1 *= tf.cast(tf.less(position_in_expert_1, capacity),
                          dtype=mask_1.dtype)
        position_in_expert_1 = tf.einsum('GSE,GSE->GS', position_in_expert_1,
                                         mask_1)

        # How many examples in this sequence go to this expert
        mask_1_count = tf.einsum('GSE->GE', mask_1)
        # [batch, group] - mostly ones, but zeros where something didn't fit
        mask_1_flat = tf.einsum('GSE->GS', mask_1)

        if self.second_expert_policy == 'all':
            pass
        elif self.second_expert_policy == 'random':
            # gate_2 is between 0 and 1, reminder:
            #
            #   raw_gates = tf.nn.softmax(logits)
            #   index_1 = tf.math.argmax(raw_gates, axis=-1, output_type=tf.int32)
            #   mask_1 = tf.one_hot(index_1, num_experts, dtype=tffloat)
            #   gate_1 = tf.einsum('GSE,GSE->GS', raw_gates, mask_1)
            #
            # E.g. if gate_2 exceeds second_expert_threshold, then we definitely
            # dispatch to second-best expert. Otherwise we dispatch with probability
            # proportional to (gate_2 / threshold).
            #
            sampled_2 = tf.less(
                tf.random.uniform(gate_2.shape, dtype=gate_2.dtype),
                (gate_2 / max(self.second_expert_threshold, 1e-9)))
            gate_2 *= tf.cast(sampled_2, gate_2.dtype)
            mask_2 *= tf.cast(tf.expand_dims(sampled_2, -1), mask_2.dtype)
        else:
            raise ValueError(self.second_expert_policy)

        # Sum token count of first and second top gate.
        position_in_expert_2 = tf.cumsum(
            mask_2, exclusive=True, axis=1) + tf.expand_dims(mask_1_count, 1)

        mask_2 *= tf.cast(tf.less(position_in_expert_2, capacity),
                          mask_2.dtype)
        position_in_expert_2 = tf.einsum('GSE,GSE->GS', position_in_expert_2,
                                         mask_2)
        mask_2_flat = tf.reduce_sum(mask_2, axis=-1)

        gate_1 *= mask_1_flat
        gate_2 *= mask_2_flat

        # Normalize top-k gates.
        denom = gate_1 + gate_2
        # To avoid divide by 0.
        denom = tf.where(denom > 0, denom, tf.ones_like(denom))
        gate_1 /= denom
        gate_2 /= denom

        # First top gate as first part of combine tensor
        b = tf.one_hot(tf.cast(position_in_expert_1, dtype=tf.int32),
                       self.expert_capacity_dim,
                       dtype=tffloat,
                       name='one_hot_b_0')  # G'SE
        a = tf.expand_dims(gate_1 * mask_1_flat, -1) * tf.one_hot(
            top_gate_index_1, self.num_experts, dtype=tffloat)  # G'SE
        first_part_of_combine_tensor = tf.einsum(
            'GSE,GSC->GSEC', a, b,
            name='first_part_of_combine_tensor')  # G'SEC

        # Second top gate as first part of combine tensor
        b = tf.one_hot(tf.cast(position_in_expert_2, dtype=tf.int32),
                       self.expert_capacity_dim,
                       dtype=tffloat,
                       name='one_hot_b_1')  # G'SE
        a = tf.expand_dims(gate_2 * mask_2_flat, -1) * tf.one_hot(
            top_gate_index_2, self.num_experts, dtype=tffloat)  # G'SE
        second_part_of_combine_tensor = tf.einsum(
            'GSE,GSC->GSEC', a, b,
            name='second_part_of_combine_tensor')  # G'SEC

        # Combine tensors of two parts.
        combine_tensor = tf.math.add(first_part_of_combine_tensor,
                                     second_part_of_combine_tensor,
                                     name='combine_tensor')  # G'SEC
        dispatch_mask = tf.cast(tf.cast(combine_tensor, tf.bool),
                                tffloat,
                                name='dispatch_mask')  # G'SEC

        return aux_loss, combine_tensor, dispatch_mask
Exemple #19
0
def skew_elements_right(tensor: tf.Tensor,
                        axis: int,
                        pad_value=0,
                        name: Optional[Text] = None) -> tf.Tensor:
  """Skews successive elements right along the given `axis`.

  This changes an input like
  [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
  ]
  into the following:
  [
    [1, 2, 3, 0, 0],
    [0, 4, 5, 6, 0],
    [0, 0, 7, 8, 9]
  ]

  Args:
    tensor: Tensor of shape [..., num_rows, axis_len, ...].
    axis: A valid axis in `tensor` to skew along. It must not be the first axis
      in `tensor`.
    pad_value: The scalar pad value to use. Defaults to 0. Must be the same type
      as `tensor`.
    name: A name for the operation (optional).

  Returns:
    Tensor of shape [..., num_rows, axis_len + num_rows - 1, ...].
  """
  with tf.name_scope(name or 'skew_elements_right'):
    tensor = tf.convert_to_tensor(tensor)

    rank = tensor.shape.rank
    num_rows = get_shape_list(tensor)[axis - 1]
    axis_len = get_shape_list(tensor)[axis]

    if rank is None:
      raise ValueError('Static rank of `tensor` must be known.')
    if axis < 0:
      axis += rank
    if axis <= 0 or axis >= rank:
      raise ValueError('`axis` out of bounds for `tensor` rank.')

    output_len = axis_len + num_rows - 1

    paddings = num_rows * tf.one_hot([-1, axis], rank, axis=0, dtype=tf.int32)

    # [..., num_rows, axis_len + num_rows, ...]
    padded_tensor = tf.pad(tensor, paddings, constant_values=pad_value)

    # [..., num_rows * (axis_len + num_rows), ...]
    flat_tensor = flatten_dims(padded_tensor, first_dim=axis - 1, last_dim=axis)

    padded_tensor2 = pad_to_multiple(
        flat_tensor,
        factor=output_len,
        axis=axis - 1,
        constant_values=pad_value)

    # [..., num_rows + 1, output_len, ...]
    new_shape = tf.concat([
        tf.shape(tensor)[:(axis - 1)], [num_rows + 1, output_len],
        tf.shape(tensor)[(axis + 1):]
    ], 0)
    reshaped_tensor = tf.reshape(padded_tensor2, new_shape)

    # [..., num_rows, output_len, ...]
    output_shape = new_shape - tf.one_hot(axis - 1, depth=rank, dtype=tf.int32)
    return tf.slice(
        reshaped_tensor, begin=tf.zeros_like(output_shape), size=output_shape)
Exemple #20
0
    def gating_internel(self, inputs, total_token_num):
        if self.is_training:
            policy = self.switch_policy_train
            capacity_factor = self.capacity_factor_train
        else:
            policy = self.switch_policy_eval
            capacity_factor = self.capacity_factor_eval

        if not self.expert_capacity_dim:
            num_experts = self.num_experts
            capacity = float(int(total_token_num) /
                             int(num_experts)) * float(capacity_factor)
            int_capacity = int(capacity)
            offset = 1 if capacity > float(int_capacity) else 0
            self.expert_capacity_dim = int(offset) + int_capacity
        self.expert_capacity_dim = max(self.expert_capacity_dim,
                                       self.min_expert_capacity)
        tf.logging.info(
            'Setting expert_capacity_dim=%r ('
            'num_experts=%r name_scope=%r)', self.expert_capacity_dim,
            self.num_experts,
            tf.get_default_graph().get_name_scope())

        if self.is_training and policy == "input_dropout":
            inputs = tf.nn.dropout(inputs, 1.0 - self.switch_dropout)

        logits = tf.einsum('GSM,ME->GSE', inputs, self.gating_weight)  # G'SE
        raw_gates = tf.nn.softmax(logits)  # along E dim, G'SE

        if policy in ["argmax", "input_dropout"]:
            _, expert_index = tf.math.top_k(raw_gates, k=1)
            expert_index = tf.squeeze(expert_index, [2])
        else:
            raise ValueError("Unknown Switch gating policy %s" % policy)

        expert_mask = tf.one_hot(expert_index, self.num_experts,
                                 dtype=tffloat)  # G'SE
        density_1_proxy = raw_gates  # G'SE
        importance = tf.ones_like(expert_mask[:, :, 0])  # G'SE
        gate_1 = tf.einsum('GSE,GSE->GS', raw_gates, expert_mask)  # G'S

        # We reshape the mask as [X*S, E], and compute cumulative sums of
        # assignment indicators for each expert index e \in 0..E-1 independently.
        # First occurrence of assignment indicator is excluded, see exclusive=True
        # flag below.
        position_in_expert_1 = tf.cumsum(expert_mask, exclusive=True, axis=1)

        # GS Tensor
        capacity = tf.cast(self.expert_capacity_dim,
                           dtype=position_in_expert_1.dtype)

        # GE Tensor (reducing S out of GSE tensor mask_1)
        # density_1[:, e] represents assignment ratio (num assigned / total) to
        # expert e as top_1 expert without taking capacity into account.
        density_denom = tf.reduce_mean(importance, axis=(1))[:,
                                                             tf.newaxis] + 1e-6
        density_1 = tf.reduce_mean(expert_mask, axis=(1)) / density_denom
        # density_1_proxy[:, e] represents mean of raw_gates for expert e, including
        # those of examples not assigned to e with top_k.
        density_1_proxy = tf.reduce_mean(density_1_proxy,
                                         axis=1) / density_denom

        with tf.name_scope('aux_loss'):
            # The MoE paper (https://arxiv.org/pdf/1701.06538.pdf) uses an aux loss of
            # reduce_mean(density_1_proxy * density_1_proxy). Here we replace one of
            # the density_1_proxy with the discrete density_1 following mesh_tensorflow.
            aux_loss = tf.reduce_mean(density_1_proxy *
                                      density_1)  # element-wise
            aux_loss *= self.num_experts * self.num_experts * self.loss_coef  # const coefficient

        expert_mask *= tf.cast(tf.less(position_in_expert_1, capacity),
                               dtype=expert_mask.dtype)
        position_in_expert_1 = tf.einsum('GSE,GSE->GS', position_in_expert_1,
                                         expert_mask)

        # [batch, group] - mostly ones, but zeros where something didn't fit
        mask_1_flat = tf.einsum('GSE->GS', expert_mask)

        gate_1 *= mask_1_flat

        # First top gate as first part of combine tensor
        b = tf.one_hot(tf.cast(position_in_expert_1, dtype=tf.int32),
                       self.expert_capacity_dim,
                       dtype=tffloat,
                       name='one_hot_b_0')  # G'SE
        a = tf.expand_dims(gate_1 * mask_1_flat, -1) * tf.one_hot(
            expert_index, self.num_experts, dtype=tffloat)  # G'SE
        combine_tensor = tf.einsum(
            'GSE,GSC->GSEC', a, b,
            name='first_part_of_combine_tensor')  # G'SEC

        dispatch_mask = tf.cast(tf.cast(combine_tensor, tf.bool),
                                tffloat,
                                name='dispatch_mask')  # G'SEC

        return aux_loss, combine_tensor, dispatch_mask
Exemple #21
0
 def compute_label_loss(logits, labels):
     one_hot_labels = tf.one_hot(labels, depth=5, dtype=tf.float32)
     log_probs = tf.nn.log_softmax(logits, axis=-1)
     loss = -tf.reduce_mean(
         tf.reduce_sum(one_hot_labels * log_probs, axis=-1))
     return loss
Exemple #22
0
def test_train(args):
    """Trains the model."""

    if args.verbose:
        tf.logging.set_verbosity(tf.logging.INFO)

    # Create input data pipeline.
    with tf.device("/cpu:0"):
        train_files = glob.glob(args.train_glob)
        if not train_files:
            raise RuntimeError(
                "No training images found with glob '{}'.".format(
                    args.train_glob))
        train_dataset = tf.data.Dataset.from_tensor_slices(train_files)
        train_dataset = train_dataset.shuffle(
            buffer_size=len(train_files)).repeat()
        train_dataset = train_dataset.map(
            read_png, num_parallel_calls=args.preprocess_threads)
        train_dataset = train_dataset.map(
            lambda x: tf.random_crop(x, (args.patchsize, args.patchsize, 3)))
        train_dataset = train_dataset.batch(args.batchsize)
        train_dataset = train_dataset.prefetch(32)

    num_pixels = args.batchsize * args.patchsize**2

    # Get training patch from dataset.
    x = train_dataset.make_one_shot_iterator().get_next()

    lmbda_level = tf.random_uniform([], minval=0, maxval=64, dtype=tf.int32)
    lmbda_onehot = tf.one_hot(tf.reshape(lmbda_level, [1]), depth=64)
    lmbda = 0.1 * tf.pow(2.0, tf.cast(lmbda_level, tf.float32) / 8.0 - 7.0)

    # Instantiate model.
    analysis_transform = AnalysisTransform(args.num_filters, lmbda_onehot)
    synthesis_transform = SynthesisTransform(args.num_filters, lmbda_onehot)
    hyper_analysis_transform = HyperAnalysisTransform(args.num_filters,
                                                      lmbda_onehot)
    hyper_synthesis_transform = HyperSynthesisTransform(
        args.num_filters, lmbda_onehot)
    entropy_bottleneck = tfc.EntropyBottleneck()

    # Build autoencoder and hyperprior.
    y = analysis_transform(x)
    z = hyper_analysis_transform(abs(y))
    z_tilde, z_likelihoods = entropy_bottleneck(z, training=True)
    sigma = hyper_synthesis_transform(z_tilde)
    scale_table = np.exp(
        np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS))
    conditional_bottleneck = tfc.GaussianConditional(sigma, scale_table)
    y_tilde, y_likelihoods = conditional_bottleneck(y, training=True)
    x_tilde = synthesis_transform(y_tilde)

    # Total number of bits divided by number of pixels.
    train_bpp = (tf.reduce_sum(tf.log(y_likelihoods)) + tf.reduce_sum(
        tf.log(z_likelihoods))) / (-np.log(2) * num_pixels)

    # Mean squared error across pixels.
    train_mse = tf.reduce_mean(tf.squared_difference(x, x_tilde))
    # Multiply by 255^2 to correct for rescaling.
    train_mse *= 255**2

    # The rate-distortion cost.
    train_loss = lmbda * train_mse + train_bpp

    # Minimize loss and auxiliary loss, and execute update op.
    step = tf.train.create_global_step()
    main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
    main_step = main_optimizer.minimize(train_loss, global_step=step)

    aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
    aux_step = aux_optimizer.minimize(entropy_bottleneck.losses[0])

    train_op = tf.group(main_step, aux_step, entropy_bottleneck.updates[0])

    tf.summary.scalar("loss", train_loss)
    tf.summary.scalar("bpp", train_bpp)
    tf.summary.scalar("mse", train_mse)
    tf.summary.scalar("lambda", lmbda_level)

    tf.summary.image("original", quantize_image(x))
    tf.summary.image("reconstruction", quantize_image(x_tilde))

    hooks = [
        tf.train.StopAtStepHook(last_step=args.last_step),
        tf.train.NanTensorHook(train_loss),
    ]
    with tf.train.MonitoredTrainingSession(hooks=hooks,
                                           checkpoint_dir=args.checkpoint_dir,
                                           save_checkpoint_secs=300,
                                           save_summaries_secs=60) as sess:
        while not sess.should_stop():
            sess.run(train_op)
Exemple #23
0
def detection_loss(cls_outputs, box_outputs, labels, params):
  """Computes total detection loss.

  Computes total detection loss including box and class loss from all levels.
  Args:
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in
      [batch_size, height, width, num_anchors * 4].
    labels: the dictionary that returned from dataloader that includes
      groundtruth targets.
    params: the dictionary including training parameters specified in
      default_haprams function in this file.
  Returns:
    total_loss: an integer tensor representing total loss reducing from
      class and box losses from all levels.
    cls_loss: an integer tensor representing total class loss.
    box_loss: an integer tensor representing total box regression loss.
  """
  # Sum all positives in a batch for normalization and avoid zero
  # num_positives_sum, which would lead to inf loss during training
  num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0
  levels = cls_outputs.keys()

  cls_losses = []
  box_losses = []
  for level in levels:
    if params['data_format'] == 'channels_first':
      labels['cls_targets_%d' % level] = tf.transpose(
          labels['cls_targets_%d' % level], [0, 3, 1, 2])
      labels['box_targets_%d' % level] = tf.transpose(
          labels['box_targets_%d' % level], [0, 3, 1, 2])
    # Onehot encoding for classification labels.
    cls_targets_at_level = tf.one_hot(
        labels['cls_targets_%d' % level],
        params['num_classes'])
    if params['data_format'] == 'channels_first':
      bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list()
      cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                        [bs, -1, width, height])
    else:
      bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list()
      cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                        [bs, width, height, -1])
    box_targets_at_level = labels['box_targets_%d' % level]
    cls_loss = _classification_loss(
        cls_outputs[level],
        cls_targets_at_level,
        num_positives_sum,
        alpha=params['alpha'],
        gamma=params['gamma'])
    if params['data_format'] == 'channels_first':
      cls_loss = tf.reshape(cls_loss,
                            [bs, -1, width, height, params['num_classes']])
    else:
      cls_loss = tf.reshape(cls_loss,
                            [bs, width, height, -1, params['num_classes']])
    cls_loss *= tf.cast(tf.expand_dims(
        tf.not_equal(labels['cls_targets_%d' % level], -2), -1), tf.float32)
    cls_losses.append(tf.reduce_sum(cls_loss))
    box_losses.append(
        _box_loss(
            box_outputs[level],
            box_targets_at_level,
            num_positives_sum,
            delta=params['delta']))

  # Sum per level losses to total loss.
  cls_loss = tf.add_n(cls_losses)
  box_loss = tf.add_n(box_losses)
  total_loss = cls_loss + params['box_loss_weight'] * box_loss
  return total_loss, cls_loss, box_loss
Exemple #24
0
def test_compress(args):
    """Compresses an image."""

    # Load input image and add batch dimension.
    fn = tf.placeholder(tf.string, [])

    x = read_png(fn)
    x = tf.expand_dims(x, 0)
    x.set_shape([1, None, None, 3])
    x_shape = tf.shape(x)

    lmbda_level = tf.random_uniform([], minval=0, maxval=64, dtype=tf.int32)
    lmbda_onehot = tf.one_hot(tf.reshape(lmbda_level, [1]), depth=64)
    lmbda = 0.1 * tf.pow(2.0, tf.cast(lmbda_level, tf.float32) / 8.0 - 7.0)

    # Instantiate model.
    analysis_transform = AnalysisTransform(args.num_filters, lmbda_onehot)
    synthesis_transform = SynthesisTransform(args.num_filters, lmbda_onehot)
    hyper_analysis_transform = HyperAnalysisTransform(args.num_filters,
                                                      lmbda_onehot)
    hyper_synthesis_transform = HyperSynthesisTransform(
        args.num_filters, lmbda_onehot)
    entropy_bottleneck = tfc.EntropyBottleneck()

    # Transform and compress the image.
    y = analysis_transform(x)
    y_shape = tf.shape(y)
    z = hyper_analysis_transform(abs(y))
    z_hat, z_likelihoods = entropy_bottleneck(z, training=False)
    sigma = hyper_synthesis_transform(z_hat)
    sigma = sigma[:, :y_shape[1], :y_shape[2], :]
    scale_table = np.exp(
        np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS))
    conditional_bottleneck = tfc.GaussianConditional(sigma, scale_table)
    side_string = entropy_bottleneck.compress(z)
    string = conditional_bottleneck.compress(y)

    # Transform the quantized image back (if requested).
    y_hat, y_likelihoods = conditional_bottleneck(y, training=False)
    x_hat = synthesis_transform(y_hat)
    x_hat = x_hat[:, :x_shape[1], :x_shape[2], :]

    num_pixels = tf.cast(tf.reduce_prod(tf.shape(x)[:-1]), dtype=tf.float32)

    # Total number of bits divided by number of pixels.
    eval_bpp = (tf.reduce_sum(tf.log(y_likelihoods)) + tf.reduce_sum(
        tf.log(z_likelihoods))) / (-np.log(2) * num_pixels)

    # Bring both images back to 0..255 range.
    x *= 255
    x_hat = tf.clip_by_value(x_hat, 0, 1)
    x_hat = tf.round(x_hat * 255)

    mse = tf.reduce_mean(tf.squared_difference(x, x_hat))
    psnr = tf.squeeze(tf.image.psnr(x_hat, x, 255))
    msssim = tf.squeeze(tf.image.ssim_multiscale(x_hat, x, 255))

    with tf.Session() as sess:
        # Load the latest model checkpoint, get the compressed string and the tensor
        # shapes.
        latest = tf.train.latest_checkpoint(checkpoint_dir=args.checkpoint_dir)
        tf.train.Saver().restore(sess, save_path=latest)

        f = open("f6.csv", "w")
        print("level, fn, bpp, mse, np", file=f)
        for i in np.arange(0, 64):
            for filename in glob.glob("kodak/*.png"):

                v_lmbda_level, v_eval_bpp, v_mse, v_num_pixels = sess.run(
                    [lmbda_level, eval_bpp, mse, num_pixels],
                    feed_dict={
                        fn: filename,
                        lmbda_level: i
                    })

                print(
                    "%.2f, %s, %.4f, %.4f, %d" %
                    (v_lmbda_level, filename, v_eval_bpp, v_mse, v_num_pixels),
                    file=f)
        f.close()
Exemple #25
0
 def onehot_labels(self):
     return tf.one_hot(self.labels, NUM_CLASSES)
Exemple #26
0
    def __init__(self,
                 env,
                 q_func,
                 optimizer_spec,
                 session,
                 exploration=LinearSchedule(1000000, 0.1),
                 stopping_criterion=None,
                 replay_buffer_size=1000000,
                 batch_size=32,
                 gamma=0.99,
                 learning_starts=50000,
                 learning_freq=4,
                 frame_history_len=4,
                 target_update_freq=10000,
                 grad_norm_clipping=10,
                 rew_file=None,
                 lander=False):
        """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    session: tf.Session
        tensorflow session to use.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    """
        assert type(env.observation_space) == gym.spaces.Box
        assert type(env.action_space) == gym.spaces.Discrete

        self.target_update_freq = target_update_freq
        self.optimizer_spec = optimizer_spec
        self.batch_size = batch_size
        self.learning_freq = learning_freq
        self.learning_starts = learning_starts
        self.stopping_criterion = stopping_criterion
        self.env = env
        self.session = session
        self.exploration = exploration
        self.rew_file = str(
            uuid.uuid4()) + '.pkl' if rew_file is None else rew_file

        ###############
        # BUILD MODEL #
        ###############

        if len(self.env.observation_space.shape) == 1:
            # This means we are running on low-dimensional observations (e.g. RAM)
            input_shape = self.env.observation_space.shape
        else:
            img_h, img_w, img_c = self.env.observation_space.shape
            input_shape = (img_h, img_w, frame_history_len * img_c)
        self.num_actions = self.env.action_space.n

        # set up placeholders
        # placeholder for current observation (or state)
        self.obs_t_ph = tf.placeholder(tf.float32 if lander else tf.uint8,
                                       [None] + list(input_shape))
        # placeholder for current action
        self.act_t_ph = tf.placeholder(tf.int32, [None])
        # placeholder for current reward
        self.rew_t_ph = tf.placeholder(tf.float32, [None])
        # placeholder for next observation (or state)
        self.obs_tp1_ph = tf.placeholder(tf.float32 if lander else tf.uint8,
                                         [None] + list(input_shape))
        # placeholder for end of episode mask
        # this value is 1 if the next state corresponds to the end of an episode,
        # in which case there is no Q-value at the next state; at the end of an
        # episode, only the current state reward contributes to the target, not the
        # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
        self.done_mask_ph = tf.placeholder(tf.float32, [None])

        # casting to float on GPU ensures lower data transfer times.
        if lander:
            obs_t_float = self.obs_t_ph
            obs_tp1_float = self.obs_tp1_ph
        else:
            obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0
            obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0

        # Here, you should fill in your own code to compute the Bellman error. This requires
        # evaluating the current and next Q-values and constructing the corresponding error.
        # TensorFlow will differentiate this error for you, you just need to pass it to the
        # optimizer. See assignment text for details.
        # Your code should produce one scalar-valued tensor: total_error
        # This will be passed to the optimizer in the provided code below.
        # Your code should also produce two collections of variables:
        # q_func_vars
        # target_q_func_vars
        # These should hold all of the variables of the Q-function network and target network,
        # respectively. A convenient way to get these is to make use of TF's "scope" feature.
        # For example, you can create your Q-function network with the scope "q_func" like this:
        # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
        # And then you can obtain the variables like this:
        # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
        # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
        # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error
        ######

        # YOUR CODE HERE

        self.q_t = q_func(obs_t_float,
                          self.num_actions,
                          scope="q_func",
                          reuse=False)
        self.q_tp1 = q_func(obs_tp1_float,
                            self.num_actions,
                            scope="target_q_func",
                            reuse=False)

        # get Q-value based on the chosen action
        max_q_t = tf.reduce_sum(self.q_t *
                                tf.one_hot(self.act_t_ph, self.num_actions),
                                axis=1)

        #max_action = tf.argmax(self.q_t, axis=1) target max q.
        max_q = tf.reduce_max(self.q_tp1, axis=1)

        target = self.rew_t_ph + gamma * (1.0 - self.done_mask_ph) * max_q
        self.total_error = tf.reduce_mean(huber_loss(target - max_q_t))

        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope="q_func")
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope="target_q_func")

        # construct optimization op (with gradient clipping)
        self.learning_rate = tf.placeholder(tf.float32, (),
                                            name="learning_rate")
        optimizer = self.optimizer_spec.constructor(
            learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
        self.train_fn = minimize_and_clip(optimizer,
                                          self.total_error,
                                          var_list=q_func_vars,
                                          clip_val=grad_norm_clipping)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_fn = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_fn.append(var_target.assign(var))
        self.update_target_fn = tf.group(*update_target_fn)

        # construct the replay buffer
        self.replay_buffer = ReplayBuffer(replay_buffer_size,
                                          frame_history_len,
                                          lander=lander)
        self.replay_buffer_idx = None

        ###############
        # RUN ENV     #
        ###############
        self.model_initialized = False
        self.num_param_updates = 0
        self.mean_episode_reward = -float('nan')
        self.best_mean_episode_reward = -float('inf')
        self.last_obs = self.env.reset()
        self.log_every_n_steps = 10000

        self.start_time = time.time()
        self.t = 0
Exemple #27
0
    def lstm_decoder_infer(self,
                           inputs,
                           sequence_length,
                           hparams,
                           clss,
                           train,
                           initial_state=None,
                           bottleneck=None):
        # IN PREDICT MODE, RUN tf.while RNN
        max_decode_length = 51
        batch_size = common_layers.shape_list(inputs)[0]
        zero_pad, logits_so_far = self.create_initial_input_for_decode(
            batch_size)

        layers = rnn.MultiRNNCell([
            self.lstm_cell(hparams, train)
            for _ in range(hparams.num_hidden_layers)
        ])

        if initial_state is None:
            raise Exception('initial state should be init from bottleneck!')

        # append one-hot class to bottleneck, which will be given per step
        clss = tf.reshape(clss, [-1])
        if not hparams.use_cls:
            clss = tf.zeros_like(clss)
        if hparams.condition_on_sln:
            sln = tf.reshape(sequence_length, [-1])
            bottleneck = tf.concat(
                (bottleneck, tf.one_hot(clss, hparams.num_categories),
                 tf.one_hot(sln, max_decode_length)), -1)
        else:
            bottleneck = tf.concat(
                (bottleneck, tf.one_hot(clss, hparams.num_categories)), -1)

        def infer_step(logits_so_far, current_hidden):
            """Inference step of LSTM while loop."""
            # unflatten hidden:
            current_hidden = tuple(
                rnn.LSTMStateTuple(c=s[0], h=s[1]) for s in current_hidden)

            # put logits_so_far through top
            tm = self._problem_hparams.modality['targets']
            # need to reuse top params
            reset_scope = tf.variable_scope(tf.VariableScope(
                tf.AUTO_REUSE, ''),
                                            reuse=tf.AUTO_REUSE,
                                            auxiliary_name_scope=False)
            top_scope = tf.variable_scope('svg_decoder/{}_modality'.format(tm),
                                          reuse=tf.AUTO_REUSE)
            with reset_scope, top_scope:
                samples_so_far = self.hparams.top['targets'](
                    logits_so_far, None, self.hparams,
                    self.problem_hparams.vocab_size)
            # append a zero pad to the samples. this effectively shifts the samples
            # right, but, unlike shift_right, by not removing the last element, we
            # allow an empty samples_so_far to not be empty after padding
            samples_so_far = tf.concat([zero_pad, samples_so_far], axis=1)
            shifted_targets = common_layers.flatten4d3d(samples_so_far)
            # now take the very last one here, will be the actual input to the rnn
            shifted_targets = shifted_targets[:, -1:, :]

            # tile and append the bottleneck to inputs
            sln_offset = 0
            if hparams.condition_on_sln:
                sln_offset = 51
            pre_tile_y = tf.reshape(bottleneck, [
                common_layers.shape_list(bottleneck)[0], 1,
                hparams.bottleneck_bits + hparams.num_categories + sln_offset
            ])
            overlay_x = tf.tile(
                pre_tile_y,
                [1, common_layers.shape_list(shifted_targets)[1], 1])
            inputs = tf.concat([shifted_targets, overlay_x], -1)

            seq_len_batch = tf.ones([common_layers.shape_list(inputs)[0]])

            # RUN PRE-LSTM LAYER
            with tf.variable_scope('pre_decoder', reuse=tf.AUTO_REUSE):
                inputs = tf.layers.dense(inputs,
                                         hparams.hidden_size,
                                         name='bottom')
                inputs = tf.nn.tanh(inputs)

            # RUN LSTM
            with tf.variable_scope('lstm_decoder', reuse=tf.AUTO_REUSE):
                next_step, next_state = tf.nn.dynamic_rnn(
                    layers,
                    inputs,
                    seq_len_batch,
                    initial_state=current_hidden,
                    dtype=tf.float32,
                    time_major=False)

            next_step = tf.expand_dims(next_step, [1])
            logits_so_far = tf.concat([logits_so_far, next_step], 1)

            # flatten state
            next_state = tuple((s.c, s.h) for s in next_state)

            return logits_so_far, next_state

        def while_exit_cond(logits_so_far, unused_current_hidden):
            length = common_layers.shape_list(logits_so_far)[1]
            return length < max_decode_length

        # passing state must be flattened:
        initial_state = tuple((s.c, s.h) for s in initial_state)

        # actually run tf.while:
        logits, final_state = tf.while_loop(
            while_exit_cond,
            infer_step, [logits_so_far, initial_state],
            shape_invariants=[
                tf.TensorShape([None, None, 1, hparams.hidden_size]),
                tuple((s[0].get_shape(), s[1].get_shape())
                      for s in initial_state),
            ],
            back_prop=False,
            parallel_iterations=1)

        # logits should be returned in 3d mode:
        logits = common_layers.flatten4d3d(logits)

        return logits, final_state
import tensorflow.compat.v1 as tf
import numpy as np

path = 'https://raw.githubusercontent.com/hunkim/DeepLearningZeroToAll/master/data-04-zoo.csv'
xy = np.genfromtxt(path, delimiter=',', dtype=np.float32)
x_data = xy[:, 0:-1]
y_data = xy[:, [-1]]

nb_classes = 7  # 0 ~ 6

X = tf.placeholder(tf.float32, [None, 16])
Y = tf.placeholder(tf.int32, [None, 1])   # 0 ~ 6

Y_one_hot = tf.one_hot(Y, nb_classes)   # one hot
Y_one_hot = tf.reshape(Y_one_hot, [-1, nb_classes])

W = tf.Variable(tf.random_normal([16, nb_classes]), name='weight')
b = tf.Variable(tf.random_normal([nb_classes]), name='bias')

# tf.nn.softmax compute softmax activations
# softmax = exp(logits) / reduce_sum(exp(logits), dim)
logits = tf.matmul(X, W) + b
hypothesis = tf.nn.softmax(logits)

# Cross entropy cost/loss
cost_i = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y_one_hot)
cost = tf.reduce_mean(cost_i)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cost)

prediction = tf.argmax(hypothesis, 1)
correct_prediction = tf.equal(prediction, tf.argmax(Y_one_hot, 1))
def train(flags):
  """Training entry point."""
  log_dir = flags.log_dir
  flags.pretrained_model_dir = log_dir
  log_dir = os.path.join(log_dir, 'train')
  flags.eval_interval_secs = 0
  with tf.Graph().as_default():
    global_step = tf.Variable(
        0, trainable=False, name='global_step', dtype=tf.int64)
    global_step_confidence = tf.Variable(
        0, trainable=False, name='global_step_confidence', dtype=tf.int64)

    model = build_model(flags)
    images_query_pl, labels_query_pl, \
    images_support_pl, labels_support_pl = \
      build_episode_placeholder(flags)

    # Augments the input.
    if flags.dataset == 'cifar10' or flags.dataset == 'cifar100':
      images_query_pl_aug = data_loader.augment_cifar(
          images_query_pl, is_training=True)
      images_support_pl_aug = data_loader.augment_cifar(
          images_support_pl, is_training=True)
    elif flags.dataset == 'tinyimagenet':
      images_query_pl_aug = data_loader.augment_tinyimagenet(
          images_query_pl, is_training=True)
      images_support_pl_aug = data_loader.augment_tinyimagenet(
          images_support_pl, is_training=True)

    logits, logits_z = build_proto_train_graph(
        images_query=images_query_pl_aug,
        images_support=images_support_pl_aug,
        flags=flags,
        is_training=True,
        model=model)
    # Losses and optimizer
    ## Classification loss
    loss_classification = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            logits=logits,
            labels=tf.one_hot(labels_query_pl, flags.num_classes_train)))

    # Confidence loss
    _, top_k_indices = tf.nn.top_k(logits, k=1)
    pred = tf.squeeze(top_k_indices)
    incorrect_mask = tf.math.logical_not(tf.math.equal(pred, labels_query_pl))
    incorrect_logits_z = tf.boolean_mask(logits_z, incorrect_mask)
    incorrect_labels_z = tf.boolean_mask(labels_query_pl, incorrect_mask)
    signal_variance = tf.math.reduce_sum(tf.cast(incorrect_mask, tf.int32))
    loss_variance_incorrect = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            logits=incorrect_logits_z,
            labels=tf.one_hot(incorrect_labels_z, flags.num_classes_train)))
    loss_variance_zero = 0.0
    loss_confidence = tf.cond(
        tf.greater(signal_variance, 0), lambda: loss_variance_incorrect,
        lambda: loss_variance_zero)

    regu_losses = tf.losses.get_regularization_losses()
    loss = tf.add_n([loss_classification] + regu_losses)

    # Learning rate
    if flags.lr_anneal == 'const':
      learning_rate = flags.init_learning_rate
    elif flags.lr_anneal == 'pwc':
      learning_rate = get_pwc_learning_rate(global_step, flags)
    elif flags.lr_anneal == 'exp':
      lr_decay_step = flags.number_of_steps // flags.n_lr_decay
      learning_rate = tf.train.exponential_decay(
          flags.init_learning_rate,
          global_step,
          lr_decay_step,
          1.0 / flags.lr_decay_rate,
          staircase=True)
    else:
      raise Exception('Not implemented')

    # Optimizer
    optimizer = tf.train.MomentumOptimizer(
        learning_rate=learning_rate, momentum=0.9)
    optimizer_confidence = tf.train.MomentumOptimizer(
        learning_rate=learning_rate, momentum=0.9)

    train_op = contrib_slim.learning.create_train_op(
        total_loss=loss,
        optimizer=optimizer,
        global_step=global_step,
        clip_gradient_norm=flags.clip_gradient_norm)
    variable_variance = []
    for v in tf.trainable_variables():
      if 'fc_variance' in v.name:
        variable_variance.append(v)
    train_op_confidence = contrib_slim.learning.create_train_op(
        total_loss=loss_confidence,
        optimizer=optimizer_confidence,
        global_step=global_step_confidence,
        clip_gradient_norm=flags.clip_gradient_norm,
        variables_to_train=variable_variance)

    tf.summary.scalar('loss', loss)
    tf.summary.scalar('loss_classification', loss_classification)
    tf.summary.scalar('loss_variance', loss_confidence)
    tf.summary.scalar('regu_loss', tf.add_n(regu_losses))
    tf.summary.scalar('learning_rate', learning_rate)
    # Merges all summaries except for pretrain
    summary = tf.summary.merge(
        tf.get_collection('summaries', scope='(?!pretrain).*'))

    # Gets datasets
    few_shot_data_train, test_dataset, train_dataset = get_train_datasets(flags)
    # Defines session and logging
    summary_writer_train = tf.summary.FileWriter(log_dir, flush_secs=1)
    saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
    print(saver.saver_def.filename_tensor_name)
    print(saver.saver_def.restore_op_name)
    # pylint: disable=unused-variable
    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    run_metadata = tf.RunMetadata()
    supervisor = tf.train.Supervisor(
        logdir=log_dir,
        init_feed_dict=None,
        summary_op=None,
        init_op=tf.global_variables_initializer(),
        summary_writer=summary_writer_train,
        saver=saver,
        global_step=global_step,
        save_summaries_secs=flags.save_summaries_secs,
        save_model_secs=0)

    with supervisor.managed_session() as sess:
      checkpoint_step = sess.run(global_step)
      if checkpoint_step > 0:
        checkpoint_step += 1
      eval_interval_steps = flags.eval_interval_steps
      for step in range(checkpoint_step, flags.number_of_steps):
        # Computes the classification loss using a batch of data.
        images_query, labels_query,\
        images_support, labels_support = \
          few_shot_data_train.next_few_shot_batch(
              query_batch_size_per_task=flags.train_batch_size,
              num_classes_per_task=flags.num_classes_train,
              num_supports_per_class=flags.num_shots_train,
              num_tasks=flags.num_tasks_per_batch)

        feed_dict = {
            images_query_pl: images_query.astype(dtype=np.float32),
            labels_query_pl: labels_query,
            images_support_pl: images_support.astype(dtype=np.float32),
            labels_support_pl: labels_support
        }

        t_batch = time.time()
        dt_batch = time.time() - t_batch

        t_train = time.time()
        loss, loss_confidence = sess.run([train_op, train_op_confidence],
                                         feed_dict=feed_dict)
        dt_train = time.time() - t_train

        if step % 100 == 0:
          summary_str = sess.run(summary, feed_dict=feed_dict)
          summary_writer_train.add_summary(summary_str, step)
          summary_writer_train.flush()
          logging.info('step %d, loss : %.4g, dt: %.3gs, dt_batch: %.3gs', step,
                       loss, dt_train, dt_batch)

        if float(step) / flags.number_of_steps > 0.5:
          eval_interval_steps = flags.eval_interval_fine_steps

        if eval_interval_steps > 0 and step % eval_interval_steps == 0:
          saver.save(sess, os.path.join(log_dir, 'model'), global_step=step)
          eval(
              flags=flags,
              train_dataset=train_dataset,
              test_dataset=test_dataset)

        if float(
            step
        ) > 0.5 * flags.number_of_steps + flags.number_of_steps_to_early_stop:
          break
Exemple #30
0
    def get_prediction_module(self, bert_model, features, is_training,
                              percent_done):
        final_hidden = bert_model.get_sequence_output()

        final_hidden_shape = modeling.get_shape_list(final_hidden,
                                                     expected_rank=3)
        batch_size = final_hidden_shape[0]
        seq_length = final_hidden_shape[1]
        # hidden_size = final_hidden_shape[2]

        # lstm_fw = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        # lstm_bw = tf.keras.layers.LSTM(hidden_size, return_sequences=True, go_backwards=True)
        # biLSTM = tf.keras.layers.Bidirectional(lstm_fw, backward_layer=lstm_bw, merge_mode='concat')
        # final_hidden = biLSTM(final_hidden)

        # biLSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True))
        # lstm = tf.keras.layers.LSTM(100, return_sequences=True)
        # linear = tf.keras.layers.Dense(2, activation=None)
        # final_hidden = biLSTM(final_hidden)
        # final_hidden = linear(final_hidden)

        answer_mask = tf.cast(features["input_mask"], tf.float32)
        answer_mask *= tf.cast(features["segment_ids"], tf.float32)
        answer_mask += tf.one_hot(0, seq_length)

        start_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1)

        start_top_log_probs = tf.zeros([batch_size, self.config.beam_size])
        start_top_index = tf.zeros([batch_size, self.config.beam_size],
                                   tf.int32)
        end_top_log_probs = tf.zeros(
            [batch_size, self.config.beam_size, self.config.beam_size])
        end_top_index = tf.zeros(
            [batch_size, self.config.beam_size, self.config.beam_size],
            tf.int32)
        if self.config.joint_prediction:
            start_logits += 1000.0 * (answer_mask - 1)
            start_log_probs = tf.nn.log_softmax(start_logits)
            start_top_log_probs, start_top_index = tf.nn.top_k(
                start_log_probs, k=self.config.beam_size)

            if not is_training:
                # batch, beam, length, hidden
                end_features = tf.tile(tf.expand_dims(final_hidden, 1),
                                       [1, self.config.beam_size, 1, 1])
                # batch, beam, length
                start_index = tf.one_hot(start_top_index,
                                         depth=seq_length,
                                         axis=-1,
                                         dtype=tf.float32)
                # batch, beam, hidden
                start_features = tf.reduce_sum(
                    tf.expand_dims(final_hidden, 1) *
                    tf.expand_dims(start_index, -1),
                    axis=-2)
                # batch, beam, length, hidden
                start_features = tf.tile(tf.expand_dims(start_features, 2),
                                         [1, 1, seq_length, 1])
            else:
                start_index = tf.one_hot(features[self.name +
                                                  "_start_positions"],
                                         depth=seq_length,
                                         axis=-1,
                                         dtype=tf.float32)
                start_features = tf.reduce_sum(
                    tf.expand_dims(start_index, -1) * final_hidden, axis=1)
                start_features = tf.tile(tf.expand_dims(start_features, 1),
                                         [1, seq_length, 1])
                end_features = final_hidden

            final_repr = tf.concat([start_features, end_features], -1)
            final_repr = tf.layers.dense(final_repr,
                                         512,
                                         activation=modeling.gelu,
                                         name="qa_hidden")
            # batch, beam, length (batch, length when training)
            end_logits = tf.squeeze(tf.layers.dense(final_repr, 1),
                                    -1,
                                    name="qa_logits")
            if is_training:
                end_logits += 1000.0 * (answer_mask - 1)
            else:
                end_logits += tf.expand_dims(1000.0 * (answer_mask - 1), 1)

            if not is_training:
                end_log_probs = tf.nn.log_softmax(end_logits)
                end_top_log_probs, end_top_index = tf.nn.top_k(
                    end_log_probs, k=self.config.beam_size)
                end_logits = tf.zeros([batch_size, seq_length])
        else:
            end_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1)
            start_logits += 1000.0 * (answer_mask - 1)
            end_logits += 1000.0 * (answer_mask - 1)

        def compute_loss(logits, positions):
            one_hot_positions = tf.one_hot(positions,
                                           depth=seq_length,
                                           dtype=tf.float32)
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1)
            return loss

        start_positions = features[self.name + "_start_positions"]
        end_positions = features[self.name + "_end_positions"]

        start_loss = compute_loss(start_logits, start_positions)
        end_loss = compute_loss(end_logits, end_positions)

        losses = (start_loss + end_loss) / 2.0

        answerable_logit = tf.zeros([batch_size])
        if self.config.answerable_classifier:
            final_repr = final_hidden[:, 0]
            if self.config.answerable_uses_start_logits:
                start_p = tf.nn.softmax(start_logits)
                start_feature = tf.reduce_sum(tf.expand_dims(start_p, -1) *
                                              final_hidden,
                                              axis=1)
                final_repr = tf.concat([final_repr, start_feature], -1)
                final_repr = tf.layers.dense(final_repr,
                                             512,
                                             activation=modeling.gelu)
            answerable_logit = tf.squeeze(tf.layers.dense(final_repr, 1), -1)
            answerable_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf.cast(features[self.name + "_is_impossible"],
                               tf.float32),
                logits=answerable_logit)
            losses += answerable_loss * self.config.answerable_weight

        return losses, dict(
            loss=losses,
            start_logits=start_logits,
            end_logits=end_logits,
            answerable_logit=answerable_logit,
            start_positions=features[self.name + "_start_positions"],
            end_positions=features[self.name + "_end_positions"],
            start_top_log_probs=start_top_log_probs,
            start_top_index=start_top_index,
            end_top_log_probs=end_top_log_probs,
            end_top_index=end_top_index,
            eid=features[self.name + "_eid"],
        )