Beispiel #1
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    # Convert params (dict) to Config for easier access.
    if params['data_format'] == 'channels_first':
        features = tf.transpose(features, [0, 3, 1, 2])

    def _model_outputs():
        return model(features, config=hparams_config.Config(params))

    if params['use_bfloat16']:
        with tf.tpu.bfloat16_scope():
            cls_outputs, box_outputs = _model_outputs()
            levels = cls_outputs.keys()
            for level in levels:
                cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
                box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
    else:
        cls_outputs, box_outputs = _model_outputs()
        levels = cls_outputs.keys()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                  labels, params)
    l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/l2_loss', l2loss)
        utils.scalar('trainloss/loss', total_loss)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=params['momentum'])
        if params['use_tpu']:
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            if params['use_tpu']:
                batch_size = params['batch_size'] * params['num_shards']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    testdev_dir=params['testdev_dir'],
                    disable_pyfun=params.get('disable_pyfun', None),
                    **kwargs)
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                              params['val_json_file'],
                                              **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint,
                                             ckpt_scope=ckpt_scope,
                                             var_scope=var_scope,
                                             var_exclude_expr=params.get(
                                                 'var_exclude_expr', None))

            tf.train.init_from_checkpoint(checkpoint, var_map)

            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn)
Beispiel #2
0
    def create_graph(self):
        BATCH_SIZE = self.BATCH_SIZE
        self.tf_x = tf.placeholder(tf.int32, [BATCH_SIZE, self.FEATURE_SIZE],
                                   name="tf_x")
        self.tf_y = tf.placeholder(tf.float32, [BATCH_SIZE, 2], name="tf_y")
        self.tf_bid_len = tf.placeholder(tf.int32, [BATCH_SIZE], name="tf_len")
        self.tf_market_price = tf.placeholder(tf.int32, [BATCH_SIZE],
                                              name="tf_market_price")
        self.tf_control_parameter = tf.placeholder(tf.float32, [2],
                                                   name="tf_control_parameter")
        alpha = self.tf_control_parameter[0]
        beta = self.tf_control_parameter[1]
        self.tf_rnn_len = tf.maximum(self.tf_bid_len, self.tf_market_price) + 2
        embeddings = tf.Variable(self.init_matrix([self.MAX_DEN,
                                                   self.EMB_DIM]))
        x_emds = tf.nn.embedding_lookup(embeddings, self.tf_x)
        input = tf.reshape(x_emds,
                           [BATCH_SIZE, self.FEATURE_SIZE * self.EMB_DIM])
        input_x = None
        if self.add_time_feature:
            middle_layer = tf.layers.dense(input, self.MIDDLE_FEATURE_SIZE,
                                           tf.nn.relu)  # hidden layer

            def add_time(x):
                y = tf.reshape(tf.tile(x, [self.MAX_SEQ_LEN]),
                               [self.MAX_SEQ_LEN, self.MIDDLE_FEATURE_SIZE])
                t = tf.reshape(tf.range(self.MAX_SEQ_LEN),
                               [self.MAX_SEQ_LEN, 1])
                z = tf.concat([y, tf.cast(t, dtype=tf.float32)], 1)
                return z

            input_x = tf.map_fn(add_time, middle_layer)

        preds = None

        if self.DNN_MODEL:
            outlist = []
            for i in range(0, self.BATCH_SIZE):
                sigleout = tf.layers.dense(input_x[i], 1, tf.nn.sigmoid)
                outlist.append(sigleout)
            preds = tf.reshape(tf.stack(outlist, axis=0),
                               [self.BATCH_SIZE, self.MAX_SEQ_LEN],
                               name="preds")
        else:
            # input_x = tf.reshape(tf.tile(input, [1, self.MAX_SEQ_LEN]), [BATCH_SIZE, self.MAX_SEQ_LEN, self.FEATURE_SIZE * self.EMB_DIM])
            rnn_cell = None
            #rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.STATE_SIZE)
            rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.STATE_SIZE)

            outputs, (h_c, h_n) = tf.nn.dynamic_rnn(
                rnn_cell,  # cell you have chosen
                input_x,  # input
                initial_state=None,  # the initial hidden state
                dtype=tf.float32,  # must given if set initial_state = None
                time_major=
                False,  # False: (batch, time step, input); True: (time step, batch, input)
                sequence_length=self.tf_rnn_len)

            new_output = tf.reshape(
                outputs, [self.MAX_SEQ_LEN * BATCH_SIZE, self.STATE_SIZE])

            with tf.variable_scope('softmax'):
                W = tf.get_variable('W', [self.STATE_SIZE, 1])
                b = tf.get_variable('b', [1],
                                    initializer=tf.constant_initializer(0))

            logits = tf.matmul(new_output, W) + b
            preds = tf.transpose(tf.nn.sigmoid(logits, name="preds"),
                                 name="preds")[0]

        self.preds = preds
        survival_rate = preds
        batch_rnn_survival_rate = tf.reshape(survival_rate,
                                             [BATCH_SIZE, self.MAX_SEQ_LEN])

        map_parameter = tf.concat([
            batch_rnn_survival_rate,
            tf.cast(tf.reshape(self.tf_bid_len, [BATCH_SIZE, 1]), tf.float32)
        ], 1)
        map_parameter = tf.concat([
            map_parameter,
            tf.cast(tf.reshape(self.tf_market_price, [BATCH_SIZE, 1]),
                    tf.float32)
        ], 1)

        def reduce_mul(x):
            bid_len = tf.cast(x[self.MAX_SEQ_LEN], dtype=tf.int32)
            market_len = tf.cast(x[self.MAX_SEQ_LEN + 1], dtype=tf.int32)
            survival_rate_last_one = tf.reduce_prod(x[0:bid_len])
            anlp_rate_last_one = tf.reduce_prod(x[0:market_len + 1])
            anlp_rate_last_two = tf.reduce_prod(x[0:market_len])
            ret = tf.stack([
                survival_rate_last_one, anlp_rate_last_one, anlp_rate_last_two
            ])
            return ret

        self.mp_para = map_parameter
        rate_result = tf.map_fn(reduce_mul,
                                elems=map_parameter,
                                name="rate_result")
        self.rate_result = rate_result
        log_minus = tf.log(
            tf.add(
                tf.transpose(rate_result)[2] - tf.transpose(rate_result)[1],
                1e-20))  #todo debug

        self.anlp_node = -tf.reduce_sum(
            log_minus) / self.BATCH_SIZE  #todo load name
        self.anlp_node = tf.add(self.anlp_node, 0, name="anlp_node")
        self.final_survival_rate = tf.transpose(rate_result)[0]
        final_dead_rate = tf.subtract(tf.constant(1.0, dtype=tf.float32),
                                      self.final_survival_rate)

        self.predict = tf.transpose(tf.stack(
            [self.final_survival_rate, final_dead_rate]),
                                    name="predict")
        cross_entropy = -tf.reduce_sum(
            self.tf_y * tf.log(tf.clip_by_value(self.predict, 1e-10, 1.0)))

        tvars = tf.trainable_variables()
        lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in tvars]) * self.L2_NORM
        cost = tf.add(cross_entropy, lossL2, name="cost") / self.BATCH_SIZE
        self.cost = tf.add(cost, 0, name="cost")
        optimizer = tf.train.AdamOptimizer(learning_rate=self.LR,
                                           beta2=0.99)  #.minimize(cost)
        optimizer_anlp = tf.train.AdamOptimizer(learning_rate=self.ANLP_LR,
                                                beta2=0.99)  #.minimize(cost)

        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.cost, tvars),
            self.GRAD_CLIP,
        )
        self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                  name="train_op")
        tf.add_to_collection('train_op', self.train_op)

        anlp_grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.anlp_node, tvars),
            self.GRAD_CLIP,
        )
        self.anlp_train_op = optimizer_anlp.apply_gradients(
            zip(anlp_grads, tvars), name="anlp_train_op")
        tf.add_to_collection('anlp_train_op', self.anlp_train_op)

        self.com_cost = tf.add(alpha * self.cost, beta * self.anlp_node)
        com_grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.com_cost, tvars),
            self.GRAD_CLIP,
        )

        self.com_train_op = optimizer.apply_gradients(zip(com_grads, tvars),
                                                      name="train_op")
        tf.add_to_collection('com_train_op', self.com_train_op)

        correct_pred = tf.equal(tf.argmax(self.predict, 1),
                                tf.argmax(self.tf_y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32),
                                       name="accuracy")
Beispiel #3
0
 def fix_video_dims_and_concat_on_x_axis(x):
     x = tf.transpose(x, [1, 3, 4, 0, 2])
     x = tf.reshape(x, [batch_size, frame_height, frame_channels, -1])
     x = tf.transpose(x, [0, 3, 1, 2])
     return x
Beispiel #4
0
def evolved_transformer_decoder(decoder_input,
                                encoder_output,
                                decoder_self_attention_bias,
                                encoder_decoder_attention_bias,
                                hparams,
                                cache=None,
                                decode_loop_step=None,
                                name="decoder",
                                nonpadding=None,
                                save_weights_to=None,
                                make_image_summary=True,
                                losses=None):
  """Evolved Transformer decoder. See arxiv.org/abs/1901.11117 for more details.

  Args:
    decoder_input: a Tensor.
    encoder_output: a Tensor.
    decoder_self_attention_bias: bias Tensor for self-attention (see
      common_attention.attention_bias()).
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias()).
    hparams: hyperparameters for model.
    cache: dict, containing tensors which are the results of previous
      layers, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop. Only used
      for inference on TPU.
    name: a string.
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This is used to mask out
      padding in convolutional layers.  We generally only need this mask for
      "packed" datasets, because for ordinary datasets, no padding is ever
      followed by nonpadding.
    save_weights_to: an optional dictionary to capture attention weights for
      visualization; the weights tensor will be appended there under a string
      key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: Not supported.

  Returns:
    Decoder output tensor.
  """
  del losses

  num_trainable_top_decoder_layers = hparams.get(
      "num_trainable_top_decoder_layers", -1)  # -1 means train all weights.

  if num_trainable_top_decoder_layers >= 0:
    encoder_output = tf.stop_gradient(encoder_output)

  attention_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "attention_dropout_broadcast_dims", "")))

  with tf.variable_scope(name):
    hidden_state = decoder_input

    num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers
    for layer in range(num_layers):
      if num_trainable_top_decoder_layers == num_layers - layer:
        hidden_state = tf.stop_gradient(hidden_state)
      layer_name = "layer_%d" % layer
      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope(layer_name):

        with tf.variable_scope(_SIXTEEN_HEAD_ATTENTION_NAME):
          residual_state = hidden_state
          hidden_state = common_layers.layer_preprocess(hidden_state, hparams)

          attention_cache = layer_cache[
              _SIXTEEN_HEAD_ATTENTION_NAME] if layer_cache is not None else None
          left_state = common_attention.multihead_attention(
              hidden_state,
              None,
              decoder_self_attention_bias,
              hparams.attention_key_channels or hparams.hidden_size,
              hparams.attention_value_channels or hparams.hidden_size,
              hparams.hidden_size,
              _capped_double_heads(hparams.num_heads),
              hparams.attention_dropout,
              attention_type=hparams.self_attention_type,
              max_relative_position=hparams.max_relative_position,
              heads_share_relative_embedding=(
                  hparams.heads_share_relative_embedding),
              add_relative_to_values=hparams.add_relative_to_values,
              save_weights_to=save_weights_to,
              cache=attention_cache,
              make_image_summary=make_image_summary,
              dropout_broadcast_dims=attention_dropout_broadcast_dims,
              max_length=hparams.get("max_length"),
              decode_loop_step=decode_loop_step,
              vars_3d=hparams.get("attention_variables_3d"),
              activation_dtype=hparams.get("activation_dtype", "float32"),
              weight_dtype=hparams.get("weight_dtype", "float32"))

        if encoder_output is not None:
          with tf.variable_scope(_FIRST_ATTEND_TO_ENCODER_NAME):
            attention_cache = (
                layer_cache[_FIRST_ATTEND_TO_ENCODER_NAME]
                if layer_cache is not None else None)
            right_state = common_attention.multihead_attention(
                hidden_state,
                encoder_output,
                encoder_decoder_attention_bias,
                hparams.attention_key_channels or hparams.hidden_size,
                hparams.attention_value_channels or hparams.hidden_size,
                hparams.hidden_size,
                hparams.num_heads,
                hparams.attention_dropout,
                max_relative_position=hparams.max_relative_position,
                heads_share_relative_embedding=(
                    hparams.heads_share_relative_embedding),
                add_relative_to_values=hparams.add_relative_to_values,
                save_weights_to=save_weights_to,
                cache=attention_cache,
                make_image_summary=make_image_summary,
                dropout_broadcast_dims=attention_dropout_broadcast_dims,
                max_length=hparams.get("max_length"),
                vars_3d=hparams.get("attention_variables_3d"),
                activation_dtype=hparams.get("activation_dtype", "float32"),
                weight_dtype=hparams.get("weight_dtype", "float32"))

            left_state = tf.nn.dropout(left_state,
                                       1 - hparams.layer_prepostprocess_dropout)
            right_state = tf.nn.dropout(
                right_state, 1 - hparams.layer_prepostprocess_dropout)

            hidden_state = residual_state + left_state + right_state

        else:
          hidden_state = common_layers.layer_postprocess(
              residual_state, left_state, hparams)

        with tf.variable_scope(_CONV_BRANCHES_NAME):
          residual_state = hidden_state
          hidden_state = common_layers.layer_preprocess(hidden_state, hparams)

          if nonpadding is not None:
            # Mask padding from conv layers.
            mask = tf.tile(
                tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size])
            hidden_state *= mask

          if layer_cache:
            if decode_loop_step is None:
              hidden_state = layer_cache[
                  _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.concat(
                      [
                          layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME],
                          hidden_state
                      ],
                      axis=1)[:, -1 * _DECODER_LEFT_CONV_PADDING - 1:, :]
              left_state = hidden_state
              right_state = hidden_state[:, _DECODER_LEFT_CONV_PADDING -
                                         _DECODER_RIGHT_CONV_PADDING:, :]

            else:
              # Inplace update is required for inference on TPU.
              # Inplace_ops only supports inplace_update on the first dimension.
              tmp = tf.transpose(
                  layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME], perm=[1, 0, 2])
              tmp = tf.expand_dims(tmp, axis=1)
              tmp = inplace_ops.alias_inplace_update(
                  tmp,
                  decode_loop_step * tf.shape(hidden_state)[1] +
                  _DECODER_LEFT_CONV_PADDING,
                  tf.transpose(hidden_state, perm=[1, 0, 2]))
              tmp = tf.squeeze(tmp, axis=1)
              hidden_state = layer_cache[
                  _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.transpose(
                      tmp, perm=[1, 0, 2])

              batch_size = hidden_state.shape.as_list()[0]
              left_state = tf.slice(hidden_state, [0, decode_loop_step, 0], [
                  batch_size, _DECODER_LEFT_CONV_PADDING + 1,
                  hparams.hidden_size
              ])
              right_state = tf.slice(hidden_state, [
                  0, decode_loop_step + _DECODER_LEFT_CONV_PADDING -
                  _DECODER_RIGHT_CONV_PADDING, 0
              ], [
                  batch_size, _DECODER_RIGHT_CONV_PADDING + 1,
                  hparams.hidden_size
              ])

          else:  # No caching.
            left_state = tf.pad(
                hidden_state,
                paddings=[[0, 0], [_DECODER_LEFT_CONV_PADDING, 0], [0, 0]])
            right_state = tf.pad(
                hidden_state,
                paddings=[[0, 0], [_DECODER_RIGHT_CONV_PADDING, 0], [0, 0]])

          left_output_dim = int(hparams.hidden_size * 2)
          separable_conv_11x1 = tf.layers.SeparableConv1D(
              left_output_dim,
              11,
              padding="VALID",
              name="separable_conv11x1",
              activation=tf.nn.relu)
          left_state = separable_conv_11x1.apply(left_state)
          left_state = tf.nn.dropout(left_state,
                                     1 - hparams.layer_prepostprocess_dropout)

          right_output_dim = int(hparams.hidden_size / 2)
          separable_conv_7x1_1 = tf.layers.SeparableConv1D(
              right_output_dim, 7, padding="VALID", name="separable_conv_7x1_1")
          right_state = separable_conv_7x1_1.apply(right_state)
          right_state = tf.nn.dropout(right_state,
                                      1 - hparams.layer_prepostprocess_dropout)
          right_state = tf.pad(
              right_state,
              [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]],
              constant_values=0)

          hidden_state = left_state + right_state

          hidden_state = common_layers.layer_preprocess(hidden_state, hparams)
          if nonpadding is not None:
            # Mask padding from conv layers.
            mask = tf.tile(
                tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size * 2])
            hidden_state *= mask

          if layer_cache:
            if decode_loop_step is None:
              hidden_state = layer_cache[
                  _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.concat(
                      [
                          layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME],
                          hidden_state
                      ],
                      axis=1)[:, -1 * _DECODER_FINAL_CONV_PADDING - 1:, :]

            else:
              # Inplace update is required for inference on TPU.
              # Inplace_ops only supports inplace_update on the first dimension.
              tmp = tf.transpose(
                  layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME], perm=[1, 0, 2])
              tmp = tf.expand_dims(tmp, axis=1)
              tmp = inplace_ops.alias_inplace_update(
                  tmp, (decode_loop_step + _DECODER_FINAL_CONV_PADDING) *
                  tf.shape(hidden_state)[1],
                  tf.transpose(hidden_state, perm=[1, 0, 2]))
              tmp = tf.squeeze(tmp, axis=1)
              hidden_state = layer_cache[
                  _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.transpose(
                      tmp, perm=[1, 0, 2])

              batch_size = hidden_state.shape.as_list()[0]
              hidden_state = tf.slice(hidden_state, [0, decode_loop_step, 0], [
                  batch_size, _DECODER_FINAL_CONV_PADDING + 1,
                  hparams.hidden_size * 2
              ])
          else:
            hidden_state = tf.pad(
                hidden_state,
                paddings=[[0, 0], [_DECODER_FINAL_CONV_PADDING, 0], [0, 0]])

          separable_conv_7x1_2 = tf.layers.SeparableConv1D(
              hparams.hidden_size,
              7,
              padding="VALID",
              name="separable_conv_7x1_2")
          hidden_state = separable_conv_7x1_2.apply(hidden_state)

          hidden_state = common_layers.layer_postprocess(
              residual_state, hidden_state, hparams)

        with tf.variable_scope(_VANILLA_ATTENTION_NAME):
          residual_state = hidden_state
          hidden_state = common_layers.layer_preprocess(hidden_state, hparams)

          attention_cache = layer_cache[
              _VANILLA_ATTENTION_NAME] if layer_cache is not None else None
          hidden_state = common_attention.multihead_attention(
              hidden_state,
              None,
              decoder_self_attention_bias,
              hparams.attention_key_channels or hparams.hidden_size,
              hparams.attention_value_channels or hparams.hidden_size,
              hparams.hidden_size,
              hparams.num_heads,
              hparams.attention_dropout,
              attention_type=hparams.self_attention_type,
              max_relative_position=hparams.max_relative_position,
              heads_share_relative_embedding=(
                  hparams.heads_share_relative_embedding),
              add_relative_to_values=hparams.add_relative_to_values,
              save_weights_to=save_weights_to,
              cache=attention_cache,
              make_image_summary=make_image_summary,
              dropout_broadcast_dims=attention_dropout_broadcast_dims,
              max_length=hparams.get("max_length"),
              decode_loop_step=decode_loop_step,
              vars_3d=hparams.get("attention_variables_3d"),
              activation_dtype=hparams.get("activation_dtype", "float32"),
              weight_dtype=hparams.get("weight_dtype", "float32"))
          hidden_state = common_layers.layer_postprocess(
              residual_state, hidden_state, hparams)

        if encoder_output is not None:
          with tf.variable_scope(_SECOND_ATTEND_TO_ENCODER_NAME):
            residual_state = hidden_state
            hidden_state = common_layers.layer_preprocess(hidden_state, hparams)

            attention_cache = (
                layer_cache[_SECOND_ATTEND_TO_ENCODER_NAME]
                if layer_cache is not None else None)
            hidden_state = common_attention.multihead_attention(
                hidden_state,
                encoder_output,
                encoder_decoder_attention_bias,
                hparams.attention_key_channels or hparams.hidden_size,
                hparams.attention_value_channels or hparams.hidden_size,
                hparams.hidden_size,
                hparams.num_heads,
                hparams.attention_dropout,
                max_relative_position=hparams.max_relative_position,
                heads_share_relative_embedding=(
                    hparams.heads_share_relative_embedding),
                add_relative_to_values=hparams.add_relative_to_values,
                save_weights_to=save_weights_to,
                cache=attention_cache,
                make_image_summary=make_image_summary,
                dropout_broadcast_dims=attention_dropout_broadcast_dims,
                max_length=hparams.get("max_length"),
                vars_3d=hparams.get("attention_variables_3d"),
                activation_dtype=hparams.get("activation_dtype", "float32"),
                weight_dtype=hparams.get("weight_dtype", "float32"))
            hidden_state = common_layers.layer_postprocess(
                residual_state, hidden_state, hparams)

        with tf.variable_scope("dense_layers"):
          residual_state = hidden_state
          hidden_state = common_layers.layer_preprocess(hidden_state, hparams)

          hidden_state = tf.layers.dense(
              hidden_state,
              int(hparams.hidden_size * 4),
              activation=tf.nn.swish)
          hidden_state = tf.nn.dropout(hidden_state,
                                       1 - hparams.layer_prepostprocess_dropout)

          hidden_state = common_layers.layer_preprocess(hidden_state, hparams)

          hidden_state = tf.layers.dense(hidden_state, hparams.hidden_size)
          hidden_state = common_layers.layer_postprocess(
              residual_state, hidden_state, hparams)

    decoder_output = common_layers.layer_preprocess(hidden_state, hparams)
    if num_trainable_top_decoder_layers == 0:
      decoder_output = tf.stop_gradient(decoder_output)
    return decoder_output
Beispiel #5
0
    def discrete_bottleneck(self, x):
        """Discretization bottleneck for latent variables.

    Args:
        x: Input to the discretization bottleneck.

    Returns:
        Embedding to pass to the decoder, discrete latent, loss, and the
        embedding
        function.

    Raises:
        ValueError: If projection_tensors is None for reshape_method
        project, or
        ema_count or ema_means is None if we are using ema, or unknown
        args.
    """
        x_reshaped = self.slice_hidden(x)
        x_means_hot = []
        x_means = 0
        loss = 0
        x_means_hot, x_means, q_loss, e_loss = self.embedding_lookup(
            x_reshaped, self.means)

        if self.hparams.ema:
            tf.logging.info("Using EMA with beta = {}".format(
                self.hparams.beta))
            updated_ema_count = \
                moving_averages.assign_moving_average(
                    self.ema_count,
                    tf.reduce_sum(
                        tf.reshape(
                            x_means_hot,
                            shape=[-1, self.hparams.num_blocks,
                                   self.hparams.block_v_size]),
                        axis=0),
                    self.hparams.decay,
                    zero_debias=False)

            dw = tf.matmul(tf.transpose(x_means_hot, perm=[1, 2, 0]),
                           tf.transpose(x_reshaped, perm=[1, 0, 2]))

            updated_ema_means = \
                moving_averages.assign_moving_average(
                    self.ema_means, dw, self.hparams.decay,
                    zero_debias=False)
            n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True)
            updated_ema_count = (
                (updated_ema_count + self.hparams.epsilon) /
                (n + 2**self.hparams.z_size * self.hparams.epsilon) * n)
            updated_ema_means = updated_ema_means / tf.expand_dims(
                updated_ema_count, axis=-1)

            with tf.control_dependencies([e_loss]):
                update_means = tf.assign(self.means, updated_ema_means)
                with tf.control_dependencies([update_means]):
                    loss += self.hparams.beta * e_loss
        else:
            # Use a gradient based loss for learning the cluster centers
            loss += q_loss + self.hparams.beta * e_loss

        # Get the discrete latent representation
        x_means_idx = tf.argmax(x_means_hot, axis=-1)

        # Get the binary representation
        num_bits = int(self.hparams.z_size // self.hparams.num_blocks)
        x_means_bits = self.int_to_bit(x_means_idx, num_bits=num_bits, base=2)
        x_discrete = self.bit_to_int(tf.to_int32(x_means_bits),
                                     num_bits=self.hparams.z_size,
                                     base=2)

        # Reshape x_discrete
        shape_x = common_layers.shape_list(x)
        shape_discrete = shape_x[:-1]
        x_discrete = tf.reshape(x_discrete, shape_discrete)
        x_means = tf.reshape(x_means, shape=shape_x)
        h1 = x + tf.stop_gradient(x_means - x)

        h2 = tf.layers.dense(tf.nn.relu(h1),
                             self.hparams.filter_size,
                             name="vch2")
        res = tf.layers.dense(tf.nn.relu(h2),
                              self.hparams.hidden_size,
                              name="vcfin")
        embed_fn = partial(self.embed)
        return {
            "dense": res,
            "discrete": x_discrete,
            "loss": loss,
            "embed": embed_fn
        }
def resample_feature_map(feat,
                         name,
                         target_height,
                         target_width,
                         target_num_channels,
                         apply_bn=False,
                         is_training=None,
                         conv_after_downsample=False,
                         use_native_resize_op=False,
                         pooling_type=None,
                         use_tpu=False,
                         data_format='channels_last'):
  """Resample input feature map to have target number of channels and size."""
  if data_format == 'channels_first':
    _, num_channels, height, width = feat.get_shape().as_list()
  else:
    _, height, width, num_channels = feat.get_shape().as_list()

  if height is None or width is None or num_channels is None:
    raise ValueError(
        'shape[1] or shape[2] or shape[3] of feat is None (shape:{}).'.format(
            feat.shape))
  if apply_bn and is_training is None:
    raise ValueError('If BN is applied, need to provide is_training')

  def _maybe_apply_1x1(feat):
    """Apply 1x1 conv to change layer width if necessary."""
    if num_channels != target_num_channels:
      feat = tf.layers.conv2d(
          feat,
          filters=target_num_channels,
          kernel_size=(1, 1),
          padding='same',
          data_format=data_format)
      if apply_bn:
        feat = utils.batch_norm_act(
            feat,
            is_training_bn=is_training,
            act_type=None,
            data_format=data_format,
            use_tpu=use_tpu,
            name='bn')
    return feat

  with tf.variable_scope('resample_{}'.format(name)):
    # If conv_after_downsample is True, when downsampling, apply 1x1 after
    # downsampling for efficiency.
    if height > target_height and width > target_width:
      if not conv_after_downsample:
        feat = _maybe_apply_1x1(feat)
      height_stride_size = int((height - 1) // target_height + 1)
      width_stride_size = int((width - 1) // target_width + 1)
      if pooling_type == 'max' or pooling_type is None:
        # Use max pooling in default.
        feat = tf.layers.max_pooling2d(
            inputs=feat,
            pool_size=[height_stride_size + 1, width_stride_size + 1],
            strides=[height_stride_size, width_stride_size],
            padding='SAME',
            data_format=data_format)
      elif pooling_type == 'avg':
        feat = tf.layers.average_pooling2d(
            inputs=feat,
            pool_size=[height_stride_size + 1, width_stride_size + 1],
            strides=[height_stride_size, width_stride_size],
            padding='SAME',
            data_format=data_format)
      else:
        raise ValueError('Unknown pooling type: {}'.format(pooling_type))
      if conv_after_downsample:
        feat = _maybe_apply_1x1(feat)
    elif height <= target_height and width <= target_width:
      feat = _maybe_apply_1x1(feat)
      if height < target_height or width < target_width:
        height_scale = target_height // height
        width_scale = target_width // width
        if (use_native_resize_op or target_height % height != 0 or
            target_width % width != 0):
          if data_format == 'channels_first':
            feat = tf.transpose(feat, [0, 2, 3, 1])
          feat = tf.image.resize_nearest_neighbor(feat,
                                                  [target_height, target_width])
          if data_format == 'channels_first':
            feat = tf.transpose(feat, [0, 3, 1, 2])
        else:
          feat = nearest_upsampling(
              feat,
              height_scale=height_scale,
              width_scale=width_scale,
              data_format=data_format)
    else:
      raise ValueError(
          'Incompatible target feature map size: target_height: {},'
          'target_width: {}'.format(target_height, target_width))

  return feat
    def update_placeholder_shape_and_add_transpose(node: Node):
        """
        The function changes placeholders shapes from NHWC to NCHW format and add transpose operations if needed.
        :param node: node to operate on.
        :return: None
        """
        try:
            import tensorflow.compat.v1 as tf_v1
            # disable eager execution of TensorFlow 2 environment immediately
            tf_v1.disable_eager_execution()
        except ImportError:
            import tensorflow as tf_v1
        from openvino.tools.mo.front.common.layout import convert_shape, nhwc_to_nchw_permute, nchw_to_nhwc_permute
        from openvino.tools.mo.front.tf.extractors.utils import tf_tensor_shape
        from openvino.tools.mo.front.tf.partial_infer.tf import add_node_def_to_subgraph, update_input_in_pbs

        tf_v1.reset_default_graph()

        inputs_replacements = list()

        # transpose permutation constant
        nchw_to_nhwc_constant = tf_v1.constant(nchw_to_nhwc_permute,
                                               dtype=tf_v1.int32,
                                               name=nchw_to_nhwc_constant_name)
        nhwc_to_nchw_constant = tf_v1.constant(nhwc_to_nchw_permute,
                                               dtype=tf_v1.int32,
                                               name=nhwc_to_nchw_constant_name)

        for placeholder_name in node['input_nodes_names']:
            # dummy node which we can refer to as input in the transpose for the output node
            # dummy node should be unique for each placeholder
            dummy_node = tf_v1.constant(value=[[[[1]]]],
                                        dtype=tf_v1.float32,
                                        name='random_dummy_name_' +
                                        placeholder_name)

            placeholder = node['pbs'][placeholder_name]
            cur_shape = tf_tensor_shape(placeholder.attr['shape'].shape)
            if len(
                    cur_shape
            ) == 4:  # TODO think about better check that transpose is required
                nchw_shape = convert_shape(cur_shape, nhwc_to_nchw_permute)
                for ind in range(len(cur_shape)):
                    placeholder.attr['shape'].shape.dim[ind].size = nchw_shape[
                        ind]
                transpose_name = placeholder.name + '_transpose'
                transpose = tf_v1.transpose(dummy_node, nchw_to_nhwc_constant,
                                            transpose_name)  # NCHW -> NHWC

                # add transpose operations to GraphDef after placeholders
                add_node_def_to_subgraph(node, transpose.op.node_def,
                                         transpose_name,
                                         len(node['input_nodes_names']))
                inputs_replacements.append((placeholder.name, transpose_name))
                inputs_replacements.append((dummy_node.name, placeholder.name))
                node['real_input_dims'].append(nchw_shape)
            else:
                node['real_input_dims'].append(cur_shape)
        add_node_def_to_subgraph(node, nchw_to_nhwc_constant.op.node_def)
        add_node_def_to_subgraph(node, nhwc_to_nchw_constant.op.node_def)

        # update initial input names to a transposed ones
        for old_input_tensor_name, new_name in inputs_replacements:
            update_input_in_pbs(node, old_input_tensor_name, new_name)
Beispiel #8
0
def random_mask2(shape, k):
  x = tf.random_normal(shape=shape)
  x = tf.transpose(x)
  kth_largest = tf.nn.top_k(x, k)[0][:, k-1]
  mask = tf.to_float(tf.greater_equal(x, tf.expand_dims(kth_largest, 1)))
  return tf.transpose(mask)
def create_de_model(bert_config, is_training, input_ids_1, input_mask_1,
                    segment_ids_1, input_ids_2, input_masks_2, segment_ids_2,
                    num_candidates, labels, use_one_hot_embeddings):
    """Creates a ranking model using cosine and dual encoder representations."""

    sequence_length_query = FLAGS.max_seq_length_query
    sequence_length_passage = FLAGS.max_seq_length - FLAGS.max_seq_length_query

    input_ids_1 = tf.reshape(input_ids_1, [-1, sequence_length_query])
    segment_ids_1 = tf.reshape(segment_ids_1, [-1, sequence_length_query])
    input_masks_1 = tf.reshape(input_mask_1, [-1, sequence_length_query])
    batch_size = tf.shape(input_masks_1)[0]

    input_ids_2 = tf.reshape(input_ids_2, [-1, sequence_length_passage])
    segment_ids_2 = tf.reshape(segment_ids_2, [-1, sequence_length_passage])
    input_masks_2 = tf.reshape(input_masks_2, [-1, sequence_length_passage])

    # [batch_size, num_candidates]
    labels = tf.dtypes.cast(labels, tf.float32)

    # [batch_size, num_vec_query, hidden_size], [batch_size, num_vec_query]
    output_layer_1, mask_1 = encode_block(bert_config, input_ids_1,
                                          input_masks_1, segment_ids_1,
                                          use_one_hot_embeddings,
                                          FLAGS.num_vec_query, is_training)

    output_layer_2, mask_2 = encode_block(bert_config, input_ids_2,
                                          input_masks_2, segment_ids_2,
                                          use_one_hot_embeddings,
                                          FLAGS.num_vec_passage, is_training)

    label_mask = tf.expand_dims(tf.eye(batch_size), axis=2)
    label_mask = tf.tile(label_mask, [1, 1, num_candidates])
    label_mask = tf.reshape(label_mask, [batch_size, -1])
    label_mask = tf.cast(label_mask, tf.float32)

    labels = tf.tile(labels, [1, batch_size])
    labels = tf.multiply(labels, label_mask)
    output_layer_2_logits = tf.reshape(
        output_layer_2,
        [batch_size, num_candidates, FLAGS.num_vec_passage, -1])
    mask_2_logits = tf.reshape(
        mask_2, [batch_size, num_candidates, FLAGS.num_vec_passage])
    mask_logits = tf.einsum("BQ,BCP->BCQP", tf.cast(mask_1, tf.float32),
                            tf.cast(mask_2_logits, tf.float32))

    logits = tf.einsum("BQH,BCPH->BCQP", output_layer_1, output_layer_2_logits)
    logits = tf.multiply(logits, mask_logits)
    logits = tf.reduce_max(logits, axis=-1)
    logits = tf.reduce_sum(logits, axis=-1)

    if FLAGS.use_tpu and is_training:
        num_shards = tpu_utils.num_tpu_shards()
        output_layer_2 = tpu_utils.cross_shard_concat(output_layer_2)
        mask_2 = tpu_utils.cross_shard_concat(tf.cast(mask_2, tf.float32))
        mask_2 = tf.cast(mask_2, tf.bool)
        labels = tpu_utils.cross_shard_pad(labels)
        tf.logging.info("Global batch size: %s", tensor_utils.shape(labels, 0))
        tf.logging.info("Num shards: %s", num_shards)
        tf.logging.info("Number of candidates in batch: %s",
                        tensor_utils.shape(output_layer_2, 0))
        labels = tf.reshape(labels, [num_shards, batch_size, -1])
        labels = tf.transpose(labels, perm=[1, 0, 2])
        labels = tf.reshape(labels, [batch_size, -1])

    with tf.variable_scope("loss"):
        if is_training:
            output_layer_1 = tf.nn.dropout(output_layer_1,
                                           keep_prob=FLAGS.dropout)
            output_layer_2 = tf.nn.dropout(output_layer_2,
                                           keep_prob=FLAGS.dropout)
        cosine_similarity = tf.einsum("AQH,BPH->ABQP", output_layer_1,
                                      output_layer_2)
        mask = tf.cast(
            tf.logical_and(tf.expand_dims(tf.expand_dims(mask_1, 2), 1),
                           tf.expand_dims(tf.expand_dims(mask_2, 1), 0)),
            tf.float32)
        cosine_similarity = tf.multiply(cosine_similarity, mask)
        cosine_similarity = tf.reduce_max(cosine_similarity, axis=-1)
        cosine_similarity = tf.reduce_sum(cosine_similarity, axis=-1)
        per_example_loss = tf.losses.softmax_cross_entropy(
            labels, cosine_similarity)

        return (per_example_loss, logits)
    def loss_som(self):
        """Computes the SOM loss."""
        k = tf.range(self.som_dim[0] * self.som_dim[1])
        k_1 = k // self.som_dim[0]
        k_2 = k % self.som_dim[1]

        k1_not_top = tf.less(k_1,
                             tf.constant(self.som_dim[0] - 1, dtype=tf.int32))
        k1_not_bottom = tf.greater(k_1, tf.constant(0, dtype=tf.int32))
        k2_not_right = tf.less(
            k_2, tf.constant(self.som_dim[1] - 1, dtype=tf.int32))
        k2_not_left = tf.greater(k_2, tf.constant(0, dtype=tf.int32))

        k1_up = tf.where(k1_not_top, tf.add(k_1, 1),
                         tf.zeros(tf.shape(k_1), dtype=tf.dtypes.int32))
        k1_down = tf.where(
            k1_not_bottom, tf.subtract(k_1, 1),
            tf.ones(tf.shape(k_1), dtype=tf.dtypes.int32) *
            (self.som_dim[0] - 1))
        k2_right = tf.where(k2_not_right, tf.add(k_2, 1),
                            tf.zeros(tf.shape(k_2), dtype=tf.dtypes.int32))
        k2_left = tf.where(
            k2_not_left, tf.subtract(k_2, 1),
            tf.ones(tf.shape(k_2), dtype=tf.dtypes.int32) *
            (self.som_dim[0] - 1))

        k_up = k1_up * self.som_dim[0] + k_2
        k_down = k1_down * self.som_dim[0] + k_2
        k_right = k_1 * self.som_dim[0] + k2_right
        k_left = k_1 * self.som_dim[0] + k2_left

        q_t = tf.transpose(self.q_ng)
        q_up = tf.transpose(
            tf.gather_nd(
                q_t, tf.reshape(k_up, [self.som_dim[0] * self.som_dim[1], 1])))
        q_down = tf.transpose(
            tf.gather_nd(
                q_t, tf.reshape(k_down,
                                [self.som_dim[0] * self.som_dim[1], 1])))
        q_right = tf.transpose(
            tf.gather_nd(
                q_t, tf.reshape(k_right,
                                [self.som_dim[0] * self.som_dim[1], 1])))
        q_left = tf.transpose(
            tf.gather_nd(
                q_t, tf.reshape(k_left,
                                [self.som_dim[0] * self.som_dim[1], 1])))

        q_neighbours = tf.concat([
            tf.expand_dims(q_up, -1),
            tf.expand_dims(q_down, -1),
            tf.expand_dims(q_right, -1),
            tf.expand_dims(q_left, -1)
        ],
                                 axis=2)
        q_neighbours = tf.reduce_sum(tf.math.log(q_neighbours), axis=-1)

        mask = tf.greater(self.q, 0.1 * tf.ones_like(self.q))
        new_q = tf.multiply(self.q, tf.cast(mask, tf.float32))
        q_n = tf.math.multiply(q_neighbours, tf.stop_gradient(new_q))
        q_n = tf.reduce_sum(q_n, axis=-1)
        qq = tf.math.negative(tf.reduce_mean(q_n))

        return qq
Beispiel #11
0
def joint_extraction_model_fn(features, labels, mode, params):
    """Runs the node-level sequence labeling model."""
    logging.info("joint_extraction_model_fn")
    inputs = features  # Arg "features" is the overall inputs.

    # Read vocabs and inputs.
    dropout = params["dropout"]
    if params["circle_features"]:
        nnodes, friend_has_label, (words, nwords), (
            prev_text_words,
            n_prev_text_words), (chars_list, chars_len_list), (
                partner_words, _), (friends_words, n_friends_words), (
                    friends_fix,
                    friends_var), (leaf_type_list, goldmine_feat_list), (
                        _, _), (node_xpath_list, node_xpath_len_list), (
                            attributes,
                            attributes_plus_none), (position_list) = inputs
    else:
        nnodes, (words, nwords), (prev_text_words, n_prev_text_words), (
            chars_list,
            chars_len_list), (leaf_type_list, goldmine_feat_list), (_, _), (
                node_xpath_list,
                node_xpath_len_list), (attributes), (position_list) = inputs

    # nnodes, the number of nodes in each page;
    #    shape is [?]; length is the number of pages.
    # words, nwords are the node_text feature, shape is [?, ?, ?]
    #    the first two dimension is the batch * pages,
    #    the last one is the maximum length of the word lists
    # prev_text_words, n_prev_text_words, similar as above for previous nodes'text
    # chars_list, chars_len_list, shape is [?,?,?,?] also for node_text features
    #    the additional dim is for the length of the character sequences.
    # friends_words, shape is [?, ?, ?], gathers all the words from different
    #    friends of one node.
    # friends_fix, friends_var, shapes are [?, ?, ?, ?]
    #    the first two dimension is the batch * pages,
    #    the last two are the maximum length of friend nodes and words.

    nnodes = merge_first_two_dims(nnodes)
    training = (mode == tf_estimator.ModeKeys.TRAIN)
    vocab_words = _index_table_from_file(
        params["words"], num_oov_buckets=params["num_oov_buckets"])
    with tf.gfile.Open(params["tags"]) as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != "none"]
        num_tags = len(
            indices) + 1  # Make "None" as the tag with the last index.

    # NodeText Char Embeddings.
    with tf.gfile.Open(params["chars"]) as f:
        num_chars = sum(1 for _ in f) + params["num_oov_buckets"]
    vocab_chars = _index_table_from_file(
        params["chars"], num_oov_buckets=params["num_oov_buckets"])
    char_ids = vocab_chars.lookup(chars_list)
    variable = tf.get_variable("chars_embeddings",
                               [num_chars + 1, params["dim_chars"]],
                               tf.float32)
    char_embeddings = tf.nn.embedding_lookup(variable, char_ids)
    char_embeddings = tf.layers.dropout(char_embeddings,
                                        rate=dropout,
                                        training=training)
    logging.info("char_embeddings.shape: %s", char_embeddings.shape)
    # Char 1d convolution.
    weights = tf.sequence_mask(chars_len_list)
    char_embeddings = masked_conv1d_and_max(char_embeddings, weights,
                                            params["filters"],
                                            params["kernel_size"])
    logging.info("char_embeddings.shape after CNN: %s", char_embeddings.shape)

    # Word Embeddings.
    word_ids = vocab_words.lookup(words)
    glove = np.load(tf.gfile.Open(params["glove"],
                                  "rb"))["embeddings"]  # np.array
    variable = np.vstack([glove, [[0.] * params["dim_word_embedding"]]])
    # To finetune the GloVe embedding by setting trainable as True.
    variable = tf.Variable(variable, dtype=tf.float32, trainable=True)
    word_embeddings = tf.nn.embedding_lookup(variable, word_ids)

    logging.info("word_embeddings.shape: %s", word_embeddings.shape)

    # Prev_Text Representations.
    prev_text_word_ids = vocab_words.lookup(prev_text_words)
    prev_text_word_embeddings = tf.nn.embedding_lookup(variable,
                                                       prev_text_word_ids)
    if params["use_prev_text_lstm"]:
        # PREV_text LSTM.
        logging.info("prev_text_representation using lstm")

        prev_t = merge_first_two_dims(prev_text_word_embeddings)
        # Seq * batch * input
        prev_t = tf.transpose(prev_t, perm=[1, 0, 2])  # Need time-major.
        prev_output_fw, prev_output_bw = _bidirectional_lstm(
            prev_t, params["lstm_size"],
            merge_first_two_dims(n_prev_text_words))
        prev_output = tf.concat([prev_output_fw, prev_output_bw], axis=-1)
        prev_output = tf.reduce_mean(prev_output, 0)
        prev_output = tf.layers.dropout(prev_output,
                                        rate=dropout,
                                        training=training)
        logging.info("prev_output.shape (after reduce_mean): %s",
                     prev_output.shape)
        context_representation = split_first_two_dims_by_example(
            prev_output, prev_text_word_embeddings)
        logging.info("context_representation.shape (after split): %s",
                     context_representation.shape)

    else:
        logging.info("prev_text_word_embeddings.shape: %s",
                     prev_text_word_embeddings.shape)
        context_representation = tf.reduce_mean(prev_text_word_embeddings, 2)
        logging.info("context_representation.shape: %s",
                     context_representation.shape)

    if params["circle_features"]:
        partner_embeddings, circle_representation = circle_feature_modeling(
            variable, vocab_words, partner_words, friends_words,
            n_friends_words, friends_fix, friends_var, word_embeddings,
            dropout, training, params)
        context_representation = circle_representation

        if params["use_friend_semantic"]:
            friends_ids = vocab_words.lookup(friends_words)
            friend_embeddings = tf.nn.embedding_lookup(variable, friends_ids)

    if params["use_xpath_lstm"]:
        h_output = xpath_feature_modeling(node_xpath_list, node_xpath_len_list,
                                          training, params)
        context_representation = tf.concat([h_output, context_representation],
                                           axis=2)

    if params["use_position_embedding"]:
        position_representation = position_modeling(position_list, params)
        context_representation = tf.concat(
            [context_representation, position_representation], axis=2)

    # Text Embeddings: Concatenate Word and Char and Feature Embeddings.
    embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    logging.info("embeddings.shape: %s", embeddings.shape)

    # LSTM inside node texts.
    t = merge_first_two_dims(embeddings)
    t = tf.transpose(t, perm=[1, 0, 2])  # Need time-major.
    output_fw, output_bw = _bidirectional_lstm(t, params["lstm_size"],
                                               merge_first_two_dims(nwords))
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.reduce_mean(output, 0)
    output = tf.layers.dropout(output, rate=dropout, training=training)
    logging.info("output.shape (after reduce_mean): %s", output.shape)
    output = split_first_two_dims_by_example(output, embeddings)
    logging.info("output.shape (after split): %s", output.shape)

    node_seq_input = tf.concat([output, context_representation], axis=2)
    logging.info("output.shape (after + prev): %s", node_seq_input.shape)

    # Leaf Type Features.
    if params["add_leaf_types"]:
        with tf.gfile.Open(params["leaf_types"]) as f:
            num_leaf_types = sum(1 for _ in f) + params["num_oov_buckets"]
        vocab_leaf_types = _index_table_from_file(
            params["leaf_types"], num_oov_buckets=params["num_oov_buckets"])
        leaf_type_ids = vocab_leaf_types.lookup(leaf_type_list)
        leaf_variable = tf.get_variable(
            "leaf_type_embeddings",
            [num_leaf_types + 1, params["dim_leaf_type"]], tf.float32)
        leaf_type_embeddings = tf.nn.embedding_lookup(leaf_variable,
                                                      leaf_type_ids)
        leaf_type_embeddings = tf.layers.dropout(leaf_type_embeddings,
                                                 rate=dropout,
                                                 training=training)
        logging.info("leaf_type_embeddings.shape: %s", char_embeddings.shape)
        logging.info("node_seq_input.shape before leaf: %s",
                     node_seq_input.shape)
        node_seq_input = tf.concat([node_seq_input, leaf_type_embeddings],
                                   axis=2)
        logging.info("node_seq_input.shape after leaf: %s",
                     node_seq_input.shape)

    # Goldmine Feat Embeddings.
    if params["add_goldmine"]:
        vocab_goldmine_features = _index_table_from_file(
            params["goldmine_features"], num_oov_buckets=1)
        goldmine_feature_variable = tf.get_variable(
            "goldmine_feature_embeddings", [8 + 1, params["dim_goldmine"]],
            tf.float32)
        goldmine_feat_ids = vocab_goldmine_features.lookup(goldmine_feat_list)
        goldmine_feat_embeddings = tf.nn.embedding_lookup(
            goldmine_feature_variable, goldmine_feat_ids)
        goldmine_feat_embeddings = tf.reduce_sum(goldmine_feat_embeddings, 2)
        logging.info("goldmine_feat_embeddings.shape: %s",
                     goldmine_feat_embeddings.shape)
        node_seq_input = tf.concat([node_seq_input, goldmine_feat_embeddings],
                                   axis=2)
        logging.info("node_seq_input.shape after goldmine: %s",
                     node_seq_input.shape)

    # Node-level LSTM modeling.
    if params["node_encoder"] == "lstm":
        # Node-Sequence-LSTM.
        n_t = tf.transpose(node_seq_input, perm=[1, 0, 2])  # Need time-major.
        node_output_fw, node_output_bw = _bidirectional_lstm(
            n_t, params["node_lstm_size"], nnodes)
        node_seq_output = tf.concat([node_output_fw, node_output_bw], axis=-1)
        node_seq_output = tf.transpose(node_seq_output, perm=[1, 0, 2])
    elif params["node_encoder"] == "cnn":
        node_weights = tf.sequence_mask(nnodes)
        node_seq_output = masked_conv1d_and_max(node_seq_input,
                                                node_weights,
                                                params["node_filters"],
                                                params["node_kernel_size"],
                                                reducemax=False)
    elif params["node_encoder"] == "transformer":
        # Node-Sequence-Transformer.
        node_seq_output = transformer_encoding(node_seq_input, nnodes, params,
                                               mode)
    else:
        node_seq_output = node_seq_input

    logging.info("node_seq_input.shape after encoder: %s",
                 node_seq_output.shape)

    if params["node_encoder"] != "transformer":
        # Add the dropout layer if the encoder is not a transformer.
        node_seq_output = tf.layers.dropout(node_seq_output,
                                            rate=dropout,
                                            training=training)

    if params["use_friends_discrete_feature"] and params["circle_features"]:
        friend_has_label = tf.expand_dims(friend_has_label, axis=-1)
        node_seq_output = tf.concat([node_seq_output, friend_has_label],
                                    axis=-1)
        logging.info("node_seq_input.shape after friend_has_label: %s",
                     node_seq_output.shape)
        node_seq_output = tf.layers.dense(node_seq_output,
                                          params["last_hidden_layer_size"])

    logits = tf.layers.dense(node_seq_output, num_tags, name="label_dense_1")

    if params["semantic_encoder"] and params["circle_features"]:

        partner_similarity_emb = semantic_similarity(variable, vocab_words,
                                                     partner_embeddings,
                                                     attributes, params)
        node_seq_output = tf.concat(
            [node_seq_output,
             tf.nn.softmax(partner_similarity_emb)], axis=-1)
        logging.info("node_seq_output.shape after semantic encoder: %s",
                     node_seq_output.shape)

        if params["use_friend_semantic"]:
            friends_similarity_emb = semantic_similarity(
                variable, vocab_words, friend_embeddings, attributes, params)

            node_seq_output = tf.concat(
                [node_seq_output, friends_similarity_emb], axis=-1)

        if params["objective"] == "classification":
            node_seq_output = tf.layers.dense(node_seq_output,
                                              params["dim_word_embedding"],
                                              activation="relu")
            node_seq_output = tf.layers.dense(node_seq_output,
                                              params["last_hidden_layer_size"])
            logging.info("node_seq_output.shape after semantic encoder: %s",
                         node_seq_output.shape)
            logits = tf.layers.dense(node_seq_output,
                                     num_tags,
                                     name="label_dense_2")

        elif params["objective"] == "semantic_scorer":
            logits = semantic_scorer(attributes_plus_none, node_seq_output,
                                     params)

        elif params["objective"] == "binary_scorer":
            logits = binary_scorer(attributes_plus_none, node_seq_output,
                                   training, params)

    if params["use_crf"]:
        # CRF Layer.
        logging.info("logits.shape: %s", logits.shape)
        crf_params = tf.get_variable("crf", [num_tags, num_tags],
                                     dtype=tf.float32)
        pred_ids, _ = tfa.text.crf.crf_decode(logits, crf_params, nnodes)
        logging.info("pred_ids.shape: %s", pred_ids.shape)
    else:
        pred_ids = tf.argmax(logits, 2)
        logging.info("pred_ids.shape: %s", pred_ids.shape)
    # Predict for new sentences in target set.
    if mode == tf_estimator.ModeKeys.PREDICT:
        reverse_vocab_tags = _index_table_from_file(params["tags"], 1)
        pred_strings = reverse_vocab_tags.lookup(
            tf.strings.as_string(pred_ids))
        predictions = {
            "pred_ids": pred_ids,
            "tags": pred_strings,
            "scores": tf.nn.softmax(logits),
            "raw_scores": logits,
        }
        # Store the intermediate weights.
        if params["semantic_encoder"]:
            predictions["similarity"] = partner_similarity_emb
        if params["friend_encoder"]:
            predictions["friends_embs"] = circle_representation
        if params["extract_node_emb"]:
            predictions["node_embs"] = node_seq_output
        return tf_estimator.EstimatorSpec(mode, predictions=predictions)

    vocab_tags = _index_table_from_file(params["tags"], 1)
    tags = vocab_tags.lookup(labels)
    logging.info("tags.shape: %s", logits.shape)

    logging.info(
        "Parameter size: %s",
        np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        ]))

    if params["use_crf"]:
        log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
            logits, tags, nnodes, crf_params)
        loss = tf.reduce_mean(-log_likelihood)
    else:
        loss = tf.losses.sparse_softmax_cross_entropy(labels=tags,
                                                      logits=logits)
    #  Processing the metrics.
    weights = tf.sequence_mask(nnodes)
    metrics = {
        "acc":
        tf.metrics.accuracy(tags, pred_ids, weights),
        "precision":
        seq_tagging_metric_util.precision(tags, pred_ids, num_tags, indices,
                                          weights),
        "recall":
        seq_tagging_metric_util.recall(tags, pred_ids, num_tags, indices,
                                       weights),
        "f1":
        seq_tagging_metric_util.f1(tags, pred_ids, num_tags, indices, weights),
    }
    for metric_name, op in metrics.items():
        tf.summary.scalar(metric_name, op[1])

    if mode == tf_estimator.ModeKeys.TRAIN:
        with tf.name_scope("train_scope"):
            optimizer = tf.train.AdamOptimizer()
            train_op = optimizer.minimize(
                loss, global_step=tf.train.get_or_create_global_step())
        return tf_estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)
    return tf_estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      eval_metric_ops=metrics)
Beispiel #12
0
    def _up_convolve_transpose_explicit(self, inputs, kernel, prepadding):
        # Computes upsampling followed by convolution, via transpose convolution ops
        # in EXPLICIT mode. This is an efficient implementation of upsampled
        # convolutions, where we only compute values that are necessary.
        do_cast = inputs.dtype.is_integer

        # conv2d_backprop_input expects the output and input channels in reversed
        # order. We implement this by swapping those dimensions of the kernel.
        kernel = tf.transpose(
            kernel,
            list(range(self._rank)) + [self._rank + 1, self._rank])

        # Compute explicit padding corresponding to the equivalent conv2d call,
        # and the shape of the output, taking into account any pre-padding.
        input_shape = tf.shape(inputs)
        padding = (self._rank + 2) * [(0, 0)]
        output_shape = [input_shape[0]] + (self._rank + 1) * [None]
        if self.data_format == "channels_last":
            spatial_axes = range(1, self._rank + 1)
            output_shape[-1] = self.filters
        else:
            spatial_axes = range(2, self._rank + 2)
            output_shape[1] = self.filters
        if self.extra_pad_end:
            get_length = lambda l, s, k, p: l * s + ((k - 1) - p)
        else:
            get_length = lambda l, s, k, p: l * s + ((k - 1) - (s - 1) - p)
        for i, a in enumerate(spatial_axes):
            if self.padding == "valid":
                padding[a] = 2 * (self.kernel_support[i] - 1, )
            else:  # same
                padding[a] = (
                    prepadding[i][0] * self.strides_up[i] +
                    self.kernel_support[i] // 2,
                    prepadding[i][1] * self.strides_up[i] +
                    (self.kernel_support[i] - 1) // 2,
                )
            output_shape[a] = get_length(input_shape[a], self.strides_up[i],
                                         self.kernel_support[i],
                                         sum(padding[a]))

        data_format = self._op_data_format
        strides = self._padded_tuple(self.strides_up, 1)

        # Compute convolution.
        if self._rank == 1 and not self.channel_separable:
            # There's no 1D equivalent to conv2d_backprop_input, so we insert an
            # extra dimension and use the 2D op.
            extradim = {
                "channels_first": 2,
                "channels_last": 1
            }[self.data_format]
            data_format = data_format.replace("W", "HW")
            strides = strides[:extradim] + (
                strides[extradim], ) + strides[extradim:]
            padding = padding[:extradim] + [(0, 0)] + padding[extradim:]
            output_shape = output_shape[:extradim] + [
                1
            ] + output_shape[extradim:]
            kernel = tf.expand_dims(kernel, 0)
            inputs = tf.expand_dims(inputs, extradim)
            if do_cast:
                inputs = tf.cast(inputs, tf.float32)
            outputs = tf.nn.conv2d_backprop_input(output_shape,
                                                  kernel,
                                                  inputs,
                                                  strides=strides,
                                                  padding=padding,
                                                  data_format=data_format)
            if do_cast:
                outputs = tf.cast(tf.math.round(outputs), self.accum_dtype)
            outputs = tf.squeeze(outputs, [extradim])
        elif self._rank == 2 and not self.channel_separable:
            if do_cast:
                inputs = tf.cast(inputs, tf.float32)
            outputs = tf.nn.conv2d_backprop_input(output_shape,
                                                  kernel,
                                                  inputs,
                                                  strides=strides,
                                                  padding=padding,
                                                  data_format=data_format)
            if do_cast:
                outputs = tf.cast(tf.math.round(outputs), self.accum_dtype)
        else:
            self._raise_notimplemented()

        # Perform downsampling if it is requested.
        if any(s > 1 for s in self.strides_down):
            slices = tuple(slice(None, None, s) for s in self.strides_down)
            slices = self._padded_tuple(slices, slice(None))
            outputs = outputs[slices]

        return outputs
Beispiel #13
0
    def _up_convolve_transpose_valid(self, inputs, kernel, prepadding):
        # Computes upsampling followed by convolution, via transpose convolution ops
        # in VALID mode. This is a relatively inefficient implementation of
        # upsampled convolutions, where we need to crop away a lot of the values
        # computed in the boundaries.

        # Transpose convolutions expect the output and input channels in reversed
        # order. We implement this by swapping those dimensions of the kernel.
        # For channel separable convolutions, we can't currently perform anything
        # other than one filter per channel, so the last dimension needs to be of
        # length one. Since this happens to be the format that the op expects it,
        # we can skip the transpose in that case.
        if not self.channel_separable:
            kernel = tf.transpose(
                kernel,
                list(range(self._rank)) + [self._rank + 1, self._rank])

        # Compute shape of temporary.
        input_shape = tf.shape(inputs)
        temp_shape = [input_shape[0]] + (self._rank + 1) * [None]
        if self.data_format == "channels_last":
            spatial_axes = range(1, self._rank + 1)
            temp_shape[-1] = (input_shape[-1]
                              if self.channel_separable else self.filters)
        else:
            spatial_axes = range(2, self._rank + 2)
            temp_shape[
                1] = input_shape[1] if self.channel_separable else self.filters
        if self.extra_pad_end:
            get_length = lambda l, s, k: l * s + (k - 1)
        else:
            get_length = lambda l, s, k: l * s + ((k - 1) - (s - 1))
        for i, a in enumerate(spatial_axes):
            temp_shape[a] = get_length(input_shape[a], self.strides_up[i],
                                       self.kernel_support[i])

        data_format = self._op_data_format
        strides = self._padded_tuple(self.strides_up, 1)

        # Compute convolution.
        if self._rank == 1 and not self.channel_separable:
            # There's no 1D equivalent to conv2d_backprop_input, so we insert an
            # extra dimension and use the 2D op.
            extradim = {
                "channels_first": 2,
                "channels_last": 1
            }[self.data_format]
            data_format = data_format.replace("W", "HW")
            strides = strides[:extradim] + (
                strides[extradim], ) + strides[extradim:]
            temp_shape = temp_shape[:extradim] + [1] + temp_shape[extradim:]
            kernel = tf.expand_dims(kernel, 0)
            inputs = tf.expand_dims(inputs, extradim)
            outputs = tf.nn.conv2d_backprop_input(temp_shape,
                                                  kernel,
                                                  inputs,
                                                  strides=strides,
                                                  padding="VALID",
                                                  data_format=data_format)
            outputs = tf.squeeze(outputs, [extradim])
        elif self._rank == 1 and self.channel_separable and self.filters == 1:
            # There's no 1D equivalent to depthwise_conv2d_native_backprop_input, so
            # we insert an extra dimension and use the 2D op.
            extradim = {
                "channels_first": 2,
                "channels_last": 1
            }[self.data_format]
            data_format = data_format.replace("W", "HW")
            strides = strides[:extradim] + (
                strides[extradim], ) + strides[extradim:]
            temp_shape = temp_shape[:extradim] + [1] + temp_shape[extradim:]
            kernel = tf.expand_dims(kernel, 0)
            inputs = tf.expand_dims(inputs, extradim)
            outputs = tf.nn.depthwise_conv2d_native_backprop_input(
                temp_shape,
                kernel,
                inputs,
                strides=strides,
                padding="VALID",
                data_format=data_format)
            outputs = tf.squeeze(outputs, [extradim])
        elif self._rank == 2 and not self.channel_separable:
            outputs = tf.nn.conv2d_backprop_input(temp_shape,
                                                  kernel,
                                                  inputs,
                                                  strides=strides,
                                                  padding="VALID",
                                                  data_format=data_format)
        elif (self._rank == 2 and self.channel_separable and self.filters == 1
              and self.strides_up[0] == self.strides_up[1]):
            outputs = tf.nn.depthwise_conv2d_native_backprop_input(
                temp_shape,
                kernel,
                inputs,
                strides=strides,
                padding="VALID",
                data_format=data_format)
        elif self._rank == 3 and not self.channel_separable:
            outputs = tf.nn.conv3d_transpose(inputs,
                                             kernel,
                                             temp_shape,
                                             strides=strides,
                                             padding="VALID",
                                             data_format=data_format)
        else:
            self._raise_notimplemented()

        # Perform crop, taking into account any pre-padding that was applied.
        slices = (self._rank + 2) * [slice(None)]
        for i, a in enumerate(spatial_axes):
            if self.padding == "valid":
                # Take `kernel_support - 1` samples away from both sides. This leaves
                # just samples computed without any padding.
                start = stop = self.kernel_support[i] - 1
            else:  # same
                # Take half of kernel sizes plus the pre-padding away from each side.
                start = prepadding[i][0] * self.strides_up[i]
                start += self.kernel_support[i] // 2
                stop = prepadding[i][1] * self.strides_up[i]
                stop += (self.kernel_support[i] - 1) // 2
            step = self.strides_down[i]
            start = start if start > 0 else None
            stop = -stop if stop > 0 else None
            step = step if step > 1 else None
            slices[a] = slice(start, stop, step)
        if not all(s.start is s.stop is s.step is None for s in slices):
            outputs = outputs[tuple(slices)]

        return outputs
                     shape=[1, 1],
                     dtype=tf.float32,
                     initializer=tf.zeros_initializer())

## Forward prop
Z1 = tf.add(tf.matmul(W1, x), b1)
A1 = tf.nn.relu(Z1)

Z2 = tf.add(tf.matmul(W2, A1), b2)
A2 = tf.nn.relu(Z2)

Z3 = tf.add(tf.matmul(W3, A2), b3)

## Compute Cost
cost = tf.reduce_mean(
    tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.transpose(Z3),
                                            labels=tf.transpose(y)))

## Backward Prop
back_prop = tf.train.GradientDescentOptimizer(learning_rate=.01).minimize(cost)

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    cost_main = []
    for i in range(2000):
        _, cost_iter = sess.run([back_prop, cost],
                                feed_dict={
                                    x: X_train,
                                    y: y_train
                                })
Beispiel #15
0
def _generate_detections_tf(cls_outputs,
                            box_outputs,
                            anchor_boxes,
                            indices,
                            classes,
                            image_id,
                            image_scale,
                            min_score_thresh=MIN_SCORE_THRESH,
                            max_boxes_to_draw=MAX_DETECTIONS_PER_IMAGE,
                            soft_nms_sigma=0.0,
                            iou_threshold=0.5,
                            use_native_nms=True):
  """Generates detections with model outputs and anchors.

  Args:
    cls_outputs: a numpy array with shape [N, 1], which has the highest class
      scores on all feature levels. The N is the number of selected
      top-K total anchors on all levels.  (k being MAX_DETECTION_POINTS)
    box_outputs: a numpy array with shape [N, 4], which stacks box regression
      outputs on all feature levels. The N is the number of selected top-k
      total anchors on all levels. (k being MAX_DETECTION_POINTS)
    anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of selected top-k total anchors on
      all levels.
    indices: a numpy array with shape [N], which is the indices from top-k
      selection.
    classes: a numpy array with shape [N], which represents the class
      prediction on all selected anchors from top-k selection.
    image_id: an integer number to specify the image id.
    image_scale: a float tensor representing the scale between original image
      and input image for the detector. It is used to rescale detections for
      evaluating with the original groundtruth annotations.
    min_score_thresh: A float representing the threshold for deciding when to
      remove boxes based on score.
    max_boxes_to_draw: Max number of boxes to draw.
    soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter;
      See Bodla et al, https://arxiv.org/abs/1704.04503).  When
        `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard)
        NMS.
    iou_threshold: A float representing the threshold for deciding whether boxes
      overlap too much with respect to IOU.
    use_native_nms: a bool that indicates whether to use native nms.

  Returns:
    detections: detection results in a tensor with each row representing
      [image_id, y, x, height, width, score, class]
  """
  logging.info('Using tf version of post-processing.')
  anchor_boxes = tf.gather(anchor_boxes, indices)

  scores = tf.math.sigmoid(cls_outputs)
  # apply bounding box regression to anchors
  boxes = decode_box_outputs_tf(
      tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0]))

  if use_native_nms:
    logging.info('Using native nms.')
    top_detection_idx, scores = tf.image.non_max_suppression_with_scores(
        boxes,
        scores,
        max_boxes_to_draw,
        iou_threshold=iou_threshold,
        score_threshold=min_score_thresh,
        soft_nms_sigma=soft_nms_sigma)
    boxes = tf.gather(boxes, top_detection_idx)
  else:
    logging.info('Using customized nms.')
    scores = tf.expand_dims(scores, axis=1)
    all_detections = tf.concat([boxes, scores], axis=1)
    top_detection_idx = nms_tf(all_detections, iou_threshold)
    detections = tf.gather(all_detections, top_detection_idx)
    scores = detections[:, 4]
    boxes = detections[:, :4]
  height = boxes[:, 2] - boxes[:, 0]
  width = boxes[:, 3] - boxes[:, 1]

  detections = tf.stack([
      tf.cast(tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32),
      boxes[:, 0] * image_scale,
      boxes[:, 1] * image_scale,
      height * image_scale,
      width * image_scale,
      scores,
      tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32)
  ], axis=1)
  return detections
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, multilabel, sent_rels, sentiment,
                 entailment_rels, entailment, corr_rels, correlation):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids)

  # Here, we are doing a classification task on the entire segment. For
  # token-level output, use model.get_sequece_output() instead.
  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    # Labels both for single and multilabel classification
    labels = tf.cast(labels, tf.float32)

    if multilabel:
      probabilities = tf.nn.sigmoid(logits)
      tf.logging.info("num_labels:{};logits:{};labels:{}".format(
          num_labels, logits, labels))
      per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(
          labels=labels, logits=logits)
    else:
      probabilities = tf.nn.softmax(logits, axis=-1)
      per_example_loss = tf.nn.softmax_cross_entropy_with_logits(
          labels=labels, logits=logits)
    loss = tf.reduce_mean(per_example_loss)

    # Add regularization based on label relations prior
    probs_exp = tf.expand_dims(probabilities, 1)
    m = tf.tile(probs_exp, [1, num_labels, 1])
    probs_exp_t = tf.transpose(probs_exp, perm=[0, 2, 1])

    # Subtract each prediction from all others:
    # Example (with batch size=1):
    #     tiled predictions: [0.1] [0.1] [0.1]
    #                        [0.2] [0.2] [0.2]
    #                        [0.3] [0.3] [0.3]
    #     subtract [0.1, 0.2, 0.3] row-wise
    #     result:   [0.0] [-.1] [-.2] --> row represents difference between
    #                                     target 1 and all other targets
    #               [0.1] [0.0] [-.1]
    #               [0.2] [0.1] [0.0]
    dists = tf.square(tf.subtract(m, probs_exp_t))  # square distances
    dists = tf.transpose(dists, perm=[0, 2, 1])

    # Sentiment-based regularization
    sent_reg = tf.multiply(
        tf.constant(sentiment),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(sent_rels, dtype=tf.float32))))
    tf.summary.scalar("sentiment_regularization", sent_reg)
    loss += sent_reg

    # Entailment-based regularization
    ent_reg = tf.multiply(
        tf.constant(entailment),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(entailment_rels, dtype=tf.float32))))
    tf.summary.scalar("entailment_regularization", ent_reg)
    loss += ent_reg

    # Correlation-based regularization
    corr_reg = tf.multiply(
        tf.constant(correlation),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(corr_rels, dtype=tf.float32))))
    tf.summary.scalar("correlation_regularization", corr_reg)
    loss += corr_reg

    tf.summary.scalar("loss", loss)

    return (loss, per_example_loss, logits, probabilities)
Beispiel #17
0
def attention_layer(x_flat,
                    attention_mask,
                    batch_size,
                    seq_length,
                    size_per_head=512,
                    num_attention_heads=1,
                    *,
                    cache=None,
                    initializer_range=0.02,
                    hidden_dropout_prob=0.1,
                    attention_probs_dropout_prob=0.1,
                    do_cache=False):
    """

    :param x_flat: Tensor input, should be [batch_size*seq_length, dim]
    :param attention_mask: Attention mask to use of size [seq_length, seq_length+cached_length]
    :param size_per_head: dim = size_per_head * num_attention_heads
    :param num_attention_heads:  dim = size_per_head * num_attention_heads
    :param cache: Optionally some past (cached) things of size
                [batch, 2, heads, sequence, features], where 2 is [k, v]
    :param do_cache: True if we should return cache
    :return: A new tensor of shape [batch_size, seq_length, dim]
    as well as a new cache "cached_keys_and_values" that will be of size
                                   [batch_size, 2, num_attention_heads, seq_length, dim]
    """
    batch_size_seq_length, dim = get_shape_list(x_flat, expected_rank=2)

    # Had to remove this because of generation script
    # if (batch_size_seq_length != batch_size * seq_length):
    #     raise ValueError("passed in a tensor of shape {} when batch_size={} and seq_length={}".format(
    #         (batch_size_seq_length, dim), batch_size, seq_length
    #     ))

    if dim != size_per_head * num_attention_heads:
        raise ValueError(
            "passed in a tensor of shape {} when size_per_head={} and num_attention_heads={}"
            .format((batch_size_seq_length, dim), size_per_head,
                    num_attention_heads))

    # if do_cache and past is not None:
    #     Shape will be (batch_size, 2, num_attention_heads, past_seq_length, dim)
    #     past_shape = get_shape_list(past, 5)
    #     desired_shape = (batch_size, 2, num_attention_heads, seq_length, dim)
    #     if tuple(past_shape) != desired_shape:
    #         raise ValueError(f"The shape of the cache is {past_shape} but we want {desired_shape}")

    # [ batch_size, num_attention_heads, seq_length, size_per_head]
    query = _attention_projection_and_transpose(
        x_flat,
        batch_size=batch_size,
        seq_length=seq_length,
        num_attention_heads=num_attention_heads,
        size_per_head=size_per_head,
        name='query_layer',
        initializer_range=initializer_range)
    key = _attention_projection_and_transpose(
        x_flat,
        batch_size=batch_size,
        seq_length=seq_length,
        num_attention_heads=num_attention_heads,
        size_per_head=size_per_head,
        name='key_layer',
        initializer_range=initializer_range)

    value = _attention_projection_and_transpose(
        x_flat,
        batch_size=batch_size,
        seq_length=seq_length,
        num_attention_heads=num_attention_heads,
        size_per_head=size_per_head,
        name='value_layer',
        initializer_range=initializer_range)

    # Add to cache
    cached_keys_and_values = tf.stack([key, value],
                                      axis=1) if do_cache else None

    # Things that were relevant from the cache
    if cache is not None:
        pk, pv = tf.unstack(cache, axis=1)
        key = tf.concat([pk, key], axis=-2)
        value = tf.concat([pv, value], axis=-2)

    # Multiply [batch_size, num_attention_heads, seq_length, size_per_head] with
    #          [batch_size, num_attention_heads, size_per_head, seq_length+cached_length] ->
    #          [batch_size, num_attention_heads, seq_length, seq_length+cached_length]
    attention_scores = tf.matmul(query, key, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))
    attention_scores = mask_attention_for_ltr(attention_scores, attention_mask)
    attention_probs = tf.nn.softmax(attention_scores)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    # NOPENOPENOPENOPE
    # attention_probs = factoreddropout(attention_probs, attention_probs_dropout_prob)

    # Multiply [batch_size, num_attention_heads, seq_length, seq_length+cached_length] with
    #          [batch_size, num_attention_heads, seq_length+cached_length, size_per_head] ->
    #          [batch_size, num_attention_heads, seq_length, size_per_head] ->
    context_layer = tf.matmul(attention_probs, value)

    # `context_layer` = [batch_size, seq_length, num_attention_heads, size_per_head]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    context_layer = tf.reshape(
        context_layer,
        [batch_size * seq_length, num_attention_heads * size_per_head])

    context_layer_projected = tf.layers.dense(
        context_layer,
        num_attention_heads * size_per_head,
        kernel_initializer=create_initializer(initializer_range),
        name='context_projection_layer')
    context_layer_projected = dropout(context_layer_projected,
                                      hidden_dropout_prob)

    return context_layer_projected, cached_keys_and_values
Beispiel #18
0
def _generate_detections_tf(cls_outputs,
                            box_outputs,
                            anchor_boxes,
                            indices,
                            classes,
                            image_id,
                            image_scale,
                            num_classes,
                            min_score_thresh=0.2,
                            max_boxes_to_draw=50,
                            soft_nms_sigma=0.0,
                            iou_threshold=0.5,
                            use_native_nms=False):
  """Generates detections with model outputs and anchors.

  Args:
    cls_outputs: a numpy array with shape [N, 1], which has the highest class
      scores on all feature levels. The N is the number of selected
      top-K total anchors on all levels.  (k being MAX_DETECTION_POINTS)
    box_outputs: a numpy array with shape [N, 4], which stacks box regression
      outputs on all feature levels. The N is the number of selected top-k
      total anchors on all levels. (k being MAX_DETECTION_POINTS)
    anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of selected top-k total anchors on
      all levels.
    indices: a numpy array with shape [N], which is the indices from top-k
      selection.
    classes: a numpy array with shape [N], which represents the class
      prediction on all selected anchors from top-k selection.
    image_id: an integer number to specify the image id.
    image_scale: a float tensor representing the scale between original image
      and input image for the detector. It is used to rescale detections for
      evaluating with the original groundtruth annotations.
    num_classes: a integer that indicates the number of classes.
    min_score_thresh: A float representing the threshold for deciding when to
      remove boxes based on score.
    max_boxes_to_draw: Max number of boxes to draw.
    soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter;
      See Bodla et al, https://arxiv.org/abs/1704.04503).  When
        `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard)
        NMS.
    iou_threshold: A float representing the threshold for deciding whether boxes
      overlap too much with respect to IOU.
    use_native_nms: a bool that indicates whether to use native nms.

  Returns:
    detections: detection results in a tensor with each row representing
      [image_id, y, x, height, width, score, class]
  """
  anchor_boxes = tf.gather(anchor_boxes, indices)

  scores = tf.math.sigmoid(cls_outputs)
  # apply bounding box regression to anchors
  boxes = decode_box_outputs_tf(
      tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0]))

  def _else(detections, class_id, indices):
    """Else branch for generating detections."""
    boxes_cls = tf.gather(boxes, indices)
    scores_cls = tf.gather(scores, indices)
    # Select top-scoring boxes in each class and apply non-maximum suppression
    # (nms) for boxes in the same class. The selected boxes from each class are
    # then concatenated for the final detection outputs.

    if use_native_nms:
      top_detection_idx, scores_cls = tf.image.non_max_suppression_with_scores(
          boxes_cls,
          scores_cls,
          max_boxes_to_draw,
          iou_threshold=iou_threshold,
          score_threshold=min_score_thresh,
          soft_nms_sigma=soft_nms_sigma)
      scores_cls = tf.expand_dims(scores_cls, axis=1)
      boxes_cls = tf.gather(boxes_cls, top_detection_idx)
      top_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1)
    else:
      scores_cls = tf.expand_dims(scores_cls, axis=1)
      all_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1)
      top_detection_idx = nms_tf(all_detections_cls, iou_threshold)
      top_detections_cls = tf.gather(all_detections_cls, top_detection_idx)
    height = top_detections_cls[:, 2] - top_detections_cls[:, 0]
    width = top_detections_cls[:, 3] - top_detections_cls[:, 1]
    top_detections_cls = tf.stack([top_detections_cls[:, 0] * image_scale,
                                   top_detections_cls[:, 1] * image_scale,
                                   height * image_scale, width * image_scale,
                                   top_detections_cls[:, 4]], axis=-1)

    top_detections_cls = tf.stack(
        [
            tf.cast(
                tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32),
            *tf.unstack(top_detections_cls, 5, axis=1),
            tf.repeat(class_id + 1.0, tf.size(top_detection_idx))
        ],
        axis=1)

    detections = tf.concat([detections, top_detections_cls], axis=0)

    return detections

  detections = tf.constant([], tf.float32, [0, 7])
  for c in range(num_classes):
    indices_cls = tf.squeeze(tf.where_v2(tf.equal(classes, c)), axis=-1)
    detections = tf.cond(
        tf.equal(tf.size(indices), 0),
        lambda: detections,
        lambda id=c, id_cls=indices_cls: _else(detections, id, id_cls))
  indices_final = tf.argsort(detections[:, -2], direction='DESCENDING')
  detections = tf.gather(
      detections, indices_final[:max_boxes_to_draw], name='detection')
  return detections
def resnet_model_fn(features, labels, mode, params):
    """The model_fn for ResNet to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images. If transpose_input is enabled, it is
      transposed to device layout and reshaped to 1D tensor.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
      `params['batch_size']` is always provided and should be used as the
      effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features['feature']

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU/TPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC.
    if params['data_format'] == 'channels_first':
        assert not params['transpose_input']  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])

    if params['transpose_input'] and mode != tf_estimator.ModeKeys.PREDICT:
        image_size = tf.sqrt(tf.shape(features)[0] / (3 * tf.shape(labels)[0]))
        features = tf.reshape(features, [image_size, image_size, 3, -1])
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    # Normalize the image to zero mean and unit variance.
    features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype)
    features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype)

    # DropBlock keep_prob for the 4 block groups of ResNet architecture.
    # None means applying no DropBlock at the corresponding block group.
    dropblock_keep_probs = [None] * 4
    if params['dropblock_groups']:
        # Scheduled keep_prob for DropBlock.
        train_steps = tf.cast(params['train_steps'], tf.float32)
        current_step = tf.cast(tf.train.get_global_step(), tf.float32)
        current_ratio = current_step / train_steps
        dropblock_keep_prob = (1 - current_ratio *
                               (1 - params['dropblock_keep_prob']))

        # Computes DropBlock keep_prob for different block groups of ResNet.
        dropblock_groups = [
            int(x) for x in params['dropblock_groups'].split(',')
        ]
        for block_group in dropblock_groups:
            if block_group < 1 or block_group > 4:
                raise ValueError(
                    'dropblock_groups should be a comma separated list of integers '
                    'between 1 and 4 (dropblcok_groups: {}).'.format(
                        params['dropblock_groups']))
            dropblock_keep_probs[block_group - 1] = 1 - (
                (1 - dropblock_keep_prob) / 4.0**(4 - block_group))

    # This nested function allows us to avoid duplicating the logic which
    # builds the network, for different values of --precision.
    def build_network():
        network = resnet_model.resnet_v1(
            resnet_depth=params['resnet_depth'],
            num_classes=params['num_label_classes'],
            dropblock_size=params['dropblock_size'],
            dropblock_keep_probs=dropblock_keep_probs,
            data_format=params['data_format'])
        return network(inputs=features,
                       is_training=(mode == tf_estimator.ModeKeys.TRAIN))

    if params['precision'] == 'bfloat16':
        with contrib_tpu.bfloat16_scope():
            logits = build_network()
        logits = tf.cast(logits, tf.float32)
    elif params['precision'] == 'float32':
        logits = build_network()

    if mode == tf_estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf_estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf_estimator.export.PredictOutput(predictions)
            })

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, params['num_label_classes'])
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=one_hot_labels,
        label_smoothing=params['label_smoothing'])

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + params['weight_decay'] * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    host_call = None
    if mode == tf_estimator.ModeKeys.TRAIN:
        # Compute the current epoch and associated learning rate from global_step.
        global_step = tf.train.get_global_step()
        steps_per_epoch = params['num_train_images'] / params[
            'train_batch_size']
        current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch)
        # LARS is a large batch optimizer. LARS enables higher accuracy at batch 16K
        # and larger batch sizes.
        if params['enable_lars']:
            learning_rate = 0.0
            optimizer = lars_util.init_lars_optimizer(current_epoch, params)
            raise ValueError(
                'LARS unexpected in the context of IGT experiments.')
        else:
            learning_rate = linear_learning_rate_schedule(params, global_step)

            if FLAGS.optimizer == 'momentum':
                tf.logging.info('Using MomentumOptimizer ({}).'.format(
                    params['momentum']))
                optimizer = tf.train.MomentumOptimizer(
                    learning_rate=learning_rate,
                    momentum=params['momentum'],
                    use_nesterov=False)

            elif FLAGS.optimizer == 'adam':
                tf.logging.info('Using AdamOptimizer')
                optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

            elif FLAGS.optimizer == 'eigt':
                tf.logging.info('Using ExpIgtOptimizer {} tail: {}'.format(
                    FLAGS.igt_optimizer, FLAGS.tail_fraction))
                optimizer = exp_igt_optimizer.ExpIgtOptimizer(
                    learning_rate,
                    tail_fraction=FLAGS.tail_fraction,
                    optimizer=FLAGS.igt_optimizer)

            else:
                raise ValueError('{} is not a supported optimizer'.format(
                    FLAGS.optimizer))

        if params['use_tpu']:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if not params['skip_host_call']:

            def host_call_fn(gs, loss, lr, ce):
                """Training host call.

        Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          loss: `Tensor` with shape `[batch]` for the training loss.
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                # Host call fns are executed params['iterations_per_loop'] times after
                # one TPU loop is finished, setting max_queue value to the same as
                # number of iterations will make the summary writer only flush the data
                # to storage once per loop.
                with summary.create_file_writer(
                        get_model_dir(params),
                        max_queue=params['iterations_per_loop']).as_default():
                    with summary.always_record_summaries():
                        summary.scalar('loss', loss[0], step=gs)
                        summary.scalar('learning_rate', lr[0], step=gs)
                        summary.scalar('current_epoch', ce[0], step=gs)

                        return summary.all_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            loss_t = tf.reshape(loss, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    else:
        train_op = None

    eval_metrics = None
    scaffold_fn = None
    if mode == tf_estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function.

      Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

        if FLAGS.mode == 'eval_igt' and FLAGS.igt_eval_mode == 'true':
            tf.logging.info('Using true param loading saver.')

            def scaffold_fn_true_params():
                """Returns a scaffold that loads the true values into vars."""
                var_mapping = {}
                trainable_vars = set(tf.trainable_variables())
                for var in tf.global_variables():
                    if var in trainable_vars:
                        var_mapping[var.op.name + '/true_param'] = var
                    else:
                        var_mapping[var.op.name] = var

                tf.logging.info('Mapping: {}'.format(var_mapping))
                saver = tf.train.Saver(var_list=var_mapping, sharded=True)
                return tf.train.Scaffold(saver=saver)

            scaffold_fn = scaffold_fn_true_params

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op,
                                        host_call=host_call,
                                        eval_metrics=eval_metrics,
                                        scaffold_fn=scaffold_fn)
Beispiel #20
0
    def _build_network(self, layers):
        network = tf.transpose(self.input_tensor, [0, 2, 3, 1])
        # [batch, assets, window, features]
        network = network / network[:, :, -1, 0, None, None]
        for layer_number, layer in enumerate(layers):
            if layer["type"] == "DenseLayer":
                network = tflearn.layers.core.fully_connected(
                    network,
                    int(layer["neuron_number"]),
                    layer["activation_function"],
                    regularizer=layer["regularizer"],
                    weight_decay=layer["weight_decay"])
                self.add_layer_to_dict(layer["type"], network)
            elif layer["type"] == "DropOut":
                network = tflearn.layers.core.dropout(
                    network, layer["keep_probability"])
            elif layer["type"] == "EIIE_Dense":
                width = network.get_shape()[2]
                network = tflearn.layers.conv_2d(
                    network,
                    int(layer["filter_number"]), [1, width], [1, 1],
                    "valid",
                    layer["activation_function"],
                    regularizer=layer["regularizer"],
                    weight_decay=layer["weight_decay"])
                self.add_layer_to_dict(layer["type"], network)
            elif layer["type"] == "ConvLayer":
                network = tflearn.layers.conv_2d(
                    network,
                    int(layer["filter_number"]),
                    allint(layer["filter_shape"]),
                    allint(layer["strides"]),
                    layer["padding"],
                    layer["activation_function"],
                    regularizer=layer["regularizer"],
                    weight_decay=layer["weight_decay"])
                self.add_layer_to_dict(layer["type"], network)
            elif layer["type"] == "MaxPooling":
                network = tflearn.layers.conv.max_pool_2d(
                    network, layer["strides"])
            elif layer["type"] == "AveragePooling":
                network = tflearn.layers.conv.avg_pool_2d(
                    network, layer["strides"])
            elif layer["type"] == "LocalResponseNormalization":
                network = tflearn.layers.normalization.local_response_normalization(
                    network)
            elif layer["type"] == "EIIE_Output":
                width = network.get_shape()[2]
                network = tflearn.layers.conv_2d(
                    network,
                    1, [1, width],
                    padding="valid",
                    regularizer=layer["regularizer"],
                    weight_decay=layer["weight_decay"])
                self.add_layer_to_dict(layer["type"], network)
                network = network[:, :, 0, 0]
                btc_bias = tf.ones((self.input_num, 1))
                self.add_layer_to_dict(layer["type"], network)
                network = tf.concat([btc_bias, network], 1)
                network = tflearn.layers.core.activation(network,
                                                         activation="softmax")
                self.add_layer_to_dict(layer["type"], network, weights=False)
            elif layer["type"] == "Output_WithW":
                network = tflearn.flatten(network)
                network = tf.concat([network, self.previous_w], axis=1)
                network = tflearn.fully_connected(
                    network,
                    self._rows + 1,
                    activation="softmax",
                    regularizer=layer["regularizer"],
                    weight_decay=layer["weight_decay"])
            elif layer["type"] == "EIIE_Output_WithW":
                width = network.get_shape()[2]
                height = network.get_shape()[1]
                features = network.get_shape()[3]
                network = tf.reshape(
                    network,
                    [self.input_num,
                     int(height), 1,
                     int(width * features)])
                w = tf.reshape(self.previous_w, [-1, int(height), 1, 1])
                network = tf.concat([network, w], axis=3)
                network = tflearn.layers.conv_2d(
                    network,
                    1, [1, 1],
                    padding="valid",
                    regularizer=layer["regularizer"],
                    weight_decay=layer["weight_decay"])
                self.add_layer_to_dict(layer["type"], network)
                network = network[:, :, 0, 0]
                #btc_bias = tf.zeros((self.input_num, 1))
                btc_bias = tf.get_variable("btc_bias", [1, 1],
                                           dtype=tf.float32,
                                           initializer=tf.zeros_initializer)
                # self.add_layer_to_dict(layer["type"], network, weights=False)
                btc_bias = tf.tile(btc_bias, [self.input_num, 1])
                network = tf.concat([btc_bias, network], 1)
                self.voting = network
                self.add_layer_to_dict('voting', network, weights=False)
                network = tflearn.layers.core.activation(network,
                                                         activation="softmax")
                self.add_layer_to_dict('softmax_layer', network, weights=False)

            elif layer["type"] == "EIIE_LSTM" or\
                            layer["type"] == "EIIE_RNN":
                network = tf.transpose(network, [0, 2, 3, 1])
                resultlist = []
                reuse = False
                for i in range(self._rows):
                    if i > 0:
                        reuse = True
                    if layer["type"] == "EIIE_LSTM":
                        result = tflearn.layers.lstm(
                            network[:, :, :, i],
                            int(layer["neuron_number"]),
                            dropout=layer["dropouts"],
                            scope="lstm" + str(layer_number),
                            reuse=reuse)
                    else:
                        result = tflearn.layers.simple_rnn(
                            network[:, :, :, i],
                            int(layer["neuron_number"]),
                            dropout=layer["dropouts"],
                            scope="rnn" + str(layer_number),
                            reuse=reuse)
                    resultlist.append(result)
                network = tf.stack(resultlist)
                network = tf.transpose(network, [1, 0, 2])
                network = tf.reshape(
                    network, [-1, self._rows, 1,
                              int(layer["neuron_number"])])
            else:
                raise ValueError("the layer {} not supported.".format(
                    layer["type"]))
        return network
Beispiel #21
0
def _scan_step_fn(state, example, packed_length, queue_size, spacing,
                  num_sequences, token_dtype):  # pylint: disable=g-doc-args
  """Transform function used by tf.data.experimental.scan to process an example.

  This is written as a stateless function rather than a class method because we
  trace it with AutoGraph (in order to simplify the conditional), and this way
  we don't have to worry about handling re-tracing semantics.

  Args:
    See the SequenceDatasetPacker class.

  Returns:
    The updated queue state, and either a packed example or a dummy sequence
    which will be filtered out downstream.
  """

  # Convert TensorArray tuples to lists since we'll need to replace them.
  availability, contents, top_index = state

  lengths = tf.concat([tf.shape(i) for i in example], axis=0)
  start_availability = availability.stack()
  can_fit = tf.reduce_all(tf.greater_equal(start_availability, lengths), axis=1)
  any_can_fit = tf.reduce_any(can_fit, axis=0)

  # AutoGraph will convert this block to a tf.cond
  if any_can_fit:
    # This indicates where in the FFD queue rotation a given index sits
    shifted_range = (
        tf.range(queue_size, dtype=INDEX_DTYPE) - top_index) % queue_size

    # Mark any indices which cannot accommodate the current example.
    exclusion_mask = tf.cast(tf.logical_not(can_fit), INDEX_DTYPE) * queue_size

    # Index in [0, queue_size) in which to place the sample. Note, this index
    # is the position in the actual TensorArray, not the index of the FFD queue.
    queue_index = (tf.reduce_min(shifted_range + exclusion_mask) +
                   top_index) % queue_size

    # NOTE(taylorrobie): We emit a non-empty Tensor for downstream checks.
    output_contents = -tf.ones((1, num_sequences), dtype=token_dtype)

  else:
    index_range = top_index * packed_length + tf.range(packed_length)
    output_contents = contents.gather(index_range)

    # Reset the queue state.
    availability = availability.write(
        top_index, packed_length * tf.ones((num_sequences,), dtype=INDEX_DTYPE))
    empty_contents = tf.zeros((packed_length, num_sequences * 2),
                              dtype=token_dtype)
    contents = contents.scatter(index_range, empty_contents)

    queue_index = top_index
    top_index = (top_index + 1) % queue_size

  pre_assign_availability = availability.read(queue_index)
  space_left = pre_assign_availability - lengths - spacing
  availability = availability.write(queue_index, space_left)

  # ============================================================================
  # == Update contents =========================================================
  # ============================================================================
  # Consider the following case for a seq-to-seq packing:
  #   (padding is represented as underscores)
  #
  #   Queue starting state:
  #     [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...]
  #     [5, 9, _, _, _, _, _, _, _, _, _, ...]
  #
  #   Examples:
  #     [4, 2, 4], [3]
  #
  #   Desired new queue state:
  #     [1, 3, 2, 4, 6, 1, _, _, 4, 2, 4, _, _, ...]
  #     [5, 9, _, _, 3, _, _, _, _, _, _, _, _, ...]
  #
  # This could be acomplished by creating a TensorArray for each of the two
  # sequences, and scattering into the respective arrays. However TensorArray
  # writes are extremely expensive relative to other operations. So instead we
  # store the contents in a single TensorArray of shape (packed_length, 2), and
  # we pad and concatenate the examples such that they can be added in a single
  # assign:
  #
  #              [_, _, _, _, 4, 2, 4]
  #              [3, _, _, _, _, _, _]
  #                        +
  #  [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...]
  #  [5, 9, _, _, _, _, _, _, _, _, _, ...]
  #
  # And in practice, the extra work of padding is neglidgable compared to
  # the gain from vectorizing the TensorArray assign. We also store a bit mask
  # denoting where sequences start which is used to compute segment and
  # position metadata:
  #
  #              [_, _, _, _, 1, _, _]
  #              [1, _, _, _, _, _, _]
  #                        +
  #  [1, _, _, _, _, _, _, _, _, _, _, ...]
  #  [1, _, _, _, _, _, _, _, _, _, _, ...]
  #
  # Both the contents and the mask are concatenated in the same TensorArray
  # for performance.

  start_index = packed_length - pre_assign_availability
  end_index = start_index + lengths
  leftmost = tf.reduce_min(start_index, axis=0)
  rightmost = tf.reduce_max(end_index, axis=0)
  delta = rightmost - leftmost
  pad_indices = [tf.stack((start_index[i] - leftmost, rightmost - end_index[i]))
                 for i in range(num_sequences)]

  padded_examples = [tf.pad(ex, padding[tf.newaxis, :])
                     for ex, padding in zip(example, pad_indices)]
  padded_examples = tf.transpose(tf.stack(padded_examples))
  mask_update = tf.one_hot(start_index - leftmost, delta,
                           dtype=contents.dtype, axis=0)

  content_update = tf.concat([padded_examples, mask_update], axis=1)

  index_range = (queue_index * packed_length +  # Offset into the right section.
                 tf.range(delta, dtype=INDEX_DTYPE) + leftmost)
  contents = contents.scatter(index_range, contents.gather(index_range) +
                              content_update)

  state = (availability, contents, top_index)
  return state, (tf.logical_not(any_can_fit), output_contents)
    def _forward(self, input_indxs, outpt_indxs, scores, weights):
        """Build the graph for the forward pass.

    Args:
      input_indxs: int32 or int64 tensor for input labels
      outpt_indxs: int32 or int64 tensor for outpt labels
      scores: float32 tensor for co-occurrence score
      weights: float32 tensor for loss weights

    Returns:
      loss: a univariate tensor giving the loss from the batch
    """
        # Initialize input/outpt word (node) parameters
        self._default_scope = tf.get_variable_scope()
        init_width = 0.5 / (self._vector_size + self._covariate_size)
        self._word['input'] = self._weight_initializer('word_input',
                                                       init_width,
                                                       self._vocab_size,
                                                       self._vector_size)
        self._word['outpt'] = self._weight_initializer('word_outpt',
                                                       init_width,
                                                       self._vocab_size,
                                                       self._vector_size)

        # Initialize input/outpt bias parameters
        self._bias['input'] = self._weight_initializer('bias_input',
                                                       init_width,
                                                       self._vocab_size, 1)
        self._bias['outpt'] = self._weight_initializer('bias_outpt',
                                                       init_width,
                                                       self._vocab_size, 1)

        if self._covariate_size > 0:
            # Initialize input/outpt cvrt transformation parameters
            self._cvrt_transformation['input'] = self._weight_initializer(
                'cvrt_input', init_width, self._covariate_data.shape[1],
                self._covariate_size)
            self._cvrt_transformation['outpt'] = self._weight_initializer(
                'cvrt_outpt', init_width, self._covariate_data.shape[1],
                self._covariate_size)

            # Project the covariate data with the transformation parameters
            self._cvrt['input'] = tf.matmul(self._covariate_data_tensor,
                                            self._cvrt_transformation['input'])
            self._cvrt['outpt'] = tf.matmul(self._covariate_data_tensor,
                                            self._cvrt_transformation['outpt'])

            if self._use_monet:
                # Compute covariate svd
                _, self._u, _ = tf.linalg.svd(self._cvrt['input'] +
                                              self._cvrt['outpt'])

                # Project base word vecs and get word vecs
                self._projected_word_input = tf.stop_gradient(
                    self._word['input'] - self._db_level * tf.matmul(
                        self._u,
                        tf.matmul(tf.transpose(self._u), self._word['input'])))
                self._projected_word_outpt = tf.stop_gradient(
                    self._word['outpt'] - self._db_level * tf.matmul(
                        self._u,
                        tf.matmul(tf.transpose(self._u), self._word['outpt'])))

        # Get loss input word vectors
        if self._use_monet:
            self._input_word_vecs = tf.nn.embedding_lookup(
                self._projected_word_input, input_indxs)
            self._outpt_word_vecs = tf.nn.embedding_lookup(
                self._projected_word_outpt, outpt_indxs)
        else:
            self._input_word_vecs = tf.nn.embedding_lookup(
                self._word['input'], input_indxs)
            self._outpt_word_vecs = tf.nn.embedding_lookup(
                self._word['outpt'], outpt_indxs)

        # Get loss input bias vectors
        self._input_bias_vecs = tf.nn.embedding_lookup(self._bias['input'],
                                                       input_indxs)
        self._outpt_bias_vecs = tf.nn.embedding_lookup(self._bias['outpt'],
                                                       outpt_indxs)
        self._word_pred = tf.reduce_sum(tf.multiply(self._input_word_vecs,
                                                    self._outpt_word_vecs),
                                        axis=1)
        self._bias_pred = tf.reduce_sum(self._input_bias_vecs +
                                        self._outpt_bias_vecs,
                                        axis=1)
        estimated_score = self._bias_pred
        self._word_pred = tf.reduce_sum(tf.multiply(self._input_word_vecs,
                                                    self._outpt_word_vecs),
                                        axis=1)
        estimated_score += self._word_pred

        # Add covariate terms
        if self._covariate_size > 0:
            self._input_cvrt_vecs = tf.nn.embedding_lookup(
                self._cvrt['input'], input_indxs)
            self._outpt_cvrt_vecs = tf.nn.embedding_lookup(
                self._cvrt['outpt'], outpt_indxs)
            self._cvrt_pred = tf.reduce_sum(tf.multiply(
                self._input_cvrt_vecs, self._outpt_cvrt_vecs),
                                            axis=1)
            estimated_score += self._cvrt_pred
        else:
            self._cvrt_pred = tf.constant(0.0)

        self._scores = scores
        self._est_score = estimated_score
        if self._use_w2v:
            loss = self._compute_w2v_loss(input_indxs)
        else:
            diff = estimated_score - scores
            self._diff = diff
            loss = tf.reduce_sum(tf.multiply(weights, tf.square(diff))) / 2
        return loss
Beispiel #23
0
def get_reg_loss(tfs):

    # Regulizer
    with tf.name_scope('reg_errors'):

        reg_loss = tfs.loss

        # amplitude
        if 'amplitude' in tfs.sys_para.reg_coeffs:
            amp_reg_alpha_coeff = tfs.sys_para.reg_coeffs['amplitude']
            amp_reg_alpha = amp_reg_alpha_coeff / float(tfs.sys_para.steps)
            reg_loss = reg_loss + amp_reg_alpha * tf.nn.l2_loss(tfs.ops_weight)

        # gaussian envelope
        if 'envelope' in tfs.sys_para.reg_coeffs:
            reg_alpha_coeff = tfs.sys_para.reg_coeffs['envelope']
            reg_alpha = reg_alpha_coeff / float(tfs.sys_para.steps)
            reg_loss = reg_loss + reg_alpha * tf.nn.l2_loss(
                tf.multiply(tfs.tf_one_minus_gaussian_envelope,
                            tfs.ops_weight))

        # Limiting the dwdt of control pulse
        if 'dwdt' in tfs.sys_para.reg_coeffs:
            zeros_for_training = tf.zeros([tfs.sys_para.ops_len, 2])
            new_weights = tf.concat([tfs.ops_weight, zeros_for_training], 1)
            new_weights = tf.concat([zeros_for_training, new_weights], 1)
            dwdt_reg_alpha_coeff = tfs.sys_para.reg_coeffs['dwdt']
            dwdt_reg_alpha = dwdt_reg_alpha_coeff / float(tfs.sys_para.steps)
            reg_loss = reg_loss + dwdt_reg_alpha * tf.nn.l2_loss(
                (new_weights[:, 1:] - new_weights[:, :tfs.sys_para.steps + 3])
                / tfs.sys_para.dt)

        # Limiting the d2wdt2 of control pulse
        if 'd2wdt2' in tfs.sys_para.reg_coeffs:
            d2wdt2_reg_alpha_coeff = tfs.sys_para.reg_coeffs['d2wdt2']
            d2wdt2_reg_alpha = d2wdt2_reg_alpha_coeff / float(
                tfs.sys_para.steps)
            reg_loss = reg_loss + d2wdt2_reg_alpha * tf.nn.l2_loss((new_weights[:, 2:] - \
                                                                              2 * new_weights[:,
                                                                                  1:tfs.sys_para.steps + 3] + new_weights[:,
                                                                                                               :tfs.sys_para.steps + 2]) / (
                                                                             tfs.sys_para.dt ** 2))
        # bandpass filter on the control
        if 'bandpass' in tfs.sys_para.reg_coeffs:
            ## currently does not support bandpass reg for CPU (no CPU kernel for FFT)
            if not tfs.sys_para.use_gpu:
                raise ValueError(
                    'currently does not support bandpass reg for CPU (no CPU kernel for FFT)'
                )

            bandpass_reg_alpha_coeff = tfs.sys_para.reg_coeffs['bandpass']
            bandpass_reg_alpha = bandpass_reg_alpha_coeff / float(
                tfs.sys_para.steps)

            tf_u = tf.cast(tfs.ops_weight, dtype=tf.complex64)

            tf_fft = tf.complex_abs(tf.fft(tf_u))

            band = np.array(tfs.sys_para.reg_coeffs['band'])

            band_id = (band * tfs.sys_para.total_time).astype(int)
            half_id = int(tfs.sys_para.steps / 2)

            fft_loss = bandpass_reg_alpha * (
                tf.reduce_sum(tf_fft[:, 0:band_id[0]]) +
                tf.reduce_sum(tf_fft[:, band_id[1]:half_id]))

            reg_loss = reg_loss + fft_loss

        # Limiting the access to forbidden states
        if 'forbidden_coeff_list' in tfs.sys_para.reg_coeffs:

            if tfs.sys_para.is_dressed:
                v_sorted = tf.constant(c_to_r_mat(
                    np.reshape(
                        sort_ev(tfs.sys_para.v_c, tfs.sys_para.dressed_id), [
                            len(tfs.sys_para.dressed_id),
                            len(tfs.sys_para.dressed_id)
                        ])),
                                       dtype=tf.float32)

            for inter_vec in tfs.inter_vecs:
                if tfs.sys_para.is_dressed and (
                        'forbid_dressed' in tfs.sys_para.reg_coeffs
                        and tfs.sys_para.reg_coeffs['forbid_dressed']):
                    inter_vec = tf.matmul(tf.transpose(v_sorted), inter_vec)
                for inter_reg_alpha_coeff, state in zip(
                        tfs.sys_para.reg_coeffs['forbidden_coeff_list'],
                        tfs.sys_para.reg_coeffs['states_forbidden_list']):
                    inter_reg_alpha = inter_reg_alpha_coeff / float(
                        tfs.sys_para.steps)
                    forbidden_state_pop = tf.square(inter_vec[state, :]) + \
                                          tf.square(inter_vec[tfs.sys_para.state_num + state, :])
                    reg_loss = reg_loss + inter_reg_alpha * tf.nn.l2_loss(
                        forbidden_state_pop)

        # Speeding up the gate time
        if 'speed_up' in tfs.sys_para.reg_coeffs:
            speed_up_reg_alpha_coeff = tfs.sys_para.reg_coeffs['speed_up']
            speed_up_reg_alpha = speed_up_reg_alpha_coeff / float(
                tfs.sys_para.steps)

            target_vecs_all_timestep = tf.tile(
                tf.reshape(
                    tfs.target_vecs,
                    [2 * tfs.sys_para.state_num, 1,
                     len(tfs.inter_vecs)]), [1, tfs.sys_para.steps + 1, 1])

            target_vecs_inner_product = tfs.get_inner_product_3D(
                tfs.inter_vecs_packed, target_vecs_all_timestep)
            reg_loss = reg_loss + speed_up_reg_alpha * tf.nn.l2_loss(
                tfs.sys_para.steps + 1 - target_vecs_inner_product)

        return reg_loss
Beispiel #24
0
def compute_cost(Z3,Y):
    logits=tf.transpose(Z3)
    labels=tf.transpose(Y)
    cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=labels))
    return cost
Beispiel #25
0
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of one hot labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features['feature']

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
    if FLAGS.data_format == 'channels_first':
        assert not FLAGS.transpose_input  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])
        stats_shape = [3, 1, 1]
    else:
        stats_shape = [1, 1, 3]

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        input_image_size = model_builder_factory.get_model_input_size(
            FLAGS.model_name)

    if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT:
        features = tf.reshape(features,
                              [input_image_size, input_image_size, 3, -1])
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (FLAGS.moving_average_decay > 0)
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)
    logging.info('Using open-source implementation.')
    override_params = {}
    if FLAGS.batch_norm_momentum is not None:
        override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum
    if FLAGS.batch_norm_epsilon is not None:
        override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon
    if FLAGS.dropout_rate is not None:
        override_params['dropout_rate'] = FLAGS.dropout_rate
    if FLAGS.survival_prob is not None:
        override_params['survival_prob'] = FLAGS.survival_prob
    if FLAGS.data_format:
        override_params['data_format'] = FLAGS.data_format
    if FLAGS.num_label_classes:
        override_params['num_classes'] = FLAGS.num_label_classes
    if FLAGS.depth_coefficient:
        override_params['depth_coefficient'] = FLAGS.depth_coefficient
    if FLAGS.width_coefficient:
        override_params['width_coefficient'] = FLAGS.width_coefficient

    def normalize_features(features, mean_rgb, stddev_rgb):
        """Normalize the image given the means and stddevs."""
        features -= tf.constant(mean_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        features /= tf.constant(stddev_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        return features

    def build_model():
        """Build model using the model_name given through the command line."""
        model_builder = model_builder_factory.get_model_builder(
            FLAGS.model_name)
        normalized_features = normalize_features(features,
                                                 model_builder.MEAN_RGB,
                                                 model_builder.STDDEV_RGB)
        logits, _ = model_builder.build_model(normalized_features,
                                              model_name=FLAGS.model_name,
                                              training=is_training,
                                              override_params=override_params,
                                              model_dir=FLAGS.model_dir)
        return logits

    if params['use_bfloat16']:
        with tf.tpu.bfloat16_scope():
            logits = tf.cast(build_model(), tf.float32)
    else:
        logits = build_model()

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=labels,
        label_smoothing=FLAGS.label_smoothing)

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + FLAGS.weight_decay * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=FLAGS.moving_average_decay, num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    restore_vars_dict = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0)
        logging.info('base_learning_rate = %f', FLAGS.base_learning_rate)
        learning_rate = utils.build_learning_rate(
            scaled_lr,
            global_step,
            params['steps_per_epoch'],
            decay_epochs=FLAGS.lr_decay_epoch,
            warmup_epochs=FLAGS.lr_warmup_epochs,
            decay_factor=FLAGS.lr_decay_factor,
            lr_decay_type=FLAGS.lr_schedule,
            total_steps=FLAGS.train_steps)
        optimizer = utils.build_optimizer(
            learning_rate,
            optimizer_name=FLAGS.optimizer,
            lars_weight_decay=FLAGS.lars_weight_decay,
            lars_epsilon=FLAGS.lars_epsilon)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not FLAGS.skip_host_call:

            def host_call_fn(gs, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                # Host call fns are executed FLAGS.iterations_per_loop times after one
                # TPU loop is finished, setting max_queue value to the same as number of
                # iterations will make the summary writer only flush the data to storage
                # once per loop.
                with tf2.summary.create_file_writer(
                        FLAGS.model_dir,
                        max_queue=FLAGS.iterations_per_loop).as_default():
                    with tf2.summary.record_if(True):
                        tf2.summary.scalar('learning_rate', lr[0], step=gs)
                        tf2.summary.scalar('current_epoch', ce[0], step=gs)

                        return tf.summary.all_v2_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, lr_t, ce_t])

    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, num_classes]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            labels = tf.argmax(labels, axis=1)
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    logging.info('number of trainable parameters: %d', num_params)

    def _scaffold_fn():
        saver = tf.train.Saver(restore_vars_dict)
        return tf.train.Scaffold(saver=saver)

    if has_moving_average_decay and not is_training:
        # Only apply scaffold for eval jobs.
        scaffold_fn = _scaffold_fn
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=loss,
                                             train_op=train_op,
                                             host_call=host_call,
                                             eval_metrics=eval_metrics,
                                             scaffold_fn=scaffold_fn)
Beispiel #26
0
# Plot errors with respect to number of epochs
plt.plot(errors)
plt.ylabel('Root Mean Squared Error')
plt.xlabel('Number of Epochs')
plt.savefig('pics/result.png')

# We can now predict movies that an arbitrarily selected user might like by feeding in the user's watched
# movie preferences into the RBM and then reconstructing the input

# Selecting the input user
inputUser = [trX[850]]

# Feed in the user and reconstructing the input
hh0 = tf.nn.sigmoid(tf.matmul(v0, W) + hb)
vv1 = tf.nn.sigmoid(tf.matmul(hh0, tf.transpose(W)) + vb)
feed = sess.run(hh0, feed_dict={v0: inputUser, W: prv_w, hb: prv_hb})
rec = sess.run(vv1, feed_dict={hh0: feed, W: prv_w, vb: prv_vb})

# We can then list the 25 most recommended movies for our mock user by sorting it by their scores given by our model
scored_movies_df_850 = movies_df
scored_movies_df_850["Recommendation Score"] = rec[0]
print("\n")
print(scored_movies_df_850.sort_values(["Recommendation Score"], ascending=False).head(25))

# Now we recommend some movies that the user has not yet watched
print("\n")
print(merged_df.iloc[850])

# Now we can find all the movies that our mock user has watched before
movies_df_850 = merged_df[merged_df['UserID'] == 2562]
Beispiel #27
0
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.

  Args:
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `from_tensor`.
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `to_tensor`.

  Returns:
    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
      size_per_head].

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """
    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
    size_per_head = int(from_shape[2] / num_attention_heads)

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`

    # `query_layer` = [B, F, N, H]
    q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head,
                       create_initializer(initializer_range), query_act,
                       "query")

    # `key_layer` = [B, T, N, H]
    k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                       create_initializer(initializer_range), key_act, "key")
    # `value_layer` = [B, T, N, H]
    v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                       create_initializer(initializer_range), value_act,
                       "value")
    q = tf.transpose(q, [0, 2, 1, 3])
    k = tf.transpose(k, [0, 2, 1, 3])
    v = tf.transpose(v, [0, 2, 1, 3])
    if attention_mask is not None:
        attention_mask = tf.reshape(attention_mask,
                                    [batch_size, 1, to_seq_length, 1])
        # 'new_embeddings = [B, N, F, H]'
    new_embeddings = dot_product_attention(q, k, v, attention_mask,
                                           attention_probs_dropout_prob)

    return tf.transpose(new_embeddings, [0, 2, 1, 3])
Beispiel #28
0
def _model_output(inputs, data_format):
    """Maybe convert from channels_first (NCHW) back to channels_last (NHWC)."""
    if data_format == 'channels_first':
        return tf.transpose(a=inputs, perm=[0, 2, 3, 1])
    else:
        return inputs
Beispiel #29
0
    def output(self):
        with tf.name_scope('CrossEntropyLoss'):
            l2_norm = tf.add_n([
                tf.nn.l2_loss(self.item_list_emb),
                tf.nn.l2_loss(self.category_list_emb),
                tf.nn.l2_loss(self.position_list_emb),
                tf.nn.l2_loss(self.user_embedding),
                tf.nn.l2_loss(self.reconsume_lst_embedding)
            ])
            regulation_rate = self.FLAGS.regulation_rate
            item_lookup_table_T = tf.transpose(self.embedding.item_emb_lookup_table)
            '''
            self.output_w = variable_scope.get_variable("output_w",
                                                       shape=[self.num_units, self.num_units],
                                                       dtype=self.predict_behavior_emb.dtype)
            logits = tf.matmul(self.predict_behavior_emb, self.output_w)
            '''




            logits = tf.matmul(self.predict_behavior_emb, item_lookup_table_T)

            row_idx = tf.reshape(tf.range(0, self.now_bacth_data_size, delta=1), [-1, 1])
            row_idx = tf.tile(row_idx, [1, self.max_len])
            row_idx = tf.reshape(row_idx, [-1, 1])

            masks = tf.sequence_mask(self.seq_length, maxlen=self.max_len)
            mask_item_list = tf.where(masks, self.item_list, (1 - tf.to_int32(masks)) * self.embedding.item_count)
            col_idx = tf.reshape(mask_item_list, [-1, 1])



            reconsume_scores = tf.sparse_to_dense(sparse_indices=tf.concat([row_idx, col_idx], axis=1),
                                                 sparse_values=tf.reshape(self.reconsume_scores, [-1, ]),
                                                 output_shape=(self.now_bacth_data_size, self.embedding.item_count+3),
                                                 validate_indices=False)


            # TODO 重新打分
            predict_is_reconsume = tf.expand_dims(self.predict_is_reconsume, axis=-1)

            logits = logits + predict_is_reconsume * reconsume_scores




            self.item_result = logits  # TODO for speed
            self.indices1 = tf.nn.top_k(self.item_result, 1).indices
            self.indices5 = tf.nn.top_k(self.item_result, 5).indices
            self.indices10 = tf.nn.top_k(self.item_result, 10).indices
            self.indices30 = tf.nn.top_k(self.item_result, 30).indices
            self.indices50 = tf.nn.top_k(self.item_result, 50).indices


            log_probs = tf.nn.log_softmax(logits)
            label_ids = tf.reshape(self.target[0], [-1])
            one_hot_labels = tf.one_hot(
                label_ids, depth=self.embedding.item_count + 3, dtype=tf.float32)
            self.loss_origin = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])


            """
            loss reconsume
            """
            predict_is_reconsume = tf.reshape(self.predict_is_reconsume,[-1,1])
            predict_is_reconsume = tf.concat([1-predict_is_reconsume,predict_is_reconsume],axis=-1)

            reconsume_labels = tf.one_hot(
                tf.to_int32(self.is_reconsume),  depth = 2 , dtype=tf.float32)
            self.loss_reconsume = tf.nn.softmax_cross_entropy_with_logits(labels = reconsume_labels,logits=predict_is_reconsume)

            predictions = tf.argmax(predict_is_reconsume,axis=-1,output_type=tf.int32)

            self.precision = tf.metrics.precision(labels=self.is_reconsume,predictions=predictions)
            self.recall = tf.metrics.recall(labels=self.is_reconsume,predictions=predictions)

            self.loss = regulation_rate * l2_norm + tf.reduce_mean(self.loss_origin) +\
                         tf.reduce_mean(self.loss_reconsume)
            # self.loss = regulation_rate * l2_norm + tf.reduce_mean(self.loss_origin)
            # tf.summary.scalar('l2_norm', l2_norm)

            tf.summary.scalar('Training Cross Entropy Loss', tf.reduce_mean(self.loss_origin))
            tf.summary.scalar('Training Reconsume Loss', tf.reduce_mean(self.loss_reconsume))
            tf.summary.scalar('normalized Training Loss', self.loss)
            tf.summary.scalar('l2_norm', l2_norm)
            tf.summary.scalar('Learning_rate', self.learning_rate)
        self.cal_gradient(tf.trainable_variables())
Beispiel #30
0
def add_metric_fn_inputs(params,
                         cls_outputs,
                         box_outputs,
                         metric_fn_inputs,
                         max_detection_points=anchors.MAX_DETECTION_POINTS):
    """Selects top-k predictions and adds the selected to metric_fn_inputs.

  Args:
    params: a parameter dictionary that includes `min_level`, `max_level`,
      `batch_size`, and `num_classes`.
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in
      [batch_size, height, width, num_anchors * 4].
    metric_fn_inputs: a dictionary that will hold the top-k selections.
    max_detection_points: an integer specifing the maximum detection points to
      keep before NMS. Keep all anchors if max_detection_points <= 0.
  """
    num_classes = params['num_classes']
    cls_outputs_all = []
    box_outputs_all = []
    # Concatenates class and box of all levels into one tensor.
    for level in range(params['min_level'], params['max_level'] + 1):
        if params['data_format'] == 'channels_first':
            cls_outputs[level] = tf.transpose(cls_outputs[level], [0, 2, 3, 1])
            box_outputs[level] = tf.transpose(box_outputs[level], [0, 2, 3, 1])

        cls_outputs_all.append(
            tf.reshape(cls_outputs[level],
                       [params['batch_size'], -1, num_classes]))
        box_outputs_all.append(
            tf.reshape(box_outputs[level], [params['batch_size'], -1, 4]))
    cls_outputs_all = tf.concat(cls_outputs_all, 1)
    box_outputs_all = tf.concat(box_outputs_all, 1)

    if max_detection_points > 0:
        # Prune anchors and detections to only keep max_detection_points.
        # Due to some issues, top_k is currently slow in graph model.
        cls_outputs_all_reshape = tf.reshape(cls_outputs_all,
                                             [params['batch_size'], -1])
        _, cls_topk_indices = tf.math.top_k(cls_outputs_all_reshape,
                                            k=max_detection_points,
                                            sorted=False)
        indices = cls_topk_indices // num_classes
        classes = cls_topk_indices % num_classes
        cls_indices = tf.stack([indices, classes], axis=2)
        cls_outputs_all_after_topk = tf.gather_nd(cls_outputs_all,
                                                  cls_indices,
                                                  batch_dims=1)
        box_outputs_all_after_topk = tf.gather_nd(box_outputs_all,
                                                  tf.expand_dims(indices, 2),
                                                  batch_dims=1)
    else:
        # Keep all anchors, but for each anchor, just keep the max probablity for
        # each class.
        cls_outputs_idx = tf.math.argmax(cls_outputs_all, axis=-1)
        num_anchors = cls_outputs_all.shape[1]

        classes = cls_outputs_idx
        indices = tf.reshape(
            tf.tile(tf.range(num_anchors), [params['batch_size']]),
            [-1, num_anchors])
        cls_outputs_all_after_topk = tf.reduce_max(cls_outputs_all, -1)
        box_outputs_all_after_topk = box_outputs_all

    metric_fn_inputs['cls_outputs_all'] = cls_outputs_all_after_topk
    metric_fn_inputs['box_outputs_all'] = box_outputs_all_after_topk
    metric_fn_inputs['indices_all'] = indices
    metric_fn_inputs['classes_all'] = classes