Example #1
0
def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets,
                   params):
  """Computes the box and class loss (Fast-RCNN branch) of Mask-RCNN.

  This function implements the classification and box regression loss of the
  Fast-RCNN branch in Mask-RCNN. As the `box_outputs` produces `num_classes`
  boxes for each RoI, the reference model expands `box_targets` to match the
  shape of `box_outputs` and selects only the target that the RoI has a maximum
  overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py)  # pylint: disable=line-too-long
  Instead, this function selects the `box_outputs` by the `class_targets` so
  that it doesn't expand `box_targets`.

  The loss computation has two parts: (1) classification loss is softmax on all
  RoIs. (2) box loss is smooth L1-loss on only positive samples of RoIs.
  Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long


  Args:
    class_outputs: a float tensor representing the class prediction for each box
      with a shape of [batch_size, num_boxes, num_classes].
    box_outputs: a float tensor representing the box prediction for each box
      with a shape of [batch_size, num_boxes, num_classes * 4].
    class_targets: a float tensor representing the class label for each box
      with a shape of [batch_size, num_boxes].
    box_targets: a float tensor representing the box label for each box
      with a shape of [batch_size, num_boxes, 4].
    params: the dictionary including training parameters specified in
      default_haprams function in this file.
  Returns:
    total_loss: a float tensor representing total loss reducing from
      class and box losses from all levels.
    cls_loss: a float tensor representing total class loss.
    box_loss: a float tensor representing total box regression loss.
  """
  with tf.name_scope('fast_rcnn_loss'):
    class_targets = tf.to_int32(class_targets)
    class_targets_one_hot = tf.one_hot(class_targets, params['num_classes'])
    class_loss = _fast_rcnn_class_loss(
        class_outputs, class_targets_one_hot)

    # Selects the box from `box_outputs` based on `class_targets`, with which
    # the box has the maximum overlap.
    batch_size, num_rois, _ = box_outputs.get_shape().as_list()
    box_outputs = tf.reshape(box_outputs,
                             [batch_size, num_rois, params['num_classes'], 4])

    box_indices = tf.reshape(
        class_targets + tf.tile(
            tf.expand_dims(
                tf.range(batch_size) * num_rois * params['num_classes'], 1),
            [1, num_rois]) + tf.tile(
                tf.expand_dims(tf.range(num_rois) * params['num_classes'], 0),
                [batch_size, 1]), [-1])

    box_outputs = tf.matmul(
        tf.one_hot(
            box_indices,
            batch_size * num_rois * params['num_classes'],
            dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
    box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])

    box_loss = (params['fast_rcnn_box_loss_weight'] *
                _fast_rcnn_box_loss(box_outputs, box_targets, class_targets))
    total_loss = class_loss + box_loss
    return total_loss, class_loss, box_loss
Example #2
0
def GetTargetSpec(
        name,
        num_dims=100,
        t_dof=1.0,
        regression_dataset="covertype",
        regression_num_points=0,
        regression_normalize=False,
        regression_hier_type="none",  # none, centered, non_centered
        regression_beta_prior="normal",  # normal, student_t
        regression_type="regular",  # regular, gamma_scales
        regression_use_beta_scales=True,
        eig_source="linear",
        batch_size=0,
        regression_stochastic_points=0,
        gamma_shape=0.5,
        precomputed_stats_path=None,
        **kwargs):
    if name == "funnel":
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=-4.0,
                          x_max=4.0,
                          y_min=-10.0,
                          y_max=10.0,
                          stats=None,
                          bijector=None)

        def funnel_forward(x):
            shift = tf.zeros_like(x)
            log_scale = tf.concat([
                tf.zeros_like(x[Ellipsis, :1]),
                tf.tile(x[Ellipsis, :1], [1, num_dims - 1])
            ], -1)
            return shift, log_scale

        mg = tfd.MultivariateNormalDiag(loc=tf.zeros(num_dims),
                                        scale_identity_multiplier=1.0)
        target = tfd.TransformedDistribution(
            mg, bijector=tfb.MaskedAutoregressiveFlow(funnel_forward))
    elif name == "ill_cond_gaussian":
        # For backwards compatibility with earlier experiments.
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=-5.0,
                          x_max=5.0,
                          y_min=-5.0,
                          y_max=5.0,
                          stats=None,
                          bijector=None)
        rng = np.random.RandomState(seed=10)
        diag_precisions = np.linspace(1., 1000., num_dims)**-1
        q, _ = np.linalg.qr(rng.randn(num_dims, num_dims))
        scg_prec = (q * diag_precisions).dot(q.T)
        scg_prec = scg_prec.astype(np.float32)
        scg_var = np.linalg.inv(scg_prec) / 1000.0
        target = tfd.MultivariateNormalFullCovariance(
            loc=tf.zeros(num_dims), covariance_matrix=scg_var)
    elif name == "new_ill_cond_gaussian":
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=-5.0,
                          x_max=5.0,
                          y_min=-5.0,
                          y_max=5.0,
                          stats=None,
                          bijector=None)
        rng = np.random.RandomState(seed=10)
        if eig_source == "linear":
            eigenvalues = np.linspace(1., 1000., num_dims)**-1
        elif eig_source == "gamma":
            eigenvalues = np.sort(
                rng.gamma(shape=gamma_shape, scale=1.,
                          size=num_dims)).astype(np.float32)
        q, _ = np.linalg.qr(rng.randn(num_dims, num_dims))
        covariance = (q * eigenvalues**-1).dot(q.T).astype(np.float32)
        target = tfd.MultivariateNormalFullCovariance(
            loc=tf.zeros(num_dims), covariance_matrix=covariance)
    elif name == "ill_cond_t":
        # For backwards compatibility with earlier experiments.
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=-10.0,
                          x_max=10.0,
                          y_min=-10.0,
                          y_max=10.0,
                          stats=None,
                          bijector=None)
        rng = np.random.RandomState(seed=10)
        diag_precisions = np.linspace(1., 1000., num_dims)**-1
        q, _ = np.linalg.qr(rng.randn(num_dims, num_dims))
        scg_prec = (q * diag_precisions).dot(q.T)
        scg_prec = scg_prec.astype(np.float32)
        scg_var = np.linalg.inv(scg_prec) / 1000.0

        scale = tf.linalg.LinearOperatorFullMatrix(scg_var)
        target = tfd.MultivariateStudentTLinearOperator(loc=tf.zeros(num_dims),
                                                        scale=scale,
                                                        df=t_dof)
    elif name == "new_ill_cond_t":
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=-5.0,
                          x_max=5.0,
                          y_min=-5.0,
                          y_max=5.0,
                          stats=None,
                          bijector=None)
        rng = np.random.RandomState(seed=10)
        if eig_source == "linear":
            eigenvalues = np.linspace(1., 1000., num_dims)**-1
        elif eig_source == "gamma":
            eigenvalues = np.sort(rng.gamma(shape=0.5, scale=1.,
                                            size=num_dims)).astype(np.float32)
        q, _ = np.linalg.qr(rng.randn(num_dims, num_dims))
        covariance = (q * eigenvalues**-1).dot(q.T).astype(np.float32)

        scale = tf.linalg.LinearOperatorFullMatrix(covariance)
        target = tfd.MultivariateStudentTLinearOperator(loc=tf.zeros(num_dims),
                                                        scale=scale,
                                                        df=t_dof)
    elif name == "logistic_reg":
        if regression_hier_type == "none":
            extra_dims = 0
        else:
            extra_dims = 2

        if regression_dataset == "covertype":
            x, y = utils.LoadCovertype()
            if regression_num_points > 0:
                rng = np.random.RandomState(seed=10)
                chosen_rows = rng.choice(x.shape[0],
                                         regression_num_points,
                                         replace=False)
                x = x[chosen_rows]
                y = y[chosen_rows]

            num_features = x.shape[-1] + 1
            num_classes = 7
            num_dims = num_features * num_classes + extra_dims

            x = tf.to_float(x)
            y = tf.to_int32(y)
        elif regression_dataset == "german":
            x, y = utils.LoadGerman()

            num_features = int(x.shape[-1]) + 1
            num_classes = 2
            num_dims = num_features * num_classes + extra_dims

            x = tf.to_float(x)
            y = tf.to_int32(y)

            if regression_num_points > 0:
                rng = np.random.RandomState(seed=10)
                chosen_rows = rng.choice(x.shape[0],
                                         regression_num_points,
                                         replace=False)
                x = tf.gather(x, chosen_rows)
                y = tf.gather(y, chosen_rows)

        if regression_stochastic_points > 0:
            chosen_rows = tf.random.uniform(
                [int(regression_stochastic_points)],
                0,
                int(x.shape[0]),
                dtype=tf.int32)
            x = tf.gather(x, chosen_rows)
            y = tf.gather(y, chosen_rows)

        if regression_normalize:
            x_min = tf.reduce_min(x, 0, keep_dims=True)
            x_max = tf.reduce_max(x, 0, keep_dims=True)

            x /= (x_max - x_min)
            x = 2.0 * x - 1.0

        x = tf.concat([x, tf.ones([int(x.shape[0]), 1])], -1)

        def regular_log_prob_fn(params):
            if regression_hier_type == "none":
                beta = params
                beta_scaled = beta
            elif regression_hier_type == "centered":
                mu_0 = params[Ellipsis, -1]
                tau_0 = tf.nn.softplus(params[Ellipsis, -2])
                beta = params[Ellipsis, :-2]
                beta_scaled = beta
            elif regression_hier_type == "non_centered":
                mu_0 = params[Ellipsis, -1]
                tau_0 = tf.nn.softplus(params[Ellipsis, -2])
                beta = params[Ellipsis, :-2]
                beta_scaled = beta / tf.expand_dims(
                    tau_0, -1) + tf.expand_dims(mu_0, -1)
            else:
                raise ValueError("Unknown regression_hier_type:" +
                                 regression_hier_type)

            if batch_size:

                def body(_, i):
                    y_dist = tfd.Categorical(logits=tf.einsum(
                        "ij,kjm->kim", x[i:i + batch_size],
                        tf.reshape(beta_scaled,
                                   [-1, num_features, num_classes])))
                    return tf.reduce_sum(y_dist.log_prob(y[i:i + batch_size]),
                                         -1)

                log_prob = tf.reduce_sum(
                    tf.scan(body,
                            tf.range(0, x.shape[0], batch_size),
                            initializer=tf.zeros(tf.shape(params)[:1]),
                            parallel_iterations=1), 0)
            else:
                y_dist = tfd.Categorical(logits=tf.einsum(
                    "ij,kjm->kim", x,
                    tf.reshape(beta_scaled, [-1, num_features, num_classes])))
                log_prob = tf.reduce_sum(y_dist.log_prob(y), -1)

            def make_beta_dist(loc, scale):
                if regression_beta_prior == "normal":
                    return tfd.Normal(loc=loc, scale=scale)
                else:
                    if tf.convert_to_tensor(loc).shape.ndims == 0:
                        loc = tf.fill(
                            tf.stack([
                                tf.shape(params)[0], num_features * num_classes
                            ]), loc)
                    if tf.convert_to_tensor(scale).shape.ndims == 0:
                        scale = tf.fill(
                            tf.stack([
                                tf.shape(params)[0], num_features * num_classes
                            ]), scale)

                    scale = tf.linalg.LinearOperatorDiag(scale)
                    return tfd.MultivariateStudentTLinearOperator(loc=loc,
                                                                  scale=scale,
                                                                  df=t_dof)

            if regression_hier_type == "none":
                beta_dist = make_beta_dist(loc=0.0, scale=10.0)
            else:
                mu_0_dist = tfd.Normal(loc=0.0, scale=10.0)
                tau_0_dist = tfd.Gamma(2.0, 1.0)
                log_prob += mu_0_dist.log_prob(mu_0) + tau_0_dist.log_prob(
                    tau_0)

                if regression_hier_type == "centered":
                    mu_0 = tf.tile(tf.expand_dims(mu_0, -1),
                                   [1, num_features * num_classes])
                    tau_0 = tf.tile(tf.expand_dims(tau_0, -1),
                                    [1, num_features * num_classes])
                    beta_dist = make_beta_dist(loc=mu_0, scale=1.0 / tau_0)
                elif regression_hier_type == "non_centered":
                    beta_dist = make_beta_dist(loc=0.0, scale=1.0)
            log_prob += tf.reduce_sum(beta_dist.log_prob(beta), -1)
            return log_prob

        def gamma_scales_log_prob_fn(params):
            assert num_classes == 2

            def unmarshal(params):
                results = []
                n_dimensions_used = 0
                if regression_use_beta_scales:
                    dim_list = [num_features, num_features, 1]
                else:
                    dim_list = [num_features, 1]
                for n_to_add in dim_list:
                    results.append(
                        params[Ellipsis,
                               n_dimensions_used:n_dimensions_used + n_to_add])
                    n_dimensions_used += n_to_add
                return tuple(results)

            log_prob = 0.
            if regression_use_beta_scales:
                beta, beta_log_scales, overall_log_scale = unmarshal(params)
                # p(per-variable scales)
                log_prob += tf.reduce_sum(
                    tfd.TransformedDistribution(
                        tfd.Gamma(0.5, 0.5),
                        tfb.Invert(tfb.Exp())).log_prob(beta_log_scales), -1)
            else:
                beta, overall_log_scale = unmarshal(params)
                beta_log_scales = 0.0
            # p(overall scale)
            log_prob += tf.reduce_sum(
                tfd.Normal(0., 10.).log_prob(overall_log_scale), -1)
            # p(beta)
            log_prob += tf.reduce_sum(tfd.Normal(0., 1.).log_prob(beta), -1)
            # p(y | x, beta)
            scaled_beta = beta * tf.exp(overall_log_scale) * tf.exp(
                beta_log_scales)
            if batch_size:

                def body(_, i):
                    logits = tf.einsum("nd,md->mn", x[i:i + batch_size],
                                       scaled_beta)
                    return tf.reduce_sum(
                        tfd.Bernoulli(logits=logits).log_prob(
                            y[i:i + batch_size]), -1)

                log_prob += tf.reduce_sum(
                    tf.scan(body,
                            tf.range(0, x.shape[0], batch_size),
                            initializer=tf.zeros(tf.shape(params)[:1]),
                            parallel_iterations=1), 0)
            else:
                logits = tf.einsum("nd,md->mn", x, scaled_beta)
                log_prob += tf.reduce_sum(
                    tfd.Bernoulli(logits=logits).log_prob(y), -1)
            return log_prob

        def horseshoe_log_prob_fn(params):
            assert num_classes == 2

            (z, r1_local, r2_local, r1_global, r2_global) = tf.split(
                params, [num_features, num_features, num_features, 1, 1],
                axis=-1)

            def indep(d):
                return tfd.Independent(d, 1)

            zero = tf.zeros(num_features)
            one = tf.ones(num_features)
            half = 0.5 * one

            p_z = indep(tfd.Normal(zero, one))
            p_r1_local = indep(tfd.HalfNormal(one))
            p_r2_local = indep(tfd.InverseGamma(half, half))

            p_r1_global = indep(tfd.HalfNormal([1.]))
            p_r2_global = indep(tfd.InverseGamma([0.5], [0.5]))

            log_prob = (p_z.log_prob(z) + p_r1_local.log_prob(r1_local) +
                        p_r2_local.log_prob(r2_local) +
                        p_r1_global.log_prob(r1_global) +
                        p_r2_global.log_prob(r2_global))

            lambda_ = r1_local * tf.sqrt(r2_local)
            tau = r1_global * tf.sqrt(r2_global)
            beta = z * lambda_ * tau

            if batch_size:

                def body(_, i):
                    logits = tf.einsum("nd,md->mn", x[i:i + batch_size], beta)
                    return tfd.Independen(tfd.Bernoulli(logits=logits),
                                          1).log_prob(y[i:i + batch_size])

                log_prob += tf.reduce_sum(
                    tf.scan(body,
                            tf.range(0, x.shape[0], batch_size),
                            initializer=tf.zeros(tf.shape(params)[:1]),
                            parallel_iterations=1), 0)
            else:
                logits = tf.einsum("nd,md->mn", x, beta)
                log_prob += tfd.Independent(tfd.Bernoulli(logits=logits),
                                            1).log_prob(y)
            return log_prob

        def gamma_scales2_log_prob_fn(params):
            assert num_classes == 2

            (z, local_scale,
             global_scale) = tf.split(params, [num_features, num_features, 1],
                                      axis=-1)

            def indep(d):
                return tfd.Independent(d, 1)

            zero = tf.zeros(num_features)
            one = tf.ones(num_features)
            half = 0.5 * one

            p_z = indep(tfd.Normal(zero, one))
            p_local_scale = indep(tfd.Gamma(half, half))
            p_global_scale = indep(tfd.Gamma([0.5], [0.5]))

            log_prob = (p_z.log_prob(z) + p_local_scale.log_prob(local_scale) +
                        p_global_scale.log_prob(global_scale))

            beta = z * local_scale * global_scale

            if batch_size:

                def body(_, i):
                    logits = tf.einsum("nd,md->mn", x[i:i + batch_size], beta)
                    return tfd.Independen(tfd.Bernoulli(logits=logits),
                                          1).log_prob(y[i:i + batch_size])

                log_prob += tf.reduce_sum(
                    tf.scan(body,
                            tf.range(0, x.shape[0], batch_size),
                            initializer=tf.zeros(tf.shape(params)[:1]),
                            parallel_iterations=1), 0)
            else:
                logits = tf.einsum("nd,md->mn", x, beta)
                log_prob += tfd.Independent(tfd.Bernoulli(logits=logits),
                                            1).log_prob(y)
            return log_prob

        bijector = None
        if regression_type == "regular":
            log_prob_fn = regular_log_prob_fn
        elif regression_type == "gamma_scales":
            log_prob_fn = gamma_scales_log_prob_fn
            num_dims = num_features + 1
            if regression_use_beta_scales:
                num_dims += num_features
        elif regression_type == "horseshoe":
            log_prob_fn = horseshoe_log_prob_fn
            num_dims = num_features * 3 + 2
            bijector = tfb.Blockwise(
                [tfb.Identity(), tfb.Exp()],
                [num_features, num_features * 2 + 2])
        elif regression_type == "gamma_scales2":
            log_prob_fn = gamma_scales2_log_prob_fn
            num_dims = num_features * 2 + 1
            bijector = tfb.Blockwise(
                [tfb.Identity(), tfb.Exp()], [num_features, num_features + 1])

        target = utils.LogProbDist(num_dims=num_dims, log_prob_fn=log_prob_fn)
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=0.10,
                          x_max=0.15,
                          y_min=0.10,
                          y_max=0.15,
                          stats=None,
                          bijector=bijector)
    elif name == "mog":
        comp_1 = tfd.MultivariateNormalDiag(loc=[-1., 1.] + [0.] *
                                            (num_dims - 2),
                                            scale_identity_multiplier=2.)
        comp_2 = tfd.MultivariateNormalDiag(loc=[1., 1.] + [0.] *
                                            (num_dims - 2),
                                            scale_identity_multiplier=4.)
        comp_3 = tfd.MultivariateNormalDiag(loc=[0., 0.] + [0.] *
                                            (num_dims - 2),
                                            scale_identity_multiplier=2.)
        cat = tfd.Categorical(logits=[0] * 3)
        target = tfd.Mixture(cat=cat, components=[comp_1, comp_2, comp_3])
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=-2.,
                          x_max=2.,
                          y_min=-2.,
                          y_max=2.,
                          stats=None,
                          bijector=None)
    elif name == "easy_gaussian":
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=-5.0,
                          x_max=5.0,
                          y_min=-5.0,
                          y_max=5.0,
                          stats=None,
                          bijector=None)
        rng = np.random.RandomState(seed=10)
        eigenvalues = np.linspace(0.5, 2., num_dims)**-1
        q, _ = np.linalg.qr(rng.randn(num_dims, num_dims))
        covariance = (q * eigenvalues**-1).dot(q.T).astype(np.float32)
        target = tfd.MultivariateNormalFullCovariance(
            loc=tf.zeros(num_dims), covariance_matrix=covariance)
    elif name == "gp_reg":
        x, y = utils.LoadCloud()

        if regression_num_points > 0:
            rng = np.random.RandomState(seed=10)
            chosen_rows = rng.choice(x.shape[0],
                                     regression_num_points,
                                     replace=False)
            x = x[chosen_rows]
            y = y[chosen_rows]

        x = tf.convert_to_tensor(x, dtype=tf.float32)
        y = tf.convert_to_tensor(y, dtype=tf.float32)

        num_features = int(x.shape[-1])
        num_dims = num_features + 2

        def log_prob_fn(params):
            rho, alpha, sigma = tf.split(params, [num_features, 1, 1], -1)

            one = tf.ones(num_features)

            def indep(d):
                return tfd.Independent(d, 1)

            p_rho = indep(tfd.InverseGamma(5. * one, 5. * one))
            p_alpha = indep(tfd.HalfNormal([1.]))
            p_sigma = indep(tfd.HalfNormal([1.]))

            rho_shape = tf.shape(rho)
            alpha_shape = tf.shape(alpha)

            x1 = tf.expand_dims(x, -2)
            x2 = tf.expand_dims(x, -3)
            exp = -0.5 * tf.squared_difference(x1, x2)
            exp /= tf.reshape(
                tf.square(rho),
                tf.concat([rho_shape[:1], [1, 1], rho_shape[1:]], 0))
            exp = tf.reduce_sum(exp, -1, keep_dims=True)
            exp += 2. * tf.reshape(
                tf.log(alpha),
                tf.concat([alpha_shape[:1], [1, 1], alpha_shape[1:]], 0))
            exp = tf.exp(exp[Ellipsis, 0])
            exp += tf.matrix_diag(
                tf.tile(tf.square(sigma), [1, int(x.shape[0])]) + 1e-6)
            exp = tf.check_numerics(exp, "exp 2 has NaNs")
            with tf.control_dependencies([tf.print(exp[0], summarize=99999)]):
                exp = tf.identity(exp)

            p_y = tfd.MultivariateNormalFullCovariance(covariance_matrix=exp)

            log_prob = (p_rho.log_prob(rho) + p_alpha.log_prob(alpha) +
                        p_sigma.log_prob(sigma) + p_y.log_prob(y))

            return log_prob

        bijector = tfb.Softplus()  #tfb.Exp()
        target = utils.LogProbDist(num_dims=num_dims, log_prob_fn=log_prob_fn)
        spec = TargetSpec(name=name,
                          num_dims=num_dims,
                          x_min=0.10,
                          x_max=0.15,
                          y_min=0.10,
                          y_max=0.15,
                          stats=None,
                          bijector=bijector)

    if precomputed_stats_path is not None:
        with tf.gfile.Open(precomputed_stats_path) as f:
            stats = simplejson.load(f)
            stats = {k: np.array(v) for k, v in stats.items()}
            spec = spec._replace(stats=stats)

    return target, spec._replace(**kwargs)
Example #3
0
def resampler_with_unstacked_warp(data,
                                  warp_x,
                                  warp_y,
                                  safe=True,
                                  name='resampler'):
    """Resamples input data at user defined coordinates.

    Args:
      data: Tensor of shape `[batch_size, data_height, data_width,
        data_num_channels]` containing 2D data that will be resampled.
      warp_x: Tensor of shape `[batch_size, dim_0, ... , dim_n]` containing the x
        coordinates at which resampling will be performed.
      warp_y: Tensor of the same shape as warp_x containing the y coordinates at
        which resampling will be performed.
      safe: A boolean, if True, warp_x and warp_y will be clamped to their bounds.
        Disable only if you know they are within bounds, otherwise a runtime
        exception will be thrown.
      name: Optional name of the op.

    Returns:
       Tensor of resampled values from `data`. The output tensor shape is
      `[batch_size, dim_0, ... , dim_n, data_num_channels]`.

    Raises:
      ValueError: If warp_x, warp_y and data have incompatible shapes.
    """

    with tf.name_scope(name):
        warp_x = tf.convert_to_tensor(warp_x)
        warp_y = tf.convert_to_tensor(warp_y)
        data = tf.convert_to_tensor(data)
        if not warp_x.shape.is_compatible_with(warp_y.shape):
            raise ValueError(
                'warp_x and warp_y are of incompatible shapes: %s vs %s ' %
                (str(warp_x.shape), str(warp_y.shape)))
        warp_shape = tf.shape(warp_x)
        if warp_x.shape[0] != data.shape[0]:
            raise ValueError(
                '\'warp_x\' and \'data\' must have compatible first '
                'dimension (batch size), but their shapes are %s and %s ' %
                (str(warp_x.shape[0]), str(data.shape[0])))
        # Compute the four points closest to warp with integer value.
        warp_floor_x = tf.floor(warp_x)
        warp_floor_y = tf.floor(warp_y)
        # Compute the weight for each point.
        right_warp_weight = warp_x - warp_floor_x
        down_warp_weight = warp_y - warp_floor_y

        warp_floor_x = tf.to_int32(warp_floor_x)
        warp_floor_y = tf.to_int32(warp_floor_y)
        warp_ceil_x = tf.to_int32(tf.ceil(warp_x))
        warp_ceil_y = tf.to_int32(tf.ceil(warp_y))

        left_warp_weight = tf.subtract(
            tf.convert_to_tensor(1.0, right_warp_weight.dtype),
            right_warp_weight)
        up_warp_weight = tf.subtract(
            tf.convert_to_tensor(1.0, down_warp_weight.dtype),
            down_warp_weight)

        # Extend warps from [batch_size, dim_0, ... , dim_n, 2] to
        # [batch_size, dim_0, ... , dim_n, 3] with the first element in last
        # dimension being the batch index.

        # A shape like warp_shape but with all sizes except the first set to 1:
        warp_batch_shape = tf.concat(
            [warp_shape[0:1], tf.ones_like(warp_shape[1:])], 0)

        warp_batch = tf.reshape(tf.range(warp_shape[0], dtype=tf.int32),
                                warp_batch_shape)

        # Broadcast to match shape:
        warp_batch += tf.zeros_like(warp_y, dtype=tf.int32)
        left_warp_weight = tf.expand_dims(left_warp_weight, axis=-1)
        down_warp_weight = tf.expand_dims(down_warp_weight, axis=-1)
        up_warp_weight = tf.expand_dims(up_warp_weight, axis=-1)
        right_warp_weight = tf.expand_dims(right_warp_weight, axis=-1)

        up_left_warp = tf.stack([warp_batch, warp_floor_y, warp_floor_x],
                                axis=-1)
        up_right_warp = tf.stack([warp_batch, warp_floor_y, warp_ceil_x],
                                 axis=-1)
        down_left_warp = tf.stack([warp_batch, warp_ceil_y, warp_floor_x],
                                  axis=-1)
        down_right_warp = tf.stack([warp_batch, warp_ceil_y, warp_ceil_x],
                                   axis=-1)

        def gather_nd(params, indices):
            return (safe_gather_nd if safe else tf.gather_nd)(params, indices)

        # gather data then take weighted average to get resample result.
        result = ((gather_nd(data, up_left_warp) * left_warp_weight +
                   gather_nd(data, up_right_warp) * right_warp_weight) *
                  up_warp_weight +
                  (gather_nd(data, down_left_warp) * left_warp_weight +
                   gather_nd(data, down_right_warp) * right_warp_weight) *
                  down_warp_weight)
        result_shape = (warp_x.get_shape().as_list() +
                        data.get_shape().as_list()[-1:])
        result.set_shape(result_shape)
        return result
def unit(w, sparsity):
  """Unit-level magnitude pruning."""
  w_shape = common_layers.shape_list(w)
  count = tf.to_int32(w_shape[-1] * sparsity)
  mask = common_layers.unit_targeting(w, count)
  return (1 - mask) * w
Example #5
0
    def get_scheduled_sample_func(self, batch_size):
        """Creates a function for scheduled sampling based on given hparams."""
        with tf.variable_scope("scheduled_sampling_func", reuse=tf.AUTO_REUSE):
            iter_num = self.get_iteration_num()

            # Simple function to bypass scheduled sampling in gt or pred only modes.
            def scheduled_sampling_simple(ground_truth_x, generated_x,
                                          batch_size, scheduled_sample_var):
                del batch_size
                if scheduled_sample_var:
                    return ground_truth_x
                return generated_x

            mode = self.hparams.scheduled_sampling_mode
            if mode == "ground_truth_only":
                scheduled_sampling_func = scheduled_sampling_simple
                scheduled_sampling_func_var = True
            elif mode == "prediction_only":
                scheduled_sampling_func = scheduled_sampling_simple
                scheduled_sampling_func_var = False
            elif mode == "prob":
                decay_steps = self.hparams.scheduled_sampling_decay_steps
                probability = tf.train.polynomial_decay(
                    1.0, iter_num, decay_steps, 0.0)
                scheduled_sampling_func = common_video.scheduled_sample_prob
                scheduled_sampling_func_var = probability
            elif mode == "prob_inverse_exp":
                decay_steps = self.hparams.scheduled_sampling_decay_steps
                probability = common_layers.inverse_exp_decay(decay_steps,
                                                              step=iter_num)
                probability *= self.hparams.scheduled_sampling_max_prob
                probability = 1.0 - probability
                scheduled_sampling_func = common_video.scheduled_sample_prob
                scheduled_sampling_func_var = probability
            elif mode == "prob_inverse_lin":
                decay_steps = self.hparams.scheduled_sampling_decay_steps
                probability = common_layers.inverse_exp_decay(
                    decay_steps // 4, step=iter_num)  # Very low at start.
                probability *= common_layers.inverse_lin_decay(decay_steps,
                                                               step=iter_num)
                probability *= self.hparams.scheduled_sampling_max_prob
                probability = 1.0 - probability
                scheduled_sampling_func = common_video.scheduled_sample_prob
                scheduled_sampling_func_var = probability
            elif mode == "count":
                # Calculate number of ground-truth frames to pass in.
                k = self.hparams.scheduled_sampling_k
                num_ground_truth = tf.to_int32(
                    tf.round(
                        tf.to_float(batch_size) *
                        (k /
                         (k + tf.exp(tf.to_float(iter_num) / tf.to_float(k)))))
                )
                scheduled_sampling_func = common_video.scheduled_sample_count
                scheduled_sampling_func_var = num_ground_truth
            else:
                raise ValueError("unknown scheduled sampling method: %s" %
                                 mode)

            if isinstance(scheduled_sampling_func_var, tf.Tensor):
                tf.summary.scalar("scheduled_sampling_var",
                                  scheduled_sampling_func_var)
            partial_func = functools.partial(
                scheduled_sampling_func,
                batch_size=batch_size,
                scheduled_sample_var=scheduled_sampling_func_var)
            return partial_func
Example #6
0
def hier_homography_fmask_estimator(color_inputs, num_param=8, num_layer=7,
                                    num_level=3, dropout_keep_prob=0.8,
                                    reuse=None, is_training=True,
                                    trainable=True,
                                    scope='hier_hmg'):
  """A hierarchical neural network with mask for homograhy estimation.

  Args:
    color_inputs: batch of input image pairs of data type float32 and of shape
      [batch_size, height, width, 6]
    num_param: the number of parameters for homography (default 8)
    num_layer: the number of convolutional layers in the motion feature network
    num_level: the number of hierarchical levels
    dropout_keep_prob: the percentage of activation values that are kept
    reuse: whether to reuse this network weights
    is_training: whether used for training or testing
    trainable: whether this network is to be trained or not
    scope: the scope of variables in this function

  Returns:
    a list of homographies at each level and motion feature maps if
    final_endpoint='mfeature'; otherwise a list of images warped by the list of
    corresponding homographies
  """
  _, h_input, w_input = color_inputs.get_shape().as_list()[0 : 3]
  vgg_inputs = (color_inputs[Ellipsis, 3 : 6] * 256 + 128)- VGG_MEANS

  with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='SAME'):
    with slim.arg_scope([slim.conv2d, slim.fully_connected], trainable=False):
      with slim.arg_scope([slim.conv2d], normalizer_fn=None):
        with slim.arg_scope(contrib_slim_nets_vgg.vgg_arg_scope()):
          sfeature, _ = contrib_slim_nets_vgg.vgg_16(
              vgg_inputs,
              1000,
              predictions_fn=slim.softmax,
              global_pool=False,
              is_training=False,
              reuse=reuse,
              spatial_squeeze=True,
              final_endpoint='pool5',
              scope='vgg_16')

  gray_image1 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 0 : 3])
  gray_image2 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 3 : 6])
  inputs = tf.concat([gray_image1, gray_image2], 3)

  hmgs_list = []
  warped_list = []
  with tf.variable_scope(scope, [inputs], reuse=reuse):
    for level_index in range(num_level):
      scale = 2 ** (num_level - 1 - level_index)
      h = tf.to_float(tf.floordiv(h_input, scale))
      w = tf.to_float(tf.floordiv(w_input, scale))
      inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w]))
      if level_index == 0:
        mfeature = hier_base_layers(inputs_il,
                                    num_layer + 1 - num_level + level_index,
                                    level_index, is_training=is_training,
                                    trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        hmgs_list.append(hmgs_il)
      else:
        warped, _ = hmg_util.homography_scale_warp_per_batch(
            inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1])
        pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1)
        warped_list.append(pre_warped_inputs_il)
        mfeature = hier_base_layers(pre_warped_inputs_il,
                                    num_layer + 1 - num_level + level_index,
                                    level_index, is_training=is_training,
                                    trainable=trainable)
        if level_index == num_level - 1:
          mfeature = fmask_layers_semantic(mfeature, sfeature, level_index,
                                           is_training=is_training,
                                           trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        new_hmgs_il = hmg_util.homography_shift_mult_batch(
            hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h)
        hmgs_list.append(new_hmgs_il)
  return hmgs_list, warped_list
Example #7
0
 def DecodeLabelAndImage(r):
     r = tf.decode_raw(r, tf.uint8)
     return tf.to_float(
         tf.transpose(tf.reshape(r[1:], [3, 32, 32]),
                      [1, 2, 0])) / 255.0, tf.to_int32(r[0])
Example #8
0
    def compute_loss(self, y_true, y_pred):
        """Compute mutlibox loss.

        # Arguments
            y_true: Ground truth targets,
                tensor of shape (?, num_boxes, 4 + num_classes + 8),
                priors in ground truth are fictitious,
                y_true[:, :, -8] has 1 if prior should be penalized
                    or in other words is assigned to some ground truth box,
                y_true[:, :, -7:] are all 0.
            y_pred: Predicted logits,
                tensor of shape (?, num_boxes, 4 + num_classes + 8).

        # Returns
            loss: Loss for prediction, tensor of shape (?,).
        """
        batch_size = tf.shape(y_true)[0]
        num_boxes = tf.to_float(tf.shape(y_true)[1])

        # loss for all priors
        conf_loss = self._softmax_loss(y_true[:, :, 4:-8], y_pred[:, :, 4:-8])
        loc_loss = self._l1_smooth_loss(y_true[:, :, :4], y_pred[:, :, :4])

        # get positives loss
        num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)
        pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8], axis=1)
        pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8], axis=1)

        # get negatives loss, we penalize only confidence here
        num_neg = tf.minimum(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
        pos_num_neg_mask = tf.greater(num_neg, 0)
        has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
        num_neg = tf.concat(
            axis=0,
            values=[num_neg, [(1 - has_min) * self.negatives_for_hard]])
        num_neg_batch = tf.reduce_min(
            tf.boolean_mask(num_neg, tf.greater(num_neg, 0)))
        num_neg_batch = tf.to_int32(num_neg_batch)
        confs_start = 4 + self.background_label_id + 1
        confs_end = confs_start + self.num_classes - 1
        max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end], axis=2)
        _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
                                 k=num_neg_batch)
        batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
        batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
        full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
                        tf.reshape(indices, [-1]))
        # full_indices = tf.concat(2, [tf.expand_dims(batch_idx, 2),
        #                              tf.expand_dims(indices, 2)])
        # neg_conf_loss = tf.gather_nd(conf_loss, full_indices)
        neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]), full_indices)
        neg_conf_loss = tf.reshape(neg_conf_loss, [batch_size, num_neg_batch])
        neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)

        # loss is sum of positives and negatives
        total_loss = pos_conf_loss + neg_conf_loss
        total_loss /= (num_pos + tf.to_float(num_neg_batch))
        num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,
                           tf.ones_like(num_pos))
        total_loss += (self.alpha * pos_loc_loss) / num_pos
        return total_loss
    def body(self,
             features,
             decode_step=None,
             cache=None,
             decoding_stats=None,
             add_summary=True):
        encoder_output = None
        extra_losses = []
        padding_bias = None
        if not self.hparams.fast_decode:
            decode_step = None
        if "inputs" in features:
            inputs = features["inputs"]
            # remove the last two dimensions that are always 1.
            inputs = tf.reshape(
                inputs,
                utils.shape_list(inputs)[:2] + [self.hidden_size])
            # Padding bias only used for seq2seq models.
            padding_bias = utils.embedding_to_padding(inputs)
            # Mask random positions
            shape = utils.shape_list(inputs)
            if self.hparams.input_dropout:
                inputs = tf.where(
                    tf.random.uniform(shape) < self.hparams.input_dropout,
                    tf.zeros_like(inputs), inputs)
            if self.hparams.add_timing_signal:
                inputs += utils.get_timing_signal_1d(self.hparams.max_length,
                                                     self.hidden_size)
            if cache is not None and -1 in cache:
                encoder_output = cache[-1]
            else:
                encoder_output = utils.transformer_encoder_layers(
                    inputs=inputs,
                    num_layers=self.num_encoder_layers,
                    hparams=self.hparams,
                    losses=extra_losses,
                    name="encoder",
                    token_bias=features.get("token_bias_inputs"),
                    padding_bias=padding_bias)
            if cache is not None and -1 not in cache:
                cache[-1] = encoder_output
        targets = tf.to_int32(features["targets"])
        # remove the last two dimensions that are always 1.
        targets = tf.reshape(targets, utils.shape_list(targets)[:2])
        # Clamp targets to max_target_length
        targets = targets[:, :self.hparams.max_target_length]
        if self.is_decode:
            targets = self.process_partial_targets_decoding(targets)
        decoder_input = self.prepare_decoder(targets)

        decoder_output = utils.transformer_decoder_layers(
            inputs=decoder_input,
            num_layers=self.num_decoder_layers,
            hparams=self.hparams,
            encoder_output=encoder_output,
            decode_step=decode_step,
            losses=extra_losses,
            cache=cache,
            name="decoder",
            decoding_stats=decoding_stats,
            token_bias_inputs=features.get("token_bias_inputs"),
            token_bias_targets=features.get("token_bias_targets"),
            padding_bias=padding_bias)
        logits = self.produce_output(decoder_output)

        # Return logits as-is in decoding mode
        if self.is_decode:
            return logits

        # Add cross entropy loss
        one_hot_targets = tf.one_hot(tf.cast(targets, dtype=tf.int32),
                                     self.vocab_size)
        x_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=one_hot_targets, logits=logits)
        weights = tf.to_float(tf.not_equal(targets, 0))
        loss = tf.reduce_sum(x_entropy * weights) / tf.reduce_sum(weights)
        if add_summary:
            tf.summary.scalar("losses/weight", tf.reduce_sum(weights))
            tf.summary.scalar("losses/x_entropy",
                              tf.reduce_sum(x_entropy * weights))

        loss_dict = {"training": loss}
        if extra_losses:
            loss_dict["extra_loss"] = tf.add_n(extra_losses)
        # hack for T2T metrics
        logits = tf.reshape(
            logits,
            utils.shape_list(logits)[:2] + [1, 1] +
            utils.shape_list(logits)[-1:])
        return logits, loss_dict
Example #10
0
def randomly_crop_and_resize(image,
                             masks,
                             boxes,
                             keypoints,
                             image_size,
                             probability=0.5):
    """
    Arguments:
        image: a float tensor with shape [height, width, 3].
        masks: a float tensor with shape [height / DOWNSAMPLE, width / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_persons, 4].
        keypoints: an int tensor with shape [num_persons, 17, 3].
        image_size: a tuple of integers (h, w).
        probability: a float number.
    Returns:
        image: a float tensor with shape [h, w, 3].
        masks: a float tensor with shape [h / DOWNSAMPLE, w / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_remaining, 4].
        keypoints: an int tensor with shape [num_remaining, 17, 3].
    """

    shape = tf.to_float(tf.shape(image))
    height, width = shape[0], shape[1]
    scaler = tf.stack([height, width, height, width])
    boxes /= scaler  # to the [0, 1] range

    def crop(image, boxes, keypoints):
        """
        Arguments:
            image: a float tensor with shape [height, width, 3].
            boxes: a float tensor with shape [num_persons, 4].
            keypoints: an int tensor with shape [num_persons, 17, 3].
        Returns:
            image: a float tensor with shape [None, None, 3].
            boxes: a float tensor with shape [num_remaining, 4].
            keypoints: an int tensor with shape [num_remaining, 17, 3].
            window: a float tensor with shape [4].
        """

        image, boxes, window, keep_indices = random_image_crop(
            image,
            boxes,
            min_object_covered=0.9,
            aspect_ratio_range=(0.95, 1.05),
            area_range=(0.5, 1.0),
            overlap_threshold=OVERLAP_THRESHOLD)

        keypoints = tf.gather(keypoints, keep_indices)
        # it has shape [num_remaining, 17, 3]

        ymin, xmin, ymax, xmax = tf.unstack(window * scaler)
        points, v = tf.split(keypoints, [2, 1], axis=2)
        points = tf.to_float(points)  # shape [num_remaining, 17, 2]

        translation = tf.stack([ymin, xmin])
        points = tf.to_int32(tf.round(points - translation))
        keypoints = tf.concat([points, v], axis=2)

        # note that after this some keypoints will be invisible,
        # so we need to modify the `v` vector later

        return image, boxes, keypoints, window

    whole_image_window = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32)
    do_it = tf.less(tf.random_uniform([]), probability)

    image, boxes, keypoints, window = tf.cond(
        do_it, lambda: crop(image, boxes, keypoints), lambda:
        (image, boxes, keypoints, whole_image_window))

    def correct_keypoints(image_shape, keypoints):
        """
        Arguments:
            image_shape: an int tensor with shape [3].
            keypoints: an int tensor with shape [num_persons, 17, 3].
        Returns:
            an int tensor with shape [num_persons, 17, 3].
        """
        y, x, v = tf.split(keypoints, 3, axis=2)

        height = image_shape[0]
        width = image_shape[1]

        coordinate_violations = tf.concat([
            tf.less(y, 0),
            tf.less(x, 0),
            tf.greater_equal(y, height),
            tf.greater_equal(x, width)
        ],
                                          axis=2)  # shape [num_persons, 17, 4]

        valid_indicator = tf.logical_not(
            tf.reduce_any(coordinate_violations, axis=2))
        valid_indicator = tf.expand_dims(valid_indicator, 2)
        # it has shape [num_persons, 17, 1]

        v *= tf.to_int32(valid_indicator)
        keypoints = tf.concat([y, x, v], axis=2)
        return keypoints

    def rescale(boxes, keypoints, old_shape, new_shape):
        """
        Arguments:
            boxes: a float tensor with shape [num_persons, 4].
            keypoints: an int tensor with shape [num_persons, 17, 3].
            old_shape, new_shape: int tensors with shape [3].
        Returns:
            a float tensor with shape [num_persons, 4].
            an int tensor with shape [num_persons, 17, 3].
        """
        points, v = tf.split(keypoints, [2, 1], axis=2)
        points = tf.to_float(points)

        old_shape = tf.to_float(old_shape)
        new_shape = tf.to_float(new_shape)
        old_height, old_width = old_shape[0], old_shape[1]
        new_height, new_width = new_shape[0], new_shape[1]

        scaler = tf.stack([new_height / old_height, new_width / old_width])
        points *= scaler

        scaler = tf.stack([new_height, new_width])
        scaler = tf.concat(2 * [scaler], axis=0)
        boxes *= scaler

        new_height = tf.to_int32(new_height)
        new_width = tf.to_int32(new_width)

        points = tf.to_int32(tf.round(points))
        y, x = tf.split(points, 2, axis=2)
        y = tf.clip_by_value(y, 0, new_height - 1)
        x = tf.clip_by_value(x, 0, new_width - 1)
        keypoints = tf.concat([y, x, v], axis=2)
        return boxes, keypoints

    old_shape = tf.shape(image)
    keypoints = correct_keypoints(old_shape, keypoints)

    h, w = image_size  # image size that will be used for training
    image = tf.image.resize_images(image, [h, w], method=RESIZE_METHOD)

    masks_height = tf.to_int32(tf.ceil(h / DOWNSAMPLE))
    masks_width = tf.to_int32(tf.ceil(w / DOWNSAMPLE))

    masks = tf.image.crop_and_resize(image=tf.expand_dims(masks, 0),
                                     boxes=tf.expand_dims(window, 0),
                                     box_indices=tf.constant([0],
                                                             dtype=tf.int32),
                                     crop_size=[masks_height, masks_width],
                                     method='nearest')
    masks = masks[0]

    boxes, keypoints = rescale(boxes, keypoints, old_shape, tf.shape(image))
    return image, masks, boxes, keypoints
Example #11
0
def ae_transformer_internal(inputs,
                            targets,
                            target_space,
                            hparams,
                            cache=None):
    """Main step used for training."""
    # Encoder.
    inputs = common_layers.flatten4d3d(inputs)
    inputs, ed = encode(inputs, target_space, hparams, "input_enc")

    # Autoencoding.
    losses = {"extra": tf.constant(0.0), "latent_pred": tf.constant(0.0)}

    max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1)
    targets, _ = common_layers.pad_to_same_length(
        targets,
        max_targets_len_from_inputs,
        final_length_divisible_by=2**hparams.num_compress_steps)
    targets_c = compress(targets, hparams, "compress")
    if hparams.mode != tf.estimator.ModeKeys.PREDICT:
        # Compress and bottleneck.
        latents_discrete_hot, extra_loss = vq_discrete_bottleneck(
            x=targets_c, hparams=hparams)
        latents_dense = vq_discrete_unbottleneck(latents_discrete_hot,
                                                 hparams=hparams)
        latents_dense = targets_c + tf.stop_gradient(latents_dense - targets_c)
        latents_discrete = tf.argmax(latents_discrete_hot, axis=-1)
        tf.summary.histogram("codes",
                             tf.reshape(latents_discrete[:, 0, :], [-1]))
        losses["extra"] = extra_loss

        # Extra loss predicting latent code from input.
        latents_pred = decode_transformer(inputs, ed, latents_dense, hparams,
                                          "extra")
        latent_pred_loss = get_latent_pred_loss(latents_pred,
                                                latents_discrete_hot, hparams)
        losses["latent_pred"] = tf.reduce_mean(latent_pred_loss)
    else:
        latent_len = common_layers.shape_list(targets_c)[1]
        embed = functools.partial(vq_discrete_unbottleneck, hparams=hparams)
        latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :])
        if cache is None:
            cache = ae_latent_sample_beam(latents_dense, inputs, ed, embed,
                                          hparams)
        cache_hot = tf.one_hot(cache, depth=2**hparams.bottleneck_bits)
        latents_dense = embed(cache_hot)

    # Postprocess.
    d = latents_dense
    pos = tf.get_variable("pos", [1, 1000, 1, hparams.hidden_size])
    pos = pos[:, :common_layers.shape_list(latents_dense)[1] + 1, :, :]
    latents_dense = tf.pad(latents_dense,
                           [[0, 0], [1, 0], [0, 0], [0, 0]]) + pos

    # Decompressing the dense latents
    for i in range(hparams.num_compress_steps):
        j = hparams.num_compress_steps - i - 1
        d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j)
        d = decompress_step(d, hparams, i > 0, "decompress_%d" % j)

    masking = common_layers.inverse_lin_decay(hparams.mask_startup_steps)
    masking *= common_layers.inverse_exp_decay(hparams.mask_startup_steps //
                                               4)  # Not much at start.
    masking = tf.minimum(tf.maximum(masking, 0.0), 1.0)
    if hparams.mode == tf.estimator.ModeKeys.PREDICT:
        masking = 1.0
    mask = tf.less(masking,
                   tf.random_uniform(common_layers.shape_list(targets)[:-1]))
    mask = tf.expand_dims(tf.to_float(mask), 3)

    # targets is always [batch, length, 1, depth]
    targets = mask * targets + (1.0 - mask) * d

    res = decode_transformer(inputs, ed, targets, hparams, "decoder")
    latent_time = tf.less(hparams.mask_startup_steps,
                          tf.to_int32(tf.train.get_global_step()))
    losses["latent_pred"] *= tf.to_float(latent_time)
    return res, losses, cache
Example #12
0
 def scale(x):
     unpadded_x = tf.to_int32(tf.round(tf.to_float(x) * scale_factor))
     x = tf.to_int32(tf.ceil(unpadded_x / divisor))
     pad = divisor * x - unpadded_x
     return (unpadded_x, pad)
Example #13
0
def resize_keeping_aspect_ratio(image, masks, boxes, keypoints, min_dimension,
                                divisor):
    """
    This function resizes and possibly pads with zeros.
    When using a usual FPN, divisor must be equal to 128.

    Arguments:
        image: a float tensor with shape [height, width, 3].
        masks: a float tensor with shape [height / DOWNSAMPLE, width / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_persons, 4].
        keypoints: an int tensor with shape [num_persons, 17, 3].
        min_dimension, divisor: integers.
    Returns:
        image: a float tensor with shape [h, w, 3],
            where `min_dimension = min(h, w)`,
            `h` and `w` are divisible by `DIVISOR`.
        masks: a float tensor with shape [h / DOWNSAMPLE, w / DOWNSAMPLE, 2].
        boxes: a float tensor with shape [num_persons, 4].
        keypoints: an int tensor with shape [num_persons, 17, 3].
    """

    assert min_dimension % divisor == 0
    min_dimension = tf.constant(min_dimension, dtype=tf.int32)
    divisor = tf.constant(divisor, dtype=tf.int32)

    shape = tf.shape(image)
    height, width = shape[0], shape[1]

    original_min_dim = tf.minimum(height, width)
    scale_factor = tf.to_float(min_dimension / original_min_dim)

    # RESIZE AND PAD IMAGE

    def scale(x):
        unpadded_x = tf.to_int32(tf.round(tf.to_float(x) * scale_factor))
        x = tf.to_int32(tf.ceil(unpadded_x / divisor))
        pad = divisor * x - unpadded_x
        return (unpadded_x, pad)

    zero = tf.constant(0, dtype=tf.int32)
    new_height, pad_height, new_width, pad_width = tf.cond(
        tf.greater_equal(height, width), lambda: scale(height) +
        (min_dimension, zero), lambda: (min_dimension, zero) + scale(width))

    # final image size
    h = new_height + pad_height
    w = new_width + pad_width

    # resize keeping aspect ratio
    image = tf.image.resize_images(image, [new_height, new_width],
                                   method=RESIZE_METHOD)

    # pad image at the bottom or at the right
    image = tf.image.pad_to_bounding_box(image,
                                         offset_height=0,
                                         offset_width=0,
                                         target_height=h,
                                         target_width=w)

    # RESIZE AND PAD MASKS

    # new size of masks with padding
    map_height = tf.to_int32(tf.ceil(h / DOWNSAMPLE))
    map_width = tf.to_int32(tf.ceil(w / DOWNSAMPLE))

    # new size of only masks without padding
    map_only_height = tf.to_int32(tf.ceil(new_height / DOWNSAMPLE))
    map_only_width = tf.to_int32(tf.ceil(new_width / DOWNSAMPLE))

    masks = tf.image.resize_images(
        masks, [map_only_height, map_only_width],
        method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

    masks = tf.image.pad_to_bounding_box(masks,
                                         offset_height=0,
                                         offset_width=0,
                                         target_height=map_height,
                                         target_width=map_width)

    # TRANSFORM KEYPOINTS

    keypoint_scaler = tf.stack([new_height / height, new_width / width])
    keypoint_scaler = tf.to_float(keypoint_scaler)

    points, v = tf.split(keypoints, [2, 1], axis=2)
    points = tf.to_int32(tf.round(tf.to_float(points) * keypoint_scaler))
    y, x = tf.split(points, 2, axis=2)
    y = tf.clip_by_value(y, 0, h - 1)
    x = tf.clip_by_value(x, 0, w - 1)
    keypoints = tf.concat([y, x, v], axis=2)

    # TRANSFORM BOXES

    box_scaler = tf.concat(2 * [keypoint_scaler], axis=0)
    boxes *= box_scaler

    return image, masks, boxes, keypoints
Example #14
0
    def parse(self, example_proto):
        """
        Returns:
            image: a float tensor with shape [height, width, 3],
                an RGB image with pixel values in the range [0, 1].
            masks: a float tensor with shape [height / DOWNSAMPLE, width / DOWNSAMPLE, 2].
            boxes: a float tensor with shape [num_persons, 4], in absolute coordinates.
            keypoints: an int tensor with shape [num_persons, 17, 3], in absolute coordinates.
        """
        features = {
            'image':
            tf.FixedLenFeature([], tf.string),
            'num_persons':
            tf.FixedLenFeature([], tf.int64),
            'boxes':
            tf.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
            'keypoints':
            tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
            'masks':
            tf.FixedLenFeature([], tf.string)
        }
        parsed_features = tf.parse_single_example(example_proto, features)

        # get an image
        image = tf.image.decode_jpeg(parsed_features['image'], channels=3)
        image = tf.image.convert_image_dtype(image, tf.float32)
        # now pixel values are scaled to the [0, 1] range

        # get number of people on the image
        num_persons = tf.to_int32(parsed_features['num_persons'])
        # it is assumed that num_persons > 0

        # get groundtruth boxes, they are in absolute coordinates
        boxes = tf.reshape(parsed_features['boxes'], [num_persons, 4])
        # they are used to guide the data augmentation (when doing a random crop)
        # and to choose sigmas for gaussian blobs

        # get keypoints, they are in absolute coordinates
        keypoints = tf.to_int32(parsed_features['keypoints'])
        keypoints = tf.reshape(keypoints, [num_persons, 17, 3])

        # get size of masks, they are downsampled
        shape = tf.shape(image)
        image_height, image_width = shape[0], shape[1]
        masks_height = tf.to_int32(tf.ceil(image_height / DOWNSAMPLE))
        masks_width = tf.to_int32(tf.ceil(image_width / DOWNSAMPLE))
        # (we use the 'SAME' padding in the networks)

        # get masks (loss and segmentation masks)
        masks = tf.decode_raw(parsed_features['masks'], tf.uint8)
        # unpack bits (reverse np.packbits)
        b = tf.constant([128, 64, 32, 16, 8, 4, 2, 1], dtype=tf.uint8)
        masks = tf.reshape(tf.bitwise.bitwise_and(masks[:, None], b), [-1])
        masks = masks[:(masks_height * masks_width * 2)]
        masks = tf.cast(masks > 0, tf.uint8)

        # reshape to the initial form
        masks = tf.reshape(masks, [masks_height, masks_width, 2])
        masks = tf.to_float(masks)  # it has binary values only

        return image, masks, boxes, keypoints
    def subsample(self, indicator, batch_size, labels, scope=None):
        """Returns subsampled minibatch.

    Args:
      indicator: boolean tensor of shape [N] whose True entries can be sampled.
      batch_size: desired batch size. If None, keeps all positive samples and
        randomly selects negative samples so that the positive sample fraction
        matches self._positive_fraction. It cannot be None is is_static is True.
      labels: boolean tensor of shape [N] denoting positive(=True) and negative
          (=False) examples.
      scope: name scope.

    Returns:
      sampled_idx_indicator: boolean tensor of shape [N], True for entries which
        are sampled.

    Raises:
      ValueError: if labels and indicator are not 1D boolean tensors.
    """
        if len(indicator.get_shape().as_list()) != 1:
            raise ValueError(
                'indicator must be 1 dimensional, got a tensor of '
                'shape %s' % indicator.get_shape())
        if len(labels.get_shape().as_list()) != 1:
            raise ValueError('labels must be 1 dimensional, got a tensor of '
                             'shape %s' % labels.get_shape())
        if labels.dtype != tf.bool:
            raise ValueError('labels should be of type bool. Received: %s' %
                             labels.dtype)
        if indicator.dtype != tf.bool:
            raise ValueError('indicator should be of type bool. Received: %s' %
                             indicator.dtype)
        with tf.name_scope(scope, 'BalancedPositiveNegativeSampler'):
            if self._is_static:
                return self._static_subsample(indicator, batch_size, labels)

            else:
                # Only sample from indicated samples
                negative_idx = tf.logical_not(labels)
                positive_idx = tf.logical_and(labels, indicator)
                negative_idx = tf.logical_and(negative_idx, indicator)

                # Sample positive and negative samples separately
                if batch_size is None:
                    max_num_pos = tf.reduce_sum(tf.to_int32(positive_idx))
                else:
                    max_num_pos = int(self._positive_fraction * batch_size)
                sampled_pos_idx = self.subsample_indicator(
                    positive_idx, max_num_pos)
                num_sampled_pos = tf.reduce_sum(
                    tf.cast(sampled_pos_idx, tf.int32))
                if batch_size is None:
                    negative_positive_ratio = (
                        1 - self._positive_fraction) / self._positive_fraction
                    max_num_neg = tf.to_int32(negative_positive_ratio *
                                              tf.to_float(num_sampled_pos))
                else:
                    max_num_neg = batch_size - num_sampled_pos
                sampled_neg_idx = self.subsample_indicator(
                    negative_idx, max_num_neg)

                return tf.logical_or(sampled_pos_idx, sampled_neg_idx)
Example #16
0
def add_distance_loss_to_center(labels, logits, groundtruth_coords):
    """Add distance loss function for ClickRegression."""
    weights = tf.to_int32(
        tf.not_equal(
            labels,
            model_input.dataset_descriptors[FLAGS.dataset].ignore_label))
    labels *= weights

    # Use GT box to get center if it exists. Less computation required.
    # Otherwise, calculate from label mask.
    if FLAGS.use_groundtruth_box:
        center_x = (groundtruth_coords['xmin'] +
                    groundtruth_coords['xmax']) / 2.0
        center_y = (groundtruth_coords['ymin'] +
                    groundtruth_coords['ymax']) / 2.0
        center = tf.stack([center_y, center_x], axis=1)
    else:
        # Make array of coordinates (each row contains three coordinates)
        ii, jj = tf.meshgrid(tf.range(FLAGS.image_size),
                             tf.range(FLAGS.image_size),
                             indexing='ij')
        coords = tf.stack([tf.reshape(ii, (-1, )),
                           tf.reshape(jj, (-1, ))],
                          axis=-1)
        coords = tf.cast(coords, tf.int32)

        # Rearrange input into one vector per volume
        volumes_flat = tf.reshape(
            labels, [-1, FLAGS.image_size * FLAGS.image_size * 1, 1])
        # Compute total mass for each volume. Add 0.00001 to prevent division by 0
        total_mass = tf.cast(tf.reduce_sum(volumes_flat, axis=1),
                             tf.float32) + ZERO_DIV_OFFSET
        # Compute centre of mass
        center = tf.cast(tf.reduce_sum(volumes_flat * coords, axis=1),
                         tf.float32) / total_mass
        center = center / FLAGS.image_size

    # Normalize coordinates by size of image
    logits = logits / FLAGS.image_size

    # Calculate loss based on the distance metric specified
    # Loss added later in model_fn by tf.losses.get_total_loss()
    if FLAGS.distance_metric == 'mse':
        tf.losses.mean_squared_error(center, logits)
    elif FLAGS.distance_metric in [
            'euclidean', 'euclidean_sqrt', 'euclidean_iter'
    ]:
        distance_to_center = tf.sqrt(
            tf.reduce_sum(tf.square(logits - center), axis=-1) +
            ZERO_DIV_OFFSET)
        if FLAGS.ratio_box_distance:
            distance_to_box = calc_distance_to_edge(groundtruth_coords, logits)
            box_distance_to_center = (tf.to_float(distance_to_center) -
                                      distance_to_box)
            loss = distance_to_center / (box_distance_to_center +
                                         ZERO_DIV_OFFSET)
        else:
            loss = distance_to_center

        if FLAGS.distance_metric == 'euclidean_sqrt':
            loss = tf.sqrt(loss)
        if FLAGS.distance_metric == 'euclidean_iter':
            iter_num = tf.to_float(tf.train.get_or_create_global_step())
            step = (iter_num // FLAGS.euclidean_step) + 1.0
            loss = tf.pow(loss, tf.to_float(1.0 / step))
        tf.losses.compute_weighted_loss(loss)
Example #17
0
def hier_homography_estimator(inputs, num_param=8, num_layer=7, num_level=3,
                              dropout_keep_prob=0.8, reuse=None,
                              is_training=True, trainable=True,
                              final_endpoint=None, scope='hier_hmg'):
  """A hierarchical VGG-style neural network for homograhy estimation.

  Args:
    inputs: batch of input image pairs of data type float32 and of shape
      [batch_size, height, width, 2]
    num_param: the number of parameters for homography (default 8)
    num_layer: the number of convolutional layers in the motion feature network
    num_level: the number of hierarchical levels
    dropout_keep_prob: the percentage of activation values that are kept
    reuse: whether to reuse this network weights
    is_training: whether used for training or testing
    trainable: whether this network is to be trained or not
    final_endpoint: specifies the endpoint to construct the network up to
    scope: the scope of variables in this function

  Returns:
    a list of homographies at each level and motion feature maps if
    final_endpoint='mfeature'; otherwise a list of images warped by the list of
    corresponding homographies
  """
  _, h_input, w_input = inputs.get_shape().as_list()[0:3]
  hmgs_list = []
  warped_list = []
  with tf.variable_scope(scope, [inputs], reuse=reuse):
    for level_index in range(num_level):
      scale = 2 ** (num_level - 1 - level_index)
      h = tf.to_float(tf.floordiv(h_input, scale))
      w = tf.to_float(tf.floordiv(w_input, scale))
      inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w]))
      if level_index == 0:
        mfeature = hier_base_layers(inputs_il,
                                    num_layer + 1 - num_level + level_index,
                                    level_index, is_training=is_training,
                                    trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        hmgs_list.append(hmgs_il)
      else:
        warped, _ = hmg_util.homography_scale_warp_per_batch(
            inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1])
        pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1)
        warped_list.append(pre_warped_inputs_il)
        if level_index == num_level - 1 and final_endpoint == 'mfeature':
          mfeature = hier_base_layers(pre_warped_inputs_il,
                                      num_layer - num_level + level_index,
                                      level_index, is_training=is_training,
                                      trainable=trainable)
          return hmgs_list, mfeature
        else:
          mfeature = hier_base_layers(pre_warped_inputs_il,
                                      num_layer + 1 - num_level + level_index,
                                      level_index, is_training=is_training,
                                      trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        new_hmgs_il = hmg_util.homography_shift_mult_batch(
            hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h)
        hmgs_list.append(new_hmgs_il)
  return hmgs_list, warped_list
Example #18
0
  def _serving_model_fn(features, labels, mode, params):
    """Builds the serving model_fn."""
    del labels  # unused.
    if mode != tf.estimator.ModeKeys.PREDICT:
      raise ValueError('To build the serving model_fn, set '
                       'mode = `tf.estimator.ModeKeys.PREDICT`')

    model_params = params_dict.ParamsDict(params)

    images = features['images']
    _, height, width, _ = images.get_shape().as_list()

    model_fn = factory.model_generator(model_params)
    outputs = model_fn.build_outputs(
        features['images'], labels=None, mode=mode_keys.PREDICT)

    logits = tf.image.resize_bilinear(
        outputs['logits'], tf.shape(images)[1:3], align_corners=False)

    original_image_size = tf.squeeze(features['image_info'][:, 0:1, :])
    height = original_image_size[0]
    width = original_image_size[1]
    offset_height = tf.zeros_like(height, dtype=tf.int32)
    offset_width = tf.zeros_like(width, dtype=tf.int32)

    # Clip the predictions to original image size.
    logits = tf.image.crop_to_bounding_box(logits, offset_height, offset_width,
                                           tf.cast(height, dtype=tf.int32),
                                           tf.cast(width, dtype=tf.int32))
    probabilities = tf.nn.softmax(logits)

    score_threshold_placeholder = features['score_thresholds']
    key_placeholder = features['key']

    score_threshold_pred_expanded = score_threshold_placeholder
    for _ in range(0, logits.shape.ndims - 1):
      score_threshold_pred_expanded = tf.expand_dims(
          score_threshold_pred_expanded, -1)

    scores = tf.where(probabilities > score_threshold_pred_expanded,
                      probabilities, tf.zeros_like(probabilities))
    scores = tf.reduce_max(scores, 3)
    scores = tf.expand_dims(scores, -1)
    scores = tf.cast(tf.minimum(scores * 255.0, 255), tf.uint8)
    categories = tf.to_int32(tf.expand_dims(tf.argmax(probabilities, 3), -1))

    # Generate images for scores and categories.
    score_bytes = tf.map_fn(
        tf.image.encode_png, scores, back_prop=False, dtype=tf.string)
    category_bytes = tf.map_fn(
        tf.image.encode_png,
        tf.cast(categories, tf.uint8),
        back_prop=False,
        dtype=tf.string)

    predictions = {}

    predictions['category_bytes'] = tf.identity(
        category_bytes, name='category_bytes')
    predictions['score_bytes'] = tf.identity(score_bytes, name='score_bytes')
    predictions['key'] = tf.identity(key_placeholder, name='key')
    if output_image_info:
      predictions['image_info'] = tf.identity(
          features['image_info'], name='image_info')

    if export_tpu_model:
      return tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode, predictions=predictions)
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
Example #19
0
 def DecodeLabel(label):
     label = tf.decode_raw(label, tf.uint8)
     label = tf.reshape(label, [])
     return tf.to_int32(label)
def get_center_index(response):
    """Get the index of the center in the response map"""
    shape = tf.shape(response)
    c1 = tf.to_int32((shape[1] - 1) / 2)
    c2 = tf.to_int32((shape[2] - 1) / 2)
    return c1, c2
Example #21
0
def apply_cmap(brightness, cmap):
    indices = tf.to_int32(tf.round(brightness * 255))
    cm = matplotlib.cm.get_cmap(cmap)
    colors = tf.constant(cm.colors, dtype=tf.float32)
    return tf.gather(colors, indices)
Example #22
0
def compute_mel_filterbank_features(waveforms,
                                    sample_rate=16000,
                                    dither=1.0 / np.iinfo(np.int16).max,
                                    preemphasis=0.97,
                                    frame_length=25,
                                    frame_step=10,
                                    fft_length=None,
                                    window_fn=functools.partial(
                                        tf.signal.hann_window, periodic=True),
                                    lower_edge_hertz=80.0,
                                    upper_edge_hertz=7600.0,
                                    num_mel_bins=80,
                                    log_noise_floor=1e-3,
                                    apply_mask=True):
    """Implement mel-filterbank extraction using tf ops.

  Args:
    waveforms: float32 tensor with shape [batch_size, max_len]
    sample_rate: sampling rate of the waveform
    dither: stddev of Gaussian noise added to waveform to prevent quantization
      artefacts
    preemphasis: waveform high-pass filtering constant
    frame_length: frame length in ms
    frame_step: frame_Step in ms
    fft_length: number of fft bins
    window_fn: windowing function
    lower_edge_hertz: lowest frequency of the filterbank
    upper_edge_hertz: highest frequency of the filterbank
    num_mel_bins: filterbank size
    log_noise_floor: clip small values to prevent numeric overflow in log
    apply_mask: When working on a batch of samples, set padding frames to zero
  Returns:
    filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1]
  """
    # `stfts` is a complex64 Tensor representing the short-time Fourier
    # Transform of each signal in `signals`. Its shape is
    # [batch_size, ?, fft_unique_bins]
    # where fft_unique_bins = fft_length // 2 + 1

    # Find the wave length: the largest index for which the value is !=0
    # note that waveforms samples that are exactly 0.0 are quite common, so
    # simply doing sum(waveforms != 0, axis=-1) will not work correctly.
    wav_lens = tf.reduce_max(
        tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) *
        tf.to_int32(tf.not_equal(waveforms, 0.0)),
        axis=-1) + 1
    if dither > 0:
        waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither)
    if preemphasis > 0:
        waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1]
        wav_lens -= 1
    frame_length = int(frame_length * sample_rate / 1e3)
    frame_step = int(frame_step * sample_rate / 1e3)
    if fft_length is None:
        fft_length = int(2**(np.ceil(np.log2(frame_length))))

    stfts = tf.contrib.signal.stft(waveforms,
                                   frame_length=frame_length,
                                   frame_step=frame_step,
                                   fft_length=fft_length,
                                   window_fn=window_fn,
                                   pad_end=True)

    stft_lens = (wav_lens + (frame_step - 1)) // frame_step
    masks = tf.to_float(
        tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0),
                      tf.expand_dims(stft_lens, 1)))

    # An energy spectrogram is the magnitude of the complex-valued STFT.
    # A float32 Tensor of shape [batch_size, ?, 257].
    magnitude_spectrograms = tf.abs(stfts)

    # Warp the linear-scale, magnitude spectrograms into the mel-scale.
    num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
    linear_to_mel_weight_matrix = (
        tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins,
                                                      num_spectrogram_bins,
                                                      sample_rate,
                                                      lower_edge_hertz,
                                                      upper_edge_hertz))
    mel_spectrograms = tf.tensordot(magnitude_spectrograms,
                                    linear_to_mel_weight_matrix, 1)
    # Note: Shape inference for tensordot does not currently handle this case.
    mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

    log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms))

    if apply_mask:
        log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1)

    return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
Example #23
0
hidden.append(
    Conv2D(nClasses, (1), padding='same', activation='softmax')(hidden[-1]))
print('layer', len(hidden) - 1, ':', hidden[-1].shape, 'output')

sm = hidden[-1]
y0 = Input((imSize, imSize, nClasses))
toCrop = int((y0.shape[1] - sm.shape[1]) // 2)
y = Cropping2D(toCrop)(y0)
cropSize = y.shape[1]

l = []
# nl = []
for iClass in range(nClasses):
    labels0 = tf.reshape(
        tf.to_int32(tf.slice(y, [0, 0, 0, iClass], [-1, -1, -1, 1])),
        [batchSize, cropSize, cropSize])
    predict0 = tf.reshape(tf.to_int32(tf.equal(tf.argmax(sm, 3), iClass)),
                          [batchSize, cropSize, cropSize])
    correct = tf.multiply(labels0, predict0)
    nCorrect0 = tf.reduce_sum(correct)
    nLabels0 = tf.reduce_sum(labels0)
    l.append(tf.to_float(nCorrect0) / tf.to_float(nLabels0))
    # nl.append(nLabels0)
acc = tf.add_n(l) / nClasses

loss = -tf.reduce_sum(tf.multiply(y, tf.log(sm)))
updateOps = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
optimizer = tf.train.AdamOptimizer(learningRate)
with tf.control_dependencies(updateOps):
    optOp = optimizer.minimize(loss)
Example #24
0
    def __init__(self,
                 num_emb,
                 batch_size,
                 emb_dim,
                 hidden_dim,
                 sequence_length,
                 start_token,
                 learning_rate=0.01,
                 reward_gamma=0.95):
        self.num_emb = num_emb
        self.batch_size = batch_size
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.sequence_length = sequence_length
        self.start_token = tf.constant([start_token] * self.batch_size,
                                       dtype=tf.int32)
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.reward_gamma = reward_gamma
        self.g_params = []
        self.d_params = []
        self.temperature = 1.0
        self.grad_clip = 5.0

        self.expected_reward = tf.Variable(tf.zeros([self.sequence_length]))

        self.g_embeddings = tf.Variable(
            self.init_matrix([self.num_emb, self.emb_dim]))
        self.g_params.append(self.g_embeddings)
        self.g_recurrent_unit = self.create_recurrent_unit(
            self.g_params)  # maps h_tm1 to h_t for generator
        self.g_output_unit = self.create_output_unit(
            self.g_params)  # maps h_t to o_t (output token logits)

        # placeholder definition
        self.x = tf.placeholder(tf.int32,
                                shape=[
                                    self.batch_size, self.sequence_length
                                ])  # sequence of tokens generated by generator
        self.rewards = tf.placeholder(
            tf.float32, shape=[self.batch_size, self.sequence_length
                               ])  # get from rollout policy and discriminator

        # processed for batch

        self.processed_x = tf.transpose(
            tf.nn.embedding_lookup(self.g_embeddings, self.x),
            perm=[1, 0, 2])  # seq_length x batch_size x emb_dim

        # Initial states
        self.h0 = tf.zeros([self.batch_size, self.hidden_dim])
        self.h0 = tf.stack([self.h0, self.h0])

        gen_o = tensor_array_ops.TensorArray(dtype=tf.float32,
                                             size=self.sequence_length,
                                             dynamic_size=False,
                                             infer_shape=True)
        gen_x = tensor_array_ops.TensorArray(dtype=tf.int32,
                                             size=self.sequence_length,
                                             dynamic_size=False,
                                             infer_shape=True)

        def _g_recurrence(i, x_t, h_tm1, gen_o, gen_x):
            h_t = self.g_recurrent_unit(x_t, h_tm1)  # hidden_memory_tuple
            o_t = self.g_output_unit(h_t)  # batch x vocab , logits not prob
            log_prob = tf.log(tf.nn.softmax(o_t))
            next_token = tf.cast(
                tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]),
                tf.int32)
            x_tp1 = tf.nn.embedding_lookup(self.g_embeddings,
                                           next_token)  # batch x emb_dim
            gen_o = gen_o.write(
                i,
                tf.reduce_sum(
                    tf.multiply(tf.one_hot(next_token, self.num_emb, 1.0, 0.0),
                                tf.nn.softmax(o_t)), 1))  # [batch_size] , prob
            gen_x = gen_x.write(i, next_token)  # indices, batch_size
            return i + 1, x_tp1, h_t, gen_o, gen_x

        _, _, _, self.gen_o, self.gen_x = control_flow_ops.while_loop(
            cond=lambda i, _1, _2, _3, _4: i < self.sequence_length,
            body=_g_recurrence,
            loop_vars=(tf.constant(0, dtype=tf.int32),
                       tf.nn.embedding_lookup(self.g_embeddings,
                                              self.start_token), self.h0,
                       gen_o, gen_x))

        self.gen_x = self.gen_x.stack()  # seq_length x batch_size
        self.gen_x = tf.transpose(self.gen_x,
                                  perm=[1, 0])  # batch_size x seq_length

        # supervised pretraining for generator
        g_predictions = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                     size=self.sequence_length,
                                                     dynamic_size=False,
                                                     infer_shape=True)

        ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                size=self.sequence_length)
        ta_emb_x = ta_emb_x.unstack(self.processed_x)

        def _pretrain_recurrence(i, x_t, h_tm1, g_predictions):
            h_t = self.g_recurrent_unit(x_t, h_tm1)
            o_t = self.g_output_unit(h_t)
            g_predictions = g_predictions.write(
                i, tf.nn.softmax(o_t))  # batch x vocab_size
            x_tp1 = ta_emb_x.read(i)
            return i + 1, x_tp1, h_t, g_predictions

        _, _, _, self.g_predictions = control_flow_ops.while_loop(
            cond=lambda i, _1, _2, _3: i < self.sequence_length,
            body=_pretrain_recurrence,
            loop_vars=(tf.constant(0, dtype=tf.int32),
                       tf.nn.embedding_lookup(self.g_embeddings,
                                              self.start_token), self.h0,
                       g_predictions))

        self.g_predictions = tf.transpose(
            self.g_predictions.stack(),
            perm=[1, 0, 2])  # batch_size x seq_length x vocab_size

        # pretraining loss
        self.pretrain_loss = -tf.reduce_sum(
            tf.one_hot(tf.to_int32(tf.reshape(
                self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log(
                    tf.clip_by_value(
                        tf.reshape(self.g_predictions, [-1, self.num_emb]),
                        1e-20, 1.0))) / (self.sequence_length *
                                         self.batch_size)

        # training updates
        pretrain_opt = self.g_optimizer(self.learning_rate)

        self.pretrain_grad, _ = tf.clip_by_global_norm(
            tf.gradients(self.pretrain_loss, self.g_params), self.grad_clip)
        self.pretrain_updates = pretrain_opt.apply_gradients(
            zip(self.pretrain_grad, self.g_params))

        #######################################################################################################
        #  Unsupervised Training
        #######################################################################################################
        self.g_loss = -tf.reduce_sum(
            tf.reduce_sum(
                tf.one_hot(tf.to_int32(tf.reshape(
                    self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log(
                        tf.clip_by_value(
                            tf.reshape(self.g_predictions, [-1, self.num_emb]),
                            1e-20, 1.0)), 1) * tf.reshape(self.rewards, [-1]))

        g_opt = self.g_optimizer(self.learning_rate)

        self.g_grad, _ = tf.clip_by_global_norm(
            tf.gradients(self.g_loss, self.g_params), self.grad_clip)
        self.g_updates = g_opt.apply_gradients(zip(self.g_grad, self.g_params))
def parse_fn(filename, output_sequence_length=IMAGES_PER_SEQUENCE):
    """Read data from single files stored in directories.

  Args:
    filename: the filename of the set of files to be loaded.
    output_sequence_length: Length of the output sequence. If less than
      IMAGES_PER_SEQUENCE, only the first `output_sequence_length` frames will
      be kept.

  Returns:
    A dictionary that maps strings to tf.Tensors of type float32:

    'rgb': an RGB image of shape H, W, 3. Each channel value is between 0.0 and
           1.0.
    'intrinsics': a list of intrinsics values.
  """
    if output_sequence_length > IMAGES_PER_SEQUENCE or output_sequence_length < 1:
        raise ValueError(
            'Invalid output_sequence_length %d: must be within [1, '
            '%d].' % (output_sequence_length, IMAGES_PER_SEQUENCE))
    image_file = tf.strings.join([filename, '.png'])
    intrinsics_file = tf.strings.join([filename, '_cam.txt'])
    mask_file = tf.strings.join([filename, '-fseg.png'])

    # Read files.
    encoded_image = tf.io.read_file(image_file)
    encoded_mask = tf.io.read_file(mask_file)
    intrinsics_content = tf.io.read_file(intrinsics_file)
    content_is_empty = tf.math.equal(intrinsics_content, '')
    filename_matches = tf.strings.regex_full_match(
        filename, '.*%s$' % KITTI_CORRUPT_FILE)
    file_is_corrupt = tf.math.logical_and(content_is_empty, filename_matches)

    intrinsics_content = tf.cond(file_is_corrupt,
                                 lambda: KITTI_CORRUPT_FILE_INTRINSICS,
                                 lambda: intrinsics_content)

    # Parse intrinsics data to a tensor representing a 3x3 matrix.
    intrinsics = tf.strings.split([intrinsics_content], ',').values
    intrinsics = tf.strings.to_number(intrinsics)
    intrinsics.set_shape([9])

    fx, _, x0, _, fy, y0, _, _, _ = tf.unstack(intrinsics)
    intrinsics = tf.stack([IMAGE_WIDTH, IMAGE_HEIGHT, fx, fy, x0, y0])

    # Decode and normalize images.
    decoded_image = tf.image.decode_png(encoded_image, channels=3)
    decoded_image = tf.to_float(decoded_image) * (1 / 255.0)
    split_image_sequence = tf.split(decoded_image, IMAGES_PER_SEQUENCE, axis=1)

    decoded_mask = tf.image.decode_png(encoded_mask, channels=3)
    mask_r, mask_g, mask_b = tf.unstack(tf.to_int32(decoded_mask), axis=-1)
    # Since TPU does not support images of type uint8, we encode the 3 RGB uint8
    # values into one int32 value.
    mask = mask_r * (256 * 256) + mask_g * 256 + mask_b
    # All images in our pipeline have 3 dimensions (height, width, channels), so
    # we add a third dimension to the mask too.
    mask = tf.expand_dims(mask, -1)
    split_mask_sequence = tf.split(mask, IMAGES_PER_SEQUENCE, axis=1)

    return {
        'rgb': tf.stack(split_image_sequence[:output_sequence_length]),
        'intrinsics': tf.stack([intrinsics] * output_sequence_length),
        'mask': tf.stack(split_mask_sequence[:output_sequence_length]),
    }
Example #26
0
def average_bag_of_embeds(embeddings, mask, use_bigrams=False,
                          bigram_embed_scope=None, append_start_end=False):
  """Averages a bag of embeds.

  Args:
    embeddings: a float Tensor of shape [None, length, depth]
    mask: a boolean Tensor of shape [None, length]
    use_bigrams: whether to use bigrams.
    bigram_embed_scope: the variable scope.
    append_start_end: whether to append start and end tokens.
  Returns:
    word_embed: a Tensor of shape [None, embed_size]
  """
  if bigram_embed_scope is None:
    var_scope = "average_bow"
  else:
    var_scope = bigram_embed_scope
  with tf.variable_scope(var_scope, reuse=tf.AUTO_REUSE):
    with tf.control_dependencies([
        tf.assert_equal(tf.rank(embeddings), 3, summarize=100),
        tf.assert_equal(tf.rank(mask), 2, summarize=100),
    ]):
      lengths = tf.cast(
          tf.reduce_sum(tf.cast(mask, tf.int32), -1, keepdims=True), tf.float32)
    batch_size = common_layers.shape_list(embeddings)[0]
    length = common_layers.shape_list(embeddings)[1]
    depth = common_layers.shape_list(embeddings)[2]
    embeddings = tf.where(
        tf.tile(tf.expand_dims(mask, 2), [1, 1, depth]), embeddings,
        tf.zeros_like(embeddings))
    if use_bigrams:
      if append_start_end:
        span_start_embed = tf.get_variable(name="span_start_embed",
                                           shape=[depth])
        span_end_embed = tf.get_variable(name="span_end_embed",
                                         shape=[depth])
        span_end_embed = tf.expand_dims(tf.expand_dims(span_end_embed, 0), 0)
        start = tf.expand_dims(
            tf.tile(tf.expand_dims(span_start_embed, 0), [batch_size, 1]), 1)
        # Prefix the start
        embeddings = tf.concat([start, embeddings], axis=1)
        # Pad for the end slot
        embeddings = tf.pad(embeddings, [[0, 0], [0, 1], [0, 0]])
        span_end_embed = tf.tile(span_end_embed, [batch_size, length + 2, 1])
        mask_with_start = tf.pad(
            tf.pad(tf.to_int32(mask), [[0, 0], [1, 0]],
                   constant_values=1), [[0, 0], [0, 1]],
            constant_values=0)
        mask_with_end = tf.pad(mask_with_start, [[0, 0], [1, 0]],
                               constant_values=1)[:, :-1]
        mask = tf.cast(mask_with_end, tf.bool)
        mask_of_end = tf.expand_dims(mask_with_end - mask_with_start, 2)
        embeddings = embeddings + span_end_embed * tf.to_float(mask_of_end)
      bigram_embeddings = tf.layers.dense(
          tf.concat([embeddings[:, :-1, :], embeddings[:, 1:, :]], axis=-1),
          units=depth)
      bigram_mask = tf.to_float(tf.expand_dims(mask[:, 1:], 2))
      masked_bigram_embeddings = bigram_embeddings * bigram_mask
      embeddings = tf.concat(
          [embeddings, masked_bigram_embeddings], axis=1)
      lengths = lengths + lengths - 1
    avg_embeddings = tf.div(tf.reduce_sum(embeddings, axis=1),
                            tf.maximum(lengths, 1.0))
  return avg_embeddings
Example #27
0
    def __init__(self,
                 train_batch_size=4096,
                 test_chain_batch_size=4096,
                 bijector="iaf",
                 log_dir="/tmp/neutra",
                 base_learning_rate=1e-3,
                 q_base_scale=1.,
                 learning_rate_schedule=[[6000, 1e-1]]):
        target, target_spec = GetTargetSpec()
        self.target = target
        self.target_spec = target_spec
        with gin.config_scope("train"):
            train_target, train_target_spec = GetTargetSpec()
            self.train_target = train_target
            self.train_target_spec = train_target_spec

        if bijector == "rnvp":
            bijector_fn = tf.make_template("bijector",
                                           MakeRNVPBijectorFn,
                                           num_dims=self.target_spec.num_dims)
        elif bijector == "iaf":
            bijector_fn = tf.make_template("bijector",
                                           MakeIAFBijectorFn,
                                           num_dims=self.target_spec.num_dims)
        elif bijector == "affine":
            bijector_fn = tf.make_template("bijector",
                                           MakeAffineBijectorFn,
                                           num_dims=self.target_spec.num_dims)
        else:
            bijector_fn = lambda *args, **kwargs: tfb.Identity()

        self.train_bijector = bijector_fn(train=True)
        self.bijector = bijector_fn(train=False)
        if train_target_spec.bijector is not None:
            print("Using train target bijector")
            self.train_bijector = tfb.Chain(
                [train_target_spec.bijector, self.train_bijector])
        if target_spec.bijector is not None:
            print("Using target bijector")
            self.bijector = tfb.Chain([target_spec.bijector, self.bijector])

        q_base = tfd.Independent(
            tfd.Normal(loc=tf.zeros(self.target_spec.num_dims),
                       scale=q_base_scale *
                       tf.ones(self.target_spec.num_dims)), 1)
        self.q_x_train = tfd.TransformedDistribution(q_base,
                                                     self.train_bijector)
        self.q_x = tfd.TransformedDistribution(q_base, self.bijector)

        # Params
        self.train_batch_size = int(train_batch_size)
        self.test_chain_batch_size = tf.placeholder_with_default(
            test_chain_batch_size, [], "test_chain_batch_size")
        self.test_batch_size = tf.placeholder_with_default(
            16384 * 8, [], "test_batch_size")
        self.test_num_steps = tf.placeholder_with_default(
            1000, [], "test_num_steps")
        self.test_num_leapfrog_steps = tf.placeholder_with_default(
            tf.to_int32(2), [], "test_num_leapfrog_steps")
        self.test_step_size = tf.placeholder_with_default(
            0.1, [], "test_step_size")

        # Test
        self.neutra_outputs = MakeNeuTra(
            target=self.target,
            q=self.q_x,
            batch_size=self.test_chain_batch_size,
            num_steps=self.test_num_steps,
            num_leapfrog_steps=self.test_num_leapfrog_steps,
            step_size=self.test_step_size,
        )
        self.z_chain = tf.reshape(
            self.bijector.inverse(
                tf.reshape(self.neutra_outputs.x_chain,
                           [-1, self.target_spec.num_dims])),
            tf.shape(self.neutra_outputs.x_chain))
        self.target_samples = self.target.sample(self.test_batch_size)
        self.target_z = self.bijector.inverse(self.target_samples)
        self.q_samples = self.q_x.sample(self.test_batch_size)

        self.target_cov = utils.Covariance(self.target_samples)
        self.target_eigvals, self.target_eigvecs = tf.linalg.eigh(
            self.target_cov)

        self.cached_target_eigvals = tf.get_local_variable(
            "cached_target_eigvals",
            self.target_eigvals.shape,
            initializer=tf.zeros_initializer())
        self.cached_target_eigvecs = tf.get_local_variable(
            "cached_target_eigvecs",
            self.target_eigvecs.shape,
            initializer=tf.zeros_initializer())
        self.cached_target_stats_update_op = [
            self.cached_target_eigvals.assign(self.target_eigvals),
            self.cached_target_eigvecs.assign(self.target_eigvecs),
            tf.print("Assigning target stats")
        ]

        def variance(x):
            x -= tf.reduce_mean(x, 0, keep_dims=True)
            x = tf.square(x)
            return x

        def rotated_variance(x):
            x2 = tf.reshape(x, [-1, self.target_spec.num_dims])
            x2 -= tf.reduce_mean(x2, 0, keep_dims=True)
            x2 = tf.matmul(x2, self.cached_target_eigvecs)
            x2 = tf.square(x2)
            return tf.reshape(x2, tf.shape(x))

        functions = [
            ("mean", tf.identity),
            #        ("var", variance),
            ("square", tf.square),
            #        ("rot_square", rot_square),
            #        ("rot_var", rotated_variance),
        ]

        self.cached_target_mean = {}
        self.cached_target_mean_update_op = [
            tf.print("Assigning target means.")
        ]
        self.neutra_stats = {}
        self.q_stats = {}

        for name, f in functions:
            target_mean = tf.reduce_mean(f(self.target_samples), 0)
            cached_target_mean = tf.get_local_variable(name + "_cached_mean",
                                                       target_mean.shape)
            if self.target_spec.stats is not None:
                self.cached_target_mean_update_op.append(
                    cached_target_mean.assign(self.target_spec.stats[name]))
            else:
                self.cached_target_mean_update_op.append(
                    cached_target_mean.assign(target_mean))

            self.cached_target_mean[name] = cached_target_mean
            self.q_stats[name] = ComputeQStats(f(self.q_samples),
                                               cached_target_mean)
            self.neutra_stats[name] = ComputeChainStats(
                f(self.neutra_outputs.x_chain), cached_target_mean,
                self.test_num_leapfrog_steps)

        # Training
        self.train_q_samples = self.q_x_train.sample(self.train_batch_size)
        self.train_log_q_x = self.q_x_train.log_prob(self.train_q_samples)
        self.kl_q_p = tf.reduce_mean(
            self.train_log_q_x - self.target.log_prob(self.train_q_samples))

        loss = self.kl_q_p
        reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        if reg_losses:
            tf.logging.info("Regularizing.")
            loss += tf.add_n(reg_losses)
        self.loss = tf.check_numerics(loss, "Loss has NaNs")

        self.global_step = tf.train.get_or_create_global_step()
        steps, factors = list(zip(*learning_rate_schedule))
        learning_rate = base_learning_rate * tf.train.piecewise_constant(
            self.global_step, steps, [1.0] + list(factors))

        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.train_op = opt.minimize(self.loss, global_step=self.global_step)

        tf.summary.scalar("kl_q_p", self.kl_q_p)
        tf.summary.scalar("loss", self.loss)

        self.init = [
            tf.global_variables_initializer(),
            tf.local_variables_initializer(),
            tf.print("Initializing variables")
        ]

        self.saver = tf.train.Saver()
        self.log_dir = log_dir
Example #28
0
  def _build_sampler(self):
    """Build the sampler ops and the log_prob ops."""
    hidden_size = self.params.controller_hidden_size
    num_layers = self.params.controller_num_layers

    arc_seq = []
    sample_log_probs = []
    sample_entropy = []
    all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)]
    all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)]

    # sampler ops
    inputs = self.g_emb
    prev_c = tf.zeros([1, hidden_size], dtype=tf.float32)
    prev_h = tf.zeros([1, hidden_size], dtype=tf.float32)

    inputs = self.g_emb
    for layer_id in range(1, num_layers+1):
      next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm)
      prev_c, prev_h = next_c, next_h
      all_h.append(next_h)
      all_h_w.append(tf.matmul(next_h, self.attn_w_1))

      query = tf.matmul(next_h, self.attn_w_2)
      query = query + tf.concat(all_h_w[:-1], axis=0)
      query = tf.tanh(query)
      logits = tf.matmul(query, self.attn_v)
      logits = tf.reshape(logits, [1, layer_id])

      if self.params.controller_temperature:
        logits /= self.params.controller_temperature
      if self.params.controller_tanh_constant:
        logits = self.params.controller_tanh_constant * tf.tanh(logits)
      diff = tf.to_float(layer_id - tf.range(0, layer_id)) ** 2
      logits -= tf.reshape(diff, [1, layer_id]) / 6.0

      skip_index = tf.multinomial(logits, 1)
      skip_index = tf.to_int32(skip_index)
      skip_index = tf.reshape(skip_index, [1])
      arc_seq.append(skip_index)

      log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=skip_index)
      sample_log_probs.append(log_prob)

      entropy = log_prob * tf.exp(-log_prob)
      sample_entropy.append(tf.stop_gradient(entropy))

      inputs = tf.nn.embedding_lookup(
          tf.concat(all_h[:-1], axis=0), skip_index)
      inputs /= (0.1 + tf.to_float(layer_id - skip_index))

      next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm)
      prev_c, prev_h = next_c, next_h
      logits = tf.matmul(next_h, self.w_emb, transpose_b=True)
      if self.params.controller_temperature:
        logits /= self.params.controller_temperature
      if self.params.controller_tanh_constant:
        logits = self.params.controller_tanh_constant * tf.tanh(logits)
      func = tf.multinomial(logits, 1)
      func = tf.to_int32(func)
      func = tf.reshape(func, [1])
      arc_seq.append(func)
      log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=func)
      sample_log_probs.append(log_prob)
      entropy = log_prob * tf.exp(-log_prob)
      sample_entropy.append(tf.stop_gradient(entropy))
      inputs = tf.nn.embedding_lookup(self.w_emb, func)

    arc_seq = tf.concat(arc_seq, axis=0)
    self.sample_arc = arc_seq

    self.sample_log_probs = tf.concat(sample_log_probs, axis=0)
    self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs))

    sample_entropy = tf.concat(sample_entropy, axis=0)
    self.sample_entropy = tf.reduce_sum(sample_entropy)

    self.all_h = all_h
Example #29
0
    def _enas_layer(self, layer_id, prev_layers, arc, out_filters):
        """
    Args:
      layer_id: current layer
      prev_layers: cache of previous layers. for skip connections
      start_idx: where to start looking at. technically, we can infer this
        from layer_id, but why bother...
    """

        assert len(prev_layers) == 2, "need exactly 2 inputs"
        layers = [prev_layers[0], prev_layers[1]]
        layers = self._maybe_calibrate_size(layers,
                                            out_filters,
                                            is_training=True)
        used = []
        for cell_id in range(self.num_cells):
            prev_layers = tf.stack(layers, axis=0)
            with tf.variable_scope("cell_{0}".format(cell_id)):
                with tf.variable_scope("x"):
                    x_id = arc[4 * cell_id]
                    x_op = arc[4 * cell_id + 1]
                    x = prev_layers[x_id, :, :, :, :]
                    x = self._enas_cell(x, cell_id, x_id, x_op, out_filters)
                    x_used = tf.one_hot(x_id,
                                        depth=self.num_cells + 2,
                                        dtype=tf.int32)

                with tf.variable_scope("y"):
                    y_id = arc[4 * cell_id + 2]
                    y_op = arc[4 * cell_id + 3]
                    y = prev_layers[y_id, :, :, :, :]
                    y = self._enas_cell(y, cell_id, y_id, y_op, out_filters)
                    y_used = tf.one_hot(y_id,
                                        depth=self.num_cells + 2,
                                        dtype=tf.int32)

                out = x + y
                used.extend([x_used, y_used])
                layers.append(out)

        used = tf.add_n(used)
        indices = tf.where(tf.equal(used, 0))
        indices = tf.to_int32(indices)
        indices = tf.reshape(indices, [-1])
        num_outs = tf.size(indices)
        out = tf.stack(layers, axis=0)
        out = tf.gather(out, indices, axis=0)

        inp = prev_layers[0]
        if self.data_format == "NHWC":
            N = tf.shape(inp)[0]
            H = tf.shape(inp)[1]
            W = tf.shape(inp)[2]
            C = tf.shape(inp)[3]
            out = tf.transpose(out, [1, 2, 3, 0, 4])
            out = tf.reshape(out, [N, H, W, num_outs * out_filters])
        elif self.data_format == "NCHW":
            N = tf.shape(inp)[0]
            C = tf.shape(inp)[1]
            H = tf.shape(inp)[2]
            W = tf.shape(inp)[3]
            out = tf.transpose(out, [1, 0, 2, 3, 4])
            out = tf.reshape(out, [N, num_outs * out_filters, H, W])
        else:
            raise ValueError("Unknown data_format '{0}'".format(
                self.data_format))

        with tf.variable_scope("final_conv"):
            w = create_weight("w",
                              [self.num_cells + 2, out_filters * out_filters])
            w = tf.gather(w, indices, axis=0)
            w = tf.reshape(w, [1, 1, num_outs * out_filters, out_filters])
            out = tf.nn.relu(out)
            out = tf.nn.conv2d(out,
                               w,
                               strides=[1, 1, 1, 1],
                               padding="SAME",
                               data_format=self.data_format)
            out = batch_norm(out,
                             is_training=True,
                             data_format=self.data_format)

        out = tf.reshape(out, tf.shape(prev_layers[0]))

        return out
Example #30
0
  def build_train_graph(self,
                        inputs,
                        min_depth,
                        max_depth,
                        num_mpi_planes,
                        learning_rate=0.0002,
                        beta1=0.9,
                        vgg_model_file=None,
                        global_step=0):
    """Construct the training computation graph.

    Args:
      inputs: dictionary of tensors (see 'input_data' below) needed for training
      min_depth: minimum depth for the PSV and MPI planes
      max_depth: maximum depth for the PSV and MPI planes
      num_mpi_planes: number of MPI planes to infer
      learning_rate: learning rate
      beta1: hyperparameter for Adam
      vgg_model_file: path to vgg weights (needed when vgg loss is used)
      global_step: current optimization step
    Returns:
      A train_op to be used for training.
    """
    print("starting to build graph")
    with tf.name_scope("input_size_randomization"):
      dim_choices = tf.constant([[1, 16], [2, 32], [4, 32], [4, 64], [4, 128],
                                 [8, 32], [8, 64], [8, 128]],
                                dtype=tf.int32)
      rand_dim = tf.random_shuffle(dim_choices)[0, :]
      height_div = rand_dim[0]
      width_div = rand_dim[0]
      num_mpi_planes = rand_dim[1]
      tf.summary.scalar("num_mpi_planes", num_mpi_planes)

    with tf.name_scope("setup"):
      mpi_planes = self.inv_depths(min_depth, max_depth, num_mpi_planes)

    with tf.name_scope("input_data"):
      raw_tgt_image = inputs["tgt_image"]
      raw_ref_image = inputs["ref_image"]
      raw_src_images = inputs["src_images"]

      _, img_height, img_width, _ = raw_src_images.get_shape().as_list(
      )
      img_height = img_height // height_div
      img_width = img_width // width_div

      raw_tgt_image = tf.image.convert_image_dtype(
          raw_tgt_image, dtype=tf.float32)
      raw_ref_image = tf.image.convert_image_dtype(
          raw_ref_image, dtype=tf.float32)
      raw_src_images = tf.image.convert_image_dtype(
          raw_src_images, dtype=tf.float32)
      raw_tgt_image = tf.image.resize_area(raw_tgt_image,
                                           [img_height, img_width])
      raw_ref_image = tf.image.resize_area(raw_ref_image,
                                           [img_height, img_width])
      raw_src_images = tf.image.resize_area(raw_src_images,
                                            [img_height, img_width])

      tgt_pose = inputs["tgt_pose"]
      ref_pose = inputs["ref_pose"]
      src_poses = inputs["src_poses"]
      intrinsics = inputs["intrinsics"]

      # Scale intrinsics based on size randomization
      intrinsics = tf.concat([
          intrinsics[:, 0:1, :] / tf.to_float(width_div),
          intrinsics[:, 1:2, :] / tf.to_float(height_div), intrinsics[:, 2:3, :]
      ],
                             axis=1)
      inputs["intrinsics"] = intrinsics

      _, num_source, _, _ = src_poses.get_shape().as_list()

    with tf.name_scope("inference"):
      print("setting up MPI inference")
      num_mpi_planes = tf.shape(mpi_planes)[0]
      pred = self.infer_mpi(raw_src_images, raw_ref_image, ref_pose, src_poses,
                            intrinsics, num_mpi_planes,
                            mpi_planes)
      rgba_layers = pred["rgba_layers"]
      rgba_layers_refine = pred["rgba_layers_refine"]
      stuff_behind = pred["stuff_behind"]
      refine_input_mpi = pred["refine_input_mpi"]
      psv = pred["psv"]

    with tf.name_scope("synthesis"):
      print("setting up rendering")
      rel_pose = tf.matmul(tgt_pose, tf.matrix_inverse(ref_pose))
      output_image, output_layers = self.mpi_render_view(
          rgba_layers, rel_pose, mpi_planes, intrinsics)
      output_alpha = output_layers[Ellipsis, -1]
      output_image_refine, _ = self.mpi_render_view(
          rgba_layers_refine, rel_pose, mpi_planes, intrinsics)

    with tf.name_scope("loss"):
      print("computing losses")
      # Mask loss for pixels outside reference frustum
      loss_mask = tf.where(
          tf.equal(
              tf.reduce_min(
                  tf.abs(tf.reduce_sum(output_layers, axis=-1)),
                  axis=3,
                  keep_dims=True), 0.0),
          tf.zeros_like(output_alpha[:, :, :, 0:1]),
          tf.ones_like(output_alpha[:, :, :, 0:1]))
      loss_mask = tf.stop_gradient(loss_mask)
      tf.summary.image("loss_mask", loss_mask)

      # Helper functions for loss
      def compute_error(real, fake, mask):
        return tf.reduce_mean(mask * tf.abs(fake - real))

      # Normalized VGG loss (from
      # https://github.com/CQFIO/PhotographicImageSynthesis)

      downsample = lambda tensor, ds: tf.nn.avg_pool(tensor, [1, ds, ds, 1],
                                                     [1, ds, ds, 1], "SAME")

      def vgg_loss(raw_tgt_image, output_image, loss_mask):
        """Compute VGG loss."""

        vgg_real = build_vgg19(raw_tgt_image * 255.0, vgg_model_file)
        rescaled_output_image = (output_image + 1.)/2. * 255.0
        vgg_fake = build_vgg19(
            rescaled_output_image, vgg_model_file, reuse=True)
        p0 = compute_error(vgg_real["input"], vgg_fake["input"], loss_mask)
        p1 = compute_error(vgg_real["conv1_2"],
                           vgg_fake["conv1_2"],
                           loss_mask)/2.6
        p2 = compute_error(vgg_real["conv2_2"],
                           vgg_fake["conv2_2"],
                           downsample(loss_mask, 2))/4.8
        p3 = compute_error(vgg_real["conv3_2"],
                           vgg_fake["conv3_2"],
                           downsample(loss_mask, 4))/3.7
        p4 = compute_error(vgg_real["conv4_2"],
                           vgg_fake["conv4_2"],
                           downsample(loss_mask, 8))/5.6
        p5 = compute_error(vgg_real["conv5_2"],
                           vgg_fake["conv5_2"],
                           downsample(loss_mask, 16))*10/1.5
        total_loss = p0+p1+p2+p3+p4+p5
        return total_loss, vgg_real, vgg_fake

      vgg_loss_initial, _, _ = vgg_loss(raw_tgt_image, output_image, loss_mask)
      tf.summary.scalar("vgg_loss_initial", vgg_loss_initial)
      total_loss = vgg_loss_initial

      vgg_loss_refine, _, _ = vgg_loss(raw_tgt_image, output_image_refine,
                                       loss_mask)
      tf.summary.scalar("vgg_loss_refine", vgg_loss_refine)
      total_loss += vgg_loss_refine

    with tf.name_scope("train_op"):
      print("setting up train op")
      train_vars = [var for var in tf.trainable_variables()]
      optim = tf.train.AdamOptimizer(learning_rate, beta1)
      grads_and_vars = optim.compute_gradients(total_loss, var_list=train_vars)
      train_op = [optim.apply_gradients(grads_and_vars)]

    # Summaries
    tf.summary.scalar("total_loss", total_loss)
    # Source images
    for i in range(num_source):
      src_image = raw_src_images[:, :, :, i*3:(i+1)*3]
      tf.summary.image("src_image_%d" % i, src_image)
    # Output image
    tf.summary.image("output_image", self.deprocess_image(output_image))
    # Refined output image
    tf.summary.image("output_image_refine",
                     self.deprocess_image(output_image_refine))
    # Target image
    tf.summary.image("tgt_image", raw_tgt_image)
    # Ref image
    tf.summary.image("ref_image", raw_ref_image)
    # Predicted color and alpha layers, and PSV
    num_summ = 16  # Number of plane summaries to show in tensorboard
    for i in range(num_summ):
      ind = tf.to_int32(i * num_mpi_planes/num_summ)
      rgb = rgba_layers[:, :, :, ind, :3]
      alpha = rgba_layers[:, :, :, ind, -1:]
      ref_plane = psv[:, :, :, ind, 3:6]
      source_plane = psv[:, :, :, ind, :3]
      output_rgb = output_layers[:, :, :, ind, :3]
      tf.summary.image("rgb_layer_%d" % i, self.deprocess_image(rgb))
      tf.summary.image("alpha_layer_%d" % i, alpha)
      tf.summary.image("rgba_layer_%d" % i, self.deprocess_image(rgb * alpha))
      tf.summary.image("psv_avg_%d" % i,
                       (self.deprocess_image(0.5*ref_plane + 0.5*source_plane)))
      tf.summary.image("output_rgb_%d" % i,
                       self.deprocess_image(output_rgb))
      tf.summary.image("psv_ref_%d" % i, self.deprocess_image(ref_plane))
      tf.summary.image("psv_source_%d" % i, self.deprocess_image(source_plane))

    # Cumulative rendered images and refined MPI
    for i in range(num_summ):
      ind = tf.to_int32(i * num_mpi_planes/num_summ)
      rgb = rgba_layers_refine[:, :, :, ind, :3]
      alpha = rgba_layers_refine[:, :, :, ind, 3:]
      render = stuff_behind[:, :, :, ind, :3]
      input_colors = refine_input_mpi[:, :, :, ind, :3]
      tf.summary.image("rgb_layer_refine_%d" % i, self.deprocess_image(rgb))
      tf.summary.image("alpha_layer_refine_%d" % i, alpha)
      tf.summary.image("rgba_layer_refine_%d" % i,
                       self.deprocess_image(rgb * alpha))
      tf.summary.image("cumulative_render_%d" % i, self.deprocess_image(render))
      tf.summary.image("input_colors_refine_%d" % i,
                       self.deprocess_image(input_colors))

    return train_op