Beispiel #1
0
def conv2d(input_,
           output_dim,
           kernel=4,
           stride=2,
           use_sp=False,
           padding='SAME',
           scope="conv2d",
           use_bias=True):
    with tf.variable_scope(scope):
        w = tf.get_variable(
            'w', [kernel, kernel,
                  input_.get_shape()[-1], output_dim],
            initializer=tf.contrib.layers.variance_scaling_initializer(),
            regularizer=l2_regularizer(scale=0.0001))
        if use_sp != True:
            conv = tf.nn.conv2d(input_,
                                w,
                                strides=[1, stride, stride, 1],
                                padding=padding)
        else:
            conv = tf.nn.conv2d(input_,
                                spectral_norm(w),
                                strides=[1, stride, stride, 1],
                                padding=padding)

        if use_bias:
            biases = tf.get_variable('biases', [output_dim],
                                     initializer=tf.constant_initializer(0.0))
            conv = tf.reshape(tf.nn.bias_add(conv, biases), tf.shape(conv))

        return conv
Beispiel #2
0
def fully_connect(input_,
                  output_size,
                  scope=None,
                  use_sp=False,
                  bias_start=0.0,
                  with_w=False):

    shape = input_.get_shape().as_list()
    with tf.variable_scope(scope or "Linear"):
        matrix = tf.get_variable(
            "Matrix", [shape[1], output_size],
            tf.float32,
            initializer=tf.contrib.layers.variance_scaling_initializer(),
            regularizer=l2_regularizer(0.0001))
        bias = tf.get_variable("bias", [output_size],
                               tf.float32,
                               initializer=tf.constant_initializer(bias_start))

        if use_sp:
            mul = tf.matmul(input_, spectral_norm(matrix))
        else:
            mul = tf.matmul(input_, matrix)
        if with_w:
            return mul + bias, matrix, bias
        else:
            return mul + bias
Beispiel #3
0
 def _linear(self, input_tensor, output_nums, l2_reg, activation_fn=None):
     if l2_reg <= 0:
         return layers.fully_connected(
             input_tensor,
             output_nums,
             activation_fn=activation_fn,
             weights_initializer=layers.xavier_initializer(),
             biases_initializer=layers.xavier_initializer(),
         )
     else:
         return layers.fully_connected(
             input_tensor,
             output_nums,
             activation_fn=activation_fn,
             weights_initializer=layers.xavier_initializer(),
             biases_initializer=layers.xavier_initializer(),
             weights_regularizer=layers.l2_regularizer(l2_reg),
             biases_regularizer=layers.l2_regularizer(l2_reg))
Beispiel #4
0
def encode_phrases(args, phrase_plh, train_phase_plh, num_phrases_plh, phrase_feature_dim, phrase_denom_plh, vecs):
    final_embed = args.dim_embed
    embed_dim = final_embed * 4
    phrase_plh = tf.reshape(phrase_plh, [-1, num_phrases_plh, phrase_feature_dim])
    # sometimes finetuning word embedding helps (with l2 reg), but often doesn't
    # seem to make a big difference
    word_embeddings = tf.get_variable('word_embeddings', vecs.shape, initializer=tf.constant_initializer(vecs), trainable = args.embedding_ft)
    embedded_words = tf.nn.embedding_lookup(word_embeddings, phrase_plh)

    # if you do finetune
    embed_l2reg = tf.zeros(1)
    if args.embedding_ft:
        embed_l2reg = tf.nn.l2_loss(word_embeddings - vecs)

    eps = 1e-10
    if args.language_model == 'gru':
        phrase_plh = tf.reshape(phrase_plh, [-1, phrase_feature_dim])
        source_sequence_length = tf.reduce_sum(tf.cast(phrase_plh > 0, tf.int32), 1)
        embedded_words = tf.reshape(embedded_words, [-1, phrase_feature_dim, vecs.shape[1]])
        encoder_cell = tf.nn.rnn_cell.GRUCell(final_embed)
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
            encoder_cell, embedded_words, dtype=encoder_cell.dtype,
            sequence_length=source_sequence_length)
        final_outputs = extract_axis_1(encoder_outputs, source_sequence_length-1)
        phrase_input = tf.reshape(final_outputs, [-1, num_phrases_plh, final_embed])

        outputs = fully_connected(phrase_input, embed_dim, activation_fn = None,
                                  weights_regularizer = tf.contrib.layers.l2_regularizer(0.005),
                                  scope = 'phrase_encoder')
        phrase_embed = tf.nn.l2_normalize(outputs, 2, epsilon=eps)
    else:
        num_words = tf.reduce_sum(tf.to_float(phrase_plh > 0), 2, keep_dims=True) + eps
        phrase_input = tf.nn.l2_normalize(tf.reduce_sum(embedded_words, 2) / num_words, 2)
        if args.language_model == 'attend':
            context_vector = tf.tile(tf.expand_dims(phrase_input, 2), (1, 1, phrase_feature_dim, 1))
            attention_inputs = tf.concat((context_vector, embedded_words), 3)
            attention_weights = fully_connected(attention_inputs, 1, 
                                                weights_regularizer = l2_regularizer(0.0005),
                                                scope = 'self_attend')
            attention_weights = tf.nn.softmax(tf.squeeze(attention_weights))
            phrase_input = tf.nn.l2_normalize(tf.reduce_sum(embedded_words * tf.expand_dims(attention_weights, 3), 2), 2)
            phrase_input = tf.reshape(phrase_input, [-1, num_phrases_plh, vecs.shape[1]])

        if args.cca_parameters:
            parameters = pickle.load(open(args.cca_parameters, 'rb'))
            phrase_embed = setup_initialize_fc_layers(args, phrase_input, parameters, 'lang', train_phase_plh, norm_axis=2)
        else:
            phrase_embed = embedding_branch(phrase_input, embed_dim, train_phase_plh, 'phrase', norm_axis=2)

    concept_weights = embedding_branch(phrase_input, embed_dim, train_phase_plh, 'concept_weight',
                                       do_l2norm = False, outdim = args.num_embeddings)
    concept_loss = tf.reduce_sum(tf.norm(concept_weights, axis=2, ord=1)) / phrase_denom_plh
    concept_weights = tf.nn.softmax(concept_weights)
    return phrase_embed, concept_weights, concept_loss, embed_l2reg
Beispiel #5
0
    def get_phrase_scores(self, phrase_embed, region_embed, concept_weights):
        elementwise_prod = tf.expand_dims(phrase_embed, 2) * tf.expand_dims(
            region_embed, 1)
        joint_embed_1 = add_fc(elementwise_prod, self.embed_dim,
                               self.train_phase, 'joint_embed_1')
        joint_embed_2 = concept_layer(joint_embed_1, self.final_embed,
                                      self.train_phase, 1, concept_weights)
        for concept_id in range(2, self.args.num_embeddings + 1):
            joint_embed_2 += concept_layer(joint_embed_1, self.final_embed,
                                           self.train_phase, concept_id,
                                           concept_weights)

        joint_embed_3 = fully_connected(
            joint_embed_2,
            1,
            activation_fn=None,
            weights_regularizer=l2_regularizer(0.005),
            scope='joint_embed_3')
        joint_embed_3 = tf.squeeze(joint_embed_3, [3])
        region_prob = 1. / (1. + tf.exp(-joint_embed_3))
        return region_prob, joint_embed_3
Beispiel #6
0
def get_phrase_scores(args, phrase_embed, region_embed, train_phase_plh, concept_weights = None):
    if args.two_branch:
        region_phrase_embedding = region_embed * tf.expand_dims(phrase_embed, 1)
        region_score = tf.reduce_sum(region_phrase_embedding, 2)
        return region_score

    final_embed = args.dim_embed
    embed_dim = final_embed * 4
    elementwise_prod = tf.expand_dims(phrase_embed, 2)*tf.expand_dims(region_embed, 1)
    joint_embed_1 = add_fc(elementwise_prod, embed_dim, train_phase_plh, 'joint_embed_1')
    joint_embed_2 = concept_layer(joint_embed_1, final_embed, train_phase_plh, 1, concept_weights)
    for concept_id in range(2, args.num_embeddings+1):
        joint_embed_2 += concept_layer(joint_embed_1, final_embed, train_phase_plh,
                                       concept_id, concept_weights)

    joint_embed_3 = fully_connected(joint_embed_2, 1, activation_fn=None ,
                                    weights_regularizer = l2_regularizer(0.005),
                                    scope = 'joint_embed_3')
    joint_embed_3 = tf.squeeze(joint_embed_3, [3])
    region_prob = 1. / (1. + tf.exp(-joint_embed_3))
    return region_prob, joint_embed_3
Beispiel #7
0
    def construct_model(self):
        with self.graph.as_default():
            self.random = np.random.RandomState(self.seed)
            tf.compat.v1.set_random_seed(
                self.random.randint(1e10, dtype=np.int64))

            self.global_step = tf.Variable(0,
                                           trainable=False,
                                           name='global_step')
            self.is_training = tf.compat.v1.placeholder_with_default(
                False, [], name='is_training')

            x = self.x = tf.compat.v1.placeholder(dtype=tf.float32,
                                                  shape=[None, self.n_in],
                                                  name='x')
            y = self.y = tf.compat.v1.placeholder(dtype=tf.float32,
                                                  shape=[None, self.n_pred],
                                                  name='y')
            T = self.T = tf.compat.v1.placeholder(dtype=tf.float32,
                                                  shape=None,
                                                  name='T')
            C = self.C = tf.compat.v1.placeholder(dtype=tf.float32,
                                                  shape=None,
                                                  name='C')
            estimate = self.forward(x)

            with tf.control_dependencies(
                    self._debug_nan([estimate, x], names=['estim', 'x'])):
                self.coefs = prior, mu, sigma = self.get_coefs(estimate)

            dist = getattr(tfd, self.distribution)(mu, sigma)
            prob = tfd.Categorical(probs=prior)
            mix = tfd.MixtureSameFamily(prob, dist)

            def impute():
                return tf.reduce_mean([
                    mix.log_prob(
                        tf.compat.v2.where(tf.math.is_nan(y), mix.sample(), y))
                    for _ in range(self.imputations)
                ], 0)

            likelihood = tf.compat.v2.cond(tf.reduce_any(tf.math.is_nan(y)),
                                           impute, lambda: mix.log_prob(y))
            neg_log_pr = tf.reduce_mean(-likelihood)
            l2_loss = tf_layers.apply_regularization(
                tf_layers.l2_regularizer(scale=self.l2))
            total_loss = neg_log_pr + l2_loss

            self.neg_log_pr = neg_log_pr

            with tf.control_dependencies(
                    tf.compat.v1.get_collection(
                        tf.compat.v1.GraphKeys.UPDATE_OPS)):
                learn_rate = self.lr
                # learn_rate  = tf.train.polynomial_decay(self.lr, self.global_step, decay_steps=self.n_iter, end_learning_rate=self.lr/10)
                train_op = tf.compat.v1.train.AdamOptimizer(learn_rate)
                grads, var = zip(*train_op.compute_gradients(total_loss))

                with tf.control_dependencies(
                        self._debug_nan(
                            list(grads) + [total_loss],
                            names=[v.name.split(':')[0]
                                   for v in var] + ['loss'])):
                    self.train = train_op.apply_gradients(
                        zip(grads, var),
                        global_step=self.global_step,
                        name='train_op')
                    self.loss = tf.identity(total_loss, name='model_loss')

            tf.compat.v1.global_variables_initializer().run(
                session=self.session)
            self.saver = tf.compat.v1.train.Saver(max_to_keep=1,
                                                  save_relative_paths=True)
Beispiel #8
0
def setup_model(args, phrase_plh, region_plh, train_phase_plh, labels_plh,
                num_boxes_plh, num_phrases_plh, region_feature_dim,
                phrase_feature_dim, phrase_denom_plh):
    """Describes the computational graph and returns the losses and outputs.

    Arguments:
    args -- command line arguments passed into the main function
    phrase_plh -- tensor containing the phrase features
    region_plh -- tensor containing the region features
    train_phase_plh -- indicator whether model is in training mode
    labels_plh -- indicates positive (1), negative (-1), or ignore (0)
    num_boxes_plh -- number of boxes per example in the batch
    region_feature_dim -- dimensions of the region features

    Returns:
    total_loss -- weighted combination of the region and concept loss
    region_loss -- logistic loss for phrase-region prediction
    concept_loss -- L1 loss for the output of the concept weight branch
    region_prob -- each row contains the probability a region is associated with a phrase
    """
    labels_plh = tf.reshape(labels_plh,
                            [args.batch_size, num_phrases_plh, num_boxes_plh])
    region_plh = tf.reshape(
        region_plh, [args.batch_size, num_boxes_plh, region_feature_dim])
    phrase_plh = tf.reshape(
        phrase_plh, [args.batch_size, num_phrases_plh, phrase_feature_dim])

    final_embed = args.dim_embed
    embed_dim = final_embed * 4

    decov_locations = args.decov
    decov_losses = []

    decov_losses, phrase_embed = embedding_branch(phrase_plh,
                                                  embed_dim,
                                                  train_phase_plh,
                                                  'phrase',
                                                  norm_axis=2,
                                                  decov_loc=decov_locations,
                                                  decov_losses=decov_losses)
    decov_losses, region_embed = embedding_branch(region_plh,
                                                  embed_dim,
                                                  train_phase_plh,
                                                  'region',
                                                  norm_axis=2,
                                                  decov_loc=decov_locations,
                                                  decov_losses=decov_losses)
    __, concept_weights = embedding_branch(phrase_plh,
                                           embed_dim,
                                           train_phase_plh,
                                           'concept_weight',
                                           do_l2norm=False,
                                           outdim=args.num_embeddings)

    concept_loss = tf.reduce_sum(tf.norm(concept_weights, axis=2,
                                         ord=1)) / phrase_denom_plh
    concept_weights = tf.nn.softmax(concept_weights)

    elementwise_prod = tf.expand_dims(phrase_embed, 2) * tf.expand_dims(
        region_embed, 1)
    joint_embed_1 = add_fc(elementwise_prod, embed_dim, train_phase_plh,
                           'joint_embed_1')
    joint_embed_2 = concept_layer(joint_embed_1, final_embed, train_phase_plh,
                                  1, concept_weights)
    for concept_id in range(2, args.num_embeddings + 1):
        joint_embed_2 += concept_layer(joint_embed_1, final_embed,
                                       train_phase_plh, concept_id,
                                       concept_weights)

    joint_embed_3 = fully_connected(joint_embed_2,
                                    1,
                                    activation_fn=None,
                                    weights_regularizer=l2_regularizer(0.005),
                                    scope='joint_embed_3')
    joint_embed_3 = tf.squeeze(joint_embed_3, [3])
    region_prob = 1. / (1. + tf.exp(-joint_embed_3))

    ind_labels = tf.abs(labels_plh)
    num_samples = tf.reduce_sum(ind_labels) + 0.00001
    region_loss = tf.reduce_sum(
        tf.log(1 + tf.exp(-joint_embed_3 * labels_plh)) *
        ind_labels) / num_samples

    for loc in args.decov:
        if loc == 'joint_embed_1':
            decov_losses.append(decov(joint_embed_1))
        if loc == 'joint_embed_2':
            decov_losses.append(decov(joint_embed_2))
    decov_loss = sum(decov_losses)

    total_loss = region_loss + (concept_loss * args.embed_l1) + decov_loss
    return total_loss, region_loss, concept_loss, region_prob, decov_loss
Beispiel #9
0
    def encode_phrases(self):
        if self.train_phase is None:
            self.set_phrase_placeholders()

        phrase_plh = tf.reshape(
            self.phrases, [-1, self.phrases_per_image, self.phrase_length])

        # sometimes finetuning word embedding helps (with l2 reg), but often doesn't
        # seem to make a big difference
        word_embeddings = tf.get_variable('word_embeddings',
                                          self.embeddings.shape,
                                          initializer=tf.constant_initializer(
                                              self.embeddings),
                                          trainable=self.args.embedding_ft)

        embedded_words = tf.nn.embedding_lookup(word_embeddings, phrase_plh)

        embed_l2reg = tf.squeeze(tf.zeros(1))
        if self.args.embedding_ft:
            embed_l2reg = tf.nn.l2_loss(word_embeddings - vecs)

        eps = 1e-10
        if self.args.language_model == 'gru':
            phrases = tf.reshape(self.phrases, [-1, self.phrase_length])
            source_sequence_length = tf.reduce_sum(
                tf.cast(phrases > 0, tf.int32), 1)
            embedded_words = tf.reshape(
                embedded_words,
                [-1, self.phrase_length, self.embeddings.shape[1]])
            encoder_cell = tf.nn.rnn_cell.GRUCell(self.phrase_layer_dim[0])
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                encoder_cell,
                embedded_words,
                dtype=encoder_cell.dtype,
                sequence_length=source_sequence_length)
            final_outputs = extract_axis_1(encoder_outputs,
                                           source_sequence_length - 1)
            phrase_input = tf.reshape(
                final_outputs,
                [-1, self.phrases_per_image, self.phrase_layer_dim[0]])

            outputs = fully_connected(
                phrase_input,
                self.phrase_layer_dim[1],
                activation_fn=None,
                weights_regularizer=tf.contrib.layers.l2_regularizer(0.005),
                scope='phrase_encoder')
            phrase_embed = tf.nn.l2_normalize(outputs, 2, epsilon=eps)
        else:
            num_words = tf.reduce_sum(
                tf.to_float(phrase_plh > 0), 2, keep_dims=True) + eps
            phrase_input = tf.nn.l2_normalize(
                tf.reduce_sum(embedded_words, 2) / num_words, 2)
            if self.args.language_model == 'attend':
                context_vector = tf.tile(tf.expand_dims(phrase_input, 2),
                                         (1, 1, self.phrase_length, 1))
                attention_inputs = tf.concat((context_vector, embedded_words),
                                             3)
                attention_weights = fully_connected(
                    attention_inputs,
                    1,
                    weights_regularizer=l2_regularizer(0.0005),
                    scope='self_attend')
                attention_weights = tf.nn.softmax(
                    tf.squeeze(attention_weights))
                phrase_input = tf.nn.l2_normalize(
                    tf.reduce_sum(
                        embedded_words * tf.expand_dims(attention_weights, 3),
                        2), 2)
                phrase_input = tf.reshape(
                    phrase_input,
                    [-1, self.phrases_per_image, self.embeddings.shape[1]])

            if self.parameters is not None:
                phrase_embed = setup_initialize_fc_layers(
                    self.args, phrase_input, self.parameters, 'lang',
                    self.train_phase)
            else:
                phrase_embed = embedding_branch(phrase_input,
                                                self.phrase_layer_dim[0],
                                                self.phrase_layer_dim[1],
                                                self.train_phase, 'phrase')

        concept_weights = embedding_branch(phrase_input,
                                           self.phrase_layer_dim[0],
                                           self.args.num_embeddings,
                                           self.train_phase,
                                           'concept_weight',
                                           do_l2norm=False)

        concept_loss = tf.reduce_sum(tf.norm(concept_weights, axis=2,
                                             ord=1)) / self.phrase_count
        concept_weights = tf.nn.softmax(concept_weights)
        return phrase_embed, concept_weights, concept_loss, embed_l2reg