def conv2d(input_, output_dim, kernel=4, stride=2, use_sp=False, padding='SAME', scope="conv2d", use_bias=True): with tf.variable_scope(scope): w = tf.get_variable( 'w', [kernel, kernel, input_.get_shape()[-1], output_dim], initializer=tf.contrib.layers.variance_scaling_initializer(), regularizer=l2_regularizer(scale=0.0001)) if use_sp != True: conv = tf.nn.conv2d(input_, w, strides=[1, stride, stride, 1], padding=padding) else: conv = tf.nn.conv2d(input_, spectral_norm(w), strides=[1, stride, stride, 1], padding=padding) if use_bias: biases = tf.get_variable('biases', [output_dim], initializer=tf.constant_initializer(0.0)) conv = tf.reshape(tf.nn.bias_add(conv, biases), tf.shape(conv)) return conv
def fully_connect(input_, output_size, scope=None, use_sp=False, bias_start=0.0, with_w=False): shape = input_.get_shape().as_list() with tf.variable_scope(scope or "Linear"): matrix = tf.get_variable( "Matrix", [shape[1], output_size], tf.float32, initializer=tf.contrib.layers.variance_scaling_initializer(), regularizer=l2_regularizer(0.0001)) bias = tf.get_variable("bias", [output_size], tf.float32, initializer=tf.constant_initializer(bias_start)) if use_sp: mul = tf.matmul(input_, spectral_norm(matrix)) else: mul = tf.matmul(input_, matrix) if with_w: return mul + bias, matrix, bias else: return mul + bias
def _linear(self, input_tensor, output_nums, l2_reg, activation_fn=None): if l2_reg <= 0: return layers.fully_connected( input_tensor, output_nums, activation_fn=activation_fn, weights_initializer=layers.xavier_initializer(), biases_initializer=layers.xavier_initializer(), ) else: return layers.fully_connected( input_tensor, output_nums, activation_fn=activation_fn, weights_initializer=layers.xavier_initializer(), biases_initializer=layers.xavier_initializer(), weights_regularizer=layers.l2_regularizer(l2_reg), biases_regularizer=layers.l2_regularizer(l2_reg))
def encode_phrases(args, phrase_plh, train_phase_plh, num_phrases_plh, phrase_feature_dim, phrase_denom_plh, vecs): final_embed = args.dim_embed embed_dim = final_embed * 4 phrase_plh = tf.reshape(phrase_plh, [-1, num_phrases_plh, phrase_feature_dim]) # sometimes finetuning word embedding helps (with l2 reg), but often doesn't # seem to make a big difference word_embeddings = tf.get_variable('word_embeddings', vecs.shape, initializer=tf.constant_initializer(vecs), trainable = args.embedding_ft) embedded_words = tf.nn.embedding_lookup(word_embeddings, phrase_plh) # if you do finetune embed_l2reg = tf.zeros(1) if args.embedding_ft: embed_l2reg = tf.nn.l2_loss(word_embeddings - vecs) eps = 1e-10 if args.language_model == 'gru': phrase_plh = tf.reshape(phrase_plh, [-1, phrase_feature_dim]) source_sequence_length = tf.reduce_sum(tf.cast(phrase_plh > 0, tf.int32), 1) embedded_words = tf.reshape(embedded_words, [-1, phrase_feature_dim, vecs.shape[1]]) encoder_cell = tf.nn.rnn_cell.GRUCell(final_embed) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, embedded_words, dtype=encoder_cell.dtype, sequence_length=source_sequence_length) final_outputs = extract_axis_1(encoder_outputs, source_sequence_length-1) phrase_input = tf.reshape(final_outputs, [-1, num_phrases_plh, final_embed]) outputs = fully_connected(phrase_input, embed_dim, activation_fn = None, weights_regularizer = tf.contrib.layers.l2_regularizer(0.005), scope = 'phrase_encoder') phrase_embed = tf.nn.l2_normalize(outputs, 2, epsilon=eps) else: num_words = tf.reduce_sum(tf.to_float(phrase_plh > 0), 2, keep_dims=True) + eps phrase_input = tf.nn.l2_normalize(tf.reduce_sum(embedded_words, 2) / num_words, 2) if args.language_model == 'attend': context_vector = tf.tile(tf.expand_dims(phrase_input, 2), (1, 1, phrase_feature_dim, 1)) attention_inputs = tf.concat((context_vector, embedded_words), 3) attention_weights = fully_connected(attention_inputs, 1, weights_regularizer = l2_regularizer(0.0005), scope = 'self_attend') attention_weights = tf.nn.softmax(tf.squeeze(attention_weights)) phrase_input = tf.nn.l2_normalize(tf.reduce_sum(embedded_words * tf.expand_dims(attention_weights, 3), 2), 2) phrase_input = tf.reshape(phrase_input, [-1, num_phrases_plh, vecs.shape[1]]) if args.cca_parameters: parameters = pickle.load(open(args.cca_parameters, 'rb')) phrase_embed = setup_initialize_fc_layers(args, phrase_input, parameters, 'lang', train_phase_plh, norm_axis=2) else: phrase_embed = embedding_branch(phrase_input, embed_dim, train_phase_plh, 'phrase', norm_axis=2) concept_weights = embedding_branch(phrase_input, embed_dim, train_phase_plh, 'concept_weight', do_l2norm = False, outdim = args.num_embeddings) concept_loss = tf.reduce_sum(tf.norm(concept_weights, axis=2, ord=1)) / phrase_denom_plh concept_weights = tf.nn.softmax(concept_weights) return phrase_embed, concept_weights, concept_loss, embed_l2reg
def get_phrase_scores(self, phrase_embed, region_embed, concept_weights): elementwise_prod = tf.expand_dims(phrase_embed, 2) * tf.expand_dims( region_embed, 1) joint_embed_1 = add_fc(elementwise_prod, self.embed_dim, self.train_phase, 'joint_embed_1') joint_embed_2 = concept_layer(joint_embed_1, self.final_embed, self.train_phase, 1, concept_weights) for concept_id in range(2, self.args.num_embeddings + 1): joint_embed_2 += concept_layer(joint_embed_1, self.final_embed, self.train_phase, concept_id, concept_weights) joint_embed_3 = fully_connected( joint_embed_2, 1, activation_fn=None, weights_regularizer=l2_regularizer(0.005), scope='joint_embed_3') joint_embed_3 = tf.squeeze(joint_embed_3, [3]) region_prob = 1. / (1. + tf.exp(-joint_embed_3)) return region_prob, joint_embed_3
def get_phrase_scores(args, phrase_embed, region_embed, train_phase_plh, concept_weights = None): if args.two_branch: region_phrase_embedding = region_embed * tf.expand_dims(phrase_embed, 1) region_score = tf.reduce_sum(region_phrase_embedding, 2) return region_score final_embed = args.dim_embed embed_dim = final_embed * 4 elementwise_prod = tf.expand_dims(phrase_embed, 2)*tf.expand_dims(region_embed, 1) joint_embed_1 = add_fc(elementwise_prod, embed_dim, train_phase_plh, 'joint_embed_1') joint_embed_2 = concept_layer(joint_embed_1, final_embed, train_phase_plh, 1, concept_weights) for concept_id in range(2, args.num_embeddings+1): joint_embed_2 += concept_layer(joint_embed_1, final_embed, train_phase_plh, concept_id, concept_weights) joint_embed_3 = fully_connected(joint_embed_2, 1, activation_fn=None , weights_regularizer = l2_regularizer(0.005), scope = 'joint_embed_3') joint_embed_3 = tf.squeeze(joint_embed_3, [3]) region_prob = 1. / (1. + tf.exp(-joint_embed_3)) return region_prob, joint_embed_3
def construct_model(self): with self.graph.as_default(): self.random = np.random.RandomState(self.seed) tf.compat.v1.set_random_seed( self.random.randint(1e10, dtype=np.int64)) self.global_step = tf.Variable(0, trainable=False, name='global_step') self.is_training = tf.compat.v1.placeholder_with_default( False, [], name='is_training') x = self.x = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.n_in], name='x') y = self.y = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.n_pred], name='y') T = self.T = tf.compat.v1.placeholder(dtype=tf.float32, shape=None, name='T') C = self.C = tf.compat.v1.placeholder(dtype=tf.float32, shape=None, name='C') estimate = self.forward(x) with tf.control_dependencies( self._debug_nan([estimate, x], names=['estim', 'x'])): self.coefs = prior, mu, sigma = self.get_coefs(estimate) dist = getattr(tfd, self.distribution)(mu, sigma) prob = tfd.Categorical(probs=prior) mix = tfd.MixtureSameFamily(prob, dist) def impute(): return tf.reduce_mean([ mix.log_prob( tf.compat.v2.where(tf.math.is_nan(y), mix.sample(), y)) for _ in range(self.imputations) ], 0) likelihood = tf.compat.v2.cond(tf.reduce_any(tf.math.is_nan(y)), impute, lambda: mix.log_prob(y)) neg_log_pr = tf.reduce_mean(-likelihood) l2_loss = tf_layers.apply_regularization( tf_layers.l2_regularizer(scale=self.l2)) total_loss = neg_log_pr + l2_loss self.neg_log_pr = neg_log_pr with tf.control_dependencies( tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS)): learn_rate = self.lr # learn_rate = tf.train.polynomial_decay(self.lr, self.global_step, decay_steps=self.n_iter, end_learning_rate=self.lr/10) train_op = tf.compat.v1.train.AdamOptimizer(learn_rate) grads, var = zip(*train_op.compute_gradients(total_loss)) with tf.control_dependencies( self._debug_nan( list(grads) + [total_loss], names=[v.name.split(':')[0] for v in var] + ['loss'])): self.train = train_op.apply_gradients( zip(grads, var), global_step=self.global_step, name='train_op') self.loss = tf.identity(total_loss, name='model_loss') tf.compat.v1.global_variables_initializer().run( session=self.session) self.saver = tf.compat.v1.train.Saver(max_to_keep=1, save_relative_paths=True)
def setup_model(args, phrase_plh, region_plh, train_phase_plh, labels_plh, num_boxes_plh, num_phrases_plh, region_feature_dim, phrase_feature_dim, phrase_denom_plh): """Describes the computational graph and returns the losses and outputs. Arguments: args -- command line arguments passed into the main function phrase_plh -- tensor containing the phrase features region_plh -- tensor containing the region features train_phase_plh -- indicator whether model is in training mode labels_plh -- indicates positive (1), negative (-1), or ignore (0) num_boxes_plh -- number of boxes per example in the batch region_feature_dim -- dimensions of the region features Returns: total_loss -- weighted combination of the region and concept loss region_loss -- logistic loss for phrase-region prediction concept_loss -- L1 loss for the output of the concept weight branch region_prob -- each row contains the probability a region is associated with a phrase """ labels_plh = tf.reshape(labels_plh, [args.batch_size, num_phrases_plh, num_boxes_plh]) region_plh = tf.reshape( region_plh, [args.batch_size, num_boxes_plh, region_feature_dim]) phrase_plh = tf.reshape( phrase_plh, [args.batch_size, num_phrases_plh, phrase_feature_dim]) final_embed = args.dim_embed embed_dim = final_embed * 4 decov_locations = args.decov decov_losses = [] decov_losses, phrase_embed = embedding_branch(phrase_plh, embed_dim, train_phase_plh, 'phrase', norm_axis=2, decov_loc=decov_locations, decov_losses=decov_losses) decov_losses, region_embed = embedding_branch(region_plh, embed_dim, train_phase_plh, 'region', norm_axis=2, decov_loc=decov_locations, decov_losses=decov_losses) __, concept_weights = embedding_branch(phrase_plh, embed_dim, train_phase_plh, 'concept_weight', do_l2norm=False, outdim=args.num_embeddings) concept_loss = tf.reduce_sum(tf.norm(concept_weights, axis=2, ord=1)) / phrase_denom_plh concept_weights = tf.nn.softmax(concept_weights) elementwise_prod = tf.expand_dims(phrase_embed, 2) * tf.expand_dims( region_embed, 1) joint_embed_1 = add_fc(elementwise_prod, embed_dim, train_phase_plh, 'joint_embed_1') joint_embed_2 = concept_layer(joint_embed_1, final_embed, train_phase_plh, 1, concept_weights) for concept_id in range(2, args.num_embeddings + 1): joint_embed_2 += concept_layer(joint_embed_1, final_embed, train_phase_plh, concept_id, concept_weights) joint_embed_3 = fully_connected(joint_embed_2, 1, activation_fn=None, weights_regularizer=l2_regularizer(0.005), scope='joint_embed_3') joint_embed_3 = tf.squeeze(joint_embed_3, [3]) region_prob = 1. / (1. + tf.exp(-joint_embed_3)) ind_labels = tf.abs(labels_plh) num_samples = tf.reduce_sum(ind_labels) + 0.00001 region_loss = tf.reduce_sum( tf.log(1 + tf.exp(-joint_embed_3 * labels_plh)) * ind_labels) / num_samples for loc in args.decov: if loc == 'joint_embed_1': decov_losses.append(decov(joint_embed_1)) if loc == 'joint_embed_2': decov_losses.append(decov(joint_embed_2)) decov_loss = sum(decov_losses) total_loss = region_loss + (concept_loss * args.embed_l1) + decov_loss return total_loss, region_loss, concept_loss, region_prob, decov_loss
def encode_phrases(self): if self.train_phase is None: self.set_phrase_placeholders() phrase_plh = tf.reshape( self.phrases, [-1, self.phrases_per_image, self.phrase_length]) # sometimes finetuning word embedding helps (with l2 reg), but often doesn't # seem to make a big difference word_embeddings = tf.get_variable('word_embeddings', self.embeddings.shape, initializer=tf.constant_initializer( self.embeddings), trainable=self.args.embedding_ft) embedded_words = tf.nn.embedding_lookup(word_embeddings, phrase_plh) embed_l2reg = tf.squeeze(tf.zeros(1)) if self.args.embedding_ft: embed_l2reg = tf.nn.l2_loss(word_embeddings - vecs) eps = 1e-10 if self.args.language_model == 'gru': phrases = tf.reshape(self.phrases, [-1, self.phrase_length]) source_sequence_length = tf.reduce_sum( tf.cast(phrases > 0, tf.int32), 1) embedded_words = tf.reshape( embedded_words, [-1, self.phrase_length, self.embeddings.shape[1]]) encoder_cell = tf.nn.rnn_cell.GRUCell(self.phrase_layer_dim[0]) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, embedded_words, dtype=encoder_cell.dtype, sequence_length=source_sequence_length) final_outputs = extract_axis_1(encoder_outputs, source_sequence_length - 1) phrase_input = tf.reshape( final_outputs, [-1, self.phrases_per_image, self.phrase_layer_dim[0]]) outputs = fully_connected( phrase_input, self.phrase_layer_dim[1], activation_fn=None, weights_regularizer=tf.contrib.layers.l2_regularizer(0.005), scope='phrase_encoder') phrase_embed = tf.nn.l2_normalize(outputs, 2, epsilon=eps) else: num_words = tf.reduce_sum( tf.to_float(phrase_plh > 0), 2, keep_dims=True) + eps phrase_input = tf.nn.l2_normalize( tf.reduce_sum(embedded_words, 2) / num_words, 2) if self.args.language_model == 'attend': context_vector = tf.tile(tf.expand_dims(phrase_input, 2), (1, 1, self.phrase_length, 1)) attention_inputs = tf.concat((context_vector, embedded_words), 3) attention_weights = fully_connected( attention_inputs, 1, weights_regularizer=l2_regularizer(0.0005), scope='self_attend') attention_weights = tf.nn.softmax( tf.squeeze(attention_weights)) phrase_input = tf.nn.l2_normalize( tf.reduce_sum( embedded_words * tf.expand_dims(attention_weights, 3), 2), 2) phrase_input = tf.reshape( phrase_input, [-1, self.phrases_per_image, self.embeddings.shape[1]]) if self.parameters is not None: phrase_embed = setup_initialize_fc_layers( self.args, phrase_input, self.parameters, 'lang', self.train_phase) else: phrase_embed = embedding_branch(phrase_input, self.phrase_layer_dim[0], self.phrase_layer_dim[1], self.train_phase, 'phrase') concept_weights = embedding_branch(phrase_input, self.phrase_layer_dim[0], self.args.num_embeddings, self.train_phase, 'concept_weight', do_l2norm=False) concept_loss = tf.reduce_sum(tf.norm(concept_weights, axis=2, ord=1)) / self.phrase_count concept_weights = tf.nn.softmax(concept_weights) return phrase_embed, concept_weights, concept_loss, embed_l2reg