def __init__(self, config, num_words, policy_gradient, device='', reuse=False): AbstractNetwork.__init__(self, "qgen", device=device) # Create the scope for this graph with tf.variable_scope(self.scope_name, reuse=reuse): mini_batch_size = None # Picture self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images') # Question self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues') self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask') # 1 if keep and (1 q/a 1) for (START q/a STOP) self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask') self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length') # Rewards self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward') # DECODER Hidden state (for beam search) zero_state = tf.zeros([1, config['num_lstm_units']]) # default LSTM state is a zero-vector zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1]) # trick to do a dynamic size 0 tensors self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c") self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h") decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h) # Misc self.is_training = tf.placeholder(tf.bool, name='is_training') self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph self.samples = None # remove last token input_dialogues = self.dialogues[:, :-1] input_seq_length = self.seq_length - 1 # remove first token(=start token) rewards = self.cum_rewards[:, 1:] target_words = self.dialogues[:, 1:] # to understand the padding: # input # <start> is it a blue <?> <yes> is it a car <?> <no> <stop_dialogue> # target # is it a blue <?> - is it a car <?> - <stop_dialogue> - # image processing if len(config["image"]["dim"]) == 1: self.image_out = self.images else: self.image_out = get_attention(self.images, None, "none") #TODO: improve by using the previous lstm state? # Reduce the embedding size of the image with tf.variable_scope('picture_embedding'): self.picture_emb = utils.fully_connected(self.image_out, config['picture_embedding_size']) picture_emb = tf.expand_dims(self.picture_emb, 1) picture_emb = tf.tile(picture_emb, [1, tf.shape(input_dialogues)[1], 1]) # Compute the question embedding input_words = utils.get_embedding( input_dialogues, n_words=num_words, n_dim=config['word_embedding_size'], scope="word_embedding") # concat word embedding and picture embedding decoder_input = tf.concat([input_words, picture_emb], axis=2, name="concat_full_embedding") # encode one word+picture decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( config['num_lstm_units'], layer_norm=False, dropout_keep_prob=1.0, reuse=reuse) self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn( cell=decoder_lstm_cell, inputs=decoder_input, dtype=tf.float32, initial_state=decoder_initial_state, sequence_length=input_seq_length, scope="word_decoder") # TODO: use multi-layer RNN max_sequence = tf.reduce_max(self.seq_length) # compute the softmax for evaluation with tf.variable_scope('decoder_output'): flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size]) flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words) # retrieve the batch/dialogue format mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words]) # Ignore th STOP token self.softmax_output = tf.nn.softmax(mlp_output, name="softmax") self.argmax_output = tf.argmax(mlp_output, axis=2) self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words) # compute the maximum likelihood loss with tf.variable_scope('ml_loss'): ml_loss = tf.identity(self.cross_entropy_loss) ml_loss *= self.answer_mask[:, 1:] # remove answers (ignore the <stop> token) ml_loss *= self.padding_mask[:, 1:] # remove padding (ignore the <start> token) # Count number of unmask elements count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1 # no_unpad - no_qa - START token ml_loss = tf.reduce_sum(ml_loss, axis=1) # reduce over dialogue dimension ml_loss = tf.reduce_sum(ml_loss, axis=0) # reduce over minibatch dimension self.ml_loss = ml_loss / count # Normalize self.loss = self.ml_loss # Compute policy gradient if policy_gradient: with tf.variable_scope('rl_baseline'): decoder_out = tf.stop_gradient(self.decoder_output) # take the LSTM output (and stop the gradient!) flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size]) # flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden') flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out') self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1]) self.baseline *= self.answer_mask[:, 1:] self.baseline *= self.padding_mask[:, 1:] with tf.variable_scope('policy_gradient_loss'): # Compute log_prob self.log_of_policy = tf.identity(self.cross_entropy_loss) self.log_of_policy *= self.answer_mask[:, 1:] # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask) # No need to use padding mask as the discounted_reward is already zero once the episode terminated # Policy gradient loss rewards *= self.answer_mask[:, 1:] self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline) # score function self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline)) self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1) # sum over the dialogue trajectory self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0) # reduce over minibatch dimension self.loss = self.policy_gradient_loss
def get_image_features(image, question, is_training, scope_name, config, dropout_keep=1., reuse=False, att = True): image_input_type = config["image_input"] # Extract feature from 1D-image feature s if image_input_type == "fc8" \ or image_input_type == "fc7" \ or image_input_type == "dummy": image_out = image if config.get('normalize', False): image_out = tf.nn.l2_normalize(image, dim=1, name="fc_normalization") elif image_input_type.startswith("conv") or image_input_type.startswith("raw"): # Extract feature from raw images if image_input_type.startswith("raw"): # Create CBN cbn = None if "cbn" in config and config["cbn"].get("use_cbn", False) and question is not None: cbn_factory = CBNfromLSTM(question, no_units=config['cbn']["cbn_embedding_size"]) excluded_scopes = config["cbn"].get('excluded_scope_names', []) cbn = ConditionalBatchNorm(cbn_factory, excluded_scope_names=excluded_scopes, is_training=is_training) # Due to the following bug #"There is a bug with classic batchnorm with slim networks (https://github.com/tensorflow/tensorflow/issues/4887). \n" \ #"Please use the following config -> 'cbn': {'use_cbn':true, 'excluded_scope_names': ['*']}" else: cbn_factory = CBNfromLSTM(question, no_units=config['cbn']["cbn_embedding_size"]) excluded_scopes = ["*"] cbn = ConditionalBatchNorm(cbn_factory, excluded_scope_names=excluded_scopes, is_training=is_training) # Create ResNet resnet_version = config['resnet_version'] image_feature_maps = create_resnet(image, is_training=is_training, scope=scope_name, cbn=cbn, resnet_version=resnet_version, resnet_out=config.get('resnet_out', "block4")) image_feature_maps = image_feature_maps if config.get('normalize', False): image_feature_maps = tf.nn.l2_normalize(image_feature_maps, dim=[1, 2, 3]) # Extract feature from 3D-image features else: image_feature_maps = image # apply attention if att: image_out = get_attention(image_feature_maps, question, config=config["attention"], dropout_keep=dropout_keep, reuse=reuse) else: image_out = image_feature_maps else: assert False, "Wrong input type for image" return image_out
def __init__(self, config, num_words, num_answers, reuse=False, device=''): ResnetModel.__init__(self, "clevr", device=device) with tf.variable_scope(self.scope_name, reuse=reuse): batch_size = None self._is_training = tf.placeholder(tf.bool, name="is_training") dropout_keep_scalar = float(config["dropout_keep_prob"]) dropout_keep = tf.cond(self._is_training, lambda: tf.constant(dropout_keep_scalar), lambda: tf.constant(1.0)) ##################### # QUESTION ##################### self._question = tf.placeholder(tf.int32, [batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length') self._answer = tf.placeholder(tf.int64, [batch_size], name='answer') word_emb = tfc_layers.embed_sequence( ids=self._question, vocab_size=num_words, embed_dim=config["question"]["word_embedding_dim"], scope="word_embedding", reuse=reuse) if config["question"]['glove']: self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") word_emb = tf.concat([word_emb, self._glove], axis=2) word_emb = tf.nn.dropout(word_emb, dropout_keep) _, last_rnn_state = rnn.rnn_factory( inputs=word_emb, seq_length=self._seq_length, cell=config["question"]["cell"], num_hidden=config["question"]["rnn_state_size"], bidirectional=config["question"]["bidirectional"], max_pool=config["question"]["max_pool"], layer_norm=config["question"]["layer_norm"], reuse=reuse) last_rnn_state = tf.nn.dropout(last_rnn_state, dropout_keep) ##################### # IMAGES ##################### self._image = tf.placeholder(tf.float32, [batch_size] + config['image']["dim"], name='image') visual_features = get_image_features(image=self._image, is_training=self._is_training, config=config['image']) with tf.variable_scope("image_film_stack", reuse=reuse): film_stack = FiLM_Stack(image=visual_features, film_input=last_rnn_state, is_training=self._is_training, config=config["film_block"], reuse=reuse) visual_features = film_stack.get() # Pool Image Features with tf.variable_scope("image_pooling"): multimodal_features = get_attention(visual_features, last_rnn_state, is_training=self._is_training, config=config["pooling"], dropout_keep=dropout_keep, reuse=reuse) with tf.variable_scope("classifier"): self.hidden_state = tfc_layers.fully_connected(multimodal_features, num_outputs=config["classifier"]["no_mlp_units"], normalizer_fn=tfc_layers.batch_norm, normalizer_params={"center": True, "scale": True, "decay": 0.9, "is_training": self._is_training, "reuse": reuse}, activation_fn=tf.nn.relu, reuse=reuse, scope="classifier_hidden_layer") self.out = tfc_layers.fully_connected(self.hidden_state, num_outputs=num_answers, activation_fn=None, reuse=reuse, scope="classifier_softmax_layer") ##################### # Loss ##################### self.cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.out, labels=self._answer, name='cross_entropy') self.loss = tf.reduce_mean(self.cross_entropy) self.softmax = tf.nn.softmax(self.out, name='answer_prob') self.prediction = tf.argmax(self.out, axis=1, name='predicted_answer') # no need to compute the softmax with tf.variable_scope('accuracy'): self.accuracy = tf.equal(self.prediction, self._answer) self.accuracy = tf.reduce_mean(tf.cast(self.accuracy, tf.float32)) tf.summary.scalar('accuracy', self.accuracy) print('Model... build!')
def __init__(self, config, num_words, device='', reuse=False): AbstractNetwork.__init__(self, "guesser", device=device) batch_size = None with tf.variable_scope(self.scope_name, reuse=reuse): self._is_training = tf.placeholder(tf.bool, name="is_training") dropout_keep_scalar = float(config["dropout_keep_prob"]) dropout_keep = tf.cond(self._is_training, lambda: tf.constant(dropout_keep_scalar), lambda: tf.constant(1.0)) ##################### # DIALOGUE ##################### self._dialogue = tf.placeholder(tf.int32, [batch_size, None], name='dialogue') self._seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length_dialogue') word_emb = tfc_layers.embed_sequence( ids=self._dialogue, vocab_size=num_words, embed_dim=config["question"]["word_embedding_dim"], scope="word_embedding", reuse=reuse) if config["question"]['glove']: self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") word_emb = tf.concat([word_emb, self._glove], axis=2) _, self.dialogue_embedding = rnn.rnn_factory( inputs=word_emb, seq_length=self._seq_length, cell=config['question']["cell"], num_hidden=config['question']["rnn_units"], bidirectional=config["question"]["bidirectional"], max_pool=config["question"]["max_pool"], layer_norm=config["question"]["layer_norm"], reuse=reuse) ##################### # IMAGE ##################### self.img_embedding = None if config['inputs']['image']: self._image = tf.placeholder(tf.float32, [batch_size] + config['image']["dim"], name='image') # get image self.img_embedding = get_image_features( image=self._image, is_training=self._is_training, config=config['image']) # pool image feature if needed if len(self.img_embedding.get_shape()) > 2: with tf.variable_scope("image_pooling"): self.img_embedding = get_attention( self.img_embedding, self.dialogue_embedding, is_training=self._is_training, config=config["pooling"], dropout_keep=dropout_keep, reuse=reuse) # fuse vision/language self.visdiag_embedding = get_fusion_mechanism( input1=self.dialogue_embedding, input2=self.img_embedding, config=config.get["fusion"], dropout_keep=dropout_keep) else: self.visdiag_embedding = self.dialogue_embedding visdiag_dim = int(self.visdiag_embedding.get_shape()[-1]) ##################### # OBJECTS ##################### self._num_object = tf.placeholder(tf.int32, [batch_size], name='obj_seq_length') self._obj_cats = tf.placeholder(tf.int32, [batch_size, None], name='obj_cat') self._obj_spats = tf.placeholder(tf.float32, [batch_size, None, 8], name='obj_spat') cats_emb = tfc_layers.embed_sequence( ids=self._obj_cats, vocab_size=config['category']["n_categories"] + 1, # we add the unknown category embed_dim=config['category']["embedding_dim"], scope="cat_embedding", reuse=reuse) ''' spatial_emb = tfc_layers.fully_connected(self._obj_spats, num_outputs=config["spatial"]["no_mlp_units"], activation_fn=tf.nn.relu, reuse=reuse, scope="spatial_upsampling") ''' spatial_emb = self._obj_spats self.objects_input = tf.concat([cats_emb, spatial_emb], axis=2) # self.objects_input = tf.nn.dropout(self.objects_input, dropout_keep) with tf.variable_scope('obj_mlp'): h1 = tfc_layers.fully_connected( self.objects_input, num_outputs=config["object"]['no_mlp_units'], activation_fn=tf.nn.relu, scope='l1') # h1 = tf.nn.dropout(h1, dropout_keep) obj_embeddings = tfc_layers.fully_connected( h1, num_outputs=visdiag_dim, activation_fn=tf.nn.relu, scope='l2') ##################### # SCORES ##################### self.scores = obj_embeddings * tf.expand_dims( self.visdiag_embedding, axis=1) self.scores = tf.reduce_sum(self.scores, axis=2) # remove max for stability (trick) self.scores -= tf.reduce_max(self.scores, axis=1, keep_dims=True) with tf.variable_scope('object_mask', reuse=reuse): object_mask = tf.sequence_mask(self._num_object) score_mask_values = float("-inf") * tf.ones_like(self.scores) self.score_masked = tf.where(object_mask, self.scores, score_mask_values) ##################### # LOSS ##################### # Targets self._targets = tf.placeholder(tf.int32, [batch_size], name="target_index") self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self._targets, logits=self.score_masked) self.loss = tf.reduce_mean(self.loss) self.selected_object = tf.argmax(self.score_masked, axis=1) self.softmax = tf.nn.softmax(self.score_masked) with tf.variable_scope('accuracy'): self.accuracy = tf.equal(self.selected_object, tf.cast(self._targets, tf.int64)) self.accuracy = tf.reduce_mean( tf.cast(self.accuracy, tf.float32))
def __init__(self, config, no_words, no_answers, reuse=False, device=''): ResnetModel.__init__(self, "oracle", device=device) with tf.variable_scope(self.scope_name, reuse=reuse): self.batch_size = None self._is_training = tf.placeholder(tf.bool, name="is_training") dropout_keep_scalar = float(config["dropout_keep_prob"]) dropout_keep = tf.cond(self._is_training, lambda: tf.constant(dropout_keep_scalar), lambda: tf.constant(1.0)) ##################### # QUESTION ##################### self._question = tf.placeholder(tf.int32, [self.batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [self.batch_size], name='seq_length') self._answer = tf.placeholder(tf.int64, [self.batch_size, no_answers], name='answer') word_emb = tfc_layers.embed_sequence( ids=self._question, vocab_size=no_words, embed_dim=config["question"]["word_embedding_dim"], scope="word_embedding", reuse=reuse) if config["question"]['glove']: self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") word_emb = tf.concat([word_emb, self._glove], axis=2) word_emb = tf.nn.dropout(word_emb, dropout_keep) self.rnn_states, self.last_rnn_states = rnn.rnn_factory( inputs=word_emb, seq_length=self._seq_length, cell=config["question"]["cell"], num_hidden=config["question"]["rnn_state_size"], bidirectional=config["question"]["bidirectional"], max_pool=config["question"]["max_pool"], layer_norm=config["question"]["layer_norm"], reuse=reuse) ##################### # SIDE INPUTS ##################### # Category if any(get_recursively(config, "category", no_field_recursive=True)): self._category = tf.placeholder(tf.int32, [self.batch_size], name='category') cat_emb = tfc_layers.embed_sequence( ids=self._category, vocab_size=config['category']["n_categories"] + 1, embed_dim=config['category']["embedding_dim"], scope="category_embedding", reuse=reuse) cat_emb = tf.nn.dropout(cat_emb, dropout_keep) else: cat_emb = None # Spatial if any(get_recursively(config, "spatial", no_field_recursive=True)): self._spatial = tf.placeholder(tf.float32, [self.batch_size, 8], name='spatial') spatial_emb = tfc_layers.fully_connected( self._spatial, num_outputs=config["spatial"]["no_mlp_units"], activation_fn=tf.nn.relu, reuse=reuse, scope="spatial_upsampling") spatial_emb = tf.nn.dropout(spatial_emb, dropout_keep) else: spatial_emb = None self.classifier_input = [] ##################### # IMAGES / CROP ##################### for visual_str in ["image", "crop"]: # Check whether to use the visual input if config["inputs"][visual_str]: # Load Image Features visual_features = tf.placeholder(tf.float32, shape=[self.batch_size] + config[visual_str]["dim"], name=visual_str) with tf.variable_scope(visual_str, reuse=reuse): visual_features = get_image_features( image=visual_features, config=config[visual_str], is_training=self._is_training) # Modulate Image Features if "film_input" in config: # Retrieve configuration film_config = config["film_input"] block_config = config["film_block"] # Load object mask mask = tf.placeholder( tf.float32, visual_features.get_shape()[:3], name='{}_mask'.format(visual_str)) mask = tf.expand_dims(mask, axis=-1) # Perform the actual modulation with tf.variable_scope( "{}_modulation".format(visual_str)): extra_context = [] with tf.variable_scope( "{}_film_input".format(visual_str), reuse=reuse): if film_config["category"]: extra_context.append(cat_emb) if film_config["spatial"]: extra_context.append(spatial_emb) if film_config["mask"]: mask_dim = int( visual_features.get_shape()[1]) * int( visual_features.get_shape()[2]) flat_mask = tf.reshape( mask, shape=[-1, mask_dim]) extra_context.append(flat_mask) with tf.variable_scope( "{}_reading_cell".format(visual_str)): reading_unit = create_reading_unit( last_state=self.last_rnn_states, states=self.rnn_states, seq_length=self._seq_length, config=film_config["reading_unit"], reuse=reuse) film_layer_fct = create_film_layer_with_reading_unit( reading_unit) with tf.variable_scope( "{}_film_stack".format(visual_str), reuse=reuse): def append_extra_features(features, config): if config[ "spatial_location"]: # add the pixel location as two additional feature map features = ft_utils.append_spatial_location( features) if config[ "mask"]: # add the mask on the object as one additional feature map features = tf.concat([features, mask], axis=3) return features film_stack = FiLM_Stack( image=visual_features, film_input=extra_context, film_layer_fct=film_layer_fct, is_training=self._is_training, config=block_config, append_extra_features=append_extra_features, reuse=reuse) visual_features = film_stack.get() # Pool Image Features if len(visual_features.get_shape()) > 2: with tf.variable_scope( "{}_pooling".format(visual_str)): visual_features = get_attention( visual_features, self.last_rnn_states, is_training=self._is_training, config=config["pooling"], dropout_keep=dropout_keep, reuse=reuse) self.classifier_input.append(visual_features) ##################### # FINAL LAYER ##################### with tf.variable_scope("classifier", reuse=reuse): if config["classifier"]["inputs"]["question"]: self.classifier_input.append(self.last_rnn_states) if config["classifier"]["inputs"]["category"]: self.classifier_input.append(cat_emb) if config["classifier"]["inputs"]["spatial"]: self.classifier_input.append(spatial_emb) assert len( self.classifier_input ) > 0, "Please provide some inputs for the classifier!!!" self.classifier_input = tf.concat(self.classifier_input, axis=1) self.hidden_state = tfc_layers.fully_connected( self.classifier_input, num_outputs=config["classifier"]["no_mlp_units"], activation_fn=tf.nn.relu, reuse=reuse, scope="classifier_hidden_layer") self.hidden_state = tf.nn.dropout(self.hidden_state, dropout_keep) self.out = tfc_layers.fully_connected( self.hidden_state, num_outputs=no_answers, activation_fn=None, reuse=reuse, scope="classifier_softmax_layer") ##################### # Loss ##################### self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.out, labels=self._answer, name='cross_entropy') self.loss = tf.reduce_mean(self.cross_entropy) self.softmax = tf.nn.softmax(self.out, name='answer_prob') self.prediction = tf.argmax( self.out, axis=1, name='predicted_answer') # no need to compute the softmax self.success = tf.equal( self.prediction, tf.argmax(self._answer, axis=1)) # no need to compute the softmax with tf.variable_scope('accuracy'): self.accuracy = tf.equal(self.prediction, tf.argmax(self._answer, axis=1)) self.accuracy = tf.reduce_mean( tf.cast(self.accuracy, tf.float32)) tf.summary.scalar('accuracy', self.accuracy) print('Model... build!')
def __init__(self, config, num_words, num_answers, device='', reuse=False): ResnetModel.__init__(self, "clevr", device=device) with tf.variable_scope(self.scope_name, reuse=reuse): batch_size = None self._is_training = tf.placeholder(tf.bool, name="is_training") dropout_keep_scalar = float(config["dropout_keep_prob"]) dropout_keep = tf.cond(self._is_training, lambda: tf.constant(dropout_keep_scalar), lambda: tf.constant(1.0)) ##################### # QUESTION ##################### self._question = tf.placeholder(tf.int32, [batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length') self._answer = tf.placeholder(tf.int64, [batch_size, num_answers], name='answer') word_emb = tfc_layers.embed_sequence( ids=self._question, vocab_size=num_words, embed_dim=config["question"]["word_embedding_dim"], scope="word_embedding", reuse=reuse) if config["question"]['glove']: self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") word_emb = tf.concat([word_emb, self._glove], axis=2) word_emb = tf.nn.dropout(word_emb, dropout_keep) _, last_rnn_state = rnn.rnn_factory( inputs=word_emb, seq_length=self._seq_length, cell=config["question"]["cell"], num_hidden=config["question"]["rnn_state_size"], bidirectional=config["question"]["bidirectional"], max_pool=config["question"]["max_pool"], layer_norm=config["question"]["layer_norm"], reuse=reuse) ##################### # IMAGES ##################### self._image = tf.placeholder(tf.float32, [batch_size] + config['image']["dim"], name='image') cbn = None if "cbn" in config: cbn = get_cbn(config["cbn"], last_rnn_state, dropout_keep, self._is_training) self.image_out = get_image_features(image=self._image, is_training=self._is_training, config=config['image'], cbn=cbn) if len(self.image_out.get_shape()) > 2: with tf.variable_scope("image_pooling"): self.image_out = get_attention( self.image_out, last_rnn_state, is_training=self._is_training, config=config["pooling"], dropout_keep=dropout_keep, reuse=reuse) ##################### # FUSION ##################### self.visdiag_embedding = get_fusion_mechanism( input1=self.image_out, input2=last_rnn_state, config=config.get["fusion"], dropout_keep=dropout_keep) ##################### # CLASSIFIER ##################### with tf.variable_scope('mlp'): num_hiddens = config['classifier']['no_mlp_units'] self.out = tfc_layers.fully_connected(self.visdiag_embedding, num_hiddens, activation_fn=tf.nn.relu) self.out = tfc_layers.fully_connected(self.out, num_answers, activation_fn=None) self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=self.out, labels=self._answer) self.loss = tf.reduce_mean(self.cross_entropy) self.softmax = tf.nn.softmax(self.out, name='answer_prob') self.prediction = tf.argmax( self.out, axis=1, name='predicted_answer') # no need to compute the softmax self.success = tf.equal( self.prediction, tf.argmax(self._answer, axis=1)) # no need to compute the softmax with tf.variable_scope('accuracy'): self.accuracy = tf.equal(self.prediction, tf.argmax(self._answer, axis=1)) self.accuracy = tf.reduce_mean(tf.cast(self.accuracy, tf.float32)) print('Model... CLEVR (baseline) build!')
def get_image_features(image, question, is_training, scope_name, scope_feature, config, dropout_keep=1., reuse=False, co_attention=True): image_input_type = config["image_input"] # Extract feature from 1D-image feature s if image_input_type == "fc8" \ or image_input_type == "fc7" \ or image_input_type == "dummy": image_out = image if config.get('normalize', False): image_out = tf.nn.l2_normalize(image, dim=1, name="fc_normalization") elif image_input_type.startswith("conv") or image_input_type.startswith( "raw"): print("---------------------------------- Before IF") # Extract feature from raw images if image_input_type.startswith("raw"): # Create CBN cbn = None if config["cbn"].get("use_cbn", False): cbn_factory = CBNfromLSTM( question, no_units=config['cbn']["cbn_embedding_size"]) excluded_scopes = config["cbn"].get('excluded_scope_names', []) cbn = ConditionalBatchNorm( cbn_factory, excluded_scope_names=excluded_scopes, is_training=is_training) print("Image = {} ".format(image)) print("cbn_factory = {} ".format(cbn_factory)) print("excluded_scopes = {} ".format(excluded_scopes)) print("cbn = {} ".format(cbn)) # exit() # print("---------------------------------- Before resnet_version") # Create ResNet resnet_version = config['resnet_version'] image_feature_maps, _ = create_resnet( image, is_training=is_training, scope=scope_name, scope_feature=scope_feature, cbn=cbn, resnet_version=resnet_version, resnet_out=config.get('resnet_out', "block4")) print("-- image_feature_maps = {}".format(image_feature_maps)) print("---------------------------------- After resnet_version") image_feature_maps = image_feature_maps if config.get('normalize', False): image_feature_maps = tf.nn.l2_normalize(image_feature_maps, dim=[1, 2, 3]) # Extract feature from 3D-image features else: image_feature_maps = image # apply attention image_out = image_feature_maps print("image_out 1= {}".format(image_out)) # exit() # print("before im") if not co_attention: image_out = get_attention(image_feature_maps, question, config=config["attention"], dropout_keep=dropout_keep, reuse=reuse) # print("-------- image_out = ",image_out) # exit() else: assert False, "Wrong input type for image" print("---------------------------------- Finish image_out") return image_out