def call(self, xs, mask=None): assert len(xs) == 2 # separate out input matrices # x1, x2: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) x1, x2 = xs # build alignment matrix # alpha: (BATCH_SIZE, MAX_TIMESTEPS, MAX_TIMESTEPS) alpha = K.softmax( K.batch_dot(K.dot(x2, self.W), K.permute_dimensions(x1, (0, 2, 1)))) # build context vectors # c1, c2: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) c1 = K.repeat_elements(K.sum(K.batch_dot(alpha, x2), axis=1, keepdims=True), self.max_timesteps, axis=1) c2 = K.repeat_elements(K.sum(K.batch_dot( K.permute_dimensions(alpha, (0, 2, 1)), x1), axis=1, keepdims=True), self.max_timesteps, axis=1) # build attention vector # o1t, o2t: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) o1t = K.tanh(K.dot(K.concatenate([c1, x1], axis=2), self.U1)) o2t = K.tanh(K.dot(K.concatenate([c2, x2], axis=2), self.U2)) # masking if mask is not None and mask[0] is not None: o1t *= K.cast( K.repeat_elements(K.expand_dims(mask[0], axis=2), o1t.shape[2], 2), K.floatx()) if mask is not None and mask[1] is not None: o2t *= K.cast( K.repeat_elements(K.expand_dims(mask[0], axis=2), o2t.shape[2], 2), K.floatx()) # sum over timesteps # o1, o2: (BATCH_SIZE, EMBED_SIZE) o1 = K.sum(o1t, axis=1) o2 = K.sum(o2t, axis=1) # merge the attention vectors according to merge_mode if self.merge_mode == "concat": return concatenate([o1, o2], axis=1) elif self.merge_mode == "diff": return add([o1, -o2]) elif self.merge_mode == "prod": return multiply([o1, o2]) elif self.merge_mode == "avg": return average([o1, o2]) else: # max return maximum([o1, o2])
def Model_sent2tag_MLP_1(sentvocabsize, tagvocabsize, sent_W, tag_W, s2v_k, tag2v_k): input_sent = Input(shape=(1, ), dtype='int32') sent_embedding = Embedding(input_dim=sentvocabsize, output_dim=s2v_k, input_length=1, mask_zero=False, trainable=False, weights=[sent_W])(input_sent) input_tag = Input(shape=(1, ), dtype='int32') tag_embedding = Embedding(input_dim=tagvocabsize, output_dim=tag2v_k, input_length=1, mask_zero=False, trainable=False, weights=[tag_W])(input_tag) x1_1 = Flatten()(sent_embedding) x2_0 = Flatten()(tag_embedding) # x1_1 = Dense(100, activation='tanh')(x1_0) sub = subtract([x2_0, x1_1]) mul = multiply([x2_0, x1_1]) max = maximum([x2_0, x1_1]) avg = average([x2_0, x1_1]) class_input = concatenate([x2_0, x1_1, sub, mul, max, avg], axis=-1) # class_input = Flatten()(class_input) class_mlp1 = Dense(200, activation='tanh')(class_input) class_mlp1 = Dropout(0.5)(class_mlp1) class_mlp2 = Dense(2)(class_mlp1) class_output = Activation('softmax', name='CLASS')(class_mlp2) # distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([mlp_x1_2, x2_0]) # distance = dot([x1_0, x2_0], axes=-1, normalize=True) mymodel = Model([input_sent, input_tag], class_output) mymodel.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=0.001), metrics=['acc']) return mymodel
def call(self, xs, mask=None): assert len(xs) == 2 # separate out input matrices # x1.shape == (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) # x2.shape == (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) x1, x2 = xs # build alignment matrix alpha = K.softmax(K.batch_dot(x1, x2, axes=(2, 2))) # align inputs # a1t, a2t: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) a1t = K.batch_dot(alpha, x2, axes=(1, 1)) a2t = K.batch_dot(alpha, x1, axes=(2, 1)) # produce aligned outputs # o1t, o2t: (BATCH_SIZE, MAX_TIMESTEPS*2, EMBED_SIZE) o1t = K.tanh(K.dot(x1, self.U1) + K.dot(a1t, self.V1)) o2t = K.tanh(K.dot(x2, self.U2) + K.dot(a2t, self.V2)) # masking if mask is not None and mask[0] is not None: o1t *= K.cast( K.repeat_elements(K.expand_dims(mask[0], axis=2), o1t.shape[2], 2), K.floatx()) if mask is not None and mask[1] is not None: o2t *= K.cast( K.repeat_elements(K.expand_dims(mask[1], axis=2), o2t.shape[2], 2), K.floatx()) # o1, o2: (BATCH_SIZE, EMBED_SIZE) o1 = K.mean(o1t, axis=1) o2 = K.mean(o2t, axis=1) # merge the attention vectors according to merge_mode if self.merge_mode == "concat": return concatenate([o1, o2], axis=1) elif self.merge_mode == "diff": return add([o1, -o2]) elif self.merge_mode == "prod": return multiply([o1, o2]) elif self.merge_mode == "avg": return average([o1, o2]) else: # max return maximum([o1, o2])
def CompoundNet_VGG19(include_top=True, weights=None, input_tensor=None, input_shape=None, fusion_strategy='concatenate', mode='fine_tuning', pooling_mode='avg', classes=9, data_augm_enabled=False): """Instantiates the CompoundNet VGG19 architecture fine-tuned (2 steps) on Human Rights Archive dataset. Optionally loads weights pre-trained on Human Rights Archive Database. # Arguments include_top: whether to include the 3 fully-connected layers at the top of the network. weights: one of `None` (random initialization), 'HRA' (pre-training on Human Rights Archive), or the path to the weights file to be loaded. input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to use as image input for the model. input_shape: optional shape tuple, only to be specified if `include_top` is False (otherwise the input shape has to be `(224, 224, 3)` (with `channels_last` data format) or `(3, 224, 224)` (with `channels_first` data format). It should have exactly 3 input channels, and width and height should be no smaller than 48. E.g. `(200, 200, 3)` would be one valid value. fusion_strategy: one of `concatenate` (feature vectors of different sources are concatenated into one super-vector), `average` (the feature set is averaged) or `maximum` (selects the highest value from the corresponding features). mode: one of `feature_extraction` (freeze all but the penultimate layer and re-train the last Dense layer) or `fine_tuning` (unfreeze the lower convolutional layers and retrain more layers). pooling_mode: Optional pooling_mode mode for feature extraction when `include_top` is `False`. - `None` means that the output of the model will be the 4D tensor output of the last convolutional layer. - `avg` means that global average pooling_mode will be applied to the output of the last convolutional layer, and thus the output of the model will be a 2D tensor. - `max` means that global max pooling_mode will be applied. classes: optional number of classes to classify images into, only to be specified if `weights` argument is `None`. data_augm_enabled: whether to use the augmented samples during training. # Returns A Keras model instance. # Raises ValueError: in case of invalid argument for `weights`. """ if not (weights in {'HRA', None} or os.path.exists(weights)): raise ValueError('The `weights` argument should be either ' '`None` (random initialization), `HRA` ' '(pre-training on Human Rights Archive), ' 'or the path to the weights file to be loaded.') if not (fusion_strategy in {'concatenate', 'average', 'maximum'}): raise ValueError( 'The `fusion_strategy` argument should be either ' '`concatenate` (feature vectors of different sources are concatenated into one super-vector), ' '`average` (the feature set is averaged) ' 'or `maximum` (selects the highest value from the corresponding features).' ) if not (pooling_mode in {'avg', 'max', 'flatten'}): raise ValueError('The `pooling_mode` argument should be either ' '`avg` (GlobalAveragePooling2D), `max` ' '(GlobalMaxPooling2D), ' 'or `flatten` (Flatten).') if weights == 'HRA' and classes != 9: raise ValueError( 'If using `weights` as Human Rights Archive, `classes` should be 9.' ) cache_subdir = 'HRA_models' # Determine proper input shape input_shape = _obtain_input_shape(input_shape, default_size=224, min_size=48, data_format=K.image_data_format(), require_flatten=include_top, weights=weights) if input_tensor is None: img_input = Input(shape=input_shape) else: if not K.is_keras_tensor(input_tensor): img_input = Input(tensor=input_tensor, shape=input_shape) else: img_input = input_tensor input_tensor = Input(shape=(224, 224, 3)) object_centric_model = VGG19(input_tensor=input_tensor, weights='imagenet', include_top=False) scene_centric_model = VGG16_Places365(input_tensor=input_tensor, weights='places', include_top=False) # retrieve the ouputs object_model_output = object_centric_model.output scene_model_output = scene_centric_model.output # We will feed the extracted features to a merging layer if fusion_strategy == 'concatenate': merged = concatenate([object_model_output, scene_model_output]) elif fusion_strategy == 'average': merged = average([object_model_output, scene_model_output]) else: merged = maximum([object_model_output, scene_model_output]) if include_top: if pooling_mode == 'avg': x = GlobalAveragePooling2D(name='GAP')(merged) elif pooling_mode == 'max': x = GlobalMaxPooling2D(name='GMP')(merged) elif pooling_mode == 'flatten': x = Flatten(name='FLATTEN')(merged) x = Dense(256, activation='relu', name='FC1')(x) # let's add a fully-connected layer # When random init is enabled, we want to include Dropout, # otherwise when loading a pre-trained HRA model we want to omit # Dropout layer so the visualisations are done properly (there is an issue if it is included) if weights is None: x = Dropout(0.5, name='DROPOUT')(x) # and a logistic layer with the number of classes defined by the `classes` argument x = Dense(classes, activation='softmax', name='PREDICTIONS')(x) # new softmax layer # Ensure that the model takes into account any potential predecessors of `input_tensor`. if input_tensor is not None: inputs = get_source_inputs(input_tensor) else: inputs = img_input # this is the transfer learning model we will train model = Model(inputs=inputs, outputs=x, name='CompoundNet-VGG19') # load weights if weights == 'HRA': if include_top: if mode == 'feature_extraction': for layer in object_centric_model.layers: layer.trainable = False for layer in scene_centric_model.layers: layer.trainable = False model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy') if data_augm_enabled: if fusion_strategy == 'concatenate': if pooling_mode == 'avg': weights_path = get_file( AUGM_FEATURE_EXTRACTION_CONCATENATE_FUSION_AVG_POOL_fname, AUGM_FEATURE_EXTRACTION_CONCATENATE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( AUGM_FEATURE_EXTRACTION_CONCATENATE_FUSION_FLATTEN_fname, AUGM_FEATURE_EXTRACTION_CONCATENATE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( AUGM_FEATURE_EXTRACTION_CONCATENATE_FUSION_MAX_POOL_fname, AUGM_FEATURE_EXTRACTION_CONCATENATE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'average': if pooling_mode == 'avg': weights_path = get_file( AUGM_FEATURE_EXTRACTION_AVERAGE_FUSION_AVG_POOL_fname, AUGM_FEATURE_EXTRACTION_AVERAGE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( AUGM_FEATURE_EXTRACTION_AVERAGE_FUSION_FLATTEN_fname, AUGM_FEATURE_EXTRACTION_AVERAGE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( AUGM_FEATURE_EXTRACTION_AVERAGE_FUSION_MAX_POOL_fname, AUGM_FEATURE_EXTRACTION_AVERAGE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'maximum': if pooling_mode == 'avg': weights_path = get_file( AUGM_FEATURE_EXTRACTION_MAXIMUM_FUSION_AVG_POOL_fname, AUGM_FEATURE_EXTRACTION_MAXIMUM_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( AUGM_FEATURE_EXTRACTION_MAXIMUM_FUSION_FLATTEN_fname, AUGM_FEATURE_EXTRACTION_MAXIMUM_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( AUGM_FEATURE_EXTRACTION_MAXIMUM_FUSION_MAX_POOL_fname, AUGM_FEATURE_EXTRACTION_MAXIMUM_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) else: if fusion_strategy == 'concatenate': if pooling_mode == 'avg': weights_path = get_file( FEATURE_EXTRACTION_CONCATENATE_FUSION_AVG_POOL_fname, FEATURE_EXTRACTION_CONCATENATE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( FEATURE_EXTRACTION_CONCATENATE_FUSION_FLATTEN_fname, FEATURE_EXTRACTION_CONCATENATE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( FEATURE_EXTRACTION_CONCATENATE_FUSION_MAX_POOL_fname, FEATURE_EXTRACTION_CONCATENATE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'average': if pooling_mode == 'avg': weights_path = get_file( FEATURE_EXTRACTION_AVERAGE_FUSION_AVG_POOL_fname, FEATURE_EXTRACTION_AVERAGE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( FEATURE_EXTRACTION_AVERAGE_FUSION_FLATTEN_fname, FEATURE_EXTRACTION_AVERAGE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( FEATURE_EXTRACTION_AVERAGE_FUSION_MAX_POOL_fname, FEATURE_EXTRACTION_AVERAGE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'maximum': if pooling_mode == 'avg': weights_path = get_file( FEATURE_EXTRACTION_MAXIMUM_FUSION_AVG_POOL_fname, FEATURE_EXTRACTION_MAXIMUM_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( FEATURE_EXTRACTION_MAXIMUM_FUSION_FLATTEN_fname, FEATURE_EXTRACTION_MAXIMUM_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( FEATURE_EXTRACTION_MAXIMUM_FUSION_MAX_POOL_fname, FEATURE_EXTRACTION_MAXIMUM_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif mode == 'fine_tuning': for layer in model.layers[:36]: layer.trainable = False for layer in model.layers[36:]: layer.trainable = True model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy') if data_augm_enabled: if fusion_strategy == 'concatenate': if pooling_mode == 'avg': weights_path = get_file( AUGM_FINE_TUNING_CONCATENATE_FUSION_AVG_POOL_fname, AUGM_FINE_TUNING_CONCATENATE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( AUGM_FINE_TUNING_CONCATENATE_FUSION_FLATTEN_fname, AUGM_FINE_TUNING_CONCATENATE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( AUGM_FINE_TUNING_CONCATENATE_FUSION_MAX_POOL_fname, AUGM_FINE_TUNING_CONCATENATE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'average': if pooling_mode == 'avg': weights_path = get_file( AUGM_FINE_TUNING_AVERAGE_FUSION_AVG_POOL_fname, AUGM_FINE_TUNING_AVERAGE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( AUGM_FINE_TUNING_AVERAGE_FUSION_FLATTEN_fname, AUGM_FINE_TUNING_AVERAGE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( AUGM_FINE_TUNING_AVERAGE_FUSION_MAX_POOL_fname, AUGM_FINE_TUNING_AVERAGE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'maximum': if pooling_mode == 'avg': weights_path = get_file( AUGM_FINE_TUNING_MAXIMUM_FUSION_AVG_POOL_fname, AUGM_FINE_TUNING_MAXIMUM_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( AUGM_FINE_TUNING_MAXIMUM_FUSION_FLATTEN_fname, AUGM_FINE_TUNING_MAXIMUM_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( AUGM_FINE_TUNING_MAXIMUM_FUSION_MAX_POOL_fname, AUGM_FINE_TUNING_MAXIMUM_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) else: if fusion_strategy == 'concatenate': if pooling_mode == 'avg': weights_path = get_file( FINE_TUNING_CONCATENATE_FUSION_AVG_POOL_fname, FINE_TUNING_CONCATENATE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( FINE_TUNING_CONCATENATE_FUSION_FLATTEN_fname, FINE_TUNING_CONCATENATE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( FINE_TUNING_CONCATENATE_FUSION_MAX_POOL_fname, FINE_TUNING_CONCATENATE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'average': if pooling_mode == 'avg': weights_path = get_file( FINE_TUNING_AVERAGE_FUSION_AVG_POOL_fname, FINE_TUNING_AVERAGE_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( FINE_TUNING_AVERAGE_FUSION_FLATTEN_fname, FINE_TUNING_AVERAGE_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( FINE_TUNING_AVERAGE_FUSION_MAX_POOL_fname, FINE_TUNING_AVERAGE_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif fusion_strategy == 'maximum': if pooling_mode == 'avg': weights_path = get_file( FINE_TUNING_MAXIMUM_FUSION_AVG_POOL_fname, FINE_TUNING_MAXIMUM_FUSION_AVG_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'flatten': weights_path = get_file( FINE_TUNING_MAXIMUM_FUSION_FLATTEN_fname, FINE_TUNING_MAXIMUM_FUSION_FLATTEN_WEIGHTS_PATH, cache_subdir=cache_subdir) elif pooling_mode == 'max': weights_path = get_file( FINE_TUNING_MAXIMUM_FUSION_MAX_POOL_fname, FINE_TUNING_MAXIMUM_FUSION_MAX_POOL_WEIGHTS_PATH, cache_subdir=cache_subdir) else: if fusion_strategy == 'average': weights_path = get_file( FINE_TUNING_AVERAGE_FUSION_NO_TOP_fname, FINE_TUNING_AVERAGE_FUSION_WEIGHTS_PATH_NO_TOP, cache_subdir=cache_subdir) elif fusion_strategy == 'concatenate': weights_path = get_file( FINE_TUNING_CONCATENATE_FUSION_NO_TOP_fname, FINE_TUNING_CONCATENATE_FUSION_WEIGHTS_PATH_NO_TOP, cache_subdir=cache_subdir) elif fusion_strategy == 'maximum': weights_path = get_file( FINE_TUNING_MAXIMUM_FUSION_NO_TOP_fname, FINE_TUNING_MAXIMUM_FUSION_WEIGHTS_PATH_NO_TOP, cache_subdir=cache_subdir) model.load_weights(weights_path) return model
def compoundNet_feature_extraction(object_centric_model='VGG16', scene_centric_model='VGG16_Places365', fusion_strategy='concatenate', pooling_mode='avg', classes=9, data_augm_enabled=False): """ConvNet as fixed feature extractor, consist of taking the convolutional base of a previously-trained network, running the new data through it, and training a new classifier on top of the output. (i.e. train only the randomly initialized top layers while freezing all convolutional layers of the original model). # Arguments object_centric_model: one of `VGG16`, `VGG19` or `ResNet50` scene_centric_model: `VGG16_Places365` fusion_strategy: one of `concatenate` (feature vectors of different sources are concatenated into one super-vector), `average` (the feature set is averaged) or `maximum` (selects the highest value from the corresponding features). pooling_mode: Optional pooling_mode mode for feature extraction when `include_top` is `False`. - `None` means that the output of the model will be the 4D tensor output of the last convolutional layer. - `avg` means that global average pooling_mode will be applied to the output of the last convolutional layer, and thus the output of the model will be a 2D tensor. - `max` means that global max pooling_mode will be applied. classes: optional number of classes to classify images into, only to be specified if `weights` argument is `None`. data_augm_enabled: whether to use the augmented samples during training. # Returns A Keras model instance. # Raises ValueError: in case of invalid argument for `object_centric_model`, `pooling_mode`, `fusion_strategy` , `scene_centric_model` or invalid input shape. """ if not (object_centric_model in {'VGG16', 'VGG19', 'ResNet50'}): raise ValueError( 'The `scene_centric_model` argument should be either ' '`VGG16`, `VGG19` or `ResNet50`. Other models will be supported in future releases. ' ) if not (pooling_mode in {'avg', 'max', 'flatten'}): raise ValueError('The `pooling_mode` argument should be either ' '`avg` (GlobalAveragePooling2D), `max` ' '(GlobalMaxPooling2D), ' 'or `flatten` (Flatten).') if not (fusion_strategy in {'concatenate', 'average', 'maximum'}): raise ValueError( 'The `fusion_strategy` argument should be either ' '`concatenate` (feature vectors of different sources are concatenated into one super-vector),' ' `average` (the feature set is averaged) ' 'or `maximum` (selects the highest value from the corresponding features).' ) if not (scene_centric_model in {'VGG16_Places365'}): raise ValueError( 'The `scene_centric_model` argument should be ' '`VGG16_Places365`. Other models will be supported in future releases.' ) # Define the name of the model and its weights weights_name = 'compoundNet_feature_extraction_' \ + object_centric_model + '_' \ + fusion_strategy + '_fusion_' \ + pooling_mode + '_pool_weights_tf_dim_ordering_tf_kernels.h5' augm_samples_weights_name = 'augm_compoundNet_feature_extraction_' \ + object_centric_model + '_' \ + fusion_strategy + '_fusion_' \ + pooling_mode + '_pool_weights_tf_dim_ordering_tf_kernels.h5' model_log = logs_dir + 'compoundNet_feature_extraction_' \ + object_centric_model + '_' \ + fusion_strategy + '_fusion_' \ + pooling_mode + '_pool_log.csv' csv_logger = CSVLogger(model_log, append=True, separator=',') input_tensor = Input(shape=(224, 224, 3)) # create the base object_centric_model pre-trained model for warm-up if object_centric_model == 'VGG16': object_base_model = VGG16(input_tensor=input_tensor, weights='imagenet', include_top=False) elif object_centric_model == 'VGG19': object_base_model = VGG19(input_tensor=input_tensor, weights='imagenet', include_top=False) elif object_centric_model == 'ResNet50': tmp_model = ResNet50(input_tensor=input_tensor, weights='imagenet', include_top=False) object_base_model = Model( inputs=tmp_model.input, outputs=tmp_model.get_layer('activation_48').output) print('\n \n') print('The plain, object-centric `' + object_centric_model + '` pre-trained convnet was successfully initialised.\n') scene_base_model = VGG16_Places365(input_tensor=input_tensor, weights='places', include_top=False) print('The plain, scene-centric `' + scene_centric_model + '` pre-trained convnet was successfully initialised.\n') # retrieve the ouputs object_base_model_output = object_base_model.output scene_base_model_output = scene_base_model.output # We will feed the extracted features to a merging layer if fusion_strategy == 'concatenate': merged = concatenate( [object_base_model_output, scene_base_model_output]) elif fusion_strategy == 'average': merged = average([object_base_model_output, scene_base_model_output]) else: merged = maximum([object_base_model_output, scene_base_model_output]) if pooling_mode == 'avg': x = GlobalAveragePooling2D(name='GAP')(merged) elif pooling_mode == 'max': x = GlobalMaxPooling2D(name='GMP')(merged) elif pooling_mode == 'flatten': x = Flatten(name='FLATTEN')(merged) x = Dense(256, activation='relu', name='FC1')(x) # let's add a fully-connected layer # When random init is enabled, we want to include Dropout, # otherwise when loading a pre-trained HRA model we want to omit # Dropout layer so the visualisations are done properly (there is an issue if it is included) x = Dropout(0.5, name='DROPOUT')(x) # and a logistic layer with the number of classes defined by the `classes` argument predictions = Dense(classes, activation='softmax', name='PREDICTIONS')(x) # new softmax layer # this is the transfer learning model we will train model = Model(inputs=object_base_model.input, outputs=predictions) print( 'Randomly initialised classifier was successfully added on top of the merged outputs. \n' ) print( 'Number of trainable weights before freezing the conv. bases of the respective original models: ' '' + str(len(model.trainable_weights))) # first: train only the top layers (which were randomly initialized) # i.e. freeze all convolutional layers of the preliminary base model for layer in object_base_model.layers: layer.trainable = False for layer in scene_base_model.layers: layer.trainable = False print( 'Number of trainable weights after freezing the conv. bases of the respective original models: ' '' + str(len(model.trainable_weights))) print('\n') # compile the warm_up_model (should be done *after* setting layers to non-trainable) model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy']) model.summary() # # The attribute model.metrics_names will give you the display labels for the scalar outputs. # print warm_up_model.metrics_names if data_augm_enabled: print( 'Using augmented samples for training. This may take a while ! \n') t = now() history = model.fit_generator(augmented_train_generator, steps_per_epoch=nb_train_samples // batch_size, epochs=feature_extraction_epochs, callbacks=[csv_logger], class_weight=class_weight) print( 'Training time for re-training the last Dense layer using augmented samples: %s' % (now() - t)) model.save_weights(feature_extraction_dir + augm_samples_weights_name) print('Model weights using augmented samples were saved as `' + augm_samples_weights_name + '`') print('\n') else: t = now() history = model.fit_generator(train_generator, steps_per_epoch=nb_train_samples // batch_size, epochs=feature_extraction_epochs, callbacks=[csv_logger], class_weight=class_weight) print('Training time for re-training the last Dense layer: %s' % (now() - t)) model.save_weights(feature_extraction_dir + weights_name) print('Model weights were saved as `' + weights_name + '`') print('\n') return model
def ParallelDenseNet121(num_inputs=4, input_size=224, nchannels=3, nb_dense_block=4, growth_rate=32, nb_filter=64, reduction=0.0, dropout_rate=0.0, weight_decay=1e-4, classes=1000, num_gpu=1): '''Instantiate the DenseNet 121 architecture, # Arguments nb_dense_block: number of dense blocks to add to end growth_rate: number of filters to add per dense block nb_filter: initial number of filters reduction: reduction factor of transition blocks. dropout_rate: dropout rate weight_decay: weight decay factor classes: optional number of classes to classify images weights_path: path to pre-trained weights # Returns A Keras model instance. ''' eps = 1.1e-5 # compute compression factor compression = 1.0 - reduction # Handle Dimension Ordering for different backends global concat_axis if K.image_dim_ordering() == 'tf': concat_axis = 3 img_inputs = [ Input(shape=(input_size, input_size, nchannels), name=f'input_{i}') for i in range(num_inputs) ] else: concat_axis = 1 img_inputs = [ Input(shape=(nchannels, input_size, input_size), name='data') for i in range(num_inputs) ] # From architecture for ImageNet (Table 1 in the paper) nb_layers = [6, 12, 24, 16] # For DenseNet-121 # Initial convolution init_conv_layers = [ ZeroPadding2D((3, 3)), Convolution2D(nb_filter, 7, 7, subsample=(2, 2), bias=False), BatchNormalization(epsilon=eps, axis=concat_axis), Scale(axis=concat_axis), Activation('relu'), ZeroPadding2D((1, 1)), MaxPooling2D((3, 3), strides=(2, 2), name='pool1') ] x = img_inputs.copy() x = allocate_layers(x=x, layers=init_conv_layers, num_gpu=num_gpu) # for j in range(len(x)): # for layer in init_conv_layers: # x[j] = layer(x[j]) # Add dense blocks for block_idx in range(nb_dense_block - 1): stage = block_idx + 2 x, nb_filter = dense_block(x, stage, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay, num_gpu=num_gpu) # Add transition_block x = transition_block(x, stage, nb_filter, compression=compression, dropout_rate=dropout_rate, weight_decay=weight_decay, num_gpu=num_gpu) nb_filter = int(nb_filter * compression) final_stage = stage + 1 x, nb_filter = dense_block(x, final_stage, nb_layers[-1], nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay, num_gpu=num_gpu) top_activation = 'softmax' if classes > 1 else 'sigmoid' finl_conv_layers = [ BatchNormalization(epsilon=eps, axis=concat_axis), Scale(axis=concat_axis), Activation('relu'), GlobalAveragePooling2D(name='pool' + str(final_stage)), Dense(classes), Activation(top_activation, name='prob') ] x = allocate_layers(x=x, layers=finl_conv_layers, num_gpu=num_gpu) # for j in range(len(x)): # for layer in finl_conv_layers: # x[j] = layer(x[j]) x = merge.maximum(x) model = Model(img_inputs, x, name='parallel_dense') return model
def Model3_LSTM_BiLSTM_LSTM(wordvocabsize, targetvocabsize, charvobsize, word_W, char_W, input_fragment_lenth, input_leftcontext_lenth, input_rightcontext_lenth, input_maxword_length, w2v_k, c2v_k, hidden_dim=200, batch_size=32, optimizer='rmsprop'): hidden_dim = 100 word_input_fragment = Input(shape=(input_fragment_lenth, ), dtype='int32') word_embedding_fragment = Embedding(input_dim=wordvocabsize + 1, output_dim=w2v_k, input_length=input_fragment_lenth, mask_zero=False, trainable=True, weights=[word_W])(word_input_fragment) word_embedding_fragment = Dropout(0.5)(word_embedding_fragment) char_input_fragment = Input(shape=( input_fragment_lenth, input_maxword_length, ), dtype='int32') char_embedding_fragment = TimeDistributed( Embedding(input_dim=charvobsize, output_dim=c2v_k, batch_input_shape=(batch_size, input_fragment_lenth, input_maxword_length), mask_zero=False, trainable=True, weights=[char_W]))(char_input_fragment) char_cnn_fragment = TimeDistributed( Conv1D(50, 3, activation='relu', padding='valid')) char_embedding_fragment = char_cnn_fragment(char_embedding_fragment) char_embedding_fragment = TimeDistributed( GlobalMaxPooling1D())(char_embedding_fragment) char_embedding_fragment = Dropout(0.25)(char_embedding_fragment) word_input_leftcontext = Input(shape=(input_leftcontext_lenth, ), dtype='int32') word_embedding_leftcontext = Embedding( input_dim=wordvocabsize + 1, output_dim=w2v_k, input_length=input_leftcontext_lenth, mask_zero=True, trainable=True, weights=[word_W])(word_input_leftcontext) word_embedding_leftcontext = Dropout(0.5)(word_embedding_leftcontext) char_input_leftcontext = Input(shape=( input_leftcontext_lenth, input_maxword_length, ), dtype='int32') char_input_rightcontext = Input(shape=( input_rightcontext_lenth, input_maxword_length, ), dtype='int32') word_input_rightcontext = Input(shape=(input_rightcontext_lenth, ), dtype='int32') word_embedding_rightcontext = Embedding( input_dim=wordvocabsize + 1, output_dim=w2v_k, input_length=input_rightcontext_lenth, mask_zero=True, trainable=True, weights=[word_W])(word_input_rightcontext) word_embedding_rightcontext = Dropout(0.5)(word_embedding_rightcontext) embedding_fragment = concatenate( [word_embedding_fragment, char_embedding_fragment], axis=-1) embedding_leftcontext = word_embedding_leftcontext embedding_rightcontext = word_embedding_rightcontext LSTM_leftcontext = LSTM(hidden_dim, go_backwards=False, activation='tanh')(embedding_leftcontext) Rep_LSTM_leftcontext = RepeatVector(input_fragment_lenth)(LSTM_leftcontext) LSTM_rightcontext = LSTM(hidden_dim, go_backwards=True, activation='tanh')(embedding_rightcontext) Rep_LSTM_rightcontext = RepeatVector(input_fragment_lenth)( LSTM_rightcontext) BiLSTM_fragment = Bidirectional(LSTM(hidden_dim // 2, activation='tanh', return_sequences=True), merge_mode='concat')(embedding_fragment) context_ADD = add([LSTM_leftcontext, BiLSTM_fragment, LSTM_rightcontext]) context_subtract_l = subtract([BiLSTM_fragment, LSTM_leftcontext]) context_subtract_r = subtract([BiLSTM_fragment, LSTM_rightcontext]) context_average = average( [LSTM_leftcontext, BiLSTM_fragment, LSTM_rightcontext]) context_maximum = maximum( [LSTM_leftcontext, BiLSTM_fragment, LSTM_rightcontext]) embedding_mix = concatenate([ embedding_fragment, BiLSTM_fragment, context_ADD, context_subtract_l, context_subtract_r, context_average, context_maximum ], axis=-1) # BiLSTM_fragment = Bidirectional(LSTM(hidden_dim // 2, activation='tanh'), merge_mode='concat')(embedding_fragment) decoderlayer1 = Conv1D(50, 1, activation='relu', strides=1, padding='same')(embedding_mix) decoderlayer2 = Conv1D(50, 2, activation='relu', strides=1, padding='same')(embedding_mix) decoderlayer3 = Conv1D(50, 3, activation='relu', strides=1, padding='same')(embedding_mix) decoderlayer4 = Conv1D(50, 4, activation='relu', strides=1, padding='same')(embedding_mix) CNNs_fragment = concatenate( [decoderlayer1, decoderlayer2, decoderlayer3, decoderlayer4], axis=-1) CNNs_fragment = Dropout(0.5)(CNNs_fragment) CNNs_fragment = GlobalMaxPooling1D()(CNNs_fragment) concat = Dropout(0.3)(CNNs_fragment) output = Dense(targetvocabsize, activation='softmax')(concat) Models = Model([ word_input_fragment, word_input_leftcontext, word_input_rightcontext, char_input_fragment, char_input_leftcontext, char_input_rightcontext ], output) Models.compile(loss='categorical_crossentropy', optimizer=optimizers.RMSprop(lr=0.001), metrics=['acc']) return Models