class TiedGraphAutoencoderFP(Layer): def __init__(self, inner_layer_arg, activ, bias, init, original_atom_bond_features, tied_to=None, encode=False, decode=False, activity_reg=None, **kwargs): # Initialise self.tied_to = tied_to self.encode = encode self.decode = decode self.original_atom_bond_features = original_atom_bond_features self.bias = bias self.reg = activity_reg if isinstance(inner_layer_arg, (int, np.int64)): self.fp_length = inner_layer_arg self.create_inner_layer_fn = lambda: DenseTied( self.fp_length, activation=activ, use_bias=bias, kernel_initializer=init, tied_to=self.tied_to, idx=None, activity_regularizer=self.reg, **kwargs) ### add inputs to dense layer else: raise ValueError( 'NeuralGraphHidden has to be initialised with fp_length.') super(TiedGraphAutoencoderFP, self).__init__(**kwargs) def build(self, inputs_shape): # Set the index for the DenseTied weight values # Import dimensions (max_atoms, _, num_atom_features, num_bond_features, _) = mol_shapes_to_dims(mol_shapes=inputs_shape) # Add the dense layer that contains the trainable parameters # Initialise dense layer with specified params (kwargs) and name self.trainable_weights = [] self.non_trainable_weights = [] inner_layer = self.create_inner_layer_fn() inner_layer_type = inner_layer.__class__.__name__.lower() inner_layer.name = self.name + '_inner_' + inner_layer_type # Initialise TimeDistributed layer wrapper in order to parallelise # dense layer across atoms inner_3D_layer_name = self.name + '_inner_timedistributed' self.inner_3D_layer = TimeDistributed(inner_layer, name=inner_3D_layer_name) # Build the TimeDistributed layer (which will build the Dense layer) if self.encode: self.inner_3D_layer.build( (None, max_atoms, num_atom_features + num_bond_features)) else: self.inner_3D_layer.build((None, max_atoms, self.fp_length)) # Store dense_3D_layer and it's weights if self.tied_to is not None: self.non_trainable_weights.append(self.inner_3D_layer.layer.kernel) if self.bias: self.trainable_weights.append(self.inner_3D_layer.layer.bias) else: self.trainable_weights = self.inner_3D_layer.trainable_weights def call(self, inputs, mask=None): if self.encode: return self.encoder(inputs) elif self.decode: return self.decoder(inputs) def encoder(self, inputs): atoms, bonds, edges = inputs final_fp_out = self.process_through_layers(atoms, bonds, edges) return final_fp_out def decoder(self, inputs): fp_out, _, _ = inputs vxi_dot = self.inner_3D_layer(fp_out) return vxi_dot def process_through_layers(self, atoms, bonds, edges): # Create a matrix that stores for each atom, the degree it is, use it # to create a general atom mask (unused atoms are 0 padded) # We have to use the edge vector for this, because in theory, a convolution # could lead to a zero vector for an atom that is present in the molecule atom_degrees = K.sum(tf.keras.backend.cast(K.not_equal(edges, -1), dtype='float32'), axis=-1, keepdims=True) general_atom_mask = K.cast(K.not_equal(atom_degrees, 0), K.floatx()) # Sum the edge features for each atom summed_bond_features = K.sum(bonds, axis=-2) # Concatenate the summed atom and bond features atoms_bonds_features = keras.layers.Concatenate(axis=-1)( [atoms, summed_bond_features]) # Compute fingerprint fingerprint_out_unmasked = self.inner_3D_layer(atoms_bonds_features) # Do explicit masking because TimeDistributed does not support masking fingerprint_out_masked = fingerprint_out_unmasked * general_atom_mask final_fp_out = fingerprint_out_masked # Sum across all atoms # final_fp_out = K.sum(fingerprint_out_masked, axis=-2, keepdims = False) return final_fp_out def compute_output_shape(self, inputs_shape): # Import dimensions (max_atoms, _, _, _, num_samples) = mol_shapes_to_dims(mol_shapes=inputs_shape) if self.encode: return (num_samples, max_atoms, self.fp_length) else: return (num_samples, max_atoms, self.original_atom_bond_features) def get_config(self): config = super(TiedGraphAutoencoderFP, self).get_config() # Store config of inner layer of the 3D wrapper inner_layer = self.inner_3D_layer.layer config['inner_layer_config'] = dict( config=inner_layer.get_config(), class_name=inner_layer.__class__.__name__) return config
def build(self, inputs_shape): # Import dimensions (max_atoms, max_degree, num_atom_features, num_bond_features, _) = mol_shapes_to_dims(mol_shapes=inputs_shape) # Add the dense layers (that contain trainable params) # (for each degree we convolve with a different weight matrix) self.trainable_weights = [] self.non_trainable_weights = [] self.inner_3D_layers = [] self.all_layers = [] self.idx = max_degree self_layer = self.create_inner_layer_fn() self_layer_type = self_layer.__class__.__name__.lower() self_layer.name = self.name + '_self_' + self_layer_type + '_' #Time Distributed layer wrapper self.self_3D_layer_name = self.name + '_self_timedistributed' self.self_3D_layer = TimeDistributed(self_layer, name=self.self_3D_layer_name) if self.encode_only: self.self_3D_layer.build( (None, max_atoms, num_atom_features + num_bond_features)) else: self.self_3D_layer.build((None, max_atoms, self.conv_width)) for degree in range(max_degree): self.idx = degree # Initialise inner layer, and rename it inner_layer = self.create_inner_layer_fn() inner_layer_type = inner_layer.__class__.__name__.lower() inner_layer.name = self.name + '_inner_' + inner_layer_type + '_' + str( degree) # Initialise TimeDistributed layer wrapper in order to parallelise # dense layer across atoms (3D) inner_3D_layer_name = self.name + '_inner_timedistributed_' + str( degree) inner_3D_layer = TimeDistributed(inner_layer, name=inner_3D_layer_name) # Build the TimeDistributed layer (which will build the Dense layer) if self.encode_only: inner_3D_layer.build( (None, max_atoms, num_bond_features + num_atom_features)) else: inner_3D_layer.build((None, max_atoms, self.conv_width)) # Store inner_3D_layer and it's weights self.inner_3D_layers.append(inner_3D_layer) self.all_layers.append(inner_3D_layer) if self.tied_to is not None: self.non_trainable_weights.append(inner_3D_layer.layer.kernel) if self.bias: self.trainable_weights.append(inner_3D_layer.layer.bias) else: self.trainable_weights += inner_3D_layer.trainable_weights if self.tied_to is not None: self.trainable_weights.append(self.self_3D_layer.layer.bias) self.non_trainable_weights.append(self.self_3D_layer.layer.kernel) else: self.trainable_weights += self.self_3D_layer.trainable_weights self.all_layers.append(self_layer)
class TiedGraphAutoencoder(Layer): def __init__(self, inner_layer_arg, activ, bias, init, original_atom_bond_features=None, tied_to=None, encode_only=False, decode_only=False, activity_reg=None, **kwargs): # Initialise inner dense layers using convolution width # Check if inner_layer_arg is conv_width self.tied_to = tied_to self.encode_only = encode_only self.decode_only = decode_only self.bias = bias self.original_atom_bond_features = original_atom_bond_features self.activ = activ self.init = init self.reg = activity_reg # Case 1: check if conv_width is given if isinstance(inner_layer_arg, (int, np.int64)): self.conv_width = inner_layer_arg self.create_inner_layer_fn = lambda: DenseTied( self.conv_width, activation=self.activ, use_bias=bias, kernel_initializer=init, tied_to=self.tied_to, idx=self.idx, activity_regularizer=self.reg, **kwargs) # Case 2: Check if an initialised keras layer is given elif isinstance(inner_layer_arg, Layer): assert inner_layer_arg.built == False, 'When initialising with a keras layer, it cannot be built.' _, self.conv_width = inner_layer_arg.get_output_shape_for( (None, None)) # layer_from_config will mutate the config dict, therefore create a get fn self.create_inner_layer_fn = lambda: layer_from_config( dict(class_name=inner_layer_arg.__class__.__name__, config=inner_layer_arg.get_config())) else: raise ValueError( 'TiedAutoencoder has to be initialised with 1). int conv_width, 2). a keras layer instance, or 3). a function returning a keras layer instance.' ) super(TiedGraphAutoencoder, self).__init__(**kwargs) def build(self, inputs_shape): # Import dimensions (max_atoms, max_degree, num_atom_features, num_bond_features, _) = mol_shapes_to_dims(mol_shapes=inputs_shape) # Add the dense layers (that contain trainable params) # (for each degree we convolve with a different weight matrix) self.trainable_weights = [] self.non_trainable_weights = [] self.inner_3D_layers = [] self.all_layers = [] self.idx = max_degree self_layer = self.create_inner_layer_fn() self_layer_type = self_layer.__class__.__name__.lower() self_layer.name = self.name + '_self_' + self_layer_type + '_' #Time Distributed layer wrapper self.self_3D_layer_name = self.name + '_self_timedistributed' self.self_3D_layer = TimeDistributed(self_layer, name=self.self_3D_layer_name) if self.encode_only: self.self_3D_layer.build( (None, max_atoms, num_atom_features + num_bond_features)) else: self.self_3D_layer.build((None, max_atoms, self.conv_width)) for degree in range(max_degree): self.idx = degree # Initialise inner layer, and rename it inner_layer = self.create_inner_layer_fn() inner_layer_type = inner_layer.__class__.__name__.lower() inner_layer.name = self.name + '_inner_' + inner_layer_type + '_' + str( degree) # Initialise TimeDistributed layer wrapper in order to parallelise # dense layer across atoms (3D) inner_3D_layer_name = self.name + '_inner_timedistributed_' + str( degree) inner_3D_layer = TimeDistributed(inner_layer, name=inner_3D_layer_name) # Build the TimeDistributed layer (which will build the Dense layer) if self.encode_only: inner_3D_layer.build( (None, max_atoms, num_bond_features + num_atom_features)) else: inner_3D_layer.build((None, max_atoms, self.conv_width)) # Store inner_3D_layer and it's weights self.inner_3D_layers.append(inner_3D_layer) self.all_layers.append(inner_3D_layer) if self.tied_to is not None: self.non_trainable_weights.append(inner_3D_layer.layer.kernel) if self.bias: self.trainable_weights.append(inner_3D_layer.layer.bias) else: self.trainable_weights += inner_3D_layer.trainable_weights if self.tied_to is not None: self.trainable_weights.append(self.self_3D_layer.layer.bias) self.non_trainable_weights.append(self.self_3D_layer.layer.kernel) else: self.trainable_weights += self.self_3D_layer.trainable_weights self.all_layers.append(self_layer) def call(self, inputs, mask=None): atoms, bonds, edges = inputs if self.encode_only: return self.encode(inputs) elif self.decode_only: return self.decode(atoms, bonds, edges) else: return self.decode(self.encode(inputs), bonds, edges) def encode(self, inputs): atoms, bonds, edges = inputs # Import dimensions max_atoms = atoms._keras_shape[1] num_atom_features = atoms._keras_shape[-1] num_bond_features = bonds._keras_shape[-1] max_degree = 5 # Looks up the neighbors, sums the edge features and creates vni summed_features, atom_degrees = self.mask_atoms_by_degree( atoms, edges, bonds) new_features_by_degree = self.create_layer_by_deg( max_degree, atom_degrees, (max_atoms, num_atom_features, num_bond_features), summed_features) zni = add(new_features_by_degree) summed_bonds = K.sum(bonds, axis=-2) vxi = K.concatenate([atoms, summed_bonds], axis=-1) zxi = self.self_3D_layer(vxi) vxi_plus_one = keras.layers.add([zni, zxi]) return vxi_plus_one def decode(self, vxi_plus_one, bonds, edges): atoms = vxi_plus_one # Import dimensions max_atoms = atoms.shape[1] num_atom_features = atoms.shape[-1] num_bond_features = bonds._keras_shape[-1] max_degree = 5 _, atom_degrees = self.mask_atoms_by_degree(atoms, edges, bonds=None) td_denses_by_degree = self.create_layer_by_deg( max_degree, atom_degrees, [max_atoms, num_atom_features, num_bond_features], vxi_plus_one) vni_dot = keras.layers.add(td_denses_by_degree) vxi_dot = self.self_3D_layer(vxi_plus_one) return [vni_dot, vxi_dot] def mask_atoms_by_degree(self, atoms, edges, bonds=None): # Create a matrix that stores for each atom, the degree it is atom_degrees = K.sum(tf.keras.backend.cast(K.not_equal(edges, -1), dtype='float32'), axis=-1, keepdims=True) # For each atom, look up the features of it's neighbour neighbour_atom_features = neighbour_lookup(atoms, edges, include_self=False) # Sum along degree axis to get summed neighbour features summed_atom_features = K.sum(neighbour_atom_features, axis=-2) # Sum the edge features for each atom if bonds is not None: summed_bond_features = K.sum(bonds, axis=-2) # Concatenate the summed atom and bond features if bonds is not None: summed_features = K.concatenate( [summed_atom_features, summed_bond_features], axis=-1) else: summed_features = summed_atom_features return summed_features, atom_degrees def create_layer_by_deg(self, max_deg, atom_degrees, inputs, summed_features): # For each degree we convolve with a different weight matrix [max_atoms, num_atom_features, num_bond_features] = inputs new_features_by_degree = [] for degree in range(max_deg): # Create mask for this degree atom_masks_this_degree = K.cast(K.equal(atom_degrees, degree), K.floatx()) # Multiply with hidden merge layer # (use time Distributed because we are dealing with 2D input/3D for batches) # Add keras shape to let keras now the dimensions if self.encode_only: summed_features._keras_shape = (None, max_atoms, num_atom_features + num_bond_features) else: summed_features._keras_shape = (None, max_atoms, self.conv_width) new_unmasked_features = self.inner_3D_layers[degree]( summed_features) # Do explicit masking because TimeDistributed does not support masking new_masked_features = new_unmasked_features * atom_masks_this_degree new_features_by_degree.append(new_masked_features) return new_features_by_degree def compute_output_shape(self, inputs_shape): # Import dimensions inputs_shape[0] = (None, int(inputs_shape[0][1]), inputs_shape[0][2]) (max_atoms, _, _, _, num_samples) = mol_shapes_to_dims(mol_shapes=inputs_shape) if self.encode_only: return (num_samples, max_atoms, self.conv_width) else: return [(num_samples, max_atoms, self.original_atom_bond_features), (num_samples, max_atoms, self.original_atom_bond_features)]
def __init__(self, p=None, h=None, include_word_vectors=True, word_embedding_weights=None, train_word_embeddings=True, include_chars=True, chars_per_word=16, char_embedding_size=8, char_conv_filters=100, char_conv_kernel_size=5, include_syntactical_features=True, syntactical_feature_size=50, include_exact_match=True, dropout_initial_keep_rate=1., dropout_decay_rate=0.977, dropout_decay_interval=10000, first_scale_down_ratio=0.3, transition_scale_down_ratio=0.5, growth_rate=20, layers_per_dense_block=8, nb_dense_blocks=3, nb_labels=3, inputs=None, outputs=None, name='DIIN'): """ :ref https://openreview.net/forum?id=r1dHXnH6-¬eId=r1dHXnH6- :param p: sequence length of premise :param h: sequence length of hypothesis :param include_word_vectors: whether or not to include word vectors in the model :param word_embedding_weights: matrix of weights for word embeddings (GloVe pre-trained vectors) :param train_word_embeddings: whether or not to modify word embeddings while training :param include_chars: whether or not to include character embeddings in the model :param chars_per_word: how many chars are there per one word (a fixed number) :param char_embedding_size: input size of the character-embedding layer :param char_conv_filters: number of conv-filters applied on character embedding :param char_conv_kernel_size: size of the kernel applied on character embeddings :param include_syntactical_features: whether or not to include syntactical features (POS tags) in the model :param syntactical_feature_size: size of the syntactical feature vector for each word :param include_exact_match: whether or not to include exact match features in the model :param dropout_initial_keep_rate: initial state of dropout :param dropout_decay_rate: how much to change dropout at each interval :param dropout_decay_interval: how much time to wait for the next update :param first_scale_down_ratio: first scale down ratio in densenet :param transition_scale_down_ratio: transition scale down ratio in densenet :param growth_rate: growing rate in densenet :param layers_per_dense_block: number of layers in one dense-block :param nb_dense_blocks: number of dense blocks in densenet :param nb_labels: number of labels (3 labels by default: entailment, contradiction, neutral) """ if inputs or outputs: super(DIIN, self).__init__(inputs=inputs, outputs=outputs, name=name) return if include_word_vectors: assert word_embedding_weights is not None inputs = [] premise_embeddings = [] hypothesis_embeddings = [] '''Embedding layer''' # 1. Word embedding input if include_word_vectors: premise_word_input = Input(shape=(p, ), dtype='int64', name='PremiseWordInput') hypothesis_word_input = Input(shape=(h, ), dtype='int64', name='HypothesisWordInput') inputs.append(premise_word_input) inputs.append(hypothesis_word_input) word_embedding = Embedding( input_dim=word_embedding_weights.shape[0], output_dim=word_embedding_weights.shape[1], weights=[word_embedding_weights], trainable=train_word_embeddings, name='WordEmbedding') premise_word_embedding = word_embedding(premise_word_input) hypothesis_word_embedding = word_embedding(hypothesis_word_input) premise_word_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='PremiseWordEmbeddingDropout')(premise_word_embedding) hypothesis_word_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='HypothesisWordEmbeddingDropout')( hypothesis_word_embedding) premise_embeddings.append(premise_word_embedding) hypothesis_embeddings.append(hypothesis_word_embedding) # 2. Character input if include_chars: premise_char_input = Input(shape=( p, chars_per_word, ), name='PremiseCharInput') hypothesis_char_input = Input(shape=( h, chars_per_word, ), name='HypothesisCharInput') inputs.append(premise_char_input) inputs.append(hypothesis_char_input) # Share weights of character-level embedding for premise and hypothesis character_embedding_layer = TimeDistributed(Sequential([ Embedding(input_dim=100, output_dim=char_embedding_size, input_length=chars_per_word), Conv1D(filters=char_conv_filters, kernel_size=char_conv_kernel_size), GlobalMaxPooling1D() ]), name='CharEmbedding') character_embedding_layer.build(input_shape=(None, None, chars_per_word)) premise_char_embedding = character_embedding_layer( premise_char_input) hypothesis_char_embedding = character_embedding_layer( hypothesis_char_input) premise_embeddings.append(premise_char_embedding) hypothesis_embeddings.append(hypothesis_char_embedding) # 3. Syntactical features if include_syntactical_features: premise_syntactical_input = Input(shape=( p, syntactical_feature_size, ), name='PremiseSyntacticalInput') hypothesis_syntactical_input = Input( shape=( h, syntactical_feature_size, ), name='HypothesisSyntacticalInput') inputs.append(premise_syntactical_input) inputs.append(hypothesis_syntactical_input) premise_embeddings.append(premise_syntactical_input) hypothesis_embeddings.append(hypothesis_syntactical_input) # 4. One-hot exact match feature if include_exact_match: premise_exact_match_input = Input(shape=(p, ), name='PremiseExactMatchInput') hypothesis_exact_match_input = Input( shape=(h, ), name='HypothesisExactMatchInput') premise_exact_match = Reshape(target_shape=( p, 1, ))(premise_exact_match_input) hypothesis_exact_match = Reshape(target_shape=( h, 1, ))(hypothesis_exact_match_input) inputs.append(premise_exact_match_input) inputs.append(hypothesis_exact_match_input) premise_embeddings.append(premise_exact_match) hypothesis_embeddings.append(hypothesis_exact_match) # Concatenate all features premise_embedding = Concatenate( name='PremiseEmbedding')(premise_embeddings) hypothesis_embedding = Concatenate( name='HypothesisEmbedding')(hypothesis_embeddings) d = K.int_shape(hypothesis_embedding)[-1] '''Encoding layer''' # Now we have the embedded premise [pxd] along with embedded hypothesis [hxd] premise_encoding = Encoding(name='PremiseEncoding')(premise_embedding) hypothesis_encoding = Encoding( name='HypothesisEncoding')(hypothesis_embedding) '''Interaction layer''' interaction = Interaction(name='Interaction')( [premise_encoding, hypothesis_encoding]) '''Feature Extraction layer''' feature_extractor_input = Conv2D(filters=int(d * first_scale_down_ratio), kernel_size=1, activation=None, name='FirstScaleDown')(interaction) feature_extractor = DenseNet( include_top=False, input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]), nb_dense_block=nb_dense_blocks, nb_layers_per_block=layers_per_dense_block, compression=transition_scale_down_ratio, growth_rate=growth_rate)(feature_extractor_input) '''Output layer''' features = DecayingDropout(initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='Features')(feature_extractor) out = Dense(units=nb_labels, activation='softmax', name='Output')(features) super(DIIN, self).__init__(inputs=inputs, outputs=out, name=name)
def build_model(cfg, summary=False, word_embedding_matrix=None): def _get_model(base_dir, cfg_=None): config_file = os.path.join(base_dir, 'bert_config.json') checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt') if not os.path.exists(config_file): config_file = os.path.join(base_dir, 'bert_config_large.json') checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model') print(config_file, checkpoint_file) # model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=cfg_['maxlen']) model = load_trained_model_from_checkpoint( config_file, checkpoint_file, training=False, trainable=cfg_["bert_trainable"], output_layer_num=cfg["cls_num"], seq_len=cfg_['maxlen']) return model def get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None): if cfg["opt"].lower() == "nadam": opt = Nadam(lr=lr) else: total_steps, warmup_steps = calc_train_steps( num_example=num_example, batch_size=B_SIZE, epochs=MAX_EPOCH, warmup_proportion=warmup_proportion, ) opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr) return opt model1 = _get_model(cfg["base_dir"], cfg) #model1 = Model(inputs=model1.inputs[: 2], outputs=model1.layers[-7].output) model1 = Model(inputs=model1.inputs[:2], outputs=model1.layers[-7].output) if word_embedding_matrix is not None: embed_layer = Embedding(input_dim=word_embedding_matrix.shape[0], output_dim=word_embedding_matrix.shape[1], weights=[word_embedding_matrix], trainable=cfg["trainable"], name="embed_layer") inp_token1 = Input(shape=(None, ), dtype=np.int32, name="query_token_input") inp_segm1 = Input(shape=(None, ), dtype=np.float32, name="query_segm_input") # inp_token2 = Input(shape=(None, ), dtype=np.int32) # inp_segm2 = Input(shape=(None, ), dtype=np.float32) inp_image = Input(shape=(None, 2048), dtype=np.float32, name="image_input") inp_image_mask = Input(shape=(None, ), dtype=np.float32, name="image_mask_input") inp_pos = Input(shape=(None, 5), dtype=np.float32, name="image_pos_input") inp_image_char = Input(shape=(None, cfg["max_char"]), dtype=np.int32, name='image_char_input') mask = Lambda(lambda x: K.cast(K.not_equal(x, cfg["x_pad"]), 'float32'), name="token_mask")(inp_token1) word_embed = embed_layer(inp_token1) word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))( [word_embed, mask]) word_embed = Bidirectional(LSTM(cfg["unit1_1"], return_sequences=True), merge_mode="sum")(word_embed) word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))( [word_embed, mask]) sequence_output = model1([inp_token1, inp_segm1]) sequence_output = Concatenate(axis=-1)([sequence_output, word_embed]) text_pool = Lambda(lambda x: x[:, 0, :])(sequence_output) # Share weights of character-level embedding for premise and hypothesis character_embedding_layer = TimeDistributed( Sequential([ embed_layer, # Embedding(input_dim=100, output_dim=char_embedding_size, input_length=chars_per_word), Conv1D(filters=128, kernel_size=3, name="char_embed_conv1d"), GlobalMaxPooling1D() ]), name='CharEmbedding') character_embedding_layer.build(input_shape=(None, None, cfg["max_char"])) image_char_embed = character_embedding_layer(inp_image_char) image_embed = Concatenate(axis=-1)([image_char_embed, inp_image]) image_embed = Dense(512, activation='relu', name='image_embed')(image_embed) image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))( [image_embed, inp_image_mask]) pos_embed = Dense(512, activation='relu', name='pos_embed')(inp_pos) pos_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))( [pos_embed, inp_image_mask]) embed = Add()([image_embed, pos_embed]) # batch, maxlen(10), 1024+128 image_embed = Bidirectional(LSTM(1152, return_sequences=True), merge_mode="sum")(embed) image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))( [image_embed, inp_image_mask]) image_pool = Lambda(lambda x: x[:, 0, :])(image_embed) pool = Concatenate(axis=-1)([image_pool, text_pool]) pool = Dense(2048, activation="relu")(pool) pool = Dense(512, activation="relu")(pool) pool = Dense(128, activation="relu")(pool) output = Dense(2, activation='softmax', name='output')(pool) opt = get_opt(num_example=cfg["num_example"], lr=cfg["lr"], min_lr=cfg['min_lr']) model = Model(inputs=[ inp_token1, inp_segm1, inp_image, inp_image_mask, inp_pos, inp_image_char ], outputs=[output]) # model.compile(optimizer=opt, loss={'output': 'sparse_categorical_crossentropy'}, metrics=['accuracy']) if summary: model.summary() return model
def __init__(self, p=None, h=None, use_word_embedding=True, word_embedding_weights=None, train_word_embeddings=False, dropout_init_keep_rate=1.0, dropout_decay_interval=10000, dropout_decay_rate=0.977, use_chars=False, chars_per_word=16, char_input_dim=100, char_embedding_size=8, char_conv_filters=100, char_conv_kernel_size=5, use_syntactical_features=False, syntactical_feature_size=50, use_exact_match=False, first_scale_down_ratio=0.3, nb_dense_blocks=3, layers_per_dense_block=8, nb_labels=3, growth_rate=20, transition_scale_down_ratio=0.5, inputs=None, outputs=None, name="DIIN"): """Densely Interactive Inference Network(DIIN) Model from paper `Natural Language Inference over Interaction Space` (https://openreview.net/forum?id=r1dHXnH6-¬eId=r1dHXnH6-) :param p: sequence length of premise :param h: sequence length of hypothesis :param use_word_embedding: whether or not to include word vectors in the model :param use_chars: whether or not to include character embeddings in the model :param use_syntactical_features: whether or not to include syntactical features (POS tags) in the model :param use_exact_match: whether or not to include exact match features in the model :param word_embedding_weights: matrix of weights for word embeddings(pre-trained vectors) :param train_word_embeddings: whether or not to modify word embeddings while training :param dropout_init_keep_rate: initial keep rate of dropout :param dropout_decay_interval: the number of steps to wait for the next turn update, steps means single batch, other than epoch :param dropout_decay_rate: how much to change dropout at each interval :param chars_per_word: how many chars are there per one word :param char_input_dim: character unique numbers :param char_embedding_size: output size of the character-embedding layer :param char_conv_filters: filters of the kernel applied on character embeddings :param char_conv_kernel_size: size of the kernel applied on character embeddings :param syntactical_feature_size: size of the syntactical feature vector for each word :param first_scale_down_ratio: scale ratio of map features as the input of first Densenet block :param nb_dense_blocks: number of dense blocks in densenet :param layers_per_dense_block: number of layers in one dense block :param nb_labels: number of labels :param growth_rate:growing rate in dense net :param transition_scale_down_ratio: transition scale down ratio in dense net :param inputs: inputs of keras models :param outputs: outputs of keras models :param name: models name """ if inputs or outputs: super(DIINModel, self).__init__(inputs=inputs, outputs=outputs, name=name) return if use_word_embedding: assert word_embedding_weights is not None, "Word embedding weights are needed" inputs = [] premise_features = [] hypothesis_features = [] """Embedding layer""" # Input: word embedding if use_word_embedding: premise_word_input = Input(shape=(p, ), dtype="int64", name="premise_word_input") hypothesis_word_input = Input(shape=(h, ), dtype="int64", name="hypothesis_word_input") inputs.append(premise_word_input) inputs.append(hypothesis_word_input) word_embedding = Embedding( input_dim=word_embedding_weights.shape[0], output_dim=word_embedding_weights.shape[1], weights=[word_embedding_weights], trainable=train_word_embeddings, name="word_embedding") premise_word_embedding = word_embedding(premise_word_input) hypothesis_word_embedding = word_embedding(hypothesis_word_input) premise_word_embedding = DecayingDropout( init_keep_rate=dropout_init_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name="premise_word_dropout")(premise_word_embedding) hypothesis_word_embedding = DecayingDropout( init_keep_rate=dropout_init_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name="hypothesis_word_dropout")(hypothesis_word_embedding) premise_features.append(premise_word_embedding) hypothesis_features.append(hypothesis_word_embedding) # Input: character embedding if use_chars: premise_char_input = Input(shape=(p, chars_per_word), dtype="int64", name="premise_char_input") hypothesis_char_input = Input(shape=(h, chars_per_word), dtype="int64", name="hypothesis_char_input") inputs.append(premise_char_input) inputs.append(hypothesis_char_input) # Share weights of character-level embedding for premise and hypothesis character_embedding = TimeDistributed(Sequential([ Embedding(input_dim=char_input_dim, output_dim=char_embedding_size, input_length=chars_per_word), Conv1D(filters=char_conv_filters, kernel_size=char_conv_kernel_size), GlobalMaxPooling1D(), ]), name="char_embedding") character_embedding.build( input_shape=(None, None, chars_per_word)) # Set input shape premise_char_embedding = character_embedding(premise_char_input) hypothesis_char_embedding = character_embedding( hypothesis_char_input) premise_features.append(premise_char_embedding) hypothesis_features.append(hypothesis_char_embedding) # Input: syntactical features if use_syntactical_features: premise_syntactical_input = Input(shape=(p, syntactical_feature_size), name="premise_syntactical_input") hypothesis_syntactical_input = Input( shape=(h, syntactical_feature_size), name="hypothesis_syntactical_input") inputs.append(premise_syntactical_input) inputs.append(hypothesis_syntactical_input) premise_features.append(premise_syntactical_input) hypothesis_features.append(hypothesis_syntactical_input) # Input: one-hot exact match feature if use_exact_match: premise_exact_match_input = Input(shape=(p, ), name='premise_exact_match_input') hypothesis_exact_match_input = Input( shape=(h, ), name='hypothesis_exact_match_input') inputs.append(premise_exact_match_input) inputs.append(hypothesis_exact_match_input) premise_exact_match = Reshape( target_shape=(p, 1))(premise_exact_match_input) hypothesis_exact_match = Reshape( target_shape=(h, 1))(hypothesis_exact_match_input) premise_features.append(premise_exact_match) hypothesis_features.append(hypothesis_exact_match) # Concatenate all features if len(premise_features) > 1: premise_embedding = Concatenate()(premise_features) hypothesis_embedding = Concatenate()(hypothesis_features) else: premise_embedding = premise_features[0] hypothesis_embedding = hypothesis_features[0] d = K.int_shape(premise_embedding)[-1] """Encoding layer""" premise_encoding = Encoding(name="premise_encoding")(premise_embedding) hypothesis_encoding = Encoding( name="hypothesis_encoding")(hypothesis_embedding) """Interaction layer""" interaction = Interaction(name="interaction")( [premise_encoding, hypothesis_encoding]) """Feature extraction layer""" feature_extractor_input = Conv2D( filters=int(d * first_scale_down_ratio), kernel_size=1, activation=None, name="bottleneck")(interaction) # Bottleneck layer feature_extractor = DenseNet( input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]), include_top=False, nb_dense_block=nb_dense_blocks, nb_layers_per_block=layers_per_dense_block, growth_rate=growth_rate, compression=transition_scale_down_ratio)(feature_extractor_input) """Output layer""" features = DecayingDropout(init_keep_rate=dropout_init_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name="features")(feature_extractor) if nb_labels == 2: out = Dense(1, activation="sigmoid", name="output")(features) else: out = Dense(nb_labels, activation="softmax", name="output")(features) super(DIINModel, self).__init__(inputs=inputs, outputs=out, name=name)