def reweight(y_true, y_pred, tp_weight=.3, tn_weight=.3, fp_weight=4, fn_weight=0.7): # Get predictions y_pred_classes = K.greater_equal(y_pred, 0.5) y_pred_classes_float = K.cast(y_pred_classes, K.floatx()) # Get misclassified examples wrongly_classified = K.not_equal(y_true, y_pred_classes_float) wrongly_classified_float = K.cast(wrongly_classified, K.floatx()) # Get correctly classified examples correctly_classified = K.equal(y_true, y_pred_classes_float) correctly_classified_float = K.cast(correctly_classified, K.floatx()) # Get tp, fp, tn, fn tp = correctly_classified_float * y_true tn = correctly_classified_float * (1 - y_true) fp = wrongly_classified_float * y_true fn = wrongly_classified_float * (1 - y_true) # Get weights weight_tensor = tp_weight * tp + fp_weight * fp + tn_weight * tn + fn_weight * fn loss = K.binary_crossentropy(y_true, y_pred) weighted_loss = loss * weight_tensor return weighted_loss
def call(self, x, mask=None): uit = dot_product(x, self.W) if self.bias: uit += self.b uit = K.tanh(uit) ait = dot_product(uit, self.u) # ait = K.dot(uit, self.u) a = K.exp(ait) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a return K.sum(weighted_input, axis=1)
def predict_prob(self, context_tokens_ids, response_tokens_ids, condition_id, temperature=1.0): """ :param context_tokens_ids: shape == (batch_size, context_size, seq_len), int32 :param response_tokens_ids: shape == (batch_size, seq_len), int32 :param condition_id: shape == (batch_size, 1), int32 :param temperature: float32 :return: tokens_probs: shape == (batch_size, seq_len, vocab_size), float32 """ # remove last token, but keep first token to match seq2seq decoder input's shape response_tokens_ids = response_tokens_ids[:, :-1] # shape == (batch_size, seq_len - 1) init_dec_hs = np.zeros(shape=(context_tokens_ids.shape[0], self._decoder_depth, self._params.hidden_layer_dim), dtype=K.floatx()) # shape == (batch_size, decoder_depth, hidden_layer_dim) temperature = np.full_like(condition_id, temperature, dtype=np.float32) # shape == (batch_size, 1) tokens_probs = self._models['seq2seq'].predict([ context_tokens_ids, response_tokens_ids, condition_id, init_dec_hs, temperature ]) # shape == (batch_size, seq_len - 1, vocab_size) return tokens_probs
def call(self, x, mask=None): # size of x :[batch_size, sel_len, attention_dim] # size of u :[batch_size, attention_dim] # uit = tanh(xW+b) uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1)) uit = tf.matmul(x, uit) uit = K.tanh(K.bias_add(uit, self.b)) ait = K.dot(uit, self.u) ait = K.squeeze(ait, -1) ait = K.exp(ait) if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano ait *= K.cast(mask, K.floatx()) ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx()) ait = K.expand_dims(ait) weighted_input = x * ait output = K.sum(weighted_input, axis=1) return output
def call(self, x, mask=None): features_dim = self.features_dim step_dim = self.step_dim eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) if self.bias: eij += self.b eij = K.tanh(eij) a = K.exp(eij) if mask is not None: a *= K.cast(mask, K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a return K.sum(weighted_input, axis=1)
def _get_training_batch_generator(self): # set unique random seed for different workers to correctly process batches in multi-gpu training horovod_seed = self._horovod.rank() if self._horovod else 0 epoch_id = 0 while True: # inifinite batches generator epoch_id += 1 for train_batch in get_training_batch( self._training_data, self._params.train_batch_size, random_permute=SHUFFLE_TRAINING_BATCHES, random_seed=RANDOM_SEED * epoch_id + horovod_seed): context_tokens_ids, response_tokens_ids, condition_id = train_batch # response tokens are wraped with _start_ and _end_ tokens # output shape == (batch_size, seq_len) # get input response ids by removing last sequence token (_end_) input_response_tokens_ids = response_tokens_ids[:, :-1] # output shape == (batch_size, seq_len - 1) # get target response ids by removing the first (_start_) token of the sequence target_response_tokens_ids = response_tokens_ids[:, 1:] # output shape == (batch_size, seq_len - 1) # workaround for using sparse_categorical_crossentropy loss # see https://github.com/tensorflow/tensorflow/issues/17150#issuecomment-399776510 target_response_tokens_ids = np.expand_dims( target_response_tokens_ids, axis=-1) # output shape == (batch_size, seq_len - 1, 1) init_dec_hs = np.zeros(shape=(context_tokens_ids.shape[0], self._decoder_depth, self._params.hidden_layer_dim), dtype=K.floatx()) yield [ context_tokens_ids, input_response_tokens_ids, condition_id, init_dec_hs ], target_response_tokens_ids
def _decoder(self, tokens_emb_model, condition_emb_model): self._logger.info('Building decoder...') thought_vector = Input(shape=(self._params.hidden_layer_dim, ), dtype=K.floatx(), name='dec_thought_vector') # output shape == (batch_size, hidden_layer_dim) response_tokens_ids = tokens_emb_model.inputs[0] # output shape == (batch_size, seq_len) condition_id = condition_emb_model.inputs[0] # output shape == (batch_size, 1) temperature = Input(shape=(1, ), dtype='float32', name='dec_temperature') # output shape == (batch_size, 1) # hardcode decoder's depth here: the general solution for any number of stacked rnn layers hs num is too bulky # and we don't need it, so keep it simple, stupid self._decoder_depth = 2 # keep inputs for rnn decoder hidden states globally accessible for all model layers that are using them # otherwise you may encounter a keras bug that affects rnn stateful models # related discussion: https://github.com/keras-team/keras/issues/9385#issuecomment-365464721 self._dec_hs_input = Input(shape=(self._decoder_depth, self._params.hidden_layer_dim), dtype=K.floatx(), name='dec_hs') # shape == (batch_size, dec_depth, hidden_layer_dim) response_tokens_embeddings = tokens_emb_model(response_tokens_ids) # output shape == (batch_size, seq_len, token_emb_size) condition_embedding = condition_emb_model(condition_id) # output shape == (batch_size, cond_emb_size) conditioned_tv = Concatenate(name='conditioned_tv')( [thought_vector, condition_embedding]) # output shape == (batch_size, hidden_layer_dim + cond_emb_size) # Temporary solution: # use a custom lambda function for layer repeating and manually set output_shape # otherwise the consequent Concatenate layer won't work repeated_conditioned_tv = Lambda( function=repeat_vector, mask=lambda inputs, inputs_masks: inputs_masks[ 0], # function to get mask of the first input output_shape=(None, self._params.hidden_layer_dim + self._condition_embedding_dim), name='repeated_conditioned_tv')( [conditioned_tv, response_tokens_ids]) # output shape == (batch_size, seq_len, hidden_layer_dim + cond_emb_size) decoder_input = Concatenate(name='concat_emb_cond_tv')( [response_tokens_embeddings, repeated_conditioned_tv]) # output shape == (batch_size, seq_len, token_emb_size + hidden_layer_dim + cond_emb_size) # unpack hidden states to tensors dec_hs_0 = Lambda(function=lambda x: x[:, 0, :], output_shape=(self._params.hidden_layer_dim, ), name='dec_hs_0')(self._dec_hs_input) dec_hs_1 = Lambda(function=lambda x: x[:, 1, :], output_shape=(self._params.hidden_layer_dim, ), name='dec_hs_1')(self._dec_hs_input) outputs_seq_0, updated_hs_seq_0 = self._rnn_class( units=self._params.hidden_layer_dim, return_sequences=True, return_state=True, name='decoder_0')\ (decoder_input, initial_state=dec_hs_0) # outputs_seq_0 and updated_hs_seq_0 shapes == (batch_size, seq_len, hidden_layer_dim) outputs_seq_1, updated_hs_seq_1 = self._rnn_class( units=self._params.hidden_layer_dim, return_sequences=True, return_state=True, name='decoder_1')\ (outputs_seq_0, initial_state=dec_hs_1) # outputs_seq_1 and updated_hs_seq_1 shapes == (batch_size, seq_len, hidden_layer_dim) outputs_dropout = Dropout( rate=self._params.dense_dropout_ratio)(outputs_seq_1) # output shape == (batch_size, seq_len, hidden_layer_dim) tokens_logits = Dense(self._vocab_size)(outputs_dropout) # output shape == (batch_size, seq_len, vocab_size) tokens_probs = softmax_with_temperature(tokens_logits, temperature) # output shape == (batch_size, seq_len, vocab_size) # pack updated hidden states into one tensor updated_hs = Concatenate(axis=1, name='updated_hs')([ Reshape((1, self._params.hidden_layer_dim))(updated_hs_seq_0), Reshape((1, self._params.hidden_layer_dim))(updated_hs_seq_1) ]) decoder_training_model = Model(inputs=[ thought_vector, response_tokens_ids, condition_id, self._dec_hs_input ], outputs=[tokens_logits], name='decoder_training_model') decoder_model = Model(inputs=[ thought_vector, response_tokens_ids, condition_id, self._dec_hs_input, temperature ], outputs=[tokens_probs, updated_hs], name='decoder_model') return decoder_training_model, decoder_model