def mycrossentropy(y_true, y_pred, e=0.1): loss1 = K.categorical_crossentropy(y_true, y_pred) loss2 = K.categorical_crossentropy( K.ones_like(y_pred) / nb_classes, y_pred) # K.ones_like(y_pred) / nb_classes return (1 - e) * loss1 + e * loss2
def get_constants(self, inputs, training=None): constants = [] if self.implementation != 0 and 0 < self.dropout < 1: input_shape = K.int_shape(inputs) input_dim = input_shape[-1] ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1))) ones = K.tile(ones, (1, int(input_dim))) def dropped_inputs(): return K.dropout(ones, self.dropout) dp_mask = [ K.in_train_phase(dropped_inputs, ones, training=training) for _ in range(4) ] constants.append(dp_mask) else: constants.append([K.cast_to_floatx(1.) for _ in range(4)]) if 0 < self.recurrent_dropout < 1: ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1))) ones = K.tile(ones, (1, self.units)) def dropped_inputs(): return K.dropout(ones, self.recurrent_dropout) rec_dp_mask = [ K.in_train_phase(dropped_inputs, ones, training=training) for _ in range(4) ] constants.append(rec_dp_mask) else: constants.append([K.cast_to_floatx(1.) for _ in range(4)]) # append the input as well for use later constants.append(inputs) return constants
def _time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, training=None): """Apply `y . w + b` for every temporal slice y of x. # Arguments x: input tensor. w: weight matrix. b: optional bias vector. dropout: wether to apply dropout (same dropout mask for every temporal slice of the input). input_dim: integer; optional dimensionality of the input. output_dim: integer; optional dimensionality of the output. timesteps: integer; optional number of timesteps. training: training phase tensor or boolean. # Returns Output tensor. """ if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.int_shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b is not None: x = K.bias_add(x, b) # reshape to 3D tensor if K.backend() == 'tensorflow': x = K.reshape(x, K.stack([-1, timesteps, output_dim])) x.set_shape([None, None, output_dim]) else: x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, inputs, states, training=None): if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = self._generate_dropout_mask( K.ones_like(inputs), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = self._generate_dropout_mask( K.ones_like(states[1]), self.recurrent_dropout, training=training, count=4) # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state shape=dp_mask[0].shape if(inputs.shape==shape): pass else: dp_mask=tf.slice(dp_mask,[0,0,0,0,0],[4,shape[0]//2,shape[1],shape[2],shape[3]]) shape=rec_dp_mask[0].shape if(inputs.shape[0]==shape[0]): pass else: rec_dp_mask=tf.slice(rec_dp_mask,[0,0,0,0,0],[4,shape[0]//2,shape[1],shape[2],shape[3]]) if 0 < self.dropout < 1.: inputs_i = inputs * dp_mask[0] inputs_f = inputs * dp_mask[1] inputs_c = inputs * dp_mask[2] inputs_o = inputs * dp_mask[3] else: inputs_i = inputs inputs_f = inputs inputs_c = inputs inputs_o = inputs if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1 h_tm1_f = h_tm1 h_tm1_c = h_tm1 h_tm1_o = h_tm1 x_i = self.input_conv(inputs_i, self.kernel_i, self.bias_i, padding=self.padding) x_f = self.input_conv(inputs_f, self.kernel_f, self.bias_f, padding=self.padding) x_c = self.input_conv(inputs_c, self.kernel_c, self.bias_c, padding=self.padding) x_o = self.input_conv(inputs_o, self.kernel_o, self.bias_o, padding=self.padding) h_i = self.recurrent_conv(h_tm1_i, self.recurrent_kernel_i) h_f = self.recurrent_conv(h_tm1_f, self.recurrent_kernel_f) h_c = self.recurrent_conv(h_tm1_c, self.recurrent_kernel_c) h_o = self.recurrent_conv(h_tm1_o, self.recurrent_kernel_o) i = self.recurrent_activation(x_i + h_i) f = self.recurrent_activation(x_f + h_f) c = f * c_tm1 + i * self.activation(x_c + h_c) o = self.recurrent_activation(x_o + h_o) h = o * self.activation(c) if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h, c]
def __init__(self, n_classes, input_dims, lr, top_rnns=True, metrics_eval_discard_first_classes=2): self.train_history = None input = Input(shape=(None, input_dims), dtype='float32', name='bert_encodings') X = input if top_rnns: X = get_bi_lstm()(X) X = get_bi_lstm()(X) pred = Dense(n_classes, activation='softmax')(X) self.model_save = Model(input, pred) #logger.debug(f'available training devices:\n{device_lib.list_local_devices()}'.replace('\n', '\n\t')) devices = device_lib.list_local_devices() # take gpu count from device info manually, because virtual devices (e.g. XLA_GPU) cause wrong number gpus = len([None for d in devices if d.device_type == 'GPU']) if gpus > 1: self.model = multi_gpu_model(self.model_save, gpus=gpus, cpu_relocation=True) logging.info(f"Training using {gpus} GPUs...") else: self.model = self.model_save logging.info("Training using single GPU or CPU...") optimizer = Adam(lr=lr) self.model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=[ ANDCounter( conditions_and=lambda y_true, y_pred: ( y_true, K.round(y_pred), # This condition masks all entries where y_true has class=0, i.e. <PAD>: # 1) gold values, except for the first class, are summed along the class-axis # 2) the resulting vector is broadcast back to the original format (via stack and number of classes) K.stack([ K.sum(y_true[:, :, metrics_eval_discard_first_classes:], axis=-1) ] * n_classes, axis=-1), ), name='tp'), ANDCounter( conditions_and=lambda y_true, y_pred: ( K.abs(y_true - K.ones_like(y_true)), K.round(y_pred), # this condition masks all entries where y_true has class=0, i.e. <PAD> (see above) K.stack([ K.sum(y_true[:, :, metrics_eval_discard_first_classes:], axis=-1) ] * n_classes, axis=-1), ), name='fp'), ANDCounter( conditions_and=lambda y_true, y_pred: ( y_true, K.abs(K.round(y_pred) - K.ones_like(y_pred)), # this condition masks all entries where y_true has class=0, i.e. <PAD> (see above) K.stack([ K.sum(y_true[:, :, metrics_eval_discard_first_classes:], axis=-1) ] * n_classes, axis=-1), ), name='fn'), ANDCounter( conditions_and=lambda y_true, y_pred: ( y_true, # this condition masks all entries where y_true has class=0, i.e. <PAD> (see above) K.stack([ K.sum(y_true[:, :, metrics_eval_discard_first_classes:], axis=-1) ] * n_classes, axis=-1), ), name='total_count'), 'acc', ]) plot_model(self.model, to_file='model.png', show_shapes=True)