コード例 #1
0
 def DNNclassifier_crps(self, p, num_cut, optimizer, seeding):
     
     tf.set_random_seed(seeding)
     inputs = Input(shape=(p,))
     if isinstance(optimizer, str):
         opt = optimizer
     else:
         opt_name = optimizer.__class__.__name__
         opt_config = optimizer.get_config()
         opt_class = getattr(optimizers, opt_name)
         opt = opt_class(**opt_config)
     
     for i, n_neuron in enumerate(self.hidden_list):
         if i == 0:
             net = Dense(n_neuron, kernel_initializer = 'he_uniform')(inputs)
         else:
             net = Dense(n_neuron, kernel_initializer = 'he_uniform')(net)
         net = Activation(activation = 'elu')(net)
         net = BatchNormalization()(net)
         net = Dropout(rate=self.dropout_list[i])(net)
     
     softmaxlayer = Dense(num_cut + 1, activation='softmax', 
                    kernel_initializer = 'he_uniform')(net)
     
     output = Lambda(self.tf_cumsum)(softmaxlayer)
     model = Model(inputs = [inputs], outputs=[output])
     model.compile(optimizer=opt, loss=self.crps_loss)
 
     return model
コード例 #2
0
def build_model(units,
                inputs_dim,
                output="regression",
                sparse_dim=[],
                with_ts=False,
                ts_maxlen=0):
    assert output == "regression" or output == "binary_clf", "This output type is not supported."
    assert len(sparse_dim) == inputs_dim[1], "Dimension not match."

    # Inputs for basic features.
    inputs1 = Input(shape=(inputs_dim[0], ), name="basic_input")
    x1 = Dense(units, kernel_regularizer='l2', activation="relu")(inputs1)

    # Inputs for long one-hot features.
    inputs2 = Input(shape=(inputs_dim[1], ), name="one_hot_input")
    for i in range(len(sparse_dim)):
        if i == 0:
            x2 = Embedding(sparse_dim[i], units,
                           mask_zero=True)(slice(inputs2, i))
        else:
            tmp = Embedding(sparse_dim[i], units,
                            mask_zero=True)(slice(inputs2, i))
            x2 = Concatenate()([x2, tmp])
    x2 = tf.reshape(x2, [-1, units * inputs_dim[1]])
    x = Concatenate()([x1, x2])

    if with_ts:
        inputs3 = Input(shape=(
            None,
            inputs_dim[2],
        ), name="ts_input")
        x3 = LSTM(units,
                  input_shape=(ts_maxlen, inputs_dim[2]),
                  return_sequences=0)(inputs3)
        x = Concatenate()([x, x3])

    x = Dense(units, kernel_regularizer='l2', activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(units, kernel_regularizer='l2', activation="relu")(x)
    x = Dropout(0.5)(x)

    if output == "regression":
        x = Dense(1, kernel_regularizer='l2')(x)
        model = Model(inputs=[inputs1, inputs2], outputs=x)
        if with_ts:
            model = Model(inputs=[inputs1, inputs2, inputs3], outputs=x)
        model.compile(optimizer='adam', loss='mean_squared_error')

    elif output == "binary_clf":
        x = Dense(1, kernel_regularizer='l2', activation="sigmoid")(x)
        model = Model(inputs=[inputs1, inputs2], outputs=x)
        if with_ts:
            model = Model(inputs=[inputs1, inputs2, inputs3], outputs=x)
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['acc'])

    #model.summary()
    return model
コード例 #3
0
ファイル: models.py プロジェクト: MichaelAlthof/USC
def gru_keras(max_features,
              maxlen,
              bidirectional,
              dropout_rate,
              embed_dim,
              rec_units,
              mtype='GRU',
              reduction=None,
              classes=4,
              lr=0.001):

    if K.backend == 'tensorflow':
        K.clear_session()

    input_layer = Input(shape=(maxlen, ))
    embedding_layer = Embedding(max_features,
                                output_dim=embed_dim,
                                trainable=True)(input_layer)
    x = SpatialDropout1D(dropout_rate)(embedding_layer)

    if reduction:
        if mtype == 'GRU':
            if bidirectional:
                x = Bidirectional(
                    CuDNNGRU(units=rec_units, return_sequences=True))(x)
            else:
                x = CuDNNGRU(units=rec_units, return_sequences=True)(x)
        elif mtype == 'LSTM':
            if bidirectional:
                x = Bidirectional(
                    CuDNNLSTM(units=rec_units, return_sequences=True))(x)
            else:
                x = CuDNNLSTM(units=rec_units, return_sequences=True)(x)

        if reduction == 'average':
            x = GlobalAveragePooling1D()(x)
        elif reduction == 'maximum':
            x = GlobalMaxPool1D()(x)
    else:
        if mtype == 'GRU':
            if bidirectional:
                x = Bidirectional(
                    CuDNNGRU(units=rec_units, return_sequences=False))(x)
            else:
                x = CuDNNGRU(units=rec_units, return_sequences=False)(x)
        elif mtype == 'LSTM':
            if bidirectional:
                x = Bidirectional(
                    CuDNNLSTM(units=rec_units, return_sequences=False))(x)
            else:
                x = CuDNNLSTM(units=rec_units, return_sequences=False)(x)

    output_layer = Dense(classes, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(learning_rate=lr, clipvalue=1, clipnorm=1),
                  metrics=['acc'])
    return model
コード例 #4
0
    def create_model(self):
        base_model = Xception(weights=None,
                              include_top=False,
                              input_shape=(IM_HEIGHT, IM_WIDTH, 3))

        x = base_model.output
        x = GlobalAveragePooling2D()(x)

        predictions = Dense(3, activation="linear")(x)
        model = Model(inputs=base_model.input, outputs=predictions)
        model.compile(loss="mse",
                      optimizer=Adam(lr=0.001),
                      metrics=["accuracy"])
        # model.enable_eager_execution()
        return model
コード例 #5
0
def build_model(hidden_size):
    inputs = Input(shape=(28, 28))
    x1 = Flatten()(inputs)
    x2 = Dense(hidden_size, activation=tf.nn.relu)(x1)
    x3 = Dropout(0.2)(x2)
    x4 = Dense(10, activation=tf.nn.softmax)(x3)
    model = Model(inputs=inputs, outputs=x4)

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # Train and fit model
    model.fit(x_train, y_train, epochs=5)
    [loss, acc] = model.evaluate(x_test, y_test)
    return [model, acc]
コード例 #6
0
    def create_critic_network(self):

        # parallel 1
        state_input = Input(shape = [self.obs_dim])
        w1 = Dense(self.hidden_dim, activation = 'relu')(state_input)
        h1 = Dense(self.hidden_dim, activation = 'linear')(w1)

        # parallel 2
        action_input = Input(shape = [self.act_dim], name = 'action2')
        a1 = Dense(self.hidden_dim, activation = 'linear')(action_input)

        # merge
        #h2 = concatenate([h1, a1], mode = 'sum')
        h2 = concatenate([h1, a1])
        h3 = Dense(self.hidden_dim, activation = 'relu')(h2)
        value_out = Dense(self.act_dim, activation = 'linear')(h3)

        model = Model(inputs = [state_input, action_input], outputs = [value_out])
        adam = Adam(self.lr)
        model.compile(loss = 'mse', optimizer = adam)

        return model, action_input, state_input
コード例 #7
0
ファイル: models.py プロジェクト: MichaelAlthof/USC
def cnn_keras(max_features,
              maxlen,
              dropout_rate,
              embed_dim,
              num_filters=300,
              classes=4,
              lr=0.001):
    if K.backend == 'tensorflow':
        K.clear_session()
    input_layer = Input(shape=(maxlen, ))
    embedding_layer = Embedding(max_features,
                                output_dim=embed_dim,
                                trainable=True)(input_layer)
    x = SpatialDropout1D(dropout_rate)(embedding_layer)
    x = Conv1D(num_filters, 7, activation='relu', padding='same')(x)
    x = GlobalMaxPooling1D()(x)
    output_layer = Dense(classes, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(learning_rate=lr, clipvalue=1, clipnorm=1),
                  metrics=['acc'])
    return model
コード例 #8
0
def keras_build_fn(num_feature,
                   num_output,
                   is_sparse,
                   embedding_dim=-1,
                   num_hidden_layer=2,
                   hidden_layer_dim=512,
                   activation='elu',
                   learning_rate=1e-3,
                   dropout=0.5,
                   l1=0.0,
                   l2=0.0,
                   loss='categorical_crossentropy'):
    """Initializes and compiles a Keras DNN model using the Adam optimizer.

  Args:
    num_feature: number of features
    num_output: number of outputs (targets, e.g., classes))
    is_sparse: boolean whether input data is in sparse format
    embedding_dim: int number of nodes in embedding layer; if value is <= 0 then
      no embedding layer will be present in the model
    num_hidden_layer: number of hidden layers
    hidden_layer_dim: int number of nodes in the hidden layer(s)
    activation: string
      activation function for hidden layers; see https://keras.io/activations/
    learning_rate: float learning rate for Adam
    dropout: float proportion of nodes to dropout; values in [0, 1]
    l1: float strength of L1 regularization on weights
    l2: float strength of L2 regularization on weights
    loss: string
      loss function; see https://keras.io/losses/

  Returns:
    model: Keras.models.Model
      compiled Keras model
  """
    assert num_hidden_layer >= 1

    inputs = Input(shape=(num_feature, ), sparse=is_sparse)

    activation_func_args = ()
    if activation.lower() == 'prelu':
        activation_func = PReLU
    elif activation.lower() == 'leakyrelu':
        activation_func = LeakyReLU
    elif activation.lower() == 'elu':
        activation_func = ELU
    elif activation.lower() == 'thresholdedrelu':
        activation_func = ThresholdedReLU
    else:
        activation_func = Activation
        activation_func_args = (activation)

    if l1 > 0 and l2 > 0:
        reg_init = lambda: regularizers.l1_l2(l1, l2)
    elif l1 > 0:
        reg_init = lambda: regularizers.l1(l1)
    elif l2 > 0:
        reg_init = lambda: regularizers.l2(l2)
    else:
        reg_init = lambda: None

    if embedding_dim > 0:
        # embedding layer
        e = Dense(embedding_dim)(inputs)

        x = Dense(hidden_layer_dim, kernel_regularizer=reg_init())(e)
        x = activation_func(*activation_func_args)(x)
        x = Dropout(dropout)(x)
    else:
        x = Dense(hidden_layer_dim, kernel_regularizer=reg_init())(inputs)
        x = activation_func(*activation_func_args)(x)
        x = Dropout(dropout)(x)

    # add additional hidden layers
    for _ in range(num_hidden_layer - 1):
        x = Dense(hidden_layer_dim, kernel_regularizer=reg_init())(x)
        x = activation_func(*activation_func_args)(x)
        x = Dropout(dropout)(x)

    x = Dense(num_output)(x)
    preds = Activation('softmax')(x)

    model = Model(inputs=inputs, outputs=preds)
    model.compile(optimizer=Adam(lr=learning_rate), loss=loss)

    return model
コード例 #9
0
ファイル: ml_seq2seq.py プロジェクト: jayanti-prasad/BigCode
class seq2seq_train:
    def __init__(self, cfg):
        self.cfg = cfg

        self.enc_inp = None
        self.enc_outp = None
        self.dec_inp = None
        self.dec_outp = None
        self.enc_model = None
        self.model = None

        self.__get_model__()

    def __get_model__(self):

        self.enc_inp = Input(shape=(self.cfg.input_seq_len(), ),
                             name="Encoder-Input")

        embd = Embedding(self.cfg.num_input_tokens(),
                         self.cfg.latent_dim(),
                         name='Encoder-Embedding',
                         mask_zero=False)

        embd_outp = embd(self.enc_inp)

        x = BatchNormalization(name='Encoder-Batchnorm-1')(embd_outp)

        _, state_h = GRU(self.cfg.latent_dim(),
                         return_state=True,
                         name='Encoder-Last-GRU')(x)

        self.enc_model = Model(inputs=self.enc_inp,
                               outputs=state_h,
                               name='Encoder-Model')

        self.enc_outp = self.enc_model(self.enc_inp)

        self.cfg.logger.info("********** Encoder Model summary **************")
        self.cfg.logger.info(self.enc_model.summary())

        # get the decoder

        self.dec_inp = Input(shape=(None, ), name='Decoder-Input')

        dec_emb = Embedding(self.cfg.num_output_tokens(),
                            self.cfg.latent_dim(),
                            name='Decoder-Embedding',
                            mask_zero=False)(self.dec_inp)

        dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

        decoder_gru = GRU(self.cfg.latent_dim(),
                          return_state=True,
                          return_sequences=True,
                          name='Decoder-GRU')

        decoder_gru_output, _ = decoder_gru(dec_bn,
                                            initial_state=self.enc_outp)

        x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

        dec_dense = Dense(self.cfg.num_output_tokens(),
                          activation='softmax',
                          name='Final-Output-Dense')

        self.dec_outp = dec_dense(x)

        model_inp = [self.enc_inp, self.dec_inp]

        self.model = Model(model_inp, self.dec_outp)

        self.cfg.logger.info("********** Full Model summary **************")

        self.cfg.logger.info(str(self.model.summary()))

        plot_model(self.model,
                   to_file=self.cfg.scratch_dir() + os.sep + "seq2seq.png")

    def fit_model(self, input_vecs, output_vecs):

        input_data = [input_vecs, output_vecs[:, :-1]]
        output_data = output_vecs[:, 1:]

        self.model.compile(optimizer=optimizers.Nadam(lr=0.001),
                           loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])

        model_checkpoint = ModelCheckpoint(self.cfg.output_dir() + os.sep +
                                           'model.hdf5',
                                           monitor='val_loss',
                                           save_best_only=True,
                                           period=1)

        csv_logger = CSVLogger(self.cfg.log_dir() + os.sep + 'history.csv')
        tb_dir = self.cfg.log_dir() + os.sep + "tensorboard"

        if os.path.isfile(tb_dir):
            rmtree(tb_dir)

        tensorboard = TensorBoard(log_dir=tb_dir,
                                  histogram_freq=10,
                                  batch_size=self.cfg.batch_size(),
                                  write_graph=True,
                                  write_grads=False,
                                  write_images=False,
                                  embeddings_freq=0,
                                  embeddings_layer_names=None,
                                  embeddings_metadata=None,
                                  embeddings_data=None)

        history = self.model.fit(
            input_data,
            np.expand_dims(output_data, -1),
            batch_size=self.cfg.batch_size(),
            epochs=self.cfg.nepochs(),
            validation_split=self.cfg.validation_split(),
            callbacks=[csv_logger, model_checkpoint, tensorboard])

        return (history)
コード例 #10
0
class SiameseNet(object):
    """Class for Siamese Network."""
    def __init__(self, inputs, arch, siam_reg, main_path, y_true):
        self.orig_inputs = inputs
        # set up inputs
        self.inputs = {
            'A': inputs['Unlabeled'],
            'B': Input(shape=inputs['Unlabeled'].get_shape().as_list()[1:]),
            'Labeled': inputs['Labeled'],
        }

        self.main_path = os.path.join(main_path, 'siemese/')
        self.y_true = y_true

        # generate layers
        self.layers = []
        self.layers += util.make_layer_list(arch, 'siamese', siam_reg)

        # create the siamese net
        self.outputs = stack_layers(self.inputs, self.layers)

        # add the distance layer
        self.distance = Lambda(affinities.euclidean_distance,
                               output_shape=affinities.eucl_dist_output_shape)(
                                   [self.outputs['A'], self.outputs['B']])

        # create the distance model for training
        self.net = Model([self.inputs['A'], self.inputs['B']], self.distance)

        # compile the siamese network
        self.net.compile(loss=affinities.get_contrastive_loss(m_neg=1,
                                                              m_pos=0.05),
                         optimizer='rmsprop')

    def train(self,
              pairs_train,
              dist_train,
              pairs_val,
              dist_val,
              lr,
              drop,
              patience,
              num_epochs,
              batch_size,
              dset,
              load=True):
        """Train the Siamese Network."""
        if load:
            # load weights into model
            output_path = os.path.join(self.main_path, dset)
            load_model(self.net, output_path, '_siamese')
            return
        # create handler for early stopping and learning rate scheduling
        self.lh = util.LearningHandler(lr=lr,
                                       drop=drop,
                                       lr_tensor=self.net.optimizer.lr,
                                       patience=patience)

        # initialize the training generator
        train_gen_ = util.train_gen(pairs_train, dist_train, batch_size)

        # format the validation data for keras
        validation_data = ([pairs_val[:, 0], pairs_val[:, 1]], dist_val)

        # compute the steps per epoch
        steps_per_epoch = int(len(pairs_train) / batch_size)

        # train the network
        self.net.fit_generator(train_gen_,
                               epochs=num_epochs,
                               validation_data=validation_data,
                               steps_per_epoch=steps_per_epoch,
                               callbacks=[self.lh])

        model_json = self.net.to_json()
        output_path = os.path.join(self.main_path, dset)
        save_model(self.net, model_json, output_path, '_siamese')

    def predict(self, x, batch_sizes):
        # compute the siamese embeddings of the input data
        return train.predict(self.outputs['A'],
                             x_unlabeled=x,
                             inputs=self.orig_inputs,
                             y_true=self.y_true,
                             batch_sizes=batch_sizes)
コード例 #11
0
                       epsilon=1e-3,
                       center=True,
                       scale=True)(x)

# Flatten layer
# x = Flatten()(x)

# Dense Layer 1
x = Dense(256, activation='relu')(x)
outputs = Dense(len(labels), activation="softmax")(x)

model = Model(inputs, outputs)
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='nadam',
              metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss',
                           mode='min',
                           verbose=1,
                           patience=10,
                           min_delta=0.0001)
checkpoint = ModelCheckpoint('speech2text_model.hdf5',
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             mode='max')

hist = model.fit(x=x_train,
                 y=y_train,
                 epochs=100,
コード例 #12
0
ファイル: networks.py プロジェクト: klinamen/SpectralNet
class SiameseNet:
    def __init__(self, inputs, arch, siam_reg, y_true):
        self.orig_inputs = inputs
        # set up inputs
        self.inputs = {
            'A': inputs['Unlabeled'],
            'B': Input(shape=inputs['Unlabeled'].get_shape().as_list()[1:]),
            'Labeled': inputs['Labeled'],
        }

        self.y_true = y_true

        # generate layers
        self.layers = []
        self.layers += make_layer_list(arch, 'siamese', siam_reg)

        # create the siamese net
        self.outputs = stack_layers(self.inputs, self.layers)

        # add the distance layer
        self.distance = Lambda(costs.euclidean_distance,
                               output_shape=costs.eucl_dist_output_shape)(
                                   [self.outputs['A'], self.outputs['B']])

        #create the distance model for training
        self.net = Model([self.inputs['A'], self.inputs['B']], self.distance)

        # compile the siamese network
        self.net.compile(loss=costs.get_contrastive_loss(m_neg=1, m_pos=0.05),
                         optimizer='rmsprop')

    def train(self, pairs_train, dist_train, pairs_val, dist_val, lr, drop,
              patience, num_epochs, batch_size):
        # create handler for early stopping and learning rate scheduling
        self.lh = LearningHandler(lr=lr,
                                  drop=drop,
                                  lr_tensor=self.net.optimizer.lr,
                                  patience=patience)

        # initialize the training generator
        train_gen_ = train_gen(pairs_train, dist_train, batch_size)

        # format the validation data for keras
        validation_data = ([pairs_val[:, 0], pairs_val[:, 1]], dist_val)

        # compute the steps per epoch
        steps_per_epoch = int(len(pairs_train) / batch_size)

        # train the network
        hist = self.net.fit_generator(train_gen_,
                                      epochs=num_epochs,
                                      validation_data=validation_data,
                                      steps_per_epoch=steps_per_epoch,
                                      callbacks=[self.lh])

        return hist

    def predict(self, x, batch_sizes):
        # compute the siamese embeddings of the input data
        return train.predict(self.outputs['A'],
                             x_unlabeled=x,
                             inputs=self.orig_inputs,
                             y_true=self.y_true,
                             batch_sizes=batch_sizes)
コード例 #13
0
headModel = LeakyReLU(alpha=0.2)(headModel)
headModel = Dropout(DROP)(headModel)
headModel = Dense(2, activation="softmax")(headModel)

# place the head FC model on top of the base model (this will become the actual model we will train)
model = Model(inputs=baseModel.input, outputs=headModel)
model.summary()

# loop over all layers in the base model and freeze them so they will *not* be updated during the first training process
for layer in baseModel.layers:
    layer.trainable = False

# compile our model
print("[INFO] compiling model...")
opt = Adam(lr=INIT_LR, decay=DEC)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

# train the head of the network
print("[INFO] training head...")
H = model.fit_generator(trainAug.flow(trainX, trainY, batch_size=BS),
                        steps_per_epoch=len(trainX) // BS,
                        validation_data=(testX, testY),
                        validation_steps=len(testX) // BS,
                        epochs=EPOCHS)

# make predictions on the testing set
print("[INFO] evaluating network...")
predIdxs = model.predict(testX, batch_size=BS)

# for each image in the testing set we need to find the index of the label with corresponding largest predicted probability
predIdxs = np.argmax(predIdxs, axis=1)
コード例 #14
0
    def train(self, dataset):
        # Transform data into format to be fed into model

        # Below code is more suitable for run mode than train mode
        '''
    (X, Y, X_valid, Y_valid) = dataset.load_as_list()
    X = self.trainable_model.encode_input(X)
    Y = self.trainable_model.encode_output(Y)
    X_valid = self.trainable_model.encode_input(X_valid)
    Y_valid = self.trainable_model.encode_output(Y_valid)
    '''

        # If using multi-gpu, then we save model/log files in other directory than normal one
        dir_suffix = ''
        gpu_count = len(self.get_available_gpus())
        if self.multi_gpu:
            gpu_count = len(self.get_available_gpus())
            # Changed to save multi-gpu model at the same path as single gpu model
            #if gpu_count > 1:
            #  dir_suffix = '_' + str(gpu_count) + 'gpus'
        print('Training on ' + str(gpu_count) + ' GPU(s)')

        # In case of train mode, we can load data in the wqay that we can utilize caching feature.
        # We separate call between input and output because they are use different transformation approach.
        (X, Y, X_valid,
         Y_valid) = self.trainable_model.load_encoded_data(dataset)
        print(len(X[0]))
        print(len(Y))
        print(len(X_valid[0]))
        print(len(Y_valid))
        '''
    xx = X[0:5]
    yy = Y[0:5]
    print('xx')
    print(xx)
    print('yy')
    print(yy)
    '''

        training_data_count = 0
        if self.input_transform.get_data_dimension() > 1:
            training_data_count = X[0].shape[0]
        else:
            training_data_count = X.shape[0]
        print('Training data count = ' + str(training_data_count))
        batch_count = int(training_data_count /
                          self.training_config['batch_size'])
        print('Batch count = ' + str(batch_count))
        training_data_count = int(batch_count *
                                  self.training_config['batch_size'])
        print('Training data used = ' + str(training_data_count))
        epochs_count = int(self.training_config['epochs'])
        if 'final_epochs' in self.training_config:  # Federated learning will have this vale overidden
            epochs_count = int(self.training_config['final_epochs'])
        training_steps = int(batch_count) * epochs_count
        training_batch_count = batch_count

        validation_data_count = 0
        if self.input_transform.get_data_dimension() > 1:
            validation_data_count = X_valid[0].shape[0]
        else:
            validation_data_count = X_valid.shape[0]
        print('Validation data count = ' + str(validation_data_count))
        batch_count = int(validation_data_count /
                          self.training_config['batch_size'])
        print('Batch count = ' + str(batch_count))
        validation_data_count = int(batch_count *
                                    self.training_config['batch_size'])
        print('Validation data used = ' + str(validation_data_count))

        if self.input_transform.get_data_dimension() > 1:
            X = [a[0:training_data_count] for a in X]
            X_valid = [a[0:validation_data_count] for a in X_valid]
            print('>>> X len = ' + str(len(X[0])))
            print('>>> X_valid len = ' + str(len(X_valid[0])))
        else:
            X = X[0:training_data_count]
            X_valid = X_valid[0:validation_data_count]
            print('>>>> X len = ' + str(X.shape[0]))
            print('>>>> X_valid len = ' + str(X_valid.shape[0]))

        if self.output_transform.get_data_dimension() > 1:
            Y = [a[0:training_data_count] for a in Y]
            Y_valid = [a[0:validation_data_count] for a in Y_valid]
            print('>>> Y len = ' + str(len(X[0])))
            print('>>> Y_valid len = ' + str(len(X_valid[0])))
        else:
            Y = Y[0:training_data_count]
            Y_valid = Y_valid[0:validation_data_count]
            print('>>>> Y len = ' + str(Y.shape[0]))
            print('>>>> Y_valid len = ' + str(Y_valid.shape[0]))

        # If multi-model, wrap it as Data Parallel trainable model
        if gpu_count > 1:
            with tf.device('/cpu'):
                [input_tensors,
                 output_tensors] = self.trainable_model.get_forward_tensors()
                print("=== INPUT_TENSOR ===")
                print(input_tensors)
                print("=== OUTPUT_TENSOR ===")
                print(output_tensors)
                model = Model(input_tensors, output_tensors)
            print("=== CPU TEMPLATE MODEL ===")
            model.summary()
            single_gpu_model = model  # For saving weight
            model = multi_gpu_model(model, gpus=gpu_count)
            print("=== MULTI-GPU MODEL ===")
            model.summary()

        elif gpu_count == 1:
            with tf.device('/gpu'):
                [input_tensors,
                 output_tensors] = self.trainable_model.get_forward_tensors()
            model = Model(input_tensors, output_tensors)
            single_gpu_model = model

        elif gpu_count == 0:
            with tf.device('/cpu'):
                [input_tensors,
                 output_tensors] = self.trainable_model.get_forward_tensors()
            model = Model(input_tensors, output_tensors)
            single_gpu_model = model

        current_epoch_wrapper = LogCurrentEpochWrapper(self.training_config,
                                                       dir_suffix)
        initial_epoch = 0
        if 'resume_if_possible' in self.training_config and self.training_config[
                'resume_if_possible'] == True:
            initial_epoch = current_epoch_wrapper.get_current_epoch()

        # Home of output directory (support multi-OS)
        output_dir = os.path.join(
            *re.split('/|\\\\', self.training_config['output_dir']))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        optimizer = self.training_config['optimizer']
        if optimizer == 'adam':
            optimizer_params = self.training_config['optimizer_params']
            optimizer = Adam(optimizer_params[0],
                             optimizer_params[1],
                             optimizer_params[2],
                             epsilon=optimizer_params[3])
        elif optimizer == 'bert_adam':
            optimizer_params = self.training_config['optimizer_params']
            # Calculate total step and set it to decay_steps (learning rate reachs 0 in the every end)
            total_steps = batch_count * self.training_config['epochs']
            print('[INFO] Training with BERT Optimizer with decay_steps = ' +
                  str(total_steps))
            from NLP_LIB.optimizer.bert_optimizer import BERTOptimizer
            optimizer = BERTOptimizer(
                decay_steps=total_steps,  # 100000,
                warmup_steps=optimizer_params[2],  # 10000,
                learning_rate=optimizer_params[0],  # 1e-4,
                weight_decay=optimizer_params[1],  # 0.01,
                weight_decay_pattern=[
                    'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'
                ],
            )
        elif optimizer == 'bert':
            optimizer_params = self.training_config['optimizer_params']
            from NLP_LIB.ext.bert.optimization import AdamWeightDecayOptimizer
            print('initial_epoch = ' + str(initial_epoch))
            print('training_batch_count = ' + str(training_batch_count))
            initial_step = initial_epoch * training_batch_count
            print('initial_step = ' + str(initial_step))
            optimizer = AdamWeightDecayOptimizer(
                initial_step=
                initial_step,  # Start from current epoch to keep model running with correct LR
                learning_rate=optimizer_params[0],  # 0.0001,
                num_train_steps=training_steps,  # 100,
                warmup_steps=optimizer_params[4],  # 10,
                lr_decay_power=optimizer_params[5],
                weight_decay_rate=optimizer_params[6],
                beta_1=optimizer_params[1],  # 0.9,
                beta_2=optimizer_params[2],  # 0.999,
                epsilon=optimizer_params[3],  # 1e-6,
                exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        # Add model metric names and tensors to tracking list
        metric_names = self.trainable_model.get_metric_names()
        metric_funcs = self.trainable_model.get_metric_functions()
        '''
    metric_names = self.trainable_model.get_metric_names()
    metric_tensors = self.trainable_model.get_metric_tensors()
    for metric_name, metric_tensor in zip(metric_names, metric_tensors):
      print('Add Metric: ' + metric_name)
      model.metrics_names.append(metric_name)
      model.metrics_tensors.append(metric_tensor)
    '''

        model.compile(optimizer=optimizer,
                      loss=self.trainable_model.get_loss_function(),
                      metrics=metric_funcs)

        model.summary()

        if self.input_transform.get_data_dimension() > 1:
            x_feed = X
            x_valid_feed = X_valid
        else:
            x_feed = [X]
            x_valid_feed = [X_valid]
            #exit(0)

        if self.output_transform.get_data_dimension() > 1:
            y_feed = Y
            y_valid_feed = Y_valid
        else:
            y_feed = [Y]
            y_valid_feed = [Y_valid]

        # If model is sequence model, we have to feed prev_output too.
        # TODO: Can we embed the flow to generate input list into the data transformation class?
        if isinstance(self.trainable_model, SequenceModelWrapper):
            print('OH NOOO!!!')
            #exit(0)

            x_feed.append(Y)
            x_valid_feed.append(Y_valid)

            # Also, if we are running Sequence Model, output will be logits but label will be sparse value.
            # Keras loss function need label and output to be in same dimension, thus we need to convert label to dense value too.
            # The converson to Dense is done in custom loss funciton in the model, but be need to "prepare" addition dimension to sparse label.
            y_feed = [np.expand_dims(Y, axis=2)]
            y_valid_feed = [np.expand_dims(Y_valid, axis=2)]

        class CustomTensorBoard(TensorBoard):
            def __init__(
                    self, log_dir,
                    **kwargs):  # add other arguments to __init__ if you need
                super().__init__(log_dir=log_dir, **kwargs)

            def on_epoch_end(self, epoch, logs=None):
                logs = logs or {}
                # If there is learning_rate_tensor in the optimizer, we want to log it too.
                if hasattr(optimizer, 'learning_rate_tensor'):
                    logs.update({
                        'learning_rate':
                        K.eval(optimizer.learning_rate_tensor)
                    })
                '''
        # Also add gradient norm as a default metric
        # Get a "l2 norm of gradients" tensor
        def get_gradient_norm(model):
          with K.name_scope('gradient_norm'):
            grads = K.gradients(model.total_loss, model.trainable_weights)
            norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
          return norm
        logs.update({'gradient_norm': K.eval(get_gradient_norm(model))})
        '''
                super().on_epoch_end(epoch, logs)

        # Tensorboard log directory
        tboard_log_dir = os.path.join(output_dir, 'tboard_log' + dir_suffix)
        if not os.path.exists(tboard_log_dir):
            os.makedirs(tboard_log_dir)
        tboard_log_saver = CustomTensorBoard(tboard_log_dir,
                                             write_graph=False,
                                             write_images=False)

        # For saving weight history along with accuracy in each epoch (May use a lot of disk)
        verbose_model_saver = None
        if self.training_config['save_weight_history']:
            verbose_log_dir = os.path.join(output_dir,
                                           'weight_history' + dir_suffix)
            if not os.path.exists(verbose_log_dir):
                os.makedirs(verbose_log_dir)
            verbose_weight_history_filepath = os.path.join(
                verbose_log_dir, 'weights.{epoch:02d}-{' +
                self.training_config['watch_metric'] + ':.4f}.h5')

            # If there is option to specified number of eopch to be saved
            if 'save_weight_every' in self.training_config:
                save_weight_every = self.training_config['save_weight_every']
                print('[INFO] Save weight every = ' + str(save_weight_every))
                verbose_model_saver = RefModelCheckpoint(
                    verbose_weight_history_filepath,
                    single_gpu_model,
                    save_best_only=False,
                    save_weights_only=True,
                    period=save_weight_every)
            else:
                verbose_model_saver = RefModelCheckpoint(
                    verbose_weight_history_filepath,
                    single_gpu_model,
                    save_best_only=False,
                    save_weights_only=True)

        model.summary()

        # Initialize all variables, including local variables created by metrics calculations and optimizers.
        sess = K.get_session()
        init = tf.group(tf.global_variables_initializer(),
                        tf.local_variables_initializer())
        sess.run(init)

        #####
        ## DEBUG Print some training variable before loading checkpoint
        #global_vars = tf.global_variables()
        #print('[DEBUG]: First Weight Name = ' + str(global_vars[0].name))
        #print('[DEBUG]: First Weight = ' + str(sess.run(global_vars[0])))

        # Callback to model after finish variable initialization, init_from_checkpoint is loaded here.
        self.trainable_model.on_after_init(single_gpu_model)

        # If resume training, load latest checkpoint
        # Checkpoint saving directory
        checkpoint_dir = os.path.join(output_dir, 'checkpoint')
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        last_checkpoint_filepath = os.path.join(
            checkpoint_dir, 'last_weight' + dir_suffix + '.h5')
        if 'resume_if_possible' in self.training_config and self.training_config[
                'resume_if_possible'] == True:
            print('Init model ' + str(self) + ' from epoch: ' +
                  str(initial_epoch))
            if os.path.exists(last_checkpoint_filepath):
                print('Init model ' + str(self) + ' from checkpoint: ' +
                      last_checkpoint_filepath)
                single_gpu_model.load_weights(last_checkpoint_filepath)

        self.training_config['initial_epoch'] = initial_epoch

        checkpoint_filepath = os.path.join(checkpoint_dir,
                                           'best_weight' + dir_suffix + '.h5')
        model_saver = RefModelCheckpoint(checkpoint_filepath,
                                         single_gpu_model,
                                         save_best_only=True,
                                         save_weights_only=True)

        # Also always save lastest model for continue training
        last_model_saver = RefModelCheckpoint(last_checkpoint_filepath,
                                              single_gpu_model,
                                              save_best_only=False,
                                              save_weights_only=True)

        # Construct all training callbacks
        training_callbacks = [model_saver, last_model_saver, tboard_log_saver]
        if verbose_model_saver is not None:
            training_callbacks.append(verbose_model_saver)
        if self.callback_list is not None:
            for callback in self.callback_list:
                training_callbacks.append(callback.get_keras_callback())

        # Save current epoch
        training_callbacks.append(current_epoch_wrapper.get_keras_callback())

        #####
        ## DEBUG Print some training variable before after checkpoint
        #global_vars = tf.global_variables()
        #print('[DEBUG]: First Weight Name = ' + str(global_vars[0].name))
        #print('[DEBUG]: First Weight = ' + str(sess.run(global_vars[0])))

        print('Start training.')
        '''
    with tf.Session(config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=False)) as sess:
      init = tf.global_variables_initializer()
      sess.run(init)      

      model.fit(x=x_feed, y=y_feed,
        batch_size=self.training_config['batch_size'],
        epochs=self.training_config['epochs'],
        validation_data=(x_valid_feed, y_valid_feed),
        callbacks=training_callbacks,
        initial_epoch=initial_epoch
      )
    '''

        # print(model.trainable_weights)

        model.fit(x=x_feed,
                  y=y_feed,
                  batch_size=self.training_config['batch_size'],
                  epochs=self.training_config['epochs'],
                  validation_data=(x_valid_feed, y_valid_feed),
                  callbacks=training_callbacks,
                  initial_epoch=initial_epoch)

        print('Finished training.')

        # Return trained model (single_gpu_model) and validation set as output.
        # They are used for further benchmarking like in federated training.
        return (single_gpu_model, x_valid_feed, y_valid_feed)