예제 #1
0
    def build_bert(self, verbose=True):
        """
        build bert + crf model for sequence model
        """
        # bert inputs
        bert_word_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype="int32", name="bert_word_input")
        bert_mask_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype="int32", name='bert_mask_input')
        bert_segment_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype="int32", name="bert_segment_input")
        
        inputs = [bert_word_ids, bert_mask_ids, bert_segment_ids]

        bert_out = BertLayer(n_fine_tune_layers=self._params.n_fine_tune_layers, bert_path=self._params.bert_path, name="bert_layer")([bert_word_ids, bert_mask_ids, bert_segment_ids])

        features = bert_out

        if self._params.use_dict:
            if verbose: logging.info("use user dict features")
            dict_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype='int32', name='dict_input')
            inputs.append(dict_ids)

            dict_embeddings = Embedding(input_dim=self._params.dict_vocab_size,
                                        output_dim=self._params.dict_embedding_dim,
                                        mask_zero=True,
                                        name='dict_embedding')(dict_ids)

            features = Concatenate(name="bert_and_dict_features")([features, dict_embeddings])

        z = Dense(self._params.fc_dim, activation='relu', name="fc_dense")(features)

        if self._params.use_crf:
            if verbose: logging.info('use crf decode layer')
            crf = CRF(self._params.num_labels, sparse_target=False,
                        learn_mode='marginal', test_mode='marginal', name='crf_out')
            loss = crf.loss_function
            pred = crf(z)
        else:
            loss = 'categorical_crossentropy'
            pred = Dense(self._params.num_labels, activation='softmax', name='softmax_out')(z)

        model = Model(inputs=inputs, outputs=pred)
        model.summary(print_fn=lambda x: logging.info(x + '\n'))

        # It is recommended that you use this optimizer for fine tuning, since this
        # is how the model was trained (note that the Adam m/v variables are NOT
        # loaded from init_checkpoint.)
        optimizer = AdamWeightDecayOptimizer(
            learning_rate=1e-5,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
        
        model.compile(loss=loss, optimizer=optimizer)

        self.model = model
예제 #2
0
def get_optimizer(
    network_config,
    default_optimizer=train.AdadeltaOptimizer(learning_rate=1.0)):
    """
    Return the optimizer given by the input network configuration, or a default optimizer.
    :param network_config: network configuration
    :param default_optimizer: default optimization algorithm
    :return: configured optimizer
    """
    try:
        optimizer = network_config.optimizer
    except KeyError:
        logging.info("Using Adadelta as default optimizer.")
        return default_optimizer
    if isinstance(optimizer.lr, numbers.Number):
        lr = optimizer.lr
    else:
        optimizer.lr.num_train_steps = network_config.max_steps
        optimizer.lr.steps_per_epoch = network_config.steps_per_epoch
        lr = get_learning_rate(optimizer.lr, train.get_global_step())

    name = optimizer.name
    params = optimizer.params
    if "Adadelta" == name:
        opt = train.AdadeltaOptimizer(lr, **params)
    elif "Adam" == name:
        opt = train.AdamOptimizer(lr, **params)
    elif "LazyAdam" == name:
        opt = LazyAdamOptimizer(lr, **params)
    elif "LazyNadam" == name:
        opt = LazyNadamOptimizer(lr, **params)
    elif "SGD" == name:
        opt = train.GradientDescentOptimizer(lr)
    elif "Momentum" == name:
        opt = train.MomentumOptimizer(lr, **params)
    elif "Nadam" == name:
        opt = NadamOptimizerSparse(lr, **params)
    elif "bert" == name:
        opt = AdamWeightDecayOptimizer(
            lr,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    else:
        raise ValueError("Invalid optimizer name: {}".format(name))
    return opt
예제 #3
0
def create_model(bert_module_path,
                 learning_rate=2e-5,
                 max_seq_length=256,
                 n_tune_layers=3,
                 n_classes=20,
                 optimizer="adam"):
    adam = Adam(learning_rate=learning_rate,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-6,
                amsgrad=True)
    adamW = AdamWeightDecayOptimizer(
        learning_rate,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    input_ids = Input(shape=(max_seq_length, ), name="input_ids")
    input_mask = Input(shape=(max_seq_length, ), name="input_masks")
    input_segment = Input(shape=(max_seq_length, ), name="segment_ids")
    bert_inputs = [input_ids, input_mask, input_segment]

    bert = BertLayer(
        bert_module_path,
        seq_len=max_seq_length,
        pooling=
        'cls',  # pooling='cls' returns pooled output, otherwise returns seqs
        n_tune_layers=n_tune_layers,
        use_layers=12,
        trainable=True,
        verbose=True)
    dropout = Dropout(0.1)

    preds = Dense(n_classes, activation='softmax')(dropout(bert(bert_inputs)))
    model = Model(inputs=bert_inputs, outputs=preds)

    model.compile(loss='categorical_crossentropy',
                  optimizer=adam if optimizer == "adam" else adamW,
                  metrics=['acc'])
    model.summary()

    return model
예제 #4
0
def build_model(maxlen):
    bert_path = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1'

    input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    input_mask = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    segment_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)

    bert_layer = hub.KerasLayer(bert_path, trainable=True)
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

    hid = tf.keras.layers.Reshape((maxlen, 768))(sequence_output)
    hid = tf.keras.layers.Flatten()(hid)
    hid = tf.keras.layers.Dense(128)(hid)

    out_begin = tf.keras.layers.Dense(maxlen, name='Begin')(hid)
    out_end = tf.keras.layers.Dense(maxlen, name='End')(hid)

    model = tf.keras.models.Model([input_word_ids, input_mask, segment_ids], [out_begin, out_end])
    opt = AdamWeightDecayOptimizer(learning_rate=3e-5)
    model.compile(opt, 'sparse_categorical_crossentropy', ['acc'])
    model.summary()

    return model
예제 #5
0
def create_custom_optimizer(tvars, loss, bert_init_lr, task_init_lr, num_train_steps, num_warmup_steps, use_tpu, global_step=None, freeze=-1, task_opt='adam', eps=1e-6):
  """Creates an optimizer training op."""
  if global_step is None:
    global_step = tf.train.get_or_create_global_step()

  bert_learning_rate = tf.constant(value=bert_init_lr, shape=[], dtype=tf.float32)
  task_learning_rate = tf.constant(value=task_init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  bert_learning_rate = tf.train.polynomial_decay(
      bert_learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)
  task_learning_rate = tf.train.polynomial_decay(
      task_learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    bert_warmup_learning_rate = bert_init_lr * warmup_percent_done
    task_warmup_learning_rate = task_init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    bert_learning_rate = (
        (1.0 - is_warmup) * bert_learning_rate + is_warmup * bert_warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  bert_optimizer = AdamWeightDecayOptimizer(
      learning_rate=bert_learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=eps,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
  if task_opt == 'adam_weight_decay':
    task_optimizer = AdamWeightDecayOptimizer(
          learning_rate=task_learning_rate,
          weight_decay_rate=0.01,
          beta_1=0.9,
          beta_2=0.999,
          epsilon=eps
    )
  elif task_opt == 'adam':
    task_optimizer = tf.train.AdamOptimizer(
      learning_rate=task_learning_rate)
  else:
    raise NotImplementedError('Check optimizer. {} is invalid.'.format(task_opt))

  # tvars = tf.trainable_variables()
  bert_vars, task_vars = [], []
  for var in tvars:
    if var.name.startswith('bert'):
        can_optimize = False
        if var.name.startswith('bert/encoder/layer_') and  int(var.name.split('/')[2][len('layer_'):]) >= freeze:
          can_optimize = True
        if freeze == -1 or can_optimize:
          bert_vars.append(var)
    else:
        task_vars.append(var)
  print('bert:task', len(bert_vars), len(task_vars))
  grads = tf.gradients(loss, bert_vars + task_vars)
  bert_grads = grads[:len(bert_vars)]
  task_grads = grads[len(bert_vars):]

  # This is how the model was pre-trained.
  (bert_grads, _) = tf.clip_by_global_norm(bert_grads, clip_norm=1.0)
  (task_grads, _) = tf.clip_by_global_norm(task_grads, clip_norm=1.0)

  # global_step1 = tf.Print(global_step, [global_step], 'before')
  bert_train_op = bert_optimizer.apply_gradients(
      zip(bert_grads, bert_vars), global_step=global_step)
  task_train_op = task_optimizer.apply_gradients(
      zip(task_grads, task_vars), global_step=global_step)
  if task_opt == 'adam_weight_decay':
    new_global_step = global_step + 1
    train_op = tf.group(bert_train_op, task_train_op, [global_step.assign(new_global_step)])
  else:
    train_op = tf.group(bert_train_op, task_train_op)
  return train_op
예제 #6
0
def create_optimizer_bplayer(loss, init_lr, num_train_steps, num_warmup_steps,
                             use_tpu, bplayer):
    """Creates an optimizer training op."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning rate.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=1.0,
                                              cycle=False)

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if use_tpu:
        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    # tvars = tf.trainable_variables()
    # grads = tf.gradients(loss, tvars)
    with tf.variable_scope("backward_gradients"):
        grads_vals = bplayer.backward_gradients()
    grads, tvars = zip(*grads_vals)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    # Normally the global step update is done inside of `apply_gradients`.
    # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
    # a different optimizer, you should probably take this line out.
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
예제 #7
0
def train(parameters, train_ds, val_ds, wordvec, class_weights):
    tf.enable_eager_execution()
    tf.logging.set_verbosity(tf.logging.ERROR)

    random_seed.set_random_seed(parameters['seed'])

    (device, data_format) = ('/gpu:0', 'channels_first')
    if parameters['no_gpu'] > 0 or not tf.test.is_gpu_available():
        (device, data_format) = ('/cpu:0', 'channels_last')
    print('Using device %s, and data format %s.' % (device, data_format))

    model = HAN(wordvec, parameters)

    optimizer = AdamWeightDecayOptimizer(
        learning_rate=parameters['learning_rate'],
        weight_decay_rate=0.0,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    timestamp = datetime.now().strftime(' %d%m%y %H%M%S')

    # Create and restore checkpoint (if one exists on the path)
    checkpoint_prefix = os.path.join(parameters['model_dir'], 'ckpt')
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=model,
                                     optimizer=optimizer,
                                     step_counter=step_counter)

    best_acc_ep = (0.0, -1, float('inf'))  # acc, epoch, loss
    patience = 0

    with tf.device(device):
        for ep in range(parameters['train_epochs']):
            start = time.time()
            train_step(model, optimizer, train_ds, step_counter, ep,
                       class_weights, parameters, parameters['log_interval'])

            val_acc, val_loss = test(model,
                                     val_ds,
                                     class_weights,
                                     ds_name='Val')

            end = time.time()
            print('\n Epoch: {} \tTime: {:.6f}'.format(ep + 1, end - start))

            parameters['val_losses'].append(val_loss)

            if val_loss.numpy() < best_acc_ep[2]:
                best_acc_ep = (val_acc.numpy(), ep, val_loss.numpy())
                print('Save checkpoint', checkpoint_prefix)
                checkpoint.save(checkpoint_prefix)


#            else:
#                if patience == parameters['patience']:
#                    print('Apply early stopping')
#                    break

#                patience += 1
#                print('patience {}/{}'.format(patience, parameters['patience']))

        print('Min loss {:.6f}, dev acc. {:.3f}%, ep {} \n'.format(
            best_acc_ep[2], best_acc_ep[0] * 100., best_acc_ep[1] + 1))

    model._name = "Hybrid Attention Network"
    model.summary()

    plt.ylabel('Training/Validation Loss')
    plt.xlabel('Number of Epochs')
    plt.plot(parameters['train_losses'], label="Train Loss")
    plt.plot(parameters['val_losses'], label="Validation Loss")
    plt.legend()
    plt.show()
    plt.savefig('han_training_curve.png')
    plt.gcf().clear()
예제 #8
0
    tf.logging.set_verbosity(tf.logging.ERROR)

    print('Load dataset..', params['data_path'])
    dataset = pickle.load(open(params['data_path'], 'rb'))
    train_ds, val_ds, test_ds = dataset.get_dataset(params['batch_size'],
                                                    params['max_date_len'],
                                                    params['max_news_len'])

    train(params, train_ds, val_ds, dataset.wordvec, dataset.class_weights)

    model = HAN(dataset.wordvec, params)

    optimizer = AdamWeightDecayOptimizer(
        learning_rate=params['learning_rate'],
        weight_decay_rate=0.0,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    timestamp = datetime.now().strftime(' %d%m%y %H%M%S')

    checkpoint_prefix = os.path.join(params['model_dir'], 'ckpt')
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=model,
                                     optimizer=optimizer,
                                     step_counter=step_counter)

    latest_checkpoint = tf.train.latest_checkpoint(params['model_dir'])
    print('Load the last checkpoint..', latest_checkpoint)
    checkpoint.restore(latest_checkpoint)
예제 #9
0
def run(flags_obj):
    tf.enable_eager_execution()

    random_seed.set_random_seed(flags_obj.seed)

    # Automatically determine device and data_format
    (device, data_format) = ('/gpu:0', 'channels_first')
    if flags_obj.no_gpu > 0 or not tf.test.is_gpu_available():
        (device, data_format) = ('/cpu:0', 'channels_last')
    print('Using device %s, and data format %s.' % (device, data_format))

    print('Load dataset..', flags_obj.pickle_path)
    dataset = pickle.load(open(flags_obj.pickle_path, 'rb'))
    train_ds, dev_ds, test_ds = dataset.get_dataset(flags_obj.batch_size,
                                                    flags_obj.max_date_len,
                                                    flags_obj.max_news_len)

    model = HAN(dataset.wordvec, flags_obj)

    # optimizer = tf.train.AdamOptimizer(learning_rate=flags_obj.learning_rate)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=flags_obj.learning_rate,
        weight_decay_rate=0.0,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    # Create file writers for writing TensorBoard summaries.
    timestamp = datetime.now().strftime(' %d%m%y %H%M%S')
    if flags_obj.output_dir:
        # Create directories to which summaries will be written
        # tensorboard --logdir=<output_dir>
        # can then be used to see the recorded summaries.
        train_dir = os.path.join(flags_obj.output_dir, 'han train' + timestamp)
        dev_dir = os.path.join(flags_obj.output_dir, 'han dev' + timestamp)
        test_dir = os.path.join(flags_obj.output_dir, 'han test' + timestamp)
        tf.gfile.MakeDirs(flags_obj.output_dir)
    else:
        train_dir = None
        dev_dir = None
        test_dir = None
    summary_writer = tf.contrib.summary.create_file_writer(train_dir,
                                                           flush_millis=10000)
    dev_summary_writer = tf.contrib.summary.create_file_writer(
        dev_dir, flush_millis=10000, name='dev')
    test_summary_writer = tf.contrib.summary.create_file_writer(
        test_dir, flush_millis=10000, name='test')

    # Create and restore checkpoint (if one exists on the path)
    checkpoint_prefix = os.path.join(flags_obj.model_dir, 'ckpt')
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=model,
                                     optimizer=optimizer,
                                     step_counter=step_counter)

    best_acc_ep = (0.0, -1, 9999.9)  # acc, epoch, loss
    patience = 0
    with tf.device(device):
        for ep in range(flags_obj.train_epochs):
            start = time.time()
            with summary_writer.as_default():
                train(model, optimizer, train_ds, step_counter, ep,
                      dataset.class_weights, flags_obj.log_interval)
            end = time.time()
            print('\nTrain time for epoch #%d (%d total steps): %.3f sec' %
                  (ep + 1, step_counter.numpy(), end - start))

            with dev_summary_writer.as_default():
                dev_acc, dev_loss = test(model,
                                         dev_ds,
                                         dataset.class_weights,
                                         ds_name='Dev')

            if dev_loss.numpy() < best_acc_ep[2]:
                best_acc_ep = (dev_acc.numpy(), ep, dev_loss.numpy())
                print('Save checkpoint', checkpoint_prefix)
                checkpoint.save(checkpoint_prefix)
            else:
                if patience == flags_obj.patience:
                    print('Apply early stopping')
                    break

                patience += 1
                print('patience {}/{}'.format(patience, flags_obj.patience))

            print('Min loss {:.6f}, dev acc. {:.3f}%, ep {} \n'.format(
                best_acc_ep[2], best_acc_ep[0] * 100., best_acc_ep[1] + 1))

        latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
        print('Load the last checkpoint..', latest_checkpoint)
        checkpoint.restore(latest_checkpoint)

        with test_summary_writer.as_default():
            test_acc, test_loss = test(model,
                                       test_ds,
                                       dataset.class_weights,
                                       show_classification_report=True)
        return \
            test_acc, test_loss, best_acc_ep[1] + 1, \
            get_num_trainable_params(model)