Python AdamWeightDecayOptimizer Examples

Programming Language: Python

Namespace/Package Name: bert.optimization

Method/Function: AdamWeightDecayOptimizer

Examples at hotexamples.com: 8

Python AdamWeightDecayOptimizer - 8 examples found. These are the top rated real world Python examples of bert.optimization.AdamWeightDecayOptimizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def test_adam(self):
     with self.test_session() as sess:
         w = tf.get_variable(
             "w",
             shape=[3],
             initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
         x = tf.constant([0.4, 0.2, -0.5])
         loss = tf.reduce_mean(tf.square(x - w))
         tvars = tf.trainable_variables()
         grads = tf.gradients(loss, tvars)
         global_step = tf.train.get_or_create_global_step()
         optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
         train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
         init_op = tf.group(tf.global_variables_initializer(),
                            tf.local_variables_initializer())
         sess.run(init_op)
         for _ in range(100):
             sess.run(train_op)
         w_np = sess.run(w)
         self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)

Example #2

Show file

File: bert_model.py Project: hanzhenlei767/BERT_Text_Matching_MGPU

    def __init__(self, bert_config, num_labels, seq_length, init_checkpoint):
        self.bert_config = bert_config
        self.num_labels = num_labels
        self.seq_length = seq_length
        self.tower_grads = []
        self.losses = []

        self.input_ids = tf.placeholder(tf.int32, [None, self.seq_length],
                                        name='input_ids')
        self.input_mask = tf.placeholder(tf.int32, [None, self.seq_length],
                                         name='input_mask')
        self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_length],
                                          name='segment_ids')
        self.labels = tf.placeholder(tf.int32, [None], name='labels')
        self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size')
        self.is_training = tf.placeholder(tf.bool,
                                          shape=[],
                                          name='is_training')
        print(self.batch_size)
        self.gpu_step = self.batch_size // gpu_nums

        global_step = tf.train.get_or_create_global_step()

        learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

        # Implements linear decay of the learning rate.
        learning_rate = tf.train.polynomial_decay(learning_rate,
                                                  global_step,
                                                  num_train_steps,
                                                  end_learning_rate=0.0,
                                                  power=1.0,
                                                  cycle=False)

        if num_warmup_steps:
            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_learning_rate = init_lr * warmup_percent_done

            is_warmup = tf.cast(global_steps_int < warmup_steps_int,
                                tf.float32)
            learning_rate = ((1.0 - is_warmup) * learning_rate +
                             is_warmup * warmup_learning_rate)

        optimizer = optimization.AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        with tf.variable_scope(tf.get_variable_scope()) as outer_scope:
            pred = []
            label = []
            for d in range(gpu_nums):
                with tf.device("/gpu:%s" % d), tf.name_scope("%s_%s" %
                                                             ("tower", d)):
                    self.model = modeling.BertModel(
                        config=self.bert_config,
                        is_training=self.is_training,
                        input_ids=self.input_ids[d * self.gpu_step:(d + 1) *
                                                 self.gpu_step],
                        input_mask=self.input_mask[d * self.gpu_step:(d + 1) *
                                                   self.gpu_step],
                        token_type_ids=self.segment_ids[d *
                                                        self.gpu_step:(d + 1) *
                                                        self.gpu_step])
                    print("GPU:", d)

                    tvars = tf.trainable_variables()
                    initialized_variable_names = {}
                    if init_checkpoint:
                        (assignment_map, initialized_variable_names
                         ) = modeling.get_assignment_map_from_checkpoint(
                             tvars, init_checkpoint)
                        tf.train.init_from_checkpoint(init_checkpoint,
                                                      assignment_map)

                    logging.info("**** Trainable Variables ****")
                    for var in tvars:
                        init_string = ""
                        if var.name in initialized_variable_names:
                            init_string = ", *INIT_FROM_CKPT*"
                        logging.info("  name = %s, shape = %s%s", var.name,
                                     var.shape, init_string)

                    output_layer = self.model.get_pooled_output()
                    logging.info(output_layer)

                    if self.is_training == True:
                        output_layer = tf.nn.dropout(output_layer,
                                                     keep_prob=0.9)

                    match_1 = tf.strided_slice(output_layer, [0],
                                               [self.gpu_step], [2])
                    match_2 = tf.strided_slice(output_layer, [1],
                                               [self.gpu_step], [2])

                    match = tf.concat([match_1, match_2], 1)

                    self.logits = tf.layers.dense(match,
                                                  self.num_labels,
                                                  name='fc',
                                                  reuse=tf.AUTO_REUSE)

                    #预测标签
                    self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits),
                                                1,
                                                name="pred")
                    logging.info(self.y_pred_cls)

                    #真实标签
                    self.r_labels = tf.strided_slice(
                        self.labels[d * self.gpu_step:(d + 1) * self.gpu_step],
                        [0], [self.gpu_step], [2])
                    logging.info(self.r_labels)

                    one_hot_labels = tf.one_hot(self.r_labels,
                                                depth=self.num_labels,
                                                dtype=tf.float32)

                    log_probs = tf.nn.log_softmax(self.logits, axis=-1)
                    per_example_loss =  - (30*one_hot_labels[:,0] * log_probs[:,0]) \
                                        - (9*one_hot_labels[:,1] * log_probs[:,1]) \
                                        - (2*one_hot_labels[:,2] * log_probs[:,2]) \
                                        - (2*one_hot_labels[:,3] * log_probs[:,3]) \
                                        - (9*one_hot_labels[:,4] * log_probs[:,4]) \
                                        + 1e-10

                    self.loss = tf.reduce_mean(per_example_loss)

                    #self.optim = optimization.create_optimizer(self.loss, learning_rate, num_train_steps, num_warmup_steps, False)

                    tvars = tf.trainable_variables()
                    grads = tf.gradients(self.loss, tvars)

                    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

                    self.tower_grads.append(list(zip(grads, tvars)))
                    self.losses.append(self.loss)
                    label.append(self.r_labels)
                    pred.append(self.y_pred_cls)
                outer_scope.reuse_variables()

        with tf.name_scope("apply_gradients"), tf.device("/cpu:0"):
            gradients = self.average_gradients(self.tower_grads)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)
            new_global_step = global_step + 1
            self.train_op = tf.group(train_op,
                                     [global_step.assign(new_global_step)])
            self.losses = tf.reduce_mean(self.losses)
            self.pred = tf.concat(pred, 0)
            self.label = tf.concat(label, 0)
            logging.info(self.pred)
            logging.info(self.label)

Example #3

Show file

def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  optimizer = optimization.AdamWeightDecayOptimizer(
      learning_rate=learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

  if use_tpu:
    optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

  tvars = tf.trainable_variables()
  print(tvars)
  tvars = [v for v in tvars if "bert" not in v.name]
  print("no bert")
  print(tvars)
  grads = tf.gradients(loss, tvars)

  # This is how the model was pre-trained.
  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

  train_op = optimizer.apply_gradients(
      zip(grads, tvars), global_step=global_step)

  # Normally the global step update is done inside of `apply_gradients`.
  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
  # a different optimizer, you should probably take this line out.
  new_global_step = global_step + 1
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op

Example #4

Show file

    def compile(self):
        """Define operations for loss, measures, optimization.
        and create session, initialize variables.
        """
        config = self.config
        # define operations for loss, measures, optimization
        self.loss = self.__compute_loss()
        self.accuracy, self.precision, self.recall, self.f1 = self.__compute_measures()
        with tf.variable_scope('optimization'):
            self.global_step = tf.train.get_or_create_global_step()
            if 'bert' in config.emb_class:
                from bert import optimization
                if config.use_bert_optimization:
                    self.learning_rate = tf.constant(value=config.starter_learning_rate, shape=[], dtype=tf.float32)
                    self.train_op = optimization.create_optimizer(self.loss,
                                                                  config.starter_learning_rate,
                                                                  config.num_train_steps,
                                                                  config.num_warmup_steps,
                                                                  False)
                else:
                    # exponential decay of the learning rate
                    self.learning_rate = tf.train.exponential_decay(config.starter_learning_rate,
                                                                    self.global_step,
                                                                    config.decay_steps,
                                                                    config.decay_rate,
                                                                    staircase=True)
                    # linear warmup, if global_step < num_warmup_steps, then
                    # learning rate = (global_step / num_warmup_steps) * starter_learning_rate
                    global_steps_int = tf.cast(self.global_step, tf.int32)
                    warmup_steps_int = tf.constant(config.num_warmup_steps, dtype=tf.int32)
                    global_steps_float = tf.cast(global_steps_int, tf.float32)
                    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
                    warmup_percent_done = global_steps_float / warmup_steps_float
                    warmup_learning_rate = config.starter_learning_rate * warmup_percent_done
                    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
                    self.learning_rate = ((1.0 - is_warmup) * self.learning_rate + is_warmup * warmup_learning_rate)
                    # Adam optimizer with correct L2 weight decay
                    optimizer = optimization.AdamWeightDecayOptimizer(
                        learning_rate=self.learning_rate,
                        weight_decay_rate=0.01,
                        beta_1=0.9,
                        beta_2=0.999,
                        epsilon=1e-6,
                        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
                    tvars = tf.trainable_variables()
                    grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm)
                    train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
                    new_global_step = self.global_step + 1
                    self.train_op = tf.group(train_op, [self.global_step.assign(new_global_step)])
            else:
                # exponential decay of the learning rate
                self.learning_rate = tf.train.exponential_decay(config.starter_learning_rate,
                                                                self.global_step,
                                                                config.decay_steps,
                                                                config.decay_rate,
                                                                staircase=True)
                # linear warmup, if global_step < num_warmup_steps, then
                # learning rate = (global_step / num_warmup_steps) * starter_learning_rate
                global_steps_int = tf.cast(self.global_step, tf.int32)
                warmup_steps_int = tf.constant(config.num_warmup_steps, dtype=tf.int32)
                global_steps_float = tf.cast(global_steps_int, tf.float32)
                warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
                warmup_percent_done = global_steps_float / warmup_steps_float
                warmup_learning_rate = config.starter_learning_rate * warmup_percent_done
                is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
                self.learning_rate = ((1.0 - is_warmup) * self.learning_rate + is_warmup * warmup_learning_rate)
                # Adam optimizer
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                tvars = tf.trainable_variables()
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
                '''
                # Adam optimizer with cyclical learning rate
                import clr # https://github.com/mhmoodlan/cyclic-learning-rate
                self.learning_rate = clr.cyclic_learning_rate(global_step=self.global_step,
                                                              learning_rate=config.starter_learning_rate * 0.3, # 0.0003
                                                              max_lr=config.starter_learning_rate,              # 0.001
                                                              step_size=5000,
                                                              mode='triangular')
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                tvars = tf.trainable_variables()
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
                '''

        # create session, initialize variables. this should be placed at the end of graph definitions.
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False,
                                      inter_op_parallelism_threads=0,
                                      intra_op_parallelism_threads=0)
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        feed_dict = {self.wrd_embeddings_init: config.embvec.wrd_embeddings}
        sess.run(tf.global_variables_initializer(), feed_dict=feed_dict) # feed large embedding data
        sess.run(tf.local_variables_initializer()) # for tf_metrics
        self.sess = sess

Example #5

Show file

def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
                     exclude_bert):
    """Creates an optimizer training op, optionally excluding BERT vars."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning rate.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=1.0,
                                              cycle=False)

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    optimizer = optimization.AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if use_tpu:
        optimizer = tf.estimator.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    if exclude_bert:
        bert_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "bert")
        tvars = [vv for vv in tvars if vv not in bert_vars]

    tf.logging.info("Training the following variables:")
    for vv in tvars:
        tf.logging.info(vv.name)

    grads = tf.gradients(loss, tvars, colocate_gradients_with_ops=True)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op

Example #6

Show file

def main(_):

    logging.set_verbosity(logging.INFO)
    bert_config = bert_modeling.BertConfig.from_json_file(
        FLAGS.bert_config_file)
    validate_flags_or_throw(bert_config)
    tf.gfile.MakeDirs(FLAGS.output_dir)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf_contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf_contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf_contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf_contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=is_per_host))

    num_train_steps = None
    num_warmup_steps = None
    '''if FLAGS.do_train:
    with tf.gfile.Open(FLAGS.record_count_file, "r") as f:
      num_train_features = int(f.read().strip())
    num_train_steps = int(num_train_features / FLAGS.train_batch_size *
                          FLAGS.num_train_epochs)
    logging.info("record_count_file: %s", FLAGS.record_count_file)
    logging.info("num_records (features): %d", num_train_features)
    logging.info("num_train_epochs: %d", FLAGS.num_train_epochs)
    logging.info("train_batch_size: %d", FLAGS.train_batch_size)
    logging.info("num_train_steps: %d", num_train_steps)

    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

  model_fn = tydi_modeling.model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this falls back to normal Estimator on CPU or GPU.
  estimator = tf_contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)'''

    if FLAGS.do_train:

        with tf.gfile.Open(FLAGS.record_count_file, "r") as f:
            num_train_features = int(f.read().strip())
        num_train_steps = int(num_train_features / FLAGS.train_batch_size *
                              FLAGS.num_train_epochs)
        logging.info("record_count_file: %s", FLAGS.record_count_file)
        logging.info("num_records (features): %d", num_train_features)
        logging.info("num_train_epochs: %d", FLAGS.num_train_epochs)
        logging.info("train_batch_size: %d", FLAGS.train_batch_size)
        logging.info("num_train_steps: %d", num_train_steps)

        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        logging.info("Running training on precomputed features")
        logging.info("  Num split examples = %d", num_train_features)
        logging.info("  Batch size = %d", FLAGS.train_batch_size)
        logging.info("  Num steps = %d", num_train_steps)
        train_filenames = tf.gfile.Glob(FLAGS.train_records_file)

        model_fn = tydi_modeling.model_fn_builder(
            bert_config=bert_config,
            init_checkpoint=FLAGS.init_checkpoint,
            learning_rate=FLAGS.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=FLAGS.use_tpu,
            use_one_hot_embeddings=FLAGS.use_tpu)

        def input_fn_builder(input_file, seq_length, is_training,
                             drop_remainder):
            """Creates an `input_fn` closure to be passed to TPUEstimator."""

            # This needs to be kept in sync with `FeatureWriter`.
            name_to_features = {
                "language_id": tf.FixedLenFeature([], tf.int64),
                "unique_ids": tf.FixedLenFeature([], tf.int64),
                "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
                "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
                "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
            }

            if is_training:
                name_to_features["start_positions"] = tf.FixedLenFeature(
                    [], tf.int64)
                name_to_features["end_positions"] = tf.FixedLenFeature(
                    [], tf.int64)
                name_to_features["answer_types"] = tf.FixedLenFeature([],
                                                                      tf.int64)

            def _decode_record(record, name_to_features):
                example = tf.parse_single_example(record, name_to_features)

                # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
                # So cast all int64 to int32.
                for name in list(example.keys()):
                    t = example[name]
                    if t.dtype == tf.int64:
                        t = tf.to_int32(t)
                        example[name] = t

                return example

            # For training, we want a lot of parallel reading and shuffling.
            # For eval, we want no shuffling and parallel reading doesn't matter.
            d = tf.data.TFRecordDataset(input_file)
            d = d.shuffle(buffer_size=100)
            d = d.map(lambda record: _decode_record(record, name_to_features))
            return d

        def split_train_dev(dataset, train_port, num_dev):
            train_size = int(DATASET_SIZE * train_port)
            sub_dev_size = int(DATASET_SIZE * (1 - train_port) / num_dev)

            full_dataset = dataset.shuffle(100)
            train_dataset = full_dataset.take(train_size)
            test_dataset = full_dataset.skip(train_size)

            shard_devs = []
            for i in range(num_dev):
                shard_devs.append(
                    test_dataset.shard(num_shards=num_dev, index=i))
            return train_dataset, shard_devs

        def count_dataset(dataset):
            cnt = 0
            for i in dataset.repeat(1).make_one_shot_iterator():
                # if cnt % 2000==0:
                # print(cnt)
                cnt += 1

            return cnt

        def split_langs(dataset):
            def dataset_fn(ds, i):
                return ds.filter(lambda x: tf.equal(x['language_id'], i))

            data_set_lst = []
            for i in range(11):
                dataset_filter_lang = dataset.apply(lambda x: dataset_fn(x, i))
                data_set_lst.append(dataset_filter_lang)
            return data_set_lst

        DATASET_SIZE = num_train_features

        NUM_LANGS = 11

        tf_dataset = input_fn_builder(FLAGS.train_records_file, 512, True,
                                      False)

        train_set, dev_shards = split_train_dev(tf_dataset, FLAGS.train_size,
                                                FLAGS.num_dev_sets)
        total_num_train_samples = FLAGS.train_size * DATASET_SIZE
        total_num_dev_samples = DATASET_SIZE - total_num_train_samples

        logging.info("Nums of examples in train dataset = %d",
                     total_num_train_samples)
        logging.info("Total numbers of examples in dev set = %d",
                     total_num_dev_samples)

        train_set_langs = split_langs(train_set)
        dev_set_langs = []

        for div_set in dev_shards:
            dev_set_langs.append(split_langs(div_set))

        #train_set_langs is a 1d lst, and div_set_langs is a 2d lst, notice that you could find
        #corresponding languages ids in data file

        def sample_lang_id(lang_freq):
            #print(lang_freq)
            #print(list(range(NUM_LANGS)))
            return choices(list(range(NUM_LANGS)), lang_freq)

        # count number of languages in each language

        lang_sample_dist = []
        for lang in train_set_langs:
            lang_cnt = count_dataset(lang)
            print(lang_cnt)
            lang_sample_dist.append(lang_cnt / total_num_train_samples)

        train_samplers = list(
            map(lambda x: x.repeat().batch(1).make_one_shot_iterator(),
                train_set_langs))

        dev_samplers = []
        for dev_set in dev_set_langs:
            dev_samplers.append(
                list(
                    map(
                        lambda x: iter(x.repeat().batch(1).
                                       make_one_shot_iterator()), dev_set)))

        global_step = tf.train.get_or_create_global_step()
        learning_rate = tf.constant(value=FLAGS.learning_rate,
                                    shape=[],
                                    dtype=tf.float32)
        learning_rate = tf.train.polynomial_decay(learning_rate,
                                                  global_step,
                                                  num_train_steps,
                                                  end_learning_rate=0.0,
                                                  power=1.0,
                                                  cycle=False)

        #uncomment this to enable warm-up steps
        '''if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = FLAGS.learning_rate * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = (
                (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)'''

        optimizer = opt.AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
        if FLAGS.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        #do MultiDDS training
        #initialize sample distribution phis
        phi = tf.get_variable(
            "phi", [11],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        opt_scorer = tf.train.AdamOptimizer(learning_rate=0.001,
                                            beta1=0.9,
                                            beta2=0.999,
                                            epsilon=1e-08,
                                            use_locking=False,
                                            name='Adam')

        while global_step < num_train_steps:
            if not tf.equal(global_step, 0):
                lang_sample_dist = list(tf.nn.softmax(phi).numpy())

            #load training data with phi
            logging.info('We are sampling from train data')
            data_lst = []
            while len(data_lst) < FLAGS.M:
                #choose a langue to sample
                cur_lang = sample_lang_id(lang_sample_dist)
                data_lst.append(train_samplers[cur_lang[0]].get_next())

            logging.info('Train mBert for multiple steps')
            for data in data_lst:
                with tf.GradientTape() as tape:
                    tvars, loss = model_fn(data, _,
                                           tf.estimator.ModeKeys.TRAIN, _,
                                           global_step)
                    #print(loss)

                grads = tape.gradient(loss, tvars)
                (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
                optimizer.apply_gradients(zip(grads, tvars),
                                          global_step=global_step)

            logging.info('Estimate the effect of each language')
            rewards = []
            for i in range(NUM_LANGS):
                gradient_dev = 0
                gradient_train = 0
                #Some languages might not have samples
                try:
                    train_test = train_samplers[i].get_next()
                    with tf.GradientTape() as tape:
                        tvars, loss = model_fn(data, _,
                                               tf.estimator.ModeKeys.TRAIN, _,
                                               global_step)

                    grads = tape.gradient(loss, tvars)
                    gradient_train = grads
                    #Not sure whether to add this line or not
                    #TODO: modify me to allow functions
                    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
                    optimizer.apply_gradients(zip(grads, tvars),
                                              global_step=global_step)

                    logging.info("Testing effect on other languages")

                    for k in range(len(dev_samplers)):
                        for j in range(NUM_LANGS):
                            try:
                                dev_data = dev_samplers[k][j].get_next()
                                with tf.GradientTape() as tape:
                                    tvars, loss = model_fn(
                                        data, _, tf.estimator.ModeKeys.TRAIN,
                                        _, global_step)
                                grads = tape.gradient(loss, tvars)
                                gradient_dev += grads

                            except:
                                print(j, 'language not exist in dataset', k)
                except:
                    print("No data in this train language!!!")

                #append scores of each language to reward list
                print(gradient_train, gradient_dev)
                normalize_a = tf.nn.l2_normalize(gradient_dev, 0)
                normalize_b = tf.nn.l2_normalize(gradient_train, 0)
                cos_similarity = tf.reduce_sum(
                    tf.multiply(normalize_a, normalize_b))
                rewards.append(cos_similarity)

            logging.info("Optimize phi!")
            grad_phi = 0
            for i in range(NUM_LANGS):
                log_i = tf.log(tf.nn.softmax(phi))[i]
                with tf.GradientTape() as tape:
                    grads = tape.gradient(log_i, phi)
                grad_phi += grads * rewards[i]
            opt_scorer.apply_gradient(zip(grad_phi, phi),
                                      global_step=global_step)

            new_global_step = global_step + 1
            global_step.assign(new_global_step)

        #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_predict:
        if not FLAGS.precomputed_predict_file:
            predict_examples_iter = preproc.read_tydi_examples(
                input_file=FLAGS.predict_file,
                is_training=False,
                max_passages=FLAGS.max_passages,
                max_position=FLAGS.max_position,
                fail_on_invalid=FLAGS.fail_on_invalid,
                open_fn=tf_io.gopen)
            shards_iter = write_tf_feature_files(predict_examples_iter)
        else:
            # Uses zeros for example and feature counts since they're unknown, and
            # we only use them for logging anyway.
            shards_iter = enumerate(
                ((f, 0, 0)
                 for f in tf.gfile.Glob(FLAGS.precomputed_predict_file)), 1)

        # Accumulates all of the prediction results to be written to the output.
        full_tydi_pred_dict = {}
        total_num_examples = 0
        for shard_num, (shard_filename, shard_num_examples,
                        shard_num_features) in shards_iter:
            total_num_examples += shard_num_examples
            logging.info(
                "Shard %d: Running prediction for %s; %d examples, %d features.",
                shard_num, shard_filename, shard_num_examples,
                shard_num_features)

            # Runs the model on the shard and store the individual results.
            # If running predict on TPU, you will need to specify the number of steps.
            predict_input_fn = tf_io.input_fn_builder(
                input_file=[shard_filename],
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=False)
            all_results = []
            for result in estimator.predict(predict_input_fn,
                                            yield_single_examples=True):
                if len(all_results) % 10000 == 0:
                    logging.info("Shard %d: Predicting for feature %d/%s",
                                 shard_num, len(all_results),
                                 shard_num_features)
                unique_id = int(result["unique_ids"])
                start_logits = [float(x) for x in result["start_logits"].flat]
                end_logits = [float(x) for x in result["end_logits"].flat]
                answer_type_logits = [
                    float(x) for x in result["answer_type_logits"].flat
                ]
                all_results.append(
                    tydi_modeling.RawResult(
                        unique_id=unique_id,
                        start_logits=start_logits,
                        end_logits=end_logits,
                        answer_type_logits=answer_type_logits))

            # Reads the prediction candidates from the (entire) prediction input file.
            candidates_dict = read_candidates(FLAGS.predict_file)
            predict_features = [
                tf.train.Example.FromString(r)
                for r in tf.python_io.tf_record_iterator(shard_filename)
            ]
            logging.info("Shard %d: Post-processing predictions.", shard_num)
            logging.info(
                "  Num candidate examples loaded (includes all shards): %d",
                len(candidates_dict))
            logging.info("  Num candidate features loaded: %d",
                         len(predict_features))
            logging.info("  Num prediction result features: %d",
                         len(all_results))
            logging.info("  Num shard features: %d", shard_num_features)

            tydi_pred_dict = postproc.compute_pred_dict(
                candidates_dict,
                predict_features, [r._asdict() for r in all_results],
                candidate_beam=FLAGS.candidate_beam)

            logging.info("Shard %d: Post-processed predictions.", shard_num)
            logging.info("  Num shard examples: %d", shard_num_examples)
            logging.info("  Num post-processed results: %d",
                         len(tydi_pred_dict))
            if shard_num_examples != len(tydi_pred_dict):
                logging.warning("  Num missing predictions: %d",
                                shard_num_examples - len(tydi_pred_dict))
            for key, value in tydi_pred_dict.items():
                if key in full_tydi_pred_dict:
                    logging.warning(
                        "ERROR: '%s' already in full_tydi_pred_dict!", key)
                full_tydi_pred_dict[key] = value

        logging.info("Prediction finished for all shards.")
        logging.info("  Total input examples: %d", total_num_examples)
        logging.info("  Total output predictions: %d",
                     len(full_tydi_pred_dict))

        with tf.gfile.Open(FLAGS.output_prediction_file, "w") as output_file:
            for prediction in full_tydi_pred_dict.values():
                output_file.write((json.dumps(prediction) + "\n").encode())

Example #7

Show file

File: bert_SUD.py Project: sumit0309/bert_punct

    def model_fn(features, labels, mode, params):

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (loss, per_example_loss, logits,
         predicted_labels) = create_model(bert_config, is_training, features,
                                          num_labels)

        #Used for initializing BERT model from the checkpoint
        tvars = tf.trainable_variables()
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            tf.logging.info("**** Loading from checkpoint ****")

        output_spec = None

        if mode == tf.estimator.ModeKeys.TRAIN:

            #learning rate: linear warmup with expontial decay
            global_step = tf.train.get_or_create_global_step()
            learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                       global_step,
                                                       2000,
                                                       0.9,
                                                       staircase=True)

            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_learning_rate = starter_learning_rate * warmup_percent_done
            is_warmup = tf.cast(global_steps_int < warmup_steps_int,
                                tf.float32)
            learning_rate = ((1.0 - is_warmup) * learning_rate +
                             is_warmup * warmup_learning_rate)

            # Adam optimizer with correct L2 weight decay
            optimizer = optimization.AdamWeightDecayOptimizer(
                learning_rate=learning_rate,
                weight_decay_rate=0.01,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-6,
                exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 2.0)
            train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                 global_step=global_step)
            new_global_step = global_step + 1
            train_op = tf.group(train_op,
                                [global_step.assign(new_global_step)])

            tf.summary.scalar('learning_rate', learning_rate)
            logging_hook = tf.train.LoggingTensorHook({"batch_loss": loss},
                                                      every_n_iter=10)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                train_op=train_op,
                training_hooks=[logging_hook])

        else:

            #metric_fn that uses tf_metrics for calculating f1, precision and recall
            #for the punctuation clasess
            def metric_fn(label_ids, predicted_labels, input_mask, num_labels):

                label_ids = tf.boolean_mask(label_ids, input_mask)
                predicted_labels = tf.boolean_mask(predicted_labels,
                                                   input_mask)

                precision = tf_metrics.precision(label_ids,
                                                 predicted_labels,
                                                 num_labels, [1, 2, 3],
                                                 average="macro")
                recall = tf_metrics.recall(label_ids,
                                           predicted_labels,
                                           num_labels, [1, 2, 3],
                                           average="macro")
                f1 = tf_metrics.f1(label_ids,
                                   predicted_labels,
                                   num_labels, [1, 2, 3],
                                   average="macro")

                return {
                    "eval_precision": precision,
                    "eval_recall": recall,
                    "eval_f": f1
                }

            input_mask = features["input_mask"]
            label_ids = features["label_ids"]
            eval_metrics = metric_fn(label_ids, predicted_labels, input_mask,
                                     num_labels)

            if mode == tf.estimator.ModeKeys.EVAL:
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metrics)
            else:

                predictions = {'labels': predicted_labels}
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode, predictions=predictions)

        return output_spec

Example #8

Show file

    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = ((1.0 - is_warmup) * learning_rate +
                     is_warmup * warmup_learning_rate)
optimizer = optimization.AdamWeightDecayOptimizer(
    learning_rate=learning_rate,
    weight_decay_rate=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6,
    exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(  # 在这个方法内部 可以调整是否需要训练 BERT 参数
    zip(grads, tvars), global_step=global_step)
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])

# train_op = tf.train.AdamOptimizer(lr).minimize(loss) # 这个是传统的optimization

#bert模型参数初始化的地方