Example #1
0
 def test_adam(self):
     with self.test_session() as sess:
         w = tf.get_variable(
             "w",
             shape=[3],
             initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
         x = tf.constant([0.4, 0.2, -0.5])
         loss = tf.reduce_mean(tf.square(x - w))
         tvars = tf.trainable_variables()
         grads = tf.gradients(loss, tvars)
         global_step = tf.train.get_or_create_global_step()
         optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
         train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
         init_op = tf.group(tf.global_variables_initializer(),
                            tf.local_variables_initializer())
         sess.run(init_op)
         for _ in range(100):
             sess.run(train_op)
         w_np = sess.run(w)
         self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
    def __init__(self, bert_config, num_labels, seq_length, init_checkpoint):
        self.bert_config = bert_config
        self.num_labels = num_labels
        self.seq_length = seq_length
        self.tower_grads = []
        self.losses = []

        self.input_ids = tf.placeholder(tf.int32, [None, self.seq_length],
                                        name='input_ids')
        self.input_mask = tf.placeholder(tf.int32, [None, self.seq_length],
                                         name='input_mask')
        self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_length],
                                          name='segment_ids')
        self.labels = tf.placeholder(tf.int32, [None], name='labels')
        self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size')
        self.is_training = tf.placeholder(tf.bool,
                                          shape=[],
                                          name='is_training')
        print(self.batch_size)
        self.gpu_step = self.batch_size // gpu_nums

        global_step = tf.train.get_or_create_global_step()

        learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

        # Implements linear decay of the learning rate.
        learning_rate = tf.train.polynomial_decay(learning_rate,
                                                  global_step,
                                                  num_train_steps,
                                                  end_learning_rate=0.0,
                                                  power=1.0,
                                                  cycle=False)

        if num_warmup_steps:
            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_learning_rate = init_lr * warmup_percent_done

            is_warmup = tf.cast(global_steps_int < warmup_steps_int,
                                tf.float32)
            learning_rate = ((1.0 - is_warmup) * learning_rate +
                             is_warmup * warmup_learning_rate)

        optimizer = optimization.AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        with tf.variable_scope(tf.get_variable_scope()) as outer_scope:
            pred = []
            label = []
            for d in range(gpu_nums):
                with tf.device("/gpu:%s" % d), tf.name_scope("%s_%s" %
                                                             ("tower", d)):
                    self.model = modeling.BertModel(
                        config=self.bert_config,
                        is_training=self.is_training,
                        input_ids=self.input_ids[d * self.gpu_step:(d + 1) *
                                                 self.gpu_step],
                        input_mask=self.input_mask[d * self.gpu_step:(d + 1) *
                                                   self.gpu_step],
                        token_type_ids=self.segment_ids[d *
                                                        self.gpu_step:(d + 1) *
                                                        self.gpu_step])
                    print("GPU:", d)

                    tvars = tf.trainable_variables()
                    initialized_variable_names = {}
                    if init_checkpoint:
                        (assignment_map, initialized_variable_names
                         ) = modeling.get_assignment_map_from_checkpoint(
                             tvars, init_checkpoint)
                        tf.train.init_from_checkpoint(init_checkpoint,
                                                      assignment_map)

                    logging.info("**** Trainable Variables ****")
                    for var in tvars:
                        init_string = ""
                        if var.name in initialized_variable_names:
                            init_string = ", *INIT_FROM_CKPT*"
                        logging.info("  name = %s, shape = %s%s", var.name,
                                     var.shape, init_string)

                    output_layer = self.model.get_pooled_output()
                    logging.info(output_layer)

                    if self.is_training == True:
                        output_layer = tf.nn.dropout(output_layer,
                                                     keep_prob=0.9)

                    match_1 = tf.strided_slice(output_layer, [0],
                                               [self.gpu_step], [2])
                    match_2 = tf.strided_slice(output_layer, [1],
                                               [self.gpu_step], [2])

                    match = tf.concat([match_1, match_2], 1)

                    self.logits = tf.layers.dense(match,
                                                  self.num_labels,
                                                  name='fc',
                                                  reuse=tf.AUTO_REUSE)

                    #预测标签
                    self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits),
                                                1,
                                                name="pred")
                    logging.info(self.y_pred_cls)

                    #真实标签
                    self.r_labels = tf.strided_slice(
                        self.labels[d * self.gpu_step:(d + 1) * self.gpu_step],
                        [0], [self.gpu_step], [2])
                    logging.info(self.r_labels)

                    one_hot_labels = tf.one_hot(self.r_labels,
                                                depth=self.num_labels,
                                                dtype=tf.float32)

                    log_probs = tf.nn.log_softmax(self.logits, axis=-1)
                    per_example_loss =  - (30*one_hot_labels[:,0] * log_probs[:,0]) \
                                        - (9*one_hot_labels[:,1] * log_probs[:,1]) \
                                        - (2*one_hot_labels[:,2] * log_probs[:,2]) \
                                        - (2*one_hot_labels[:,3] * log_probs[:,3]) \
                                        - (9*one_hot_labels[:,4] * log_probs[:,4]) \
                                        + 1e-10

                    self.loss = tf.reduce_mean(per_example_loss)

                    #self.optim = optimization.create_optimizer(self.loss, learning_rate, num_train_steps, num_warmup_steps, False)

                    tvars = tf.trainable_variables()
                    grads = tf.gradients(self.loss, tvars)

                    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

                    self.tower_grads.append(list(zip(grads, tvars)))
                    self.losses.append(self.loss)
                    label.append(self.r_labels)
                    pred.append(self.y_pred_cls)
                outer_scope.reuse_variables()

        with tf.name_scope("apply_gradients"), tf.device("/cpu:0"):
            gradients = self.average_gradients(self.tower_grads)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)
            new_global_step = global_step + 1
            self.train_op = tf.group(train_op,
                                     [global_step.assign(new_global_step)])
            self.losses = tf.reduce_mean(self.losses)
            self.pred = tf.concat(pred, 0)
            self.label = tf.concat(label, 0)
            logging.info(self.pred)
            logging.info(self.label)
Example #3
0
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  optimizer = optimization.AdamWeightDecayOptimizer(
      learning_rate=learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

  if use_tpu:
    optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

  tvars = tf.trainable_variables()
  print(tvars)
  tvars = [v for v in tvars if "bert" not in v.name]
  print("no bert")
  print(tvars)
  grads = tf.gradients(loss, tvars)

  # This is how the model was pre-trained.
  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

  train_op = optimizer.apply_gradients(
      zip(grads, tvars), global_step=global_step)

  # Normally the global step update is done inside of `apply_gradients`.
  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
  # a different optimizer, you should probably take this line out.
  new_global_step = global_step + 1
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op
Example #4
0
    def compile(self):
        """Define operations for loss, measures, optimization.
        and create session, initialize variables.
        """
        config = self.config
        # define operations for loss, measures, optimization
        self.loss = self.__compute_loss()
        self.accuracy, self.precision, self.recall, self.f1 = self.__compute_measures()
        with tf.variable_scope('optimization'):
            self.global_step = tf.train.get_or_create_global_step()
            if 'bert' in config.emb_class:
                from bert import optimization
                if config.use_bert_optimization:
                    self.learning_rate = tf.constant(value=config.starter_learning_rate, shape=[], dtype=tf.float32)
                    self.train_op = optimization.create_optimizer(self.loss,
                                                                  config.starter_learning_rate,
                                                                  config.num_train_steps,
                                                                  config.num_warmup_steps,
                                                                  False)
                else:
                    # exponential decay of the learning rate
                    self.learning_rate = tf.train.exponential_decay(config.starter_learning_rate,
                                                                    self.global_step,
                                                                    config.decay_steps,
                                                                    config.decay_rate,
                                                                    staircase=True)
                    # linear warmup, if global_step < num_warmup_steps, then
                    # learning rate = (global_step / num_warmup_steps) * starter_learning_rate
                    global_steps_int = tf.cast(self.global_step, tf.int32)
                    warmup_steps_int = tf.constant(config.num_warmup_steps, dtype=tf.int32)
                    global_steps_float = tf.cast(global_steps_int, tf.float32)
                    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
                    warmup_percent_done = global_steps_float / warmup_steps_float
                    warmup_learning_rate = config.starter_learning_rate * warmup_percent_done
                    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
                    self.learning_rate = ((1.0 - is_warmup) * self.learning_rate + is_warmup * warmup_learning_rate)
                    # Adam optimizer with correct L2 weight decay
                    optimizer = optimization.AdamWeightDecayOptimizer(
                        learning_rate=self.learning_rate,
                        weight_decay_rate=0.01,
                        beta_1=0.9,
                        beta_2=0.999,
                        epsilon=1e-6,
                        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
                    tvars = tf.trainable_variables()
                    grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm)
                    train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
                    new_global_step = self.global_step + 1
                    self.train_op = tf.group(train_op, [self.global_step.assign(new_global_step)])
            else:
                # exponential decay of the learning rate
                self.learning_rate = tf.train.exponential_decay(config.starter_learning_rate,
                                                                self.global_step,
                                                                config.decay_steps,
                                                                config.decay_rate,
                                                                staircase=True)
                # linear warmup, if global_step < num_warmup_steps, then
                # learning rate = (global_step / num_warmup_steps) * starter_learning_rate
                global_steps_int = tf.cast(self.global_step, tf.int32)
                warmup_steps_int = tf.constant(config.num_warmup_steps, dtype=tf.int32)
                global_steps_float = tf.cast(global_steps_int, tf.float32)
                warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
                warmup_percent_done = global_steps_float / warmup_steps_float
                warmup_learning_rate = config.starter_learning_rate * warmup_percent_done
                is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
                self.learning_rate = ((1.0 - is_warmup) * self.learning_rate + is_warmup * warmup_learning_rate)
                # Adam optimizer
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                tvars = tf.trainable_variables()
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
                '''
                # Adam optimizer with cyclical learning rate
                import clr # https://github.com/mhmoodlan/cyclic-learning-rate
                self.learning_rate = clr.cyclic_learning_rate(global_step=self.global_step,
                                                              learning_rate=config.starter_learning_rate * 0.3, # 0.0003
                                                              max_lr=config.starter_learning_rate,              # 0.001
                                                              step_size=5000,
                                                              mode='triangular')
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                tvars = tf.trainable_variables()
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)
                '''

        # create session, initialize variables. this should be placed at the end of graph definitions.
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False,
                                      inter_op_parallelism_threads=0,
                                      intra_op_parallelism_threads=0)
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        feed_dict = {self.wrd_embeddings_init: config.embvec.wrd_embeddings}
        sess.run(tf.global_variables_initializer(), feed_dict=feed_dict) # feed large embedding data
        sess.run(tf.local_variables_initializer()) # for tf_metrics
        self.sess = sess
Example #5
0
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
                     exclude_bert):
    """Creates an optimizer training op, optionally excluding BERT vars."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning rate.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=1.0,
                                              cycle=False)

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    optimizer = optimization.AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if use_tpu:
        optimizer = tf.estimator.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    if exclude_bert:
        bert_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "bert")
        tvars = [vv for vv in tvars if vv not in bert_vars]

    tf.logging.info("Training the following variables:")
    for vv in tvars:
        tf.logging.info(vv.name)

    grads = tf.gradients(loss, tvars, colocate_gradients_with_ops=True)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
Example #6
0
def main(_):

    logging.set_verbosity(logging.INFO)
    bert_config = bert_modeling.BertConfig.from_json_file(
        FLAGS.bert_config_file)
    validate_flags_or_throw(bert_config)
    tf.gfile.MakeDirs(FLAGS.output_dir)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf_contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf_contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf_contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf_contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=is_per_host))

    num_train_steps = None
    num_warmup_steps = None
    '''if FLAGS.do_train:
    with tf.gfile.Open(FLAGS.record_count_file, "r") as f:
      num_train_features = int(f.read().strip())
    num_train_steps = int(num_train_features / FLAGS.train_batch_size *
                          FLAGS.num_train_epochs)
    logging.info("record_count_file: %s", FLAGS.record_count_file)
    logging.info("num_records (features): %d", num_train_features)
    logging.info("num_train_epochs: %d", FLAGS.num_train_epochs)
    logging.info("train_batch_size: %d", FLAGS.train_batch_size)
    logging.info("num_train_steps: %d", num_train_steps)

    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

  model_fn = tydi_modeling.model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this falls back to normal Estimator on CPU or GPU.
  estimator = tf_contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)'''

    if FLAGS.do_train:

        with tf.gfile.Open(FLAGS.record_count_file, "r") as f:
            num_train_features = int(f.read().strip())
        num_train_steps = int(num_train_features / FLAGS.train_batch_size *
                              FLAGS.num_train_epochs)
        logging.info("record_count_file: %s", FLAGS.record_count_file)
        logging.info("num_records (features): %d", num_train_features)
        logging.info("num_train_epochs: %d", FLAGS.num_train_epochs)
        logging.info("train_batch_size: %d", FLAGS.train_batch_size)
        logging.info("num_train_steps: %d", num_train_steps)

        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        logging.info("Running training on precomputed features")
        logging.info("  Num split examples = %d", num_train_features)
        logging.info("  Batch size = %d", FLAGS.train_batch_size)
        logging.info("  Num steps = %d", num_train_steps)
        train_filenames = tf.gfile.Glob(FLAGS.train_records_file)

        model_fn = tydi_modeling.model_fn_builder(
            bert_config=bert_config,
            init_checkpoint=FLAGS.init_checkpoint,
            learning_rate=FLAGS.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=FLAGS.use_tpu,
            use_one_hot_embeddings=FLAGS.use_tpu)

        def input_fn_builder(input_file, seq_length, is_training,
                             drop_remainder):
            """Creates an `input_fn` closure to be passed to TPUEstimator."""

            # This needs to be kept in sync with `FeatureWriter`.
            name_to_features = {
                "language_id": tf.FixedLenFeature([], tf.int64),
                "unique_ids": tf.FixedLenFeature([], tf.int64),
                "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
                "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
                "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
            }

            if is_training:
                name_to_features["start_positions"] = tf.FixedLenFeature(
                    [], tf.int64)
                name_to_features["end_positions"] = tf.FixedLenFeature(
                    [], tf.int64)
                name_to_features["answer_types"] = tf.FixedLenFeature([],
                                                                      tf.int64)

            def _decode_record(record, name_to_features):
                example = tf.parse_single_example(record, name_to_features)

                # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
                # So cast all int64 to int32.
                for name in list(example.keys()):
                    t = example[name]
                    if t.dtype == tf.int64:
                        t = tf.to_int32(t)
                        example[name] = t

                return example

            # For training, we want a lot of parallel reading and shuffling.
            # For eval, we want no shuffling and parallel reading doesn't matter.
            d = tf.data.TFRecordDataset(input_file)
            d = d.shuffle(buffer_size=100)
            d = d.map(lambda record: _decode_record(record, name_to_features))
            return d

        def split_train_dev(dataset, train_port, num_dev):
            train_size = int(DATASET_SIZE * train_port)
            sub_dev_size = int(DATASET_SIZE * (1 - train_port) / num_dev)

            full_dataset = dataset.shuffle(100)
            train_dataset = full_dataset.take(train_size)
            test_dataset = full_dataset.skip(train_size)

            shard_devs = []
            for i in range(num_dev):
                shard_devs.append(
                    test_dataset.shard(num_shards=num_dev, index=i))
            return train_dataset, shard_devs

        def count_dataset(dataset):
            cnt = 0
            for i in dataset.repeat(1).make_one_shot_iterator():
                # if cnt % 2000==0:
                # print(cnt)
                cnt += 1

            return cnt

        def split_langs(dataset):
            def dataset_fn(ds, i):
                return ds.filter(lambda x: tf.equal(x['language_id'], i))

            data_set_lst = []
            for i in range(11):
                dataset_filter_lang = dataset.apply(lambda x: dataset_fn(x, i))
                data_set_lst.append(dataset_filter_lang)
            return data_set_lst

        DATASET_SIZE = num_train_features

        NUM_LANGS = 11

        tf_dataset = input_fn_builder(FLAGS.train_records_file, 512, True,
                                      False)

        train_set, dev_shards = split_train_dev(tf_dataset, FLAGS.train_size,
                                                FLAGS.num_dev_sets)
        total_num_train_samples = FLAGS.train_size * DATASET_SIZE
        total_num_dev_samples = DATASET_SIZE - total_num_train_samples

        logging.info("Nums of examples in train dataset = %d",
                     total_num_train_samples)
        logging.info("Total numbers of examples in dev set = %d",
                     total_num_dev_samples)

        train_set_langs = split_langs(train_set)
        dev_set_langs = []

        for div_set in dev_shards:
            dev_set_langs.append(split_langs(div_set))

        #train_set_langs is a 1d lst, and div_set_langs is a 2d lst, notice that you could find
        #corresponding languages ids in data file

        def sample_lang_id(lang_freq):
            #print(lang_freq)
            #print(list(range(NUM_LANGS)))
            return choices(list(range(NUM_LANGS)), lang_freq)

        # count number of languages in each language

        lang_sample_dist = []
        for lang in train_set_langs:
            lang_cnt = count_dataset(lang)
            print(lang_cnt)
            lang_sample_dist.append(lang_cnt / total_num_train_samples)

        train_samplers = list(
            map(lambda x: x.repeat().batch(1).make_one_shot_iterator(),
                train_set_langs))

        dev_samplers = []
        for dev_set in dev_set_langs:
            dev_samplers.append(
                list(
                    map(
                        lambda x: iter(x.repeat().batch(1).
                                       make_one_shot_iterator()), dev_set)))

        global_step = tf.train.get_or_create_global_step()
        learning_rate = tf.constant(value=FLAGS.learning_rate,
                                    shape=[],
                                    dtype=tf.float32)
        learning_rate = tf.train.polynomial_decay(learning_rate,
                                                  global_step,
                                                  num_train_steps,
                                                  end_learning_rate=0.0,
                                                  power=1.0,
                                                  cycle=False)

        #uncomment this to enable warm-up steps
        '''if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = FLAGS.learning_rate * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = (
                (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)'''

        optimizer = opt.AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
        if FLAGS.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        #do MultiDDS training
        #initialize sample distribution phis
        phi = tf.get_variable(
            "phi", [11],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        opt_scorer = tf.train.AdamOptimizer(learning_rate=0.001,
                                            beta1=0.9,
                                            beta2=0.999,
                                            epsilon=1e-08,
                                            use_locking=False,
                                            name='Adam')

        while global_step < num_train_steps:
            if not tf.equal(global_step, 0):
                lang_sample_dist = list(tf.nn.softmax(phi).numpy())

            #load training data with phi
            logging.info('We are sampling from train data')
            data_lst = []
            while len(data_lst) < FLAGS.M:
                #choose a langue to sample
                cur_lang = sample_lang_id(lang_sample_dist)
                data_lst.append(train_samplers[cur_lang[0]].get_next())

            logging.info('Train mBert for multiple steps')
            for data in data_lst:
                with tf.GradientTape() as tape:
                    tvars, loss = model_fn(data, _,
                                           tf.estimator.ModeKeys.TRAIN, _,
                                           global_step)
                    #print(loss)

                grads = tape.gradient(loss, tvars)
                (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
                optimizer.apply_gradients(zip(grads, tvars),
                                          global_step=global_step)

            logging.info('Estimate the effect of each language')
            rewards = []
            for i in range(NUM_LANGS):
                gradient_dev = 0
                gradient_train = 0
                #Some languages might not have samples
                try:
                    train_test = train_samplers[i].get_next()
                    with tf.GradientTape() as tape:
                        tvars, loss = model_fn(data, _,
                                               tf.estimator.ModeKeys.TRAIN, _,
                                               global_step)

                    grads = tape.gradient(loss, tvars)
                    gradient_train = grads
                    #Not sure whether to add this line or not
                    #TODO: modify me to allow functions
                    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
                    optimizer.apply_gradients(zip(grads, tvars),
                                              global_step=global_step)

                    logging.info("Testing effect on other languages")

                    for k in range(len(dev_samplers)):
                        for j in range(NUM_LANGS):
                            try:
                                dev_data = dev_samplers[k][j].get_next()
                                with tf.GradientTape() as tape:
                                    tvars, loss = model_fn(
                                        data, _, tf.estimator.ModeKeys.TRAIN,
                                        _, global_step)
                                grads = tape.gradient(loss, tvars)
                                gradient_dev += grads

                            except:
                                print(j, 'language not exist in dataset', k)
                except:
                    print("No data in this train language!!!")

                #append scores of each language to reward list
                print(gradient_train, gradient_dev)
                normalize_a = tf.nn.l2_normalize(gradient_dev, 0)
                normalize_b = tf.nn.l2_normalize(gradient_train, 0)
                cos_similarity = tf.reduce_sum(
                    tf.multiply(normalize_a, normalize_b))
                rewards.append(cos_similarity)

            logging.info("Optimize phi!")
            grad_phi = 0
            for i in range(NUM_LANGS):
                log_i = tf.log(tf.nn.softmax(phi))[i]
                with tf.GradientTape() as tape:
                    grads = tape.gradient(log_i, phi)
                grad_phi += grads * rewards[i]
            opt_scorer.apply_gradient(zip(grad_phi, phi),
                                      global_step=global_step)

            new_global_step = global_step + 1
            global_step.assign(new_global_step)

        #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_predict:
        if not FLAGS.precomputed_predict_file:
            predict_examples_iter = preproc.read_tydi_examples(
                input_file=FLAGS.predict_file,
                is_training=False,
                max_passages=FLAGS.max_passages,
                max_position=FLAGS.max_position,
                fail_on_invalid=FLAGS.fail_on_invalid,
                open_fn=tf_io.gopen)
            shards_iter = write_tf_feature_files(predict_examples_iter)
        else:
            # Uses zeros for example and feature counts since they're unknown, and
            # we only use them for logging anyway.
            shards_iter = enumerate(
                ((f, 0, 0)
                 for f in tf.gfile.Glob(FLAGS.precomputed_predict_file)), 1)

        # Accumulates all of the prediction results to be written to the output.
        full_tydi_pred_dict = {}
        total_num_examples = 0
        for shard_num, (shard_filename, shard_num_examples,
                        shard_num_features) in shards_iter:
            total_num_examples += shard_num_examples
            logging.info(
                "Shard %d: Running prediction for %s; %d examples, %d features.",
                shard_num, shard_filename, shard_num_examples,
                shard_num_features)

            # Runs the model on the shard and store the individual results.
            # If running predict on TPU, you will need to specify the number of steps.
            predict_input_fn = tf_io.input_fn_builder(
                input_file=[shard_filename],
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=False)
            all_results = []
            for result in estimator.predict(predict_input_fn,
                                            yield_single_examples=True):
                if len(all_results) % 10000 == 0:
                    logging.info("Shard %d: Predicting for feature %d/%s",
                                 shard_num, len(all_results),
                                 shard_num_features)
                unique_id = int(result["unique_ids"])
                start_logits = [float(x) for x in result["start_logits"].flat]
                end_logits = [float(x) for x in result["end_logits"].flat]
                answer_type_logits = [
                    float(x) for x in result["answer_type_logits"].flat
                ]
                all_results.append(
                    tydi_modeling.RawResult(
                        unique_id=unique_id,
                        start_logits=start_logits,
                        end_logits=end_logits,
                        answer_type_logits=answer_type_logits))

            # Reads the prediction candidates from the (entire) prediction input file.
            candidates_dict = read_candidates(FLAGS.predict_file)
            predict_features = [
                tf.train.Example.FromString(r)
                for r in tf.python_io.tf_record_iterator(shard_filename)
            ]
            logging.info("Shard %d: Post-processing predictions.", shard_num)
            logging.info(
                "  Num candidate examples loaded (includes all shards): %d",
                len(candidates_dict))
            logging.info("  Num candidate features loaded: %d",
                         len(predict_features))
            logging.info("  Num prediction result features: %d",
                         len(all_results))
            logging.info("  Num shard features: %d", shard_num_features)

            tydi_pred_dict = postproc.compute_pred_dict(
                candidates_dict,
                predict_features, [r._asdict() for r in all_results],
                candidate_beam=FLAGS.candidate_beam)

            logging.info("Shard %d: Post-processed predictions.", shard_num)
            logging.info("  Num shard examples: %d", shard_num_examples)
            logging.info("  Num post-processed results: %d",
                         len(tydi_pred_dict))
            if shard_num_examples != len(tydi_pred_dict):
                logging.warning("  Num missing predictions: %d",
                                shard_num_examples - len(tydi_pred_dict))
            for key, value in tydi_pred_dict.items():
                if key in full_tydi_pred_dict:
                    logging.warning(
                        "ERROR: '%s' already in full_tydi_pred_dict!", key)
                full_tydi_pred_dict[key] = value

        logging.info("Prediction finished for all shards.")
        logging.info("  Total input examples: %d", total_num_examples)
        logging.info("  Total output predictions: %d",
                     len(full_tydi_pred_dict))

        with tf.gfile.Open(FLAGS.output_prediction_file, "w") as output_file:
            for prediction in full_tydi_pred_dict.values():
                output_file.write((json.dumps(prediction) + "\n").encode())
Example #7
0
    def model_fn(features, labels, mode, params):

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (loss, per_example_loss, logits,
         predicted_labels) = create_model(bert_config, is_training, features,
                                          num_labels)

        #Used for initializing BERT model from the checkpoint
        tvars = tf.trainable_variables()
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            tf.logging.info("**** Loading from checkpoint ****")

        output_spec = None

        if mode == tf.estimator.ModeKeys.TRAIN:

            #learning rate: linear warmup with expontial decay
            global_step = tf.train.get_or_create_global_step()
            learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                       global_step,
                                                       2000,
                                                       0.9,
                                                       staircase=True)

            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_learning_rate = starter_learning_rate * warmup_percent_done
            is_warmup = tf.cast(global_steps_int < warmup_steps_int,
                                tf.float32)
            learning_rate = ((1.0 - is_warmup) * learning_rate +
                             is_warmup * warmup_learning_rate)

            # Adam optimizer with correct L2 weight decay
            optimizer = optimization.AdamWeightDecayOptimizer(
                learning_rate=learning_rate,
                weight_decay_rate=0.01,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-6,
                exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 2.0)
            train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                 global_step=global_step)
            new_global_step = global_step + 1
            train_op = tf.group(train_op,
                                [global_step.assign(new_global_step)])

            tf.summary.scalar('learning_rate', learning_rate)
            logging_hook = tf.train.LoggingTensorHook({"batch_loss": loss},
                                                      every_n_iter=10)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                train_op=train_op,
                training_hooks=[logging_hook])

        else:

            #metric_fn that uses tf_metrics for calculating f1, precision and recall
            #for the punctuation clasess
            def metric_fn(label_ids, predicted_labels, input_mask, num_labels):

                label_ids = tf.boolean_mask(label_ids, input_mask)
                predicted_labels = tf.boolean_mask(predicted_labels,
                                                   input_mask)

                precision = tf_metrics.precision(label_ids,
                                                 predicted_labels,
                                                 num_labels, [1, 2, 3],
                                                 average="macro")
                recall = tf_metrics.recall(label_ids,
                                           predicted_labels,
                                           num_labels, [1, 2, 3],
                                           average="macro")
                f1 = tf_metrics.f1(label_ids,
                                   predicted_labels,
                                   num_labels, [1, 2, 3],
                                   average="macro")

                return {
                    "eval_precision": precision,
                    "eval_recall": recall,
                    "eval_f": f1
                }

            input_mask = features["input_mask"]
            label_ids = features["label_ids"]
            eval_metrics = metric_fn(label_ids, predicted_labels, input_mask,
                                     num_labels)

            if mode == tf.estimator.ModeKeys.EVAL:
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metrics)
            else:

                predictions = {'labels': predicted_labels}
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode, predictions=predictions)

        return output_spec
Example #8
0
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = ((1.0 - is_warmup) * learning_rate +
                     is_warmup * warmup_learning_rate)
optimizer = optimization.AdamWeightDecayOptimizer(
    learning_rate=learning_rate,
    weight_decay_rate=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6,
    exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(  # 在这个方法内部 可以调整是否需要训练 BERT 参数
    zip(grads, tvars), global_step=global_step)
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])

# train_op = tf.train.AdamOptimizer(lr).minimize(loss) # 这个是传统的optimization

#bert模型参数初始化的地方